A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

import subprocess, sys
def pip(*pkgs):
subprocess.check_call([sys.executable, “-m”, “pip”, “install”, “-q”, *pkgs])
pip(“llmcompressor”, “compressed-tensors”,
“transformers>=4.45”, “accelerate”, “datasets”)
import os, gc, time, json, math
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
assert torch.cuda.is_available(), \
“Enable a GPU: Runtime > Change runtime type > T4 GPU”
print(“GPU:”, torch.cuda.get_device_name(0),
“| CUDA:”, torch.version.cuda,
“| torch:”, torch.__version__)
MODEL_ID = “Qwen/Qwen2.5-0.5B-Instruct”
WORKDIR = Path(“/content/quant_lab”); WORKDIR.mkdir(exist_ok=True)
os.chdir(WORKDIR)
def free_mem():
gc.collect(); torch.cuda.empty_cache()
def dir_size_gb(path):
total = 0
for root, _, files in os.walk(path):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total / 1e9
def time_generation(model, tok, prompt, max_new_tokens=64):
“””Greedy decode; reports latency & tokens/sec after a brief warmup.”””
inputs = tok(prompt, return_tensors=”pt”).to(model.device)
_ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
torch.cuda.synchronize()
t0 = time.time()
out = model.generate(**inputs, max_new_tokens=max_new_tokens,
do_sample=False, pad_token_id=tok.eos_token_id)
torch.cuda.synchronize()
dt = time.time() – t0
new_ids = out[0][inputs[“input_ids”].shape[1]:]
return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt
@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
“””Light WikiText-2 perplexity probe (fast, indicative).”””
ds = load_dataset(“wikitext”, “wikitext-2-raw-v1″, split=”test”)
text = “\n\n”.join(t for t in ds[“text”][:400] if t.strip())
enc = tok(text, return_tensors=”pt”).input_ids.to(model.device)
nll_sum, tok_count = 0.0, 0
for begin in range(0, enc.size(1) – seq_len, stride):
chunk = enc[:, begin:begin+seq_len]
out = model(chunk, labels=chunk)
nll_sum += out.loss.float().item() * seq_len
tok_count += seq_len
if tok_count // seq_len >= max_chunks: break
return math.exp(nll_sum / tok_count)
results = {}
PROMPT = (“<|im_start|>user\nIn two sentences, explain why post-training ”
“quantization works for large language models.<|im_end|>\n”
“<|im_start|>assistant\n”)
def benchmark(label, model_path_or_id):
free_mem()
print(f”\n──── benchmarking: {label} ────”)
tok = AutoTokenizer.from_pretrained(model_path_or_id)
m = AutoModelForCausalLM.from_pretrained(
model_path_or_id, torch_dtype=”auto”, device_map=”cuda”).eval()
sample, dt, tps = time_generation(m, tok, PROMPT)
ppl = wikitext_ppl(m, tok)
size = dir_size_gb(model_path_or_id) if os.path.isdir(str(model_path_or_id)) else None
results[label] = {“size_gb”: size, “ppl”: round(ppl, 3),
“latency_s”: round(dt, 3), “tok_per_s”: round(tps, 1),
“sample”: sample.strip().replace(“\n”, ” “)[:180]}
print(json.dumps(results[label], indent=2))
del m; free_mem()

Source link

A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

Toward a future that preserves benefits of neurotechnology for all | MIT News

How America's 250th birthday became a test of AI-powered collective intelligence

Takeda signs US$600M AI drug discovery deal with Insilico

Mistral AI Releases Leanstral 1.5: An Apache-2.0 Lean 4 Code Agent Model Solving 587 of 672 PutnamBench Problems

MIT in the media: Innovating and educating for the next 250 years of America | MIT News

HP accelerates enterprise workflows with OpenAI Frontier

Moonbeam Pivots From Polkadot to Base to Build AI Agents

Vitalik Buterin Unveils New ‘Lean Ethereum” Strawmap

Bitcoin Bounces Above $63K Following Strategy-fueled Selloff

Trader Turns $2 Million of ETH Into $14,208 as Lighter Token Rallies 53%

What Does the Average Canadian’s TFSA Look Like at 55?

Top Insights

Bitcoin Shrugs Off Strategy FUD, Hits New 2-Week Peak in Early Signs of Structural Stabilization

Stock Indexes Settle Higher as Big Tech and Chip Stocks Rally

A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

Related Posts