安装
git clone https :/ / github.com/AutoGPTQ/AutoGPTQ
cd AutoGPTQ
pip install -vvv --no-build-isolation -e .
代码:
-
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-
from transformers import AutoTokenizer
-
-
# Specify paths and hyperparameters for quantization
-
model_path = "/data/qwen3b/Qwen/Qwen2___5-3B-Instruct/"
-
quant_path = "/data/qwen3b/Qwen/Qwen2___5-3B-Instruct-4bit-gptq/"
-
quantize_config = BaseQuantizeConfig(
-
bits=4, # 4 or 8
-
group_size=128,
-
damp_percent=0.01,
-
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
-
static_groups=False,
-
sym=True,
-
true_sequential=True,
-
model_name_or_path=None,
-
model_file_base_name="model"
-
)
-
max_len = 8192
-
-
# Load your tokenizer and model with AutoGPTQ
-
# To learn about loading model to multiple GPUs,
-
# visit https://github.com/AutoGPTQ/AutoGPTQ/blob/main/docs/tutorial/02-Advanced-Model-Loading-and-Best-Practice.md
-
tokenizer = AutoTokenizer.from_pretrained(model_path)
-
model = AutoGPTQForCausalLM.from_pretrained(model_path, quantize_config)
-
-
examples = [
-
tokenizer(
-
"Auto-GPTQ 是一个简单易用的模型量化库,基于 GPTQ 算法,具有用户友好的 API。"
-
)
-
]
-
#model.quantize(examples)
-
model.quantize(examples, cache_examples_on_gpu=False)
-
model.save_quantized(quant_path, use_safetensors=True)
-
tokenizer.save_pretrained(quant_path)