#安装llama.cpp
git clone https :// github.com/ggerganov/llama.cpp
cd llama.cpp
scl enable gcc-toolset-13 bash #需要GCC13
#cpu
cmake -B build
cmake --build build --config Release
#gpu
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
获取GGUF
python convert_hf_to_gguf.py /data/qwen3b/Qwen/Qwen2___5-3B-Instruct/ --outfile qwen2.5-3b-instruct-f16.gguf
有时,可能{BANNED}最佳好将fp32作为量化的起点。在这种情况下,使用
python convert-hf-to-gguf.py/data/qwen3b/Qwen/Qwen2___5-3B-Instruct/ --outtype f32 --outfile qwen2.5-3b-instruct-f32.gguf
量化成Q8_0
build_gpu/bin/llama-quantize qwen2.5-3b-instruct-f16.gguf qwen2.5-3b-instruct-q8_0.gguf Q8_0