跳到主要内容

vLLM 速查表

快速查找 vLLM 常用命令、参数和代码片段。

安装命令

# pip 安装(推荐)
pip install vllm

# 安装开发版本
pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly

# 安装 Flash Attention(可选,提升性能)
pip install flash-attn --no-build-isolation

# 安装 benchmark 工具
pip install vllm[benchmark]

# Docker 拉取
docker pull vllm/vllm-openai:latest

启动服务

基础命令

# 最简单启动
vllm serve MODEL_NAME

# 指定主机和端口
vllm serve MODEL_NAME --host 0.0.0.0 --port 8000

# 设置 API 密钥
vllm serve MODEL_NAME --api-key sk-your-key

完整参数示例

vllm serve meta-llama/Llama-2-7b-chat-hf \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 1 \
--pipeline-parallel-size 1 \
--data-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--max-model-len 4096 \
--max-num-seqs 256 \
--quantization awq \
--dtype auto \
--enable-prefix-caching \
--enable-chunked-prefill \
--api-key sk-your-key

Python API

基础推理

from vllm import LLM, SamplingParams

# 加载模型
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")

# 配置采样参数
sampling_params = SamplingParams(
temperature=0.7,
max_tokens=100
)

# 生成文本
outputs = llm.generate(["Hello, world!"], sampling_params)
print(outputs[0].outputs[0].text)

LLM 初始化参数

LLM(
model="MODEL_NAME", # 模型名称或路径

# 并行配置
tensor_parallel_size=1, # 张量并行大小
pipeline_parallel_size=1, # 流水线并行大小
data_parallel_size=1, # 数据并行大小

# 显存配置
gpu_memory_utilization=0.9, # GPU 显存利用率 (0-1)
max_model_len=4096, # 最大模型长度
max_num_seqs=256, # 最大并发序列数
max_num_batched_tokens=8192, # 单批次最大 token 数

# 量化配置
quantization=None, # 量化类型: awq, gptq, fp8
dtype="auto", # 数据类型: auto, float16, bfloat16
load_format="auto", # 加载格式

# 性能优化
enable_prefix_caching=False, # 启用前缀缓存
enable_chunked_prefill=False, # 启用 Chunked Prefill
enforce_eager=False, # 强制使用 eager 模式

# 推测解码
speculative_model=None, # 草稿模型
num_speculative_tokens=5, # 推测 token 数

# LoRA 配置
enable_lora=False, # 启用 LoRA
max_lora_rank=16, # LoRA 最大秩
max_loras=1, # 同时加载的 LoRA 数
)

SamplingParams 参数

SamplingParams(
# 基础参数
n=1, # 每个提示词生成 n 个结果
temperature=1.0, # 温度 (0-2),越高越随机
top_p=1.0, # 核采样阈值 (0-1)
top_k=-1, # Top-k 采样,-1 表示禁用
max_tokens=16, # 最大生成 token 数
min_tokens=0, # 最小生成 token 数

# 惩罚参数
presence_penalty=0.0, # 存在惩罚 (-2 到 2)
frequency_penalty=0.0, # 频率惩罚 (-2 到 2)
repetition_penalty=1.0, # 重复惩罚 (0-2)

# 停止条件
stop=None, # 停止词列表
stop_token_ids=None, # 停止 token ID 列表
ignore_eos=False, # 是否忽略结束符

# 其他
logprobs=None, # 返回 logprobs 的数量
prompt_logprobs=None, # 返回 prompt logprobs
skip_special_tokens=True, # 跳过特殊 token
)

采样参数预设

# 创意写作 - 高随机性
creative = SamplingParams(temperature=1.2, top_p=0.95, max_tokens=500)

# 代码生成 - 低随机性,避免重复
code = SamplingParams(temperature=0.2, top_p=0.95, repetition_penalty=1.1, max_tokens=200)

# 问答系统 - 确定性输出
qa = SamplingParams(temperature=0.1, top_p=0.9, max_tokens=200)

# 对话系统 - 适中随机性,避免重复
chat = SamplingParams(temperature=0.7, top_p=0.9, frequency_penalty=0.3, max_tokens=150)

# 多候选生成
multi_choice = SamplingParams(n=3, temperature=0.8, max_tokens=100)

# 结构化输出(JSON)
structured = SamplingParams(
temperature=0.0,
max_tokens=200,
response_format={"type": "json_object"}
)

额外请求参数

# 通过 extra_body 传递 vLLM 特有参数
response = client.chat.completions.create(
model="MODEL_NAME",
messages=[...],
extra_body={
# 采样参数
"top_k": 50, # Top-k 采样
"min_p": 0.05, # 最小概率阈值
"repetition_penalty": 1.1, # 重复惩罚

# 请求控制
"priority": 1, # 请求优先级(越小越高)
"request_id": "req-001", # 请求追踪 ID

# 停止条件
"stop_token_ids": [2, 3], # 停止 token ID
"min_tokens": 10, # 最小生成 token 数
"ignore_eos": False, # 忽略 EOS

# 安全相关
"cache_salt": "random-salt", # 前缀缓存盐值(多租户安全)

# 重复检测
"repetition_detection": {
"ngram_size": 3,
"num_repeats": 5
}
}
)

HTTP API

Completions API

curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "MODEL_NAME",
"prompt": "Hello, world!",
"max_tokens": 100,
"temperature": 0.7,
"top_p": 0.95
}'

Chat Completions API

curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "MODEL_NAME",
"messages": [
{"role": "system", "content": "你是一个 helpful 的 AI 助手。"},
{"role": "user", "content": "你好!"}
],
"max_tokens": 100,
"temperature": 0.7
}'

流式输出

curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "MODEL_NAME",
"messages": [{"role": "user", "content": "讲一个故事"}],
"stream": true,
"max_tokens": 500
}'

Python 客户端

from openai import OpenAI

client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="not-needed" # 或你的 API key
)

# Completions
response = client.completions.create(
model="MODEL_NAME",
prompt="Hello, world!",
max_tokens=100
)
print(response.choices[0].text)

# Chat Completions
response = client.chat.completions.create(
model="MODEL_NAME",
messages=[{"role": "user", "content": "Hello!"}]
)
print(response.choices[0].message.content)

# 流式输出
stream = client.chat.completions.create(
model="MODEL_NAME",
messages=[{"role": "user", "content": "讲一个故事"}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)

# 异步客户端
import asyncio
from openai import AsyncOpenAI

async_client = AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="not-needed"
)

async def generate():
response = await async_client.chat.completions.create(
model="MODEL_NAME",
messages=[{"role": "user", "content": "Hello!"}]
)
return response.choices[0].message.content

# 运行
# asyncio.run(generate())

量化选项

类型命令参数显存节省速度精度损失适用场景
FP16--dtype float16基准基准通用
BF16--dtype bfloat160%基准推荐(Ampere+)
FP8--quantization fp8~50%Hopper GPU
INT8--load-format bitsandbytes~50%通用
AWQ--quantization awq~65%很快中等显存受限
GPTQ--quantization gptq~65%很快中等显存受限
# FP8 量化(需要 Hopper GPU)
vllm serve MODEL_NAME --quantization fp8

# AWQ 量化
vllm serve MODEL_NAME --quantization awq

# GPTQ 量化
vllm serve MODEL_NAME --quantization gptq

# bitsandbytes INT8
vllm serve MODEL_NAME --load-format bitsandbytes

并行配置

张量并行(TP)

# 单节点多卡
vllm serve MODEL_NAME --tensor-parallel-size 4

# Python
llm = LLM(model="MODEL_NAME", tensor_parallel_size=4)

流水线并行(PP)

vllm serve MODEL_NAME --pipeline-parallel-size 2

# Python
llm = LLM(model="MODEL_NAME", pipeline_parallel_size=2)

数据并行(DP)

# 内部负载均衡
vllm serve MODEL_NAME --data-parallel-size 4

# 结合 TP
vllm serve MODEL_NAME --data-parallel-size 2 --tensor-parallel-size 2

# 使用 Ray 后端
vllm serve MODEL_NAME --data-parallel-size 4 --data-parallel-backend ray

混合并行

# TP=2, PP=2, DP=2 (共 8 GPU)
vllm serve MODEL_NAME \
--tensor-parallel-size 2 \
--pipeline-parallel-size 2 \
--data-parallel-size 2

结构化输出

Choice(选项选择)

# 在线服务
response = client.chat.completions.create(
model="MODEL_NAME",
messages=[{"role": "user", "content": "情感分析:这个产品很好!"}],
extra_body={
"structured_outputs": {
"choice": ["positive", "negative", "neutral"]
}
}
)

# 离线推理
from vllm.sampling_params import StructuredOutputsParams

structured_outputs = StructuredOutputsParams(
choice=["positive", "negative", "neutral"]
)
sampling_params = SamplingParams(structured_outputs=structured_outputs)

JSON Schema

from pydantic import BaseModel

class Person(BaseModel):
name: str
age: int

# 在线服务
response = client.chat.completions.create(
model="MODEL_NAME",
messages=[{"role": "user", "content": "生成一个人物信息"}],
response_format={
"type": "json_schema",
"json_schema": {
"name": "person",
"schema": Person.model_json_schema()
}
}
)

# 离线推理
structured_outputs = StructuredOutputsParams(
json=Person.model_json_schema()
)

Regex(正则表达式)

response = client.chat.completions.create(
model="MODEL_NAME",
messages=[{"role": "user", "content": "生成邮箱地址"}],
extra_body={
"structured_outputs": {
"regex": r"\w+@example\.com"
}
}
)

Grammar(语法)

sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " FROM " table
column ::= "name " | "age "
table ::= "users "
"""

response = client.chat.completions.create(
model="MODEL_NAME",
messages=[{"role": "user", "content": "生成 SQL"}],
extra_body={"structured_outputs": {"grammar": sql_grammar}}
)

配置后端

# 启动时指定结构化输出后端
vllm serve MODEL_NAME \
--structured-outputs-config.backend xgrammar
# 可选:auto, xgrammar, guidance, outlines, lm-format-enforcer

性能优化

# 启用前缀缓存
--enable-prefix-caching

# 启用 Chunked Prefill
--enable-chunked-prefill --max-num-batched-tokens 2048

# 推测解码
--speculative-model DRAFT_MODEL --num-speculative-tokens 5

# 调整批处理
--max-num-seqs 512 --max-num-batched-tokens 16384

# 专家并行(MoE 模型)
--enable-expert-parallel

LoRA 适配器

启动服务

# 启用 LoRA 支持
vllm serve meta-llama/Llama-2-7b-hf \
--enable-lora \
--lora-modules sql-lora=/path/to/sql-lora \
--max-lora-rank 64

# 多个 LoRA 适配器
vllm serve meta-llama/Llama-2-7b-hf \
--enable-lora \
--lora-modules \
sql-lora=/path/to/sql-lora \
chat-lora=/path/to/chat-lora \
code-lora=/path/to/code-lora

Python 离线推理

from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

llm = LLM(
model="meta-llama/Llama-2-7b-hf",
enable_lora=True,
max_lora_rank=64
)

outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("adapter_name", 1, "/path/to/lora")
)

动态加载/卸载

# 启用动态更新
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True

# 加载适配器
curl -X POST http://localhost:8000/v1/load_lora_adapter \
-H "Content-Type: application/json" \
-d '{"lora_name": "new-adapter", "lora_path": "/path/to/lora"}'

# 卸载适配器
curl -X POST http://localhost:8000/v1/unload_lora_adapter \
-H "Content-Type: application/json" \
-d '{"lora_name": "new-adapter"}'

LoRA 参数

参数说明默认值
max_lora_rankLoRA 最大秩16
max_loras同时加载的最大数量1
max_cpu_lorasCPU 缓存数量与 max_loras 相同
lora_extra_vocab_sizeLoRA 额外词汇表大小256

多节点部署

使用 Ray

# Head 节点
ray start --head --port=6379

# Worker 节点
ray start --address="HEAD_NODE_IP:6379"

# 启动 vLLM
vllm serve MODEL_NAME \
--tensor-parallel-size 8 \
--pipeline-parallel-size 2 \
--distributed-executor-backend ray

使用 multiprocessing

# Head 节点
vllm serve MODEL_NAME \
--tensor-parallel-size 8 \
--pipeline-parallel-size 2 \
--nnodes 2 \
--node-rank 0 \
--master-addr HEAD_NODE_IP

# Worker 节点
vllm serve MODEL_NAME \
--tensor-parallel-size 8 \
--pipeline-parallel-size 2 \
--nnodes 2 \
--node-rank 1 \
--master-addr HEAD_NODE_IP \
--headless

环境变量

# 日志级别
export VLLM_LOGGING_LEVEL=DEBUG

# CUDA 设备
export CUDA_VISIBLE_DEVICES=0,1,2,3

# Hugging Face 镜像(国内)
export HF_ENDPOINT=https://hf-mirror.com

# NCCL 调试
export NCCL_DEBUG=INFO
export NCCL_TIMEOUT=1800

# 动态 LoRA 更新
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True

常见问题速查

显存不足

llm = LLM(
model="MODEL_NAME",
gpu_memory_utilization=0.7,
max_model_len=2048,
quantization="awq"
)

模型下载问题

# 使用镜像
export HF_ENDPOINT=https://hf-mirror.com

# 手动下载
huggingface-cli download MODEL_NAME --local-dir ./models

指定 GPU

# 方式一:环境变量
CUDA_VISIBLE_DEVICES=0,1 vllm serve MODEL_NAME

# 方式二:命令行
vllm serve MODEL_NAME --tensor-parallel-size 2

性能测试

# 吞吐量测试
python -m vllm benchmark throughput \
--model MODEL_NAME \
--num-prompts 1000 \
--input-len 512 \
--output-len 128

# 延迟测试
python -m vllm benchmark latency \
--model MODEL_NAME \
--input-len 512 \
--output-len 128

模型显存估算

模型参数量FP16 显存INT8 显存4-bit 显存
Llama-2-7B7B~14 GB~7 GB~4 GB
Llama-2-13B13B~26 GB~13 GB~7 GB
Llama-2-70B70B~140 GB~70 GB~35 GB
Qwen-72B72B~144 GB~72 GB~36 GB
DeepSeek-67B67B~134 GB~67 GB~34 GB

参考链接