RAG 速查表
本文档提供 RAG 开发中常用的代码片段、配置模板和问题排查指南。
1. 快速开始
最小可行 RAG
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
# 1. 加载文档
loader = TextLoader("document.txt")
documents = loader.load()
# 2. 分块
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)
# 3. 创建向量存储
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)
# 4. 创建检索器
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# 5. 创建 RAG 链
from langchain.chains import RetrievalQA
llm = ChatOpenAI(model="gpt-4o-mini")
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
# 6. 查询
answer = qa.invoke("文档的主要内容是什么?")
print(answer["result"])
2. 分块参数速查
| 文档类型 | 推荐块大小 | 重叠 | 分隔符优先级 |
|---|---|---|---|
| 技术文档 | 800-1000 | 15-20% | \n\n, \n, 。 |
| FAQ | 按问答对 | 0 | 按 Q: A: 分割 |
| 法律文档 | 1000-1500 | 20% | 按条款分割 |
| 新闻文章 | 500-800 | 10-15% | \n\n, \n, 。 |
| 代码文件 | 按函数 | 0 | 按函数/类分割 |
# 中文分块器
splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=160,
separators=["\n\n", "\n", "。", "!", "?", ";", ",", " ", ""]
)
3. 嵌入模型速查
中文场景
# 推荐:BGE-large-zh
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
# 使用:添加指令
query = "为这个句子生成表示以用于检索相关文章:如何学习Python?"
embedding = model.encode([query], normalize_embeddings=True)
英文场景
# OpenAI API
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# E5(需加前缀)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-large-v2')
query_embedding = model.encode(["query: What is RAG?"])
doc_embedding = model.encode(["passage: RAG is retrieval-augmented generation..."])
多语言
# BGE-M3
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-m3')
# 或 Cohere
from langchain_cohere import CohereEmbeddings
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")
4. 向量数据库连接
Chroma(本地开发)
import chromadb
from langchain_community.vectorstores import Chroma
client = chromadb.PersistentClient(path="./chroma_db")
vectorstore = Chroma(
client=client,
collection_name="documents",
embedding_function=embeddings
)
Pinecone(生产云服务)
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
pc = Pinecone(api_key="your-api-key")
index = pc.Index("your-index-name")
vectorstore = PineconeVectorStore(
index=index,
embedding=embeddings,
text_key="text"
)
Milvus(大规模自托管)
from pymilvus import connections
from langchain_milvus import Milvus
connections.connect("default", host="localhost", port="19530")
vectorstore = Milvus(
embedding_function=embeddings,
collection_name="documents",
connection_args={"host": "localhost", "port": "19530"}
)
Qdrant(高性能自托管)
from qdrant_client import QdrantClient
from langchain_qdrant import Qdrant
client = QdrantClient(host="localhost", port=6333)
vectorstore = Qdrant(
client=client,
collection_name="documents",
embeddings=embeddings
)
5. 检索策略代码
基础向量检索
results = vectorstore.similarity_search(query, k=5)
带分数检索
results = vectorstore.similarity_search_with_score(query, k=5)
for doc, score in results:
print(f"分数: {score:.4f}")
混合检索
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 5
ensemble = EnsembleRetriever(
retrievers=[bm25_retriever, vector_retriever],
weights=[0.4, 0.6]
)
多查询检索
from langchain.retrievers import MultiQueryRetriever
multi_retriever = MultiQueryRetriever.from_llm(
retriever=vectorstore.as_retriever(),
llm=llm
)
元数据过滤
results = vectorstore.similarity_search(
query,
k=5,
filter={"category": "技术文档", "year": {"$gte": 2023}}
)
6. 重排序代码
BGE Reranker
from sentence_transformers import CrossEncoder
reranker = CrossEncoder('BAAI/bge-reranker-large')
def rerank(query, documents, top_k=5):
pairs = [(query, doc.page_content) for doc in documents]
scores = reranker.predict(pairs)
results = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
return results[:top_k]
Cohere Rerank
import cohere
co = cohere.Client("your-api-key")
def rerank_cohere(query, documents, top_n=5):
results = co.rerank(
model="rerank-multilingual-v3.0",
query=query,
documents=[doc.page_content for doc in documents],
top_n=top_n
)
return results
LangChain 集成
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=vectorstore.as_retriever(search_kwargs={"k": 20})
)
7. 提示词模板
基础 RAG 提示词
prompt_template = """
你是一个智能助手。请根据以下参考文档回答问题。
如果参考文档中没有相关信息,请明确告知用户,不要编造答案。
参考文档:
{context}
问题:{question}
请提供准确、有帮助的回答:
"""
带来源引用
prompt_template = """
请根据参考文档回答问题,并在回答中标注信息来源。
参考文档:
{context}
问题:{question}
请按以下格式回答:
回答:[你的回答]
来源:[引用的文档片段]
"""
对话式 RAG
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
prompt = ChatPromptTemplate.from_messages([
("system", "你是一个智能助手,根据上下文回答问题。"),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
("human", "相关文档:\n{context}")
])
8. 常见问题排查
问题:检索不到相关文档
# 排查步骤
# 1. 检查分块是否正确
print(f"分块数量: {len(chunks)}")
print(f"示例分块: {chunks[0].page_content[:200]}")
# 2. 检查嵌入向量
embedding = embeddings.embed_query("测试查询")
print(f"向量维度: {len(embedding)}")
print(f"向量范数: {sum(x**2 for x in embedding)**0.5}")
# 3. 测试相似度
test_doc_embedding = embeddings.embed_documents([chunks[0].page_content])
similarity = cosine_similarity([embedding], test_doc_embedding)
print(f"相似度: {similarity[0][0]}")
问题:回答包含幻觉
# 解决方案:加强提示词约束
prompt = """
请严格根据参考文档回答问题。
- 只使用参考文档中的信息
- 如果参考文档没有相关信息,请说"参考文档中没有相关信息"
- 不要添加任何参考文档以外的信息
参考文档:
{context}
问题:{question}
"""
问题:响应太慢
# 诊断
import time
start = time.time()
results = retriever.invoke(query)
print(f"检索耗时: {time.time() - start:.2f}s")
start = time.time()
answer = llm.invoke(prompt)
print(f"LLM 耗时: {time.time() - start:.2f}s")
# 优化方案
# 1. 减少检索数量 k
# 2. 使用更快的嵌入模型
# 3. 使用流式输出
# 4. 启用缓存
9. 评估代码
使用 RAGAS
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
# 准备数据
dataset = {
"question": ["问题1", "问题2"],
"answer": ["回答1", "回答2"],
"contexts": [["文档1"], ["文档2"]],
"ground_truth": ["标准答案1", "标准答案2"]
}
# 评估
results = evaluate(
dataset,
metrics=[faithfulness, answer_relevancy, context_precision]
)
print(results)
自定义评估
def evaluate_hit_rate(test_cases, retriever):
"""评估命中率"""
hits = 0
for case in test_cases:
results = retriever.invoke(case["query"])
retrieved_ids = {r.metadata.get("id") for r in results}
if retrieved_ids & set(case["relevant_ids"]):
hits += 1
return hits / len(test_cases)
10. 生产配置清单
环境变量
# .env
OPENAI_API_KEY=sk-xxx
EMBEDDING_MODEL=text-embedding-3-small
LLM_MODEL=gpt-4o-mini
CHROMA_PERSIST_DIR=./data/chroma
CHUNK_SIZE=800
CHUNK_OVERLAP=160
RETRIEVAL_TOP_K=20
RERANK_TOP_K=5
日志配置
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('rag.log'),
logging.StreamHandler()
]
)
健康检查
def health_check():
"""系统健康检查"""
checks = {
"vectorstore": vectorstore._collection.count() > 0,
"embeddings": len(embeddings.embed_query("test")) > 0,
"llm": llm.invoke("Say ok").content == "Ok"
}
return all(checks.values()), checks