跳到主要内容

RAG 速查表

本文档提供 RAG 开发中常用的代码片段、配置模板和问题排查指南。

1. 快速开始

最小可行 RAG

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

# 1. 加载文档
loader = TextLoader("document.txt")
documents = loader.load()

# 2. 分块
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)

# 3. 创建向量存储
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)

# 4. 创建检索器
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# 5. 创建 RAG 链
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model="gpt-4o-mini")
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# 6. 查询
answer = qa.invoke("文档的主要内容是什么?")
print(answer["result"])

2. 分块参数速查

文档类型推荐块大小重叠分隔符优先级
技术文档800-100015-20%\n\n, \n,
FAQ按问答对0Q: A: 分割
法律文档1000-150020%按条款分割
新闻文章500-80010-15%\n\n, \n,
代码文件按函数0按函数/类分割
# 中文分块器
splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=160,
separators=["\n\n", "\n", "。", "!", "?", ";", ",", " ", ""]
)

3. 嵌入模型速查

中文场景

# 推荐:BGE-large-zh
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')

# 使用:添加指令
query = "为这个句子生成表示以用于检索相关文章:如何学习Python?"
embedding = model.encode([query], normalize_embeddings=True)

英文场景

# OpenAI API
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# E5(需加前缀)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-large-v2')
query_embedding = model.encode(["query: What is RAG?"])
doc_embedding = model.encode(["passage: RAG is retrieval-augmented generation..."])

多语言

# BGE-M3
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-m3')

# 或 Cohere
from langchain_cohere import CohereEmbeddings
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")

4. 向量数据库连接

Chroma(本地开发)

import chromadb
from langchain_community.vectorstores import Chroma

client = chromadb.PersistentClient(path="./chroma_db")
vectorstore = Chroma(
client=client,
collection_name="documents",
embedding_function=embeddings
)

Pinecone(生产云服务)

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key="your-api-key")
index = pc.Index("your-index-name")

vectorstore = PineconeVectorStore(
index=index,
embedding=embeddings,
text_key="text"
)

Milvus(大规模自托管)

from pymilvus import connections
from langchain_milvus import Milvus

connections.connect("default", host="localhost", port="19530")

vectorstore = Milvus(
embedding_function=embeddings,
collection_name="documents",
connection_args={"host": "localhost", "port": "19530"}
)

Qdrant(高性能自托管)

from qdrant_client import QdrantClient
from langchain_qdrant import Qdrant

client = QdrantClient(host="localhost", port=6333)

vectorstore = Qdrant(
client=client,
collection_name="documents",
embeddings=embeddings
)

5. 检索策略代码

基础向量检索

results = vectorstore.similarity_search(query, k=5)

带分数检索

results = vectorstore.similarity_search_with_score(query, k=5)
for doc, score in results:
print(f"分数: {score:.4f}")

混合检索

from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 5

ensemble = EnsembleRetriever(
retrievers=[bm25_retriever, vector_retriever],
weights=[0.4, 0.6]
)

多查询检索

from langchain.retrievers import MultiQueryRetriever

multi_retriever = MultiQueryRetriever.from_llm(
retriever=vectorstore.as_retriever(),
llm=llm
)

元数据过滤

results = vectorstore.similarity_search(
query,
k=5,
filter={"category": "技术文档", "year": {"$gte": 2023}}
)

6. 重排序代码

BGE Reranker

from sentence_transformers import CrossEncoder

reranker = CrossEncoder('BAAI/bge-reranker-large')

def rerank(query, documents, top_k=5):
pairs = [(query, doc.page_content) for doc in documents]
scores = reranker.predict(pairs)

results = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
return results[:top_k]

Cohere Rerank

import cohere

co = cohere.Client("your-api-key")

def rerank_cohere(query, documents, top_n=5):
results = co.rerank(
model="rerank-multilingual-v3.0",
query=query,
documents=[doc.page_content for doc in documents],
top_n=top_n
)
return results

LangChain 集成

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=vectorstore.as_retriever(search_kwargs={"k": 20})
)

7. 提示词模板

基础 RAG 提示词

prompt_template = """
你是一个智能助手。请根据以下参考文档回答问题。
如果参考文档中没有相关信息,请明确告知用户,不要编造答案。

参考文档:
{context}

问题:{question}

请提供准确、有帮助的回答:
"""

带来源引用

prompt_template = """
请根据参考文档回答问题,并在回答中标注信息来源。

参考文档:
{context}

问题:{question}

请按以下格式回答:
回答:[你的回答]
来源:[引用的文档片段]
"""

对话式 RAG

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages([
("system", "你是一个智能助手,根据上下文回答问题。"),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
("human", "相关文档:\n{context}")
])

8. 常见问题排查

问题:检索不到相关文档

# 排查步骤
# 1. 检查分块是否正确
print(f"分块数量: {len(chunks)}")
print(f"示例分块: {chunks[0].page_content[:200]}")

# 2. 检查嵌入向量
embedding = embeddings.embed_query("测试查询")
print(f"向量维度: {len(embedding)}")
print(f"向量范数: {sum(x**2 for x in embedding)**0.5}")

# 3. 测试相似度
test_doc_embedding = embeddings.embed_documents([chunks[0].page_content])
similarity = cosine_similarity([embedding], test_doc_embedding)
print(f"相似度: {similarity[0][0]}")

问题:回答包含幻觉

# 解决方案:加强提示词约束
prompt = """
请严格根据参考文档回答问题。
- 只使用参考文档中的信息
- 如果参考文档没有相关信息,请说"参考文档中没有相关信息"
- 不要添加任何参考文档以外的信息

参考文档:
{context}

问题:{question}
"""

问题:响应太慢

# 诊断
import time

start = time.time()
results = retriever.invoke(query)
print(f"检索耗时: {time.time() - start:.2f}s")

start = time.time()
answer = llm.invoke(prompt)
print(f"LLM 耗时: {time.time() - start:.2f}s")

# 优化方案
# 1. 减少检索数量 k
# 2. 使用更快的嵌入模型
# 3. 使用流式输出
# 4. 启用缓存

9. 评估代码

使用 RAGAS

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision

# 准备数据
dataset = {
"question": ["问题1", "问题2"],
"answer": ["回答1", "回答2"],
"contexts": [["文档1"], ["文档2"]],
"ground_truth": ["标准答案1", "标准答案2"]
}

# 评估
results = evaluate(
dataset,
metrics=[faithfulness, answer_relevancy, context_precision]
)

print(results)

自定义评估

def evaluate_hit_rate(test_cases, retriever):
"""评估命中率"""
hits = 0
for case in test_cases:
results = retriever.invoke(case["query"])
retrieved_ids = {r.metadata.get("id") for r in results}
if retrieved_ids & set(case["relevant_ids"]):
hits += 1
return hits / len(test_cases)

10. 生产配置清单

环境变量

# .env
OPENAI_API_KEY=sk-xxx
EMBEDDING_MODEL=text-embedding-3-small
LLM_MODEL=gpt-4o-mini
CHROMA_PERSIST_DIR=./data/chroma
CHUNK_SIZE=800
CHUNK_OVERLAP=160
RETRIEVAL_TOP_K=20
RERANK_TOP_K=5

日志配置

import logging

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('rag.log'),
logging.StreamHandler()
]
)

健康检查

def health_check():
"""系统健康检查"""
checks = {
"vectorstore": vectorstore._collection.count() > 0,
"embeddings": len(embeddings.embed_query("test")) > 0,
"llm": llm.invoke("Say ok").content == "Ok"
}
return all(checks.values()), checks

参考资料