Weaviate 向量数据库
Weaviate 是一个开源的向量搜索引擎,采用模块化设计,支持 GraphQL 查询接口。它将向量搜索与传统搜索引擎的功能相结合,提供灵活的查询能力和丰富的数据类型支持。
概述
为什么选择 Weaviate
| 特性 | 说明 |
|---|---|
| GraphQL 接口 | 灵活的查询语法,精确控制返回数据 |
| 模块化设计 | 可插拔的嵌入模型和生成模型 |
| 多模态支持 | 文本、图像、音频的统一处理 |
| 混合搜索 | 向量相似度 + BM25 关键词搜索 |
| 向量+语义 | 内置向量化,无需预处理 |
| 云原生 | 支持 Kubernetes 部署 |
适用场景
- 复杂查询需求:需要灵活过滤和关联查询
- 多模态应用:同时处理文本和图像
- 知识图谱:需要实体关系建模
- 混合搜索:结合语义和关键词搜索
快速开始
1. 部署 Weaviate
Docker Compose 部署
# docker-compose.yml
version: '3.4'
services:
weaviate:
image: semitechnologies/weaviate:1.24.0
ports:
- "8080:8080"
environment:
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'text2vec-transformers'
ENABLE_MODULES: 'text2vec-transformers'
volumes:
- ./weaviate_data:/var/lib/weaviate
docker-compose up -d
使用 Weaviate Cloud
import weaviate
# 连接到 Weaviate Cloud
client = weaviate.Client(
url="https://your-cluster.weaviate.network",
auth_client_secret=weaviate.AuthApiKey(api_key="your-api-key")
)
2. 安装 Python 客户端
pip install weaviate-client
3. 第一个示例
import weaviate
# 连接到本地 Weaviate
client = weaviate.Client("http://localhost:8080")
# 定义 Schema
class_obj = {
"class": "Article",
"vectorizer": "text2vec-transformers", # 使用内置向量化模块
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False
}
},
"properties": [
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}
}
},
{
"name": "content",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}
}
},
{
"name": "category",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": True} # 不向量化此字段
}
}
]
}
# 创建 Class
client.schema.create_class(class_obj)
# 添加数据
client.data_object.create(
data_object={
"title": "向量数据库介绍",
"content": "向量数据库是专为 AI 应用设计的数据库...",
"category": "技术"
},
class_name="Article"
)
# 语义搜索
result = (
client.query
.get("Article", ["title", "content", "category"])
.with_near_text({"concepts": ["什么是向量数据库"]}) # 自动向量化查询
.with_limit(3)
.do()
)
print(result)
核心概念
Schema(模式)
Weaviate 使用 Schema 定义数据结构,类似于关系型数据库的表结构。
# 定义一个完整的 Schema
schema = {
"classes": [
{
"class": "Product",
"description": "产品信息",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False
}
},
"properties": [
{
"name": "name",
"dataType": ["text"],
"description": "产品名称",
"moduleConfig": {
"text2vec-transformers": {
"skip": False,
"vectorizePropertyName": False
}
}
},
{
"name": "description",
"dataType": ["text"],
"description": "产品描述"
},
{
"name": "price",
"dataType": ["number"],
"description": "价格",
"moduleConfig": {
"text2vec-transformers": {"skip": True} # 不向量化数字字段
}
},
{
"name": "category",
"dataType": ["text"],
"description": "分类"
},
{
"name": "inStock",
"dataType": ["boolean"],
"description": "是否有库存"
}
]
}
]
}
# 创建 Schema
client.schema.create(schema)
# 查看 Schema
print(client.schema.get())
# 删除 Schema
client.schema.delete_all()
支持的数据类型:
| 类型 | 说明 | 示例 |
|---|---|---|
| text | 文本 | "Hello World" |
| text[] | 文本数组 | ["tag1", "tag2"] |
| number | 数字 | 3.14 |
| int | 整数 | 42 |
| boolean | 布尔 | true |
| date | 日期 | "2024-01-01T00:00:00Z" |
| geoCoordinates | 地理坐标 | {"latitude": 51.5, "longitude": -0.1} |
| phoneNumber | 电话号码 | {"number": "+1234567890"} |
| blob | 二进制数据 | 图片、音频 |
Class(类)
Class 是 Weaviate 中的数据集合,类似于其他数据库的表。
# 创建单个 Class
class_obj = {
"class": "Book",
"vectorizer": "text2vec-transformers",
"properties": [
{"name": "title", "dataType": ["text"]},
{"name": "author", "dataType": ["text"]},
{"name": "publishedYear", "dataType": ["int"]}
]
}
client.schema.create_class(class_obj)
# 获取所有 Classes
classes = client.schema.get()["classes"]
# 删除 Class
client.schema.delete_class("Book")
向量化模块
Weaviate 支持多种向量化模块:
# 1. text2vec-transformers(本地 Transformer 模型)
{
"class": "Article",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False
}
}
}
# 2. text2vec-openai(OpenAI API)
{
"class": "Article",
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text"
}
}
}
# 3. text2vec-cohere(Cohere API)
{
"class": "Article",
"vectorizer": "text2vec-cohere",
"moduleConfig": {
"text2vec-cohere": {
"model": "embed-multilingual-v2.0"
}
}
}
# 4. img2vec-neural(图像向量化)
{
"class": "Image",
"vectorizer": "img2vec-neural",
"moduleConfig": {
"img2vec-neural": {
"imageFields": ["image"]
}
},
"properties": [
{"name": "image", "dataType": ["blob"]},
{"name": "filename", "dataType": ["text"]}
]
}
数据操作
添加数据
# 基本添加
client.data_object.create(
data_object={
"title": "Python 编程入门",
"content": "Python 是一种简单易学的编程语言...",
"category": "编程"
},
class_name="Article"
)
# 指定自定义向量(跳过自动向量化)
import json
client.data_object.create(
data_object={"title": "自定义向量文档"},
class_name="Article",
vector=[0.1, 0.2, 0.3, ...] # 自定义向量
)
# 批量添加
from weaviate.util import generate_uuid5
with client.batch as batch:
batch.batch_size = 100
for article in articles:
batch.add_data_object(
data_object=article,
class_name="Article",
uuid=generate_uuid5(article["title"]) # 生成确定性 UUID
)
查询数据
基本查询
# 获取所有数据
result = client.query.get("Article", ["title", "content"]).do()
# 带过滤条件的查询
result = (
client.query
.get("Article", ["title", "content", "category"])
.with_where({
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
})
.do()
)
向量搜索(Near Vector)
# 使用向量搜索
query_vector = [0.1, 0.2, 0.3, ...]
result = (
client.query
.get("Article", ["title", "content"])
.with_near_vector({"vector": query_vector})
.with_limit(5)
.do()
)
文本搜索(Near Text)
# 自动将文本转换为向量进行搜索
result = (
client.query
.get("Article", ["title", "content"])
.with_near_text({"concepts": ["人工智能", "机器学习"]})
.with_limit(5)
.do()
)
# 带距离限制的搜索
result = (
client.query
.get("Article", ["title"])
.with_near_text({
"concepts": ["深度学习"],
"distance": 0.2 # 最大距离
})
.do()
)
混合搜索
结合向量搜索和 BM25 关键词搜索:
result = (
client.query
.get("Article", ["title", "content"])
.with_hybrid(query="向量数据库", alpha=0.75) # alpha: 向量权重
.with_limit(10)
.do()
)
过滤条件
# 等于
.where({
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
})
# 不等于
.where({
"path": ["status"],
"operator": "NotEqual",
"valueText": "deleted"
})
# 大于/大于等于
.where({
"path": ["price"],
"operator": "GreaterThan",
"valueNumber": 100
})
# 小于/小于等于
.where({
"path": ["price"],
"operator": "LessThanEqual",
"valueNumber": 1000
})
# 包含在列表中
.where({
"path": ["category"],
"operator": "ContainsAny",
"valueText": ["技术", "编程"]
})
# 字符串包含
.where({
"path": ["title"],
"operator": "Like",
"valueText": "*Python*"
})
# AND 条件
.where({
"operator": "And",
"operands": [
{
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
},
{
"path": ["price"],
"operator": "LessThan",
"valueNumber": 1000
}
]
})
# OR 条件
.where({
"operator": "Or",
"operands": [
{
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
},
{
"path": ["category"],
"operator": "Equal",
"valueText": "编程"
}
]
})
更新和删除
# 更新数据
client.data_object.update(
data_object={"price": 899},
class_name="Product",
uuid="文章-uuid"
)
# 替换数据(完全覆盖)
client.data_object.replace(
data_object={
"title": "新标题",
"content": "新内容"
},
class_name="Article",
uuid="文章-uuid"
)
# 删除数据
client.data_object.delete(
uuid="文章-uuid",
class_name="Article"
)
# 批量删除(带过滤条件)
client.batch.delete_objects(
class_name="Article",
where={
"path": ["category"],
"operator": "Equal",
"valueText": "obsolete"
}
)
GraphQL 查询
Weaviate 的核心查询语言是 GraphQL,提供强大的查询能力。
基本查询
# 构建 GraphQL 查询
gql_query = """
{
Get {
Article(limit: 5) {
title
content
category
}
}
}
"""
result = client.query.raw(gql_query)
向量搜索查询
# Near Text 查询
gql_query = """
{
Get {
Article(
nearText: {
concepts: ["向量数据库"]
}
limit: 5
) {
title
content
_additional {
distance
certainty
}
}
}
}
"""
# 带过滤的向量搜索
gql_query = """
{
Get {
Article(
nearText: {
concepts: ["AI"]
}
where: {
path: ["category"]
operator: Equal
valueText: "技术"
}
limit: 5
) {
title
category
}
}
}
"""
聚合查询
# 统计
gql_query = """
{
Aggregate {
Article {
meta {
count
}
}
}
}
"""
# 分组统计
gql_query = """
{
Aggregate {
Article(groupBy: ["category"]) {
category {
count
topOccurrences {
value
occurs
}
}
}
}
}
"""
RAG 应用示例
import weaviate
from openai import OpenAI
import os
class WeaviateRAG:
def __init__(self):
# 连接 Weaviate
self.client = weaviate.Client("http://localhost:8080")
# OpenAI 客户端
self.openai = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# 确保 Schema 存在
self._setup_schema()
def _setup_schema(self):
"""设置 Schema"""
schema = {
"class": "Document",
"vectorizer": "text2vec-transformers",
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "source", "dataType": ["text"]},
{"name": "title", "dataType": ["text"]}
]
}
# 如果 Class 不存在则创建
if not self.client.schema.exists("Document"):
self.client.schema.create_class(schema)
def add_documents(self, documents):
"""添加文档"""
with self.client.batch as batch:
batch.batch_size = 100
for doc in documents:
batch.add_data_object(
data_object={
"content": doc["content"],
"source": doc.get("source", ""),
"title": doc.get("title", "")
},
class_name="Document"
)
print(f"已添加 {len(documents)} 篇文档")
def search(self, query, top_k=5):
"""搜索相关文档"""
result = (
self.client.query
.get("Document", ["content", "source", "title"])
.with_near_text({"concepts": [query]})
.with_limit(top_k)
.with_additional(["distance"])
.do()
)
documents = result["data"]["Get"]["Document"]
return [
{
"content": doc["content"],
"source": doc["source"],
"title": doc["title"],
"distance": doc["_additional"]["distance"]
}
for doc in documents
]
def answer(self, question, top_k=3):
"""生成回答"""
# 检索相关文档
docs = self.search(question, top_k)
# 构建上下文
context = "\n\n".join([
f"[文档 {i+1}] {doc['content']}"
for i, doc in enumerate(docs)
])
# 调用 LLM
prompt = f"""基于以下文档回答问题。如果文档中没有相关信息,请说明。
文档:
{context}
问题:{question}
回答:"""
response = self.openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "你是一个基于文档回答问题的助手。"},
{"role": "user", "content": prompt}
]
)
return {
"answer": response.choices[0].message.content,
"sources": docs
}
# 使用示例
rag = WeaviateRAG()
docs = [
{"content": "Weaviate 是一个开源向量搜索引擎...", "source": "weaviate.io"},
{"content": "向量数据库用于 AI 应用的语义搜索...", "source": "blog"}
]
rag.add_documents(docs)
result = rag.answer("什么是 Weaviate?")
print(result["answer"])
高级功能
多模态搜索
# 定义图像类
image_class = {
"class": "Image",
"vectorizer": "img2vec-neural",
"moduleConfig": {
"img2vec-neural": {
"imageFields": ["image"]
}
},
"properties": [
{"name": "image", "dataType": ["blob"]},
{"name": "filename", "dataType": ["text"]},
{"name": "description", "dataType": ["text"]}
]
}
client.schema.create_class(image_class)
# 添加图像
import base64
with open("image.jpg", "rb") as f:
encoded_image = base64.b64encode(f.read()).decode("utf-8")
client.data_object.create(
data_object={
"image": encoded_image,
"filename": "image.jpg",
"description": "一只猫"
},
class_name="Image"
)
# 以图搜图
result = (
client.query
.get("Image", ["filename", "description"])
.with_near_image({"image": encoded_image})
.with_limit(5)
.do()
)
生成式搜索(Generative Search)
结合 LLM 进行生成式搜索:
# 需要配置 generative-openai 模块
result = (
client.query
.get("Article", ["title", "content"])
.with_near_text({"concepts": ["AI 发展"]})
.with_generate(
single_prompt="总结这篇文章的主要内容:{content}"
)
.with_limit(3)
.do()
)
# 访问生成结果
for article in result["data"]["Get"]["Article"]:
print(f"标题: {article['title']}")
print(f"生成摘要: {article['_additional']['generate']['singleResult']}")
向量化配置详解
# 细粒度控制向量化行为
class_obj = {
"class": "Article",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False, # 不向量化类名
"poolingStrategy": "masked_mean" # 池化策略
}
},
"properties": [
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {
"skip": False, # 不向量化此字段
"vectorizePropertyName": False # 不向量化属性名
}
}
},
{
"name": "internal_notes",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": True} # 跳过向量化
}
}
]
}
性能优化
批量导入
# 配置批量导入参数
client.batch.configure(
batch_size=100, # 每批数量
dynamic=True, # 动态调整批量大小
timeout_retries=3,
callback=None # 可设置回调函数处理错误
)
# 批量导入
with client.batch as batch:
for doc in documents:
batch.add_data_object(
data_object=doc,
class_name="Article"
)
索引调优
# 在 Schema 中配置向量索引参数
class_obj = {
"class": "Article",
"vectorIndexType": "hnsw", # 或 "flat"
"vectorIndexConfig": {
"ef": 100, # 搜索时探索因子
"efConstruction": 128, # 构建时探索因子
"maxConnections": 64, # 最大连接数
"dynamicEfFactor": 8,
"dynamicEfMin": 100,
"dynamicEfMax": 500
},
"properties": [...]
}
查询优化
# 使用 cursor 进行分页
result = (
client.query
.get("Article", ["title"])
.with_limit(100)
.with_after("最后一条的-uuid") # 分页游标
.do()
)
# 只获取需要的字段
result = (
client.query
.get("Article", ["title"]) # 只获取标题
.with_near_text({"concepts": ["AI"]})
.do()
)
常见问题
Q: 如何选择向量化模块?
| 模块 | 优点 | 缺点 |
|---|---|---|
| text2vec-transformers | 本地运行,无 API 费用 | 需要 GPU 获得好性能 |
| text2vec-openai | 高质量嵌入 | 需要 API Key,有费用 |
| text2vec-cohere | 多语言支持好 | 需要 API Key |
Q: 如何处理中文?
使用支持中文的向量化模块:
# docker-compose.yml 配置
services:
weaviate:
image: semitechnologies/weaviate:1.24.0
environment:
DEFAULT_VECTORIZER_MODULE: 'text2vec-transformers'
ENABLE_MODULES: 'text2vec-transformers'
# 使用支持中文的 transformer 模型
或使用 OpenAI/Cohere 的多语言模型。
Q: 数据如何备份?
# 1. 直接备份数据目录
cp -r ./weaviate_data ./weaviate_backup
# 2. 使用 API 导出
# 查询所有数据并保存到文件
Q: 如何监控性能?
# 查看元数据
result = (
client.query
.get("Article", ["title"])
.with_near_text({"concepts": ["test"]})
.with_additional(["vector", "distance", "certainty"])
.do()
)
# 查看查询时间(在返回结果中)
下一步
- Qdrant 教程 - Rust 编写的高性能向量数据库
- pgvector 教程 - PostgreSQL 的向量扩展
- 实践案例 - 完整项目示例