跳到主要内容

Weaviate 向量数据库

Weaviate 是一个开源的向量搜索引擎,采用模块化设计,支持 GraphQL 查询接口。它将向量搜索与传统搜索引擎的功能相结合,提供灵活的查询能力和丰富的数据类型支持。

概述

为什么选择 Weaviate

特性说明
GraphQL 接口灵活的查询语法,精确控制返回数据
模块化设计可插拔的嵌入模型和生成模型
多模态支持文本、图像、音频的统一处理
混合搜索向量相似度 + BM25 关键词搜索
向量+语义内置向量化,无需预处理
云原生支持 Kubernetes 部署

适用场景

  • 复杂查询需求:需要灵活过滤和关联查询
  • 多模态应用:同时处理文本和图像
  • 知识图谱:需要实体关系建模
  • 混合搜索:结合语义和关键词搜索

快速开始

1. 部署 Weaviate

Docker Compose 部署

# docker-compose.yml
version: '3.4'
services:
weaviate:
image: semitechnologies/weaviate:1.24.0
ports:
- "8080:8080"
environment:
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'text2vec-transformers'
ENABLE_MODULES: 'text2vec-transformers'
volumes:
- ./weaviate_data:/var/lib/weaviate
docker-compose up -d

使用 Weaviate Cloud

import weaviate

# 连接到 Weaviate Cloud
client = weaviate.Client(
url="https://your-cluster.weaviate.network",
auth_client_secret=weaviate.AuthApiKey(api_key="your-api-key")
)

2. 安装 Python 客户端

pip install weaviate-client

3. 第一个示例

import weaviate

# 连接到本地 Weaviate
client = weaviate.Client("http://localhost:8080")

# 定义 Schema
class_obj = {
"class": "Article",
"vectorizer": "text2vec-transformers", # 使用内置向量化模块
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False
}
},
"properties": [
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}
}
},
{
"name": "content",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": False, "vectorizePropertyName": False}
}
},
{
"name": "category",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": True} # 不向量化此字段
}
}
]
}

# 创建 Class
client.schema.create_class(class_obj)

# 添加数据
client.data_object.create(
data_object={
"title": "向量数据库介绍",
"content": "向量数据库是专为 AI 应用设计的数据库...",
"category": "技术"
},
class_name="Article"
)

# 语义搜索
result = (
client.query
.get("Article", ["title", "content", "category"])
.with_near_text({"concepts": ["什么是向量数据库"]}) # 自动向量化查询
.with_limit(3)
.do()
)

print(result)

核心概念

Schema(模式)

Weaviate 使用 Schema 定义数据结构,类似于关系型数据库的表结构。

# 定义一个完整的 Schema
schema = {
"classes": [
{
"class": "Product",
"description": "产品信息",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False
}
},
"properties": [
{
"name": "name",
"dataType": ["text"],
"description": "产品名称",
"moduleConfig": {
"text2vec-transformers": {
"skip": False,
"vectorizePropertyName": False
}
}
},
{
"name": "description",
"dataType": ["text"],
"description": "产品描述"
},
{
"name": "price",
"dataType": ["number"],
"description": "价格",
"moduleConfig": {
"text2vec-transformers": {"skip": True} # 不向量化数字字段
}
},
{
"name": "category",
"dataType": ["text"],
"description": "分类"
},
{
"name": "inStock",
"dataType": ["boolean"],
"description": "是否有库存"
}
]
}
]
}

# 创建 Schema
client.schema.create(schema)

# 查看 Schema
print(client.schema.get())

# 删除 Schema
client.schema.delete_all()

支持的数据类型

类型说明示例
text文本"Hello World"
text[]文本数组["tag1", "tag2"]
number数字3.14
int整数42
boolean布尔true
date日期"2024-01-01T00:00:00Z"
geoCoordinates地理坐标{"latitude": 51.5, "longitude": -0.1}
phoneNumber电话号码{"number": "+1234567890"}
blob二进制数据图片、音频

Class(类)

Class 是 Weaviate 中的数据集合,类似于其他数据库的表。

# 创建单个 Class
class_obj = {
"class": "Book",
"vectorizer": "text2vec-transformers",
"properties": [
{"name": "title", "dataType": ["text"]},
{"name": "author", "dataType": ["text"]},
{"name": "publishedYear", "dataType": ["int"]}
]
}

client.schema.create_class(class_obj)

# 获取所有 Classes
classes = client.schema.get()["classes"]

# 删除 Class
client.schema.delete_class("Book")

向量化模块

Weaviate 支持多种向量化模块:

# 1. text2vec-transformers(本地 Transformer 模型)
{
"class": "Article",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False
}
}
}

# 2. text2vec-openai(OpenAI API)
{
"class": "Article",
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text"
}
}
}

# 3. text2vec-cohere(Cohere API)
{
"class": "Article",
"vectorizer": "text2vec-cohere",
"moduleConfig": {
"text2vec-cohere": {
"model": "embed-multilingual-v2.0"
}
}
}

# 4. img2vec-neural(图像向量化)
{
"class": "Image",
"vectorizer": "img2vec-neural",
"moduleConfig": {
"img2vec-neural": {
"imageFields": ["image"]
}
},
"properties": [
{"name": "image", "dataType": ["blob"]},
{"name": "filename", "dataType": ["text"]}
]
}

数据操作

添加数据

# 基本添加
client.data_object.create(
data_object={
"title": "Python 编程入门",
"content": "Python 是一种简单易学的编程语言...",
"category": "编程"
},
class_name="Article"
)

# 指定自定义向量(跳过自动向量化)
import json

client.data_object.create(
data_object={"title": "自定义向量文档"},
class_name="Article",
vector=[0.1, 0.2, 0.3, ...] # 自定义向量
)

# 批量添加
from weaviate.util import generate_uuid5

with client.batch as batch:
batch.batch_size = 100

for article in articles:
batch.add_data_object(
data_object=article,
class_name="Article",
uuid=generate_uuid5(article["title"]) # 生成确定性 UUID
)

查询数据

基本查询

# 获取所有数据
result = client.query.get("Article", ["title", "content"]).do()

# 带过滤条件的查询
result = (
client.query
.get("Article", ["title", "content", "category"])
.with_where({
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
})
.do()
)

向量搜索(Near Vector)

# 使用向量搜索
query_vector = [0.1, 0.2, 0.3, ...]

result = (
client.query
.get("Article", ["title", "content"])
.with_near_vector({"vector": query_vector})
.with_limit(5)
.do()
)

文本搜索(Near Text)

# 自动将文本转换为向量进行搜索
result = (
client.query
.get("Article", ["title", "content"])
.with_near_text({"concepts": ["人工智能", "机器学习"]})
.with_limit(5)
.do()
)

# 带距离限制的搜索
result = (
client.query
.get("Article", ["title"])
.with_near_text({
"concepts": ["深度学习"],
"distance": 0.2 # 最大距离
})
.do()
)

混合搜索

结合向量搜索和 BM25 关键词搜索:

result = (
client.query
.get("Article", ["title", "content"])
.with_hybrid(query="向量数据库", alpha=0.75) # alpha: 向量权重
.with_limit(10)
.do()
)

过滤条件

# 等于
.where({
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
})

# 不等于
.where({
"path": ["status"],
"operator": "NotEqual",
"valueText": "deleted"
})

# 大于/大于等于
.where({
"path": ["price"],
"operator": "GreaterThan",
"valueNumber": 100
})

# 小于/小于等于
.where({
"path": ["price"],
"operator": "LessThanEqual",
"valueNumber": 1000
})

# 包含在列表中
.where({
"path": ["category"],
"operator": "ContainsAny",
"valueText": ["技术", "编程"]
})

# 字符串包含
.where({
"path": ["title"],
"operator": "Like",
"valueText": "*Python*"
})

# AND 条件
.where({
"operator": "And",
"operands": [
{
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
},
{
"path": ["price"],
"operator": "LessThan",
"valueNumber": 1000
}
]
})

# OR 条件
.where({
"operator": "Or",
"operands": [
{
"path": ["category"],
"operator": "Equal",
"valueText": "技术"
},
{
"path": ["category"],
"operator": "Equal",
"valueText": "编程"
}
]
})

更新和删除

# 更新数据
client.data_object.update(
data_object={"price": 899},
class_name="Product",
uuid="文章-uuid"
)

# 替换数据(完全覆盖)
client.data_object.replace(
data_object={
"title": "新标题",
"content": "新内容"
},
class_name="Article",
uuid="文章-uuid"
)

# 删除数据
client.data_object.delete(
uuid="文章-uuid",
class_name="Article"
)

# 批量删除(带过滤条件)
client.batch.delete_objects(
class_name="Article",
where={
"path": ["category"],
"operator": "Equal",
"valueText": "obsolete"
}
)

GraphQL 查询

Weaviate 的核心查询语言是 GraphQL,提供强大的查询能力。

基本查询

# 构建 GraphQL 查询
gql_query = """
{
Get {
Article(limit: 5) {
title
content
category
}
}
}
"""

result = client.query.raw(gql_query)

向量搜索查询

# Near Text 查询
gql_query = """
{
Get {
Article(
nearText: {
concepts: ["向量数据库"]
}
limit: 5
) {
title
content
_additional {
distance
certainty
}
}
}
}
"""

# 带过滤的向量搜索
gql_query = """
{
Get {
Article(
nearText: {
concepts: ["AI"]
}
where: {
path: ["category"]
operator: Equal
valueText: "技术"
}
limit: 5
) {
title
category
}
}
}
"""

聚合查询

# 统计
gql_query = """
{
Aggregate {
Article {
meta {
count
}
}
}
}
"""

# 分组统计
gql_query = """
{
Aggregate {
Article(groupBy: ["category"]) {
category {
count
topOccurrences {
value
occurs
}
}
}
}
}
"""

RAG 应用示例

import weaviate
from openai import OpenAI
import os

class WeaviateRAG:
def __init__(self):
# 连接 Weaviate
self.client = weaviate.Client("http://localhost:8080")

# OpenAI 客户端
self.openai = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# 确保 Schema 存在
self._setup_schema()

def _setup_schema(self):
"""设置 Schema"""
schema = {
"class": "Document",
"vectorizer": "text2vec-transformers",
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "source", "dataType": ["text"]},
{"name": "title", "dataType": ["text"]}
]
}

# 如果 Class 不存在则创建
if not self.client.schema.exists("Document"):
self.client.schema.create_class(schema)

def add_documents(self, documents):
"""添加文档"""
with self.client.batch as batch:
batch.batch_size = 100

for doc in documents:
batch.add_data_object(
data_object={
"content": doc["content"],
"source": doc.get("source", ""),
"title": doc.get("title", "")
},
class_name="Document"
)

print(f"已添加 {len(documents)} 篇文档")

def search(self, query, top_k=5):
"""搜索相关文档"""
result = (
self.client.query
.get("Document", ["content", "source", "title"])
.with_near_text({"concepts": [query]})
.with_limit(top_k)
.with_additional(["distance"])
.do()
)

documents = result["data"]["Get"]["Document"]
return [
{
"content": doc["content"],
"source": doc["source"],
"title": doc["title"],
"distance": doc["_additional"]["distance"]
}
for doc in documents
]

def answer(self, question, top_k=3):
"""生成回答"""
# 检索相关文档
docs = self.search(question, top_k)

# 构建上下文
context = "\n\n".join([
f"[文档 {i+1}] {doc['content']}"
for i, doc in enumerate(docs)
])

# 调用 LLM
prompt = f"""基于以下文档回答问题。如果文档中没有相关信息,请说明。

文档:
{context}

问题:{question}

回答:"""

response = self.openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "你是一个基于文档回答问题的助手。"},
{"role": "user", "content": prompt}
]
)

return {
"answer": response.choices[0].message.content,
"sources": docs
}

# 使用示例
rag = WeaviateRAG()

docs = [
{"content": "Weaviate 是一个开源向量搜索引擎...", "source": "weaviate.io"},
{"content": "向量数据库用于 AI 应用的语义搜索...", "source": "blog"}
]
rag.add_documents(docs)

result = rag.answer("什么是 Weaviate?")
print(result["answer"])

高级功能

多模态搜索

# 定义图像类
image_class = {
"class": "Image",
"vectorizer": "img2vec-neural",
"moduleConfig": {
"img2vec-neural": {
"imageFields": ["image"]
}
},
"properties": [
{"name": "image", "dataType": ["blob"]},
{"name": "filename", "dataType": ["text"]},
{"name": "description", "dataType": ["text"]}
]
}

client.schema.create_class(image_class)

# 添加图像
import base64

with open("image.jpg", "rb") as f:
encoded_image = base64.b64encode(f.read()).decode("utf-8")

client.data_object.create(
data_object={
"image": encoded_image,
"filename": "image.jpg",
"description": "一只猫"
},
class_name="Image"
)

# 以图搜图
result = (
client.query
.get("Image", ["filename", "description"])
.with_near_image({"image": encoded_image})
.with_limit(5)
.do()
)

结合 LLM 进行生成式搜索:

# 需要配置 generative-openai 模块
result = (
client.query
.get("Article", ["title", "content"])
.with_near_text({"concepts": ["AI 发展"]})
.with_generate(
single_prompt="总结这篇文章的主要内容:{content}"
)
.with_limit(3)
.do()
)

# 访问生成结果
for article in result["data"]["Get"]["Article"]:
print(f"标题: {article['title']}")
print(f"生成摘要: {article['_additional']['generate']['singleResult']}")

向量化配置详解

# 细粒度控制向量化行为
class_obj = {
"class": "Article",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False, # 不向量化类名
"poolingStrategy": "masked_mean" # 池化策略
}
},
"properties": [
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {
"skip": False, # 不向量化此字段
"vectorizePropertyName": False # 不向量化属性名
}
}
},
{
"name": "internal_notes",
"dataType": ["text"],
"moduleConfig": {
"text2vec-transformers": {"skip": True} # 跳过向量化
}
}
]
}

性能优化

批量导入

# 配置批量导入参数
client.batch.configure(
batch_size=100, # 每批数量
dynamic=True, # 动态调整批量大小
timeout_retries=3,
callback=None # 可设置回调函数处理错误
)

# 批量导入
with client.batch as batch:
for doc in documents:
batch.add_data_object(
data_object=doc,
class_name="Article"
)

索引调优

# 在 Schema 中配置向量索引参数
class_obj = {
"class": "Article",
"vectorIndexType": "hnsw", # 或 "flat"
"vectorIndexConfig": {
"ef": 100, # 搜索时探索因子
"efConstruction": 128, # 构建时探索因子
"maxConnections": 64, # 最大连接数
"dynamicEfFactor": 8,
"dynamicEfMin": 100,
"dynamicEfMax": 500
},
"properties": [...]
}

查询优化

# 使用 cursor 进行分页
result = (
client.query
.get("Article", ["title"])
.with_limit(100)
.with_after("最后一条的-uuid") # 分页游标
.do()
)

# 只获取需要的字段
result = (
client.query
.get("Article", ["title"]) # 只获取标题
.with_near_text({"concepts": ["AI"]})
.do()
)

常见问题

Q: 如何选择向量化模块?

模块优点缺点
text2vec-transformers本地运行,无 API 费用需要 GPU 获得好性能
text2vec-openai高质量嵌入需要 API Key,有费用
text2vec-cohere多语言支持好需要 API Key

Q: 如何处理中文?

使用支持中文的向量化模块:

# docker-compose.yml 配置
services:
weaviate:
image: semitechnologies/weaviate:1.24.0
environment:
DEFAULT_VECTORIZER_MODULE: 'text2vec-transformers'
ENABLE_MODULES: 'text2vec-transformers'
# 使用支持中文的 transformer 模型

或使用 OpenAI/Cohere 的多语言模型。

Q: 数据如何备份?

# 1. 直接备份数据目录
cp -r ./weaviate_data ./weaviate_backup

# 2. 使用 API 导出
# 查询所有数据并保存到文件

Q: 如何监控性能?

# 查看元数据
result = (
client.query
.get("Article", ["title"])
.with_near_text({"concepts": ["test"]})
.with_additional(["vector", "distance", "certainty"])
.do()
)

# 查看查询时间(在返回结果中)

下一步