跳到主要内容

pgvector - PostgreSQL 向量扩展

pgvector 是 PostgreSQL 的扩展,为 PostgreSQL 添加向量相似度搜索功能。它允许你在现有的 PostgreSQL 数据库中存储和查询向量,无需额外的数据库系统。

概述

为什么选择 pgvector

特性说明
SQL 兼容使用熟悉的 SQL 语法
事务支持ACID 保证,数据一致性
Join 查询向量与关系数据联合查询
成熟生态PostgreSQL 的丰富工具和扩展
多种索引HNSW、IVFFlat 索引支持
混合查询向量相似度 + SQL 过滤

适用场景

  • 已有 PostgreSQL 基础设施:最小迁移成本
  • 需要事务支持:向量数据需要 ACID 保证
  • 复杂关联查询:向量与关系数据的联合查询
  • 统一数据存储:避免多数据库管理的复杂性

快速开始

1. 安装 pgvector

Docker 部署

# 使用包含 pgvector 的 PostgreSQL 镜像
docker run -d \
--name pgvector \
-e POSTGRES_PASSWORD=postgres \
-p 5432:5432 \
pgvector/pgvector:pg16

现有 PostgreSQL 安装

# 克隆并编译
git clone --branch v0.5.1 https://github.com/pgvector/pgvector.git
cd pgvector
make
make install

# 在数据库中创建扩展
psql -d mydb -c "CREATE EXTENSION vector;"

2. 安装 Python 客户端

pip install psycopg2-binary
# 或
pip install asyncpg

3. 第一个示例

import psycopg2

# 连接数据库
conn = psycopg2.connect(
host="localhost",
port=5432,
database="postgres",
user="postgres",
password="postgres"
)

cur = conn.cursor()

# 创建扩展
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

# 创建表
cur.execute("""
CREATE TABLE IF NOT EXISTS items (
id SERIAL PRIMARY KEY,
content TEXT,
embedding VECTOR(384)
);
""")

# 插入数据
cur.execute("""
INSERT INTO items (content, embedding)
VALUES
('第一篇文档', '[0.1, 0.2, 0.3, 0.4, 0.5]'),
('第二篇文档', '[0.2, 0.3, 0.4, 0.5, 0.6]'),
('第三篇文档', '[0.3, 0.4, 0.5, 0.6, 0.7]');
""")

# 向量相似度搜索
cur.execute("""
SELECT id, content, embedding <=> '[0.1, 0.2, 0.3, 0.4, 0.5]' AS distance
FROM items
ORDER BY embedding <=> '[0.1, 0.2, 0.3, 0.4, 0.5]'
LIMIT 3;
""")

results = cur.fetchall()
for row in results:
print(f"ID: {row[0]}, Content: {row[1]}, Distance: {row[2]}")

conn.commit()
cur.close()
conn.close()

核心概念

向量类型

pgvector 提供了 vector 类型来存储向量:

-- 创建带向量的表
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
embedding VECTOR(1536) -- 指定维度
);

-- 插入向量
INSERT INTO documents (title, content, embedding)
VALUES (
'示例文档',
'这是文档内容',
'[0.1, 0.2, 0.3, ...]' -- 1536 维向量
);

相似度运算符

运算符说明使用场景
<->欧几里得距离几何距离度量
<#>负点积点积相似度
<=>余弦距离余弦相似度(最常用)
<+>曼哈顿距离L1 距离度量
-- 余弦相似度搜索(距离越小越相似)
SELECT * FROM documents
ORDER BY embedding <=> '[0.1, 0.2, ...]'
LIMIT 10;

-- 欧几里得距离搜索
SELECT * FROM documents
ORDER BY embedding <-> '[0.1, 0.2, ...]'
LIMIT 10;

-- 点积搜索(值越大越相似)
SELECT * FROM documents
ORDER BY embedding <#> '[0.1, 0.2, ...]' DESC
LIMIT 10;

索引

-- HNSW 索引(推荐,查询速度快)
CREATE INDEX ON documents
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);

-- IVFFlat 索引(构建速度快,内存占用少)
CREATE INDEX ON documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);

-- 查看索引
\d documents

-- 删除索引
DROP INDEX documents_embedding_idx;

索引参数说明

HNSW 索引:

  • m: 最大连接数(默认 16,范围 2-100)
  • ef_construction: 构建时的搜索范围(默认 64)

IVFFlat 索引:

  • lists: 聚类中心数(建议 4×√行数)

数据操作

插入数据

import psycopg2
import numpy as np

conn = psycopg2.connect("postgresql://postgres:postgres@localhost:5432/postgres")
cur = conn.cursor()

# 创建表
cur.execute("""
CREATE TABLE IF NOT EXISTS articles (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
content TEXT,
category TEXT,
author TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
embedding VECTOR(384)
);
""")

# 单条插入
embedding = [0.1, 0.2, 0.3] + [0.0] * 381
cur.execute("""
INSERT INTO articles (title, content, category, embedding)
VALUES (%s, %s, %s, %s);
""", ("文章标题", "文章内容", "技术", embedding))

# 批量插入(推荐)
articles = [
("文章1", "内容1", "技术", [0.1, 0.2, ...]),
("文章2", "内容2", "生活", [0.2, 0.3, ...]),
("文章3", "内容3", "技术", [0.3, 0.4, ...]),
]

cur.executemany("""
INSERT INTO articles (title, content, category, embedding)
VALUES (%s, %s, %s, %s);
""", articles)

conn.commit()

查询数据

基本向量搜索

# 余弦相似度搜索
cur.execute("""
SELECT id, title, content, embedding <=> %s AS distance
FROM articles
ORDER BY embedding <=> %s
LIMIT 10;
""", (query_embedding, query_embedding))

results = cur.fetchall()
for row in results:
print(f"ID: {row[0]}, Title: {row[1]}, Distance: {row[3]}")

带过滤条件的搜索

# 结合 SQL WHERE 子句
cur.execute("""
SELECT id, title, content, embedding <=> %s AS distance
FROM articles
WHERE category = %s AND created_at > %s
ORDER BY embedding <=> %s
LIMIT 10;
""", (query_embedding, "技术", "2024-01-01", query_embedding))

# 复杂过滤
cur.execute("""
SELECT id, title, content, embedding <=> %s AS distance
FROM articles
WHERE category IN ('技术', '编程')
AND author = %s
AND created_at BETWEEN %s AND %s
ORDER BY embedding <=> %s
LIMIT 10;
""", (query_embedding, "张三", "2024-01-01", "2024-12-31", query_embedding))

联合查询

# 向量搜索 + Join 查询
cur.execute("""
SELECT
a.id,
a.title,
a.embedding <=> %s AS distance,
u.username,
u.email
FROM articles a
JOIN users u ON a.author_id = u.id
WHERE a.category = %s
ORDER BY a.embedding <=> %s
LIMIT 10;
""", (query_embedding, "技术", query_embedding))

更新和删除

# 更新向量
cur.execute("""
UPDATE articles
SET embedding = %s
WHERE id = %s;
""", (new_embedding, article_id))

# 更新其他字段
cur.execute("""
UPDATE articles
SET title = %s, content = %s
WHERE id = %s;
""", (new_title, new_content, article_id))

# 删除
cur.execute("DELETE FROM articles WHERE id = %s;", (article_id,))

# 按条件删除
cur.execute("DELETE FROM articles WHERE category = %s;", ("obsolete",))

conn.commit()

完整 RAG 示例

import psycopg2
from openai import OpenAI
import os

class PgvectorRAG:
def __init__(self):
# 连接数据库
self.conn = psycopg2.connect(
host="localhost",
port=5432,
database="postgres",
user="postgres",
password="postgres"
)
self.cur = self.conn.cursor()

# OpenAI 客户端
self.openai = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# 初始化表
self._setup_table()

def _setup_table(self):
"""设置数据库表"""
# 创建扩展
self.cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

# 创建表
self.cur.execute("""
CREATE TABLE IF NOT EXISTS knowledge_base (
id SERIAL PRIMARY KEY,
content TEXT NOT NULL,
source TEXT,
title TEXT,
category TEXT DEFAULT 'general',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
embedding VECTOR(1536)
);
""")

# 创建索引
self.cur.execute("""
CREATE INDEX IF NOT EXISTS idx_kb_embedding
ON knowledge_base
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
""")

self.conn.commit()

def add_documents(self, documents):
"""添加文档到知识库"""
for doc in documents:
# 生成嵌入
embedding = self._get_embedding(doc["content"])

self.cur.execute("""
INSERT INTO knowledge_base (content, source, title, category, embedding)
VALUES (%s, %s, %s, %s, %s);
""", (
doc["content"],
doc.get("source", ""),
doc.get("title", ""),
doc.get("category", "general"),
embedding
))

self.conn.commit()
print(f"已添加 {len(documents)} 篇文档")

def _get_embedding(self, text):
"""获取文本嵌入"""
response = self.openai.embeddings.create(
input=text,
model="text-embedding-3-small"
)
return response.data[0].embedding

def search(self, query, top_k=5, category=None):
"""搜索相关文档"""
query_embedding = self._get_embedding(query)

if category:
self.cur.execute("""
SELECT id, content, source, title, embedding <=> %s AS distance
FROM knowledge_base
WHERE category = %s
ORDER BY embedding <=> %s
LIMIT %s;
""", (query_embedding, category, query_embedding, top_k))
else:
self.cur.execute("""
SELECT id, content, source, title, embedding <=> %s AS distance
FROM knowledge_base
ORDER BY embedding <=> %s
LIMIT %s;
""", (query_embedding, query_embedding, top_k))

results = self.cur.fetchall()
return [
{
"id": row[0],
"content": row[1],
"source": row[2],
"title": row[3],
"distance": row[4]
}
for row in results
]

def answer(self, question, top_k=3):
"""生成回答"""
# 检索相关文档
docs = self.search(question, top_k)

# 构建上下文
context = "\n\n".join([
f"[文档 {i+1}] {doc['content']}"
for i, doc in enumerate(docs)
])

# 调用 LLM
prompt = f"""基于以下文档回答问题。如果文档中没有相关信息,请说明。

文档:
{context}

问题:{question}

回答:"""

response = self.openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "你是一个基于文档回答问题的助手。"},
{"role": "user", "content": prompt}
]
)

return {
"answer": response.choices[0].message.content,
"sources": docs
}

def close(self):
"""关闭连接"""
self.cur.close()
self.conn.close()

# 使用示例
rag = PgvectorRAG()

docs = [
{"content": "pgvector 是 PostgreSQL 的向量扩展...", "source": "github.com", "category": "database"},
{"content": "PostgreSQL 是一个强大的开源关系型数据库...", "source": "postgresql.org", "category": "database"}
]
rag.add_documents(docs)

result = rag.answer("什么是 pgvector?")
print(f"回答:{result['answer']}")

rag.close()

高级功能

混合搜索(全文 + 向量)

-- 安装 pg_trgm 扩展用于全文搜索
CREATE EXTENSION IF NOT EXISTS pg_trgm;

-- 创建全文搜索索引
CREATE INDEX idx_articles_content_trgm ON articles USING gin (content gin_trgm_ops);

-- 混合搜索:结合全文搜索和向量搜索
WITH vector_results AS (
SELECT
id,
title,
content,
embedding <=> '[0.1, 0.2, ...]' AS vector_distance
FROM articles
ORDER BY embedding <=> '[0.1, 0.2, ...]'
LIMIT 100
),
fulltext_results AS (
SELECT
id,
title,
content,
similarity(content, '搜索关键词') AS text_score
FROM articles
WHERE content % '搜索关键词'
ORDER BY similarity(content, '搜索关键词') DESC
LIMIT 100
)
SELECT
COALESCE(v.id, f.id) AS id,
COALESCE(v.title, f.title) AS title,
COALESCE(v.content, f.content) AS content,
v.vector_distance,
f.text_score
FROM vector_results v
FULL OUTER JOIN fulltext_results f ON v.id = f.id
ORDER BY
COALESCE(v.vector_distance, 1) * 0.5 +
COALESCE(1 - f.text_score, 1) * 0.5
LIMIT 10;

向量聚合

-- 计算平均向量
SELECT avg(embedding) AS avg_embedding
FROM articles
WHERE category = '技术';

-- 计算向量中心点
SELECT
category,
avg(embedding) AS center_embedding
FROM articles
GROUP BY category;

子向量搜索

-- 使用 subvector 函数提取子向量进行搜索
SELECT * FROM documents
ORDER BY subvector(embedding, 0, 128) <=> subvector('[0.1, 0.2, ...]', 0, 128)
LIMIT 10;

性能优化

索引优化

-- 根据数据量选择合适的索引类型

-- 小规模数据(< 10万):不需要索引,顺序扫描即可

-- 中等规模(10万 - 1000万):IVFFlat 索引
CREATE INDEX ON articles
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100); -- lists ≈ 行数 / 1000

-- 大规模数据(> 1000万):HNSW 索引
CREATE INDEX ON articles
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 200);

-- 调整 HNSW 搜索参数
SET hnsw.ef_search = 100; -- 默认 40,越大精度越高

查询优化

# 使用 EXPLAIN ANALYZE 分析查询
cur.execute("""
EXPLAIN ANALYZE
SELECT id, title, embedding <=> %s AS distance
FROM articles
ORDER BY embedding <=> %s
LIMIT 10;
""", (query_embedding, query_embedding))

print(cur.fetchall())

# 预过滤 + 向量搜索(使用索引)
cur.execute("""
SELECT id, title, embedding <=> %s AS distance
FROM articles
WHERE category = '技术' -- 先过滤减少数据量
ORDER BY embedding <=> %s
LIMIT 10;
""", (query_embedding, query_embedding))

分区表

-- 对大规模数据使用分区
CREATE TABLE articles (
id SERIAL,
title TEXT,
content TEXT,
category TEXT,
created_at TIMESTAMP,
embedding VECTOR(384)
) PARTITION BY RANGE (created_at);

-- 创建分区
CREATE TABLE articles_2024_q1 PARTITION OF articles
FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');

CREATE TABLE articles_2024_q2 PARTITION OF articles
FOR VALUES FROM ('2024-04-01') TO ('2024-07-01');

-- 在每个分区上创建索引
CREATE INDEX ON articles_2024_q1 USING hnsw (embedding vector_cosine_ops);
CREATE INDEX ON articles_2024_q2 USING hnsw (embedding vector_cosine_ops);

常见问题

Q: pgvector 与专用向量数据库如何选择?

维度pgvector专用向量数据库
数据规模适合千万级以下适合亿级以上
查询性能中等
事务支持完整 ACID有限
Join 查询支持有限
运维成本低(复用 PG 基础设施)

Q: 如何升级 pgvector?

# 1. 备份数据
pg_dump mydb > backup.sql

# 2. 升级扩展
psql -d mydb -c "ALTER EXTENSION vector UPDATE;"

# 3. 验证版本
psql -d mydb -c "SELECT * FROM pg_extension WHERE extname = 'vector';"

Q: 向量维度有限制吗?

pgvector 支持最高 16000 维,但建议:

  • 常用维度:384、768、1536
  • 超过 2000 维可能影响性能

Q: 如何监控性能?

-- 查看索引使用情况
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read
FROM pg_stat_user_indexes
WHERE tablename = 'articles';

-- 查看慢查询
SELECT query, mean_time, calls
FROM pg_stat_statements
WHERE query LIKE '%articles%'
ORDER BY mean_time DESC
LIMIT 10;

下一步

  • 实践案例 - 完整项目示例
  • 向量数据库对比 - 各产品对比分析(即将推出)