Embedding与向量表示完整教程
目录
Embedding原理
什么是Embedding
Embedding是将文本转换为高维向量的技术,使得语义相似的文本在向量空间中距离更近。
┌────────────────────────────────────────────────────┐
│ 文本 → 向量转换过程 │
├────────────────────────────────────────────────────┤
│ │
│ 原始文本 │
│ "人工智能改变世界" │
│ │ │
│ ▼ │
│ Tokenization (分词) │
│ ["人工智能", "改变", "世界"] │
│ │ │
│ ▼ │
│ Embedding Model │
│ (深度学习模型) │
│ │ │
│ ▼ │
│ 向量表示 (1536维) │
│ [0.023, -0.015, 0.048, ..., 0.012] │
│ │
│ 特性: │
│ • 相似语义 → 相似向量 │
│ • 捕获语义关系 │
│ • 支持数学运算 │
│ │
└────────────────────────────────────────────────────┘向量空间可视化
维度2
│
│ • "机器学习"
│ • • "深度学习"
│ "AI"
│
│
─────────┼─────────────────── 维度1
│
│ • "足球"
│ • "篮球"
│
│相似概念在向量空间中聚集在一起。
OpenAI Embeddings API
基础使用
python
from openai import OpenAI
client = OpenAI()
def get_embedding(text, model="text-embedding-3-small"):
"""获取文本的Embedding"""
text = text.replace("\n", " ") # 替换换行符
response = client.embeddings.create(
input=text,
model=model
)
return response.data[0].embedding
# 使用示例
text = "人工智能正在改变世界"
embedding = get_embedding(text)
print(f"文本: {text}")
print(f"向量维度: {len(embedding)}")
print(f"前10个值: {embedding[:10]}")模型对比
python
class EmbeddingComparison:
"""Embedding模型对比"""
MODELS = {
"text-embedding-3-small": {
"dimensions": 1536,
"price_per_1m": 0.02, # USD
"performance": "高性价比"
},
"text-embedding-3-large": {
"dimensions": 3072,
"price_per_1m": 0.13,
"performance": "最高精度"
},
"text-embedding-ada-002": {
"dimensions": 1536,
"price_per_1m": 0.10,
"performance": "旧版模型"
}
}
@staticmethod
def compare_models(text):
"""对比不同模型"""
client = OpenAI()
results = {}
for model, info in EmbeddingComparison.MODELS.items():
print(f"\n测试模型: {model}")
response = client.embeddings.create(
input=text,
model=model
)
embedding = response.data[0].embedding
results[model] = {
"dimensions": len(embedding),
"sample_values": embedding[:5],
"info": info
}
return results
# 使用示例
text = "深度学习是机器学习的一个分支"
results = EmbeddingComparison.compare_models(text)
for model, data in results.items():
print(f"\n{model}:")
print(f" 维度: {data['dimensions']}")
print(f" 性能: {data['info']['performance']}")
print(f" 价格: ${data['info']['price_per_1m']}/1M tokens")批量处理
python
def batch_embed(texts, batch_size=100, model="text-embedding-3-small"):
"""批量生成Embeddings"""
client = OpenAI()
embeddings = []
# 分批处理
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
# 清理文本
batch = [text.replace("\n", " ") for text in batch]
# 调用API
response = client.embeddings.create(
input=batch,
model=model
)
# 提取embeddings
batch_embeddings = [item.embedding for item in response.data]
embeddings.extend(batch_embeddings)
print(f"处理进度: {min(i + batch_size, len(texts))}/{len(texts)}")
return embeddings
# 使用示例
texts = [
"人工智能",
"机器学习",
"深度学习",
"神经网络",
"自然语言处理"
]
embeddings = batch_embed(texts)
print(f"\n生成了{len(embeddings)}个向量")成本计算
python
import tiktoken
class EmbeddingCostCalculator:
"""Embedding成本计算器"""
# 价格 (USD per 1M tokens)
PRICING = {
"text-embedding-3-small": 0.02,
"text-embedding-3-large": 0.13,
"text-embedding-ada-002": 0.10
}
@staticmethod
def count_tokens(texts):
"""计算token数量"""
encoding = tiktoken.get_encoding("cl100k_base")
total_tokens = 0
for text in texts:
tokens = encoding.encode(text)
total_tokens += len(tokens)
return total_tokens
@staticmethod
def calculate_cost(texts, model="text-embedding-3-small"):
"""计算成本"""
tokens = EmbeddingCostCalculator.count_tokens(texts)
price_per_token = EmbeddingCostCalculator.PRICING[model] / 1_000_000
cost = tokens * price_per_token
return {
"texts_count": len(texts),
"total_tokens": tokens,
"avg_tokens": tokens / len(texts),
"model": model,
"cost_usd": cost,
"cost_per_1000": (tokens / 1000) * (EmbeddingCostCalculator.PRICING[model] / 1000)
}
# 使用示例
texts = ["这是一段测试文本" * 50 for _ in range(100)]
cost_info = EmbeddingCostCalculator.calculate_cost(
texts,
model="text-embedding-3-small"
)
print(f"文本数量: {cost_info['texts_count']}")
print(f"总Token数: {cost_info['total_tokens']}")
print(f"平均Token数: {cost_info['avg_tokens']:.1f}")
print(f"总成本: ${cost_info['cost_usd']:.4f}")Sentence Transformers
安装与配置
bash
pip install sentence-transformers基础使用
python
from sentence_transformers import SentenceTransformer
import numpy as np
# 加载模型
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# 生成Embedding
texts = [
"人工智能正在改变世界",
"机器学习是AI的核心技术",
"今天天气很好"
]
embeddings = model.encode(texts)
print(f"生成了{len(embeddings)}个向量")
print(f"每个向量的维度: {embeddings[0].shape}")推荐模型
python
class SentenceTransformerModels:
"""Sentence-Transformers推荐模型"""
MODELS = {
# 中文模型
"chinese": [
{
"name": "moka-ai/m3e-base",
"dimensions": 768,
"language": "中文",
"use_case": "通用中文embedding"
},
{
"name": "shibing624/text2vec-base-chinese",
"dimensions": 768,
"language": "中文",
"use_case": "中文语义匹配"
}
],
# 英文模型
"english": [
{
"name": "all-MiniLM-L6-v2",
"dimensions": 384,
"language": "英文",
"use_case": "轻量级,速度快"
},
{
"name": "all-mpnet-base-v2",
"dimensions": 768,
"language": "英文",
"use_case": "高性能,精度高"
}
],
# 多语言模型
"multilingual": [
{
"name": "paraphrase-multilingual-MiniLM-L12-v2",
"dimensions": 384,
"language": "多语言",
"use_case": "支持50+语言"
},
{
"name": "paraphrase-multilingual-mpnet-base-v2",
"dimensions": 768,
"language": "多语言",
"use_case": "多语言高精度"
}
],
# 代码模型
"code": [
{
"name": "microsoft/codebert-base",
"dimensions": 768,
"language": "代码",
"use_case": "代码语义理解"
}
]
}
@staticmethod
def list_models(category=None):
"""列出推荐模型"""
if category:
return SentenceTransformerModels.MODELS.get(category, [])
return SentenceTransformerModels.MODELS
# 使用示例
print("中文模型推荐:")
for model in SentenceTransformerModels.list_models("chinese"):
print(f" {model['name']}")
print(f" - 维度: {model['dimensions']}")
print(f" - 用途: {model['use_case']}\n")自定义模型训练
python
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
class CustomEmbeddingTrainer:
"""自定义Embedding模型训练"""
def __init__(self, base_model="paraphrase-multilingual-MiniLM-L12-v2"):
self.model = SentenceTransformer(base_model)
def prepare_training_data(self, data):
"""准备训练数据
data格式: [
{"text1": "句子1", "text2": "句子2", "label": 1.0},
...
]
"""
examples = []
for item in data:
example = InputExample(
texts=[item["text1"], item["text2"]],
label=float(item["label"])
)
examples.append(example)
return examples
def train(self, train_data, epochs=1, batch_size=16):
"""训练模型"""
# 准备数据
examples = self.prepare_training_data(train_data)
dataloader = DataLoader(examples, batch_size=batch_size, shuffle=True)
# 定义损失函数
loss = losses.CosineSimilarityLoss(self.model)
# 训练
self.model.fit(
train_objectives=[(dataloader, loss)],
epochs=epochs,
warmup_steps=100
)
def save(self, path):
"""保存模型"""
self.model.save(path)
# 使用示例
training_data = [
{"text1": "人工智能", "text2": "机器学习", "label": 0.8},
{"text1": "人工智能", "text2": "足球", "label": 0.1},
{"text1": "深度学习", "text2": "神经网络", "label": 0.9},
]
trainer = CustomEmbeddingTrainer()
trainer.train(training_data, epochs=1)
trainer.save("./my_embedding_model")向量相似度计算
余弦相似度
python
import numpy as np
def cosine_similarity(vec1, vec2):
"""计算余弦相似度
公式: cos(θ) = (A·B) / (||A|| * ||B||)
范围: [-1, 1],越接近1越相似
"""
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return dot_product / (norm1 * norm2)
# 使用示例
vec1 = np.array([1, 2, 3])
vec2 = np.array([2, 3, 4])
vec3 = np.array([-1, -2, -3])
sim_12 = cosine_similarity(vec1, vec2)
sim_13 = cosine_similarity(vec1, vec3)
print(f"vec1 vs vec2: {sim_12:.4f}") # 相似
print(f"vec1 vs vec3: {sim_13:.4f}") # 相反欧几里得距离
python
def euclidean_distance(vec1, vec2):
"""计算欧几里得距离
公式: d = sqrt(Σ(xi - yi)²)
范围: [0, ∞],越小越相似
"""
return np.linalg.norm(vec1 - vec2)
# 使用示例
vec1 = np.array([1, 2, 3])
vec2 = np.array([2, 3, 4])
vec3 = np.array([10, 20, 30])
dist_12 = euclidean_distance(vec1, vec2)
dist_13 = euclidean_distance(vec1, vec3)
print(f"vec1 vs vec2: {dist_12:.4f}") # 相近
print(f"vec1 vs vec3: {dist_13:.4f}") # 距离远点积
python
def dot_product(vec1, vec2):
"""计算点积
公式: A·B = Σ(ai * bi)
"""
return np.dot(vec1, vec2)
# 使用示例
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
dp = dot_product(vec1, vec2)
print(f"点积: {dp}") # 32相似度计算工具类
python
class SimilarityCalculator:
"""向量相似度计算工具"""
@staticmethod
def cosine(vec1, vec2):
"""余弦相似度"""
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
@staticmethod
def euclidean(vec1, vec2):
"""欧几里得距离"""
return np.linalg.norm(vec1 - vec2)
@staticmethod
def manhattan(vec1, vec2):
"""曼哈顿距离"""
return np.sum(np.abs(vec1 - vec2))
@staticmethod
def dot_product(vec1, vec2):
"""点积"""
return np.dot(vec1, vec2)
@staticmethod
def batch_cosine_similarity(query_vec, vectors):
"""批量计算余弦相似度"""
# 归一化
query_norm = query_vec / np.linalg.norm(query_vec)
vectors_norm = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
# 批量点积
similarities = np.dot(vectors_norm, query_norm)
return similarities
@staticmethod
def find_most_similar(query_vec, vectors, top_k=5, metric="cosine"):
"""找到最相似的向量"""
if metric == "cosine":
similarities = SimilarityCalculator.batch_cosine_similarity(
query_vec, vectors
)
# 余弦相似度越大越相似
top_indices = np.argsort(similarities)[::-1][:top_k]
top_scores = similarities[top_indices]
elif metric == "euclidean":
distances = np.array([
np.linalg.norm(query_vec - vec) for vec in vectors
])
# 欧氏距离越小越相似
top_indices = np.argsort(distances)[:top_k]
top_scores = distances[top_indices]
else:
raise ValueError(f"不支持的度量: {metric}")
return top_indices, top_scores
# 使用示例
from openai import OpenAI
client = OpenAI()
# 生成测试向量
texts = [
"人工智能正在改变世界",
"机器学习是AI的重要分支",
"深度学习使用神经网络",
"今天天气很好",
"我喜欢吃苹果"
]
embeddings = [
client.embeddings.create(input=text, model="text-embedding-3-small").data[0].embedding
for text in texts
]
vectors = np.array(embeddings)
# 查询
query = "什么是人工智能?"
query_vec = np.array(
client.embeddings.create(input=query, model="text-embedding-3-small").data[0].embedding
)
# 找到最相似的
indices, scores = SimilarityCalculator.find_most_similar(
query_vec, vectors, top_k=3, metric="cosine"
)
print(f"查询: {query}\n")
print("最相似的文本:")
for i, (idx, score) in enumerate(zip(indices, scores), 1):
print(f"{i}. {texts[idx]} (相似度: {score:.4f})")语义搜索实现
简单语义搜索
python
class SimpleSemanticSearch:
"""简单的语义搜索引擎"""
def __init__(self, model="text-embedding-3-small"):
self.client = OpenAI()
self.model = model
self.documents = []
self.embeddings = []
def add_documents(self, documents):
"""添加文档"""
self.documents.extend(documents)
# 生成embeddings
new_embeddings = []
for doc in documents:
embedding = self.client.embeddings.create(
input=doc,
model=self.model
).data[0].embedding
new_embeddings.append(embedding)
self.embeddings.extend(new_embeddings)
print(f"已添加{len(documents)}个文档,总共{len(self.documents)}个")
def search(self, query, top_k=5):
"""搜索"""
if not self.documents:
return []
# 生成查询向量
query_embedding = self.client.embeddings.create(
input=query,
model=self.model
).data[0].embedding
query_vec = np.array(query_embedding)
vectors = np.array(self.embeddings)
# 计算相似度
indices, scores = SimilarityCalculator.find_most_similar(
query_vec, vectors, top_k=top_k
)
# 返回结果
results = []
for idx, score in zip(indices, scores):
results.append({
"document": self.documents[idx],
"score": float(score)
})
return results
# 使用示例
search_engine = SimpleSemanticSearch()
# 添加文档
documents = [
"Python是一种高级编程语言,广泛用于Web开发、数据分析和人工智能。",
"JavaScript是Web开发的核心语言,运行在浏览器中。",
"机器学习是人工智能的一个分支,让计算机从数据中学习。",
"深度学习使用多层神经网络处理复杂模式。",
"自然语言处理让计算机理解和生成人类语言。",
"计算机视觉使机器能够理解图像和视频。",
"数据库用于存储和管理大量结构化数据。",
"云计算提供按需的计算资源和服务。"
]
search_engine.add_documents(documents)
# 搜索
results = search_engine.search("AI技术有哪些?", top_k=3)
print("\n搜索结果:")
for i, result in enumerate(results, 1):
print(f"\n{i}. {result['document']}")
print(f" 相似度: {result['score']:.4f}")高级语义搜索
python
from datetime import datetime
import json
class AdvancedSemanticSearch:
"""高级语义搜索引擎"""
def __init__(self, model="text-embedding-3-small"):
self.client = OpenAI()
self.model = model
self.documents = []
self.embeddings = []
self.metadata = []
def add_documents(self, documents, metadata=None):
"""添加文档及元数据
documents: 文档列表
metadata: 元数据列表 [{"title": "...", "category": "...", ...}, ...]
"""
if metadata and len(documents) != len(metadata):
raise ValueError("文档和元数据数量不匹配")
# 生成embeddings
for i, doc in enumerate(documents):
embedding = self.client.embeddings.create(
input=doc,
model=self.model
).data[0].embedding
self.documents.append(doc)
self.embeddings.append(embedding)
# 添加元数据
doc_metadata = metadata[i] if metadata else {}
doc_metadata["added_at"] = datetime.now().isoformat()
doc_metadata["index"] = len(self.documents) - 1
self.metadata.append(doc_metadata)
def search(self, query, top_k=5, filter_fn=None, rerank=False):
"""搜索
query: 查询文本
top_k: 返回结果数量
filter_fn: 过滤函数 lambda metadata: bool
rerank: 是否重排序
"""
if not self.documents:
return []
# 生成查询向量
query_embedding = self.client.embeddings.create(
input=query,
model=self.model
).data[0].embedding
query_vec = np.array(query_embedding)
# 应用过滤
if filter_fn:
valid_indices = [
i for i, meta in enumerate(self.metadata)
if filter_fn(meta)
]
vectors = np.array([self.embeddings[i] for i in valid_indices])
else:
valid_indices = list(range(len(self.documents)))
vectors = np.array(self.embeddings)
if len(vectors) == 0:
return []
# 计算相似度
similarities = SimilarityCalculator.batch_cosine_similarity(
query_vec, vectors
)
# 获取top_k结果
top_indices = np.argsort(similarities)[::-1][:top_k * 2] # 获取2倍数量用于重排
results = []
for idx in top_indices[:top_k]:
original_idx = valid_indices[idx]
results.append({
"document": self.documents[original_idx],
"score": float(similarities[idx]),
"metadata": self.metadata[original_idx]
})
# 重排序 (可以基于其他因素)
if rerank:
results = self._rerank(results, query)
return results[:top_k]
def _rerank(self, results, query):
"""重排序结果"""
# 简单的重排策略: 结合相似度和文档长度
for result in results:
doc_length = len(result["document"])
length_score = min(doc_length / 200, 1.0) # 标准化长度分数
result["final_score"] = result["score"] * 0.7 + length_score * 0.3
return sorted(results, key=lambda x: x["final_score"], reverse=True)
def save(self, filepath):
"""保存索引"""
data = {
"documents": self.documents,
"embeddings": [emb for emb in self.embeddings],
"metadata": self.metadata,
"model": self.model
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def load(self, filepath):
"""加载索引"""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
self.documents = data["documents"]
self.embeddings = data["embeddings"]
self.metadata = data["metadata"]
self.model = data["model"]
# 使用示例
search = AdvancedSemanticSearch()
# 添加文档和元数据
documents = [
"Python是一种高级编程语言",
"机器学习是AI的重要分支",
"深度学习使用神经网络"
]
metadata = [
{"title": "Python简介", "category": "编程"},
{"title": "机器学习", "category": "AI"},
{"title": "深度学习", "category": "AI"}
]
search.add_documents(documents, metadata)
# 普通搜索
results = search.search("人工智能技术", top_k=2)
print("普通搜索:")
for r in results:
print(f" {r['metadata']['title']}: {r['document']}")
# 过滤搜索 (只搜索AI类别)
results = search.search(
"学习技术",
top_k=2,
filter_fn=lambda meta: meta["category"] == "AI"
)
print("\n过滤搜索(只看AI类别):")
for r in results:
print(f" {r['metadata']['title']}: {r['document']}")
# 保存和加载
search.save("search_index.json")完整代码示例
构建知识库问答系统
python
from typing import List, Dict
import numpy as np
from openai import OpenAI
class KnowledgeBase:
"""知识库系统"""
def __init__(self, model="text-embedding-3-small"):
self.client = OpenAI()
self.model = model
self.chunks = []
self.embeddings = []
self.sources = []
def load_documents(self, documents: List[Dict[str, str]]):
"""加载文档
documents: [
{"content": "文档内容", "source": "来源", "metadata": {...}},
...
]
"""
for doc in documents:
# 分块
chunks = self._chunk_text(doc["content"])
# 为每个块生成embedding
for chunk in chunks:
embedding = self.client.embeddings.create(
input=chunk,
model=self.model
).data[0].embedding
self.chunks.append(chunk)
self.embeddings.append(embedding)
self.sources.append({
"source": doc.get("source", "unknown"),
"metadata": doc.get("metadata", {})
})
print(f"已加载{len(documents)}个文档,生成{len(self.chunks)}个chunks")
def _chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
"""文本分块"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
"""检索相关文档"""
if not self.chunks:
return []
# 生成查询向量
query_embedding = self.client.embeddings.create(
input=query,
model=self.model
).data[0].embedding
query_vec = np.array(query_embedding)
vectors = np.array(self.embeddings)
# 计算相似度
indices, scores = SimilarityCalculator.find_most_similar(
query_vec, vectors, top_k=top_k
)
# 返回结果
results = []
for idx, score in zip(indices, scores):
results.append({
"content": self.chunks[idx],
"score": float(score),
"source": self.sources[idx]
})
return results
def answer_question(self, question: str, top_k: int = 3) -> str:
"""回答问题"""
# 检索相关文档
relevant_docs = self.retrieve(question, top_k=top_k)
if not relevant_docs:
return "抱歉,我没有找到相关信息。"
# 构建上下文
context = "\n\n".join([
f"[来源: {doc['source']['source']}]\n{doc['content']}"
for doc in relevant_docs
])
# 生成答案
prompt = f"""
基于以下上下文回答问题。如果上下文中没有相关信息,请说"我不知道"。
上下文:
{context}
问题: {question}
答案:
"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.3
)
answer = response.choices[0].message.content
return {
"answer": answer,
"sources": [doc["source"] for doc in relevant_docs]
}
# 使用示例
kb = KnowledgeBase()
# 加载文档
documents = [
{
"content": "Python是一种解释型、面向对象、动态数据类型的高级程序设计语言。Python由Guido van Rossum于1989年底发明。Python的设计哲学强调代码的可读性和简洁的语法。",
"source": "Python文档",
"metadata": {"category": "编程语言"}
},
{
"content": "机器学习是人工智能的一个分支。它是一门多领域交叉学科,涉及概率论、统计学、逼近论、凸分析、算法复杂度理论等多门学科。机器学习的核心是使用算法解析数据,从中学习,然后对真实世界中的事件做出决策和预测。",
"source": "AI百科",
"metadata": {"category": "人工智能"}
},
{
"content": "深度学习是机器学习的一个子集,它使用多层神经网络来学习数据的表示。深度学习在图像识别、语音识别、自然语言处理等领域取得了突破性进展。",
"source": "深度学习指南",
"metadata": {"category": "人工智能"}
}
]
kb.load_documents(documents)
# 问答
result = kb.answer_question("什么是Python?")
print(f"问题: 什么是Python?")
print(f"答案: {result['answer']}")
print(f"\n参考来源:")
for source in result['sources']:
print(f" - {source['source']}")总结
本教程涵盖了Embedding和向量表示的核心知识:
- Embedding原理: 文本到向量的转换
- OpenAI API: 使用最先进的embedding模型
- Sentence Transformers: 开源替代方案
- 相似度计算: 余弦、欧氏、点积等度量
- 语义搜索: 从简单到高级的实现
- 实战应用: 构建知识库问答系统
关键要点
- Embedding捕获文本的语义信息
- 相似文本的向量在空间中距离近
- 余弦相似度是最常用的度量
- 批量处理可以提高效率
- 元数据和过滤增强搜索能力
下一步
- 学习向量数据库 (Milvus, Pinecone)
- 探索RAG (检索增强生成)
- 优化分块策略
- 实现混合搜索 (关键词+语义)
💬 讨论
使用 GitHub 账号登录后即可参与讨论