Embedding

1、先基于模型实现向量数据转换。 2、基于HuggingFace安装模型实现向量数据转换。 3、美食评论小案例。向量数据库 1、PAISS基本操作
5 months ago · 45612161d6
parent ab2b6e2ce4
commit 45612161d6
5 changed files with 345 additions and 1 deletions
--- a/05-Embedding/03-HuggingFace下载BGE模型实现向量转换.py
+++ b/05-Embedding/03-HuggingFace下载BGE模型实现向量转换.py
@ -7,7 +7,6 @@ model_kwargs = {'device': 'cpu'} # 没有显卡就用cpu，有英伟达显卡写
 encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

 #  第一次运行，会自动下载模型（去huggingface上下载），下载到hf默认的缓存目录。
-
 hf_embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
--- a/05-Embedding/04-美食评论小案例.py
+++ b/05-Embedding/04-美食评论小案例.py
@ -0,0 +1,96 @@
+import ast
+import pandas as pd
+import numpy as np
+from langchain_huggingface import HuggingFaceEmbeddings
+
+model_name = "BAAI/bge-small-zh-v1.5"
+model_kwargs = {'device': 'cpu'}
+encode_kwargs = {'normalize_embeddings': True}  # set True to compute cosine similarity
+
+
+
+bge_hf_embedding = HuggingFaceEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
+)
+
+# 【2】该函数就是把数据变为向量的函数
+def text_2_embedding(text):
+    resp = bge_hf_embedding.embed_documents(
+        [text]
+    )
+    return resp[0]
+
+# 【1】读取原始文件中的美食评论数据，通过调用Embedding模型，得到对应的向量，并保持到新文件中
+def embedding_2_file(source_file, output_file):
+    """读取原始的美食评论数据，通过调用Embedding模型，得到向量，并保持到新文件中"""
+    #  步骤：1、准备数据，并读取，从第index_col个字段读取
+    # pandas 是 Python 中用于数据处理和分析的核心工具，专门解决表格类数据的各类操作需求。
+    # pandas 支持读取几乎所有常见的结构化数据格式（Excel、CSV、SQL、JSON 等），并能将处理后的数据便捷地保存为这些格式，解决了数据导入导出的基础问题。
+    df = pd.read_csv(source_file, index_col=0)
+    # 读取后提取所需要的字段
+    df = df[['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text']]
+
+    print(df.head(2)) # 打印前两行看看
+
+    # 步骤2： 清洗数据 和 合并数据
+    df = df.dropna() # 如果有空数据，就删掉
+    # 把评论的摘要  和 内容字段 合并成 一个字段（方便后续处理），放入新字段text_content
+    df['text_content'] = 'Summary: ' + df.Summary.str.strip() + "; Text: " + df.Text.str.strip()
+    print(df.head(2))  # 打印前两行看看 ，发现确实增加一个字段text_content
+
+    # 步骤3: 对text_content向量化，存到一个新的文件中，向量单独存入embedding字段
+    df['embedding'] = df.text_content.apply(lambda x: text_2_embedding(x))
+    # 保存到新文件
+    df.to_csv(output_file)
+
+
+def cosine_distance(a, b):
+    """计算余弦距离"""
+    return  np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
+# 【3】input：用户输入的问题   ，embedding_file 要检索的文件， top_n 按照相似性最多检索3个
+def search_text(input, embedding_file, top_n=3):
+    """
+    根据用户输入的问题，进行语义检索，返回最相似的前top_n个结果
+    :param input:
+    :param top_n:
+    :return:
+    """
+    # 读取新文件
+    df_data = pd.read_csv(embedding_file)
+    # 取文件中df_data['embedding']字段，该字段在文件中是以字符串存储的
+    # 所以你从文件读取出来这个字段也是字符串类型的啊
+    # 要把这个字符串变成向量进行后续的数学计算，保持到新字段embedding_vector
+    # 目前这个新字段是在内存中的，没有在文件里
+    df_data['embedding_vector'] = df_data['embedding'].apply(ast.literal_eval)
+
+
+    # 把输入问题转化为向量
+    input_vector = text_2_embedding(input)
+
+
+    # 内存中embedding_vector字段和input_vector进行相似度比较
+    # 按照余弦相似度比较，余弦相似度单独封装到函数 cosine_distance中
+    # 产生一个新字段  similarity
+    df_data['similarity'] = df_data.embedding_vector.apply(lambda x: cosine_distance(x, input_vector))
+
+    res = (
+        # 对similarity字段排序，降序排
+        df_data.sort_values('similarity', ascending=False)
+        # 返回topn,此案例是：返回3个
+        .head(top_n)
+        # 把text_content字段的Summary: 替换为""，把; Text: 替换为""
+        .text_content.str.replace('Summary: ', "")  # text_content是字段名
+        .str.replace('; Text: ', ';')
+    )
+
+    # 把topn的结果打印，每打印一行画个虚线
+    for r in res:
+        print(r)
+        print('-' * 30)
+
+if __name__ == '__main__':
+    # 可以先单独测试函数embedding_2_file('../datas/fine_food_reviews_1k.csv', '../datas/output_embedding.csv')
+    search_text('delicious beans', './output_embedding.csv')
--- a/05-Embedding/output_embedding.csv
+++ b/05-Embedding/output_embedding.csv
--- a/06-向量数据库/01-faiss入门.py
+++ b/06-向量数据库/01-faiss入门.py
@ -0,0 +1,113 @@
+from langchain_community.docstore import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+import faiss;
+from langchain_huggingface import HuggingFaceEmbeddings
+
+# 下载好FAISS
+
+# 准备好向量化的对象
+model_name = "BAAI/bge-small-zh-v1.5" # 模型名
+model_kwargs = {'device': 'cpu'} # 没有显卡就用cpu，有英伟达显卡写cuda
+encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
+
+#  第一次运行，会自动下载模型（去huggingface上下载），下载到hf默认的缓存目录。
+hf_embedding = HuggingFaceEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
+)
+# 1、初始化数据库
+# 创建索引
+index = faiss.IndexFlatL2(len(hf_embedding.embed_query('Hello world!')));
+db = FAISS(
+    # 指定好初始化的Embedding的算法模型引用
+    embedding_function = hf_embedding,
+    # 索引长度
+    index = index,
+    # 内存存储文档 doc嘛，非关系型数据库
+    docstore = InMemoryDocstore(),
+    # 字典
+    index_to_docstore_id = {}
+);
+
+# 2、准备数据
+document_1 = Document(
+    # 具体数据，key要定死是page_content
+    page_content = "今天早餐我吃了巧克力薄煎饼和炒蛋。",
+    # 额外数据，随便写，key：value格式
+    metadata = {"source": "tweet"}
+);
+document_2 = Document(
+    page_content="明天的天气预报是阴天多云，最高气温62华氏度。",
+    metadata={"source": "news"},
+)
+
+document_3 = Document(
+    page_content="正在用LangChain构建一个激动人心的新项目——快来看看吧！",
+    metadata={"source": "tweet"},
+)
+
+document_4 = Document(
+    page_content="劫匪闯入城市银行，盗走了100万美元现金。",
+    metadata={"source": "news"},
+)
+
+document_5 = Document(
+    page_content="哇！那部电影太精彩了，我已经迫不及待想再看一遍。",
+    metadata={"source": "tweet"},
+)
+
+document_6 = Document(
+    page_content="新iPhone值得这个价格吗？阅读这篇评测一探究竟。",
+    metadata={"source": "website"},
+)
+
+document_7 = Document(
+    page_content="当今世界排名前十的足球运动员。",
+    metadata={"source": "website"},
+)
+
+document_8 = Document(
+    page_content="LangGraph是构建有状态智能体应用的最佳框架！",
+    metadata={"source": "tweet"},
+)
+
+document_9 = Document(
+    page_content="由于对经济衰退的担忧，今日股市下跌500点。",
+    metadata={"source": "news"},
+)
+
+document_10 = Document(
+    page_content="我有种不好的预感，我要被删除了 :(",
+    metadata={"source": "tweet"},
+)
+
+documents = [
+    document_1,
+    document_2,
+    document_3,
+    document_4,
+    document_5,
+    document_6,
+    document_7,
+    document_8,
+    document_9,
+    document_10,
+]
+# 生成个id，1~documents长度的
+ids = [ 'id'+str(i+1) for i in range(len(documents))]
+# 数据插入到faiss,并且指定ID，保存到内存~
+db.add_documents(documents,ids = ids);
+
+# 把数据库写入磁盘
+db.save_local('./faiss_db')
+
+
+# 语义检索，k=返回几条
+# results = db.similarity_search('今天的金融投资新闻', k=2)
+results = db.similarity_search('有美食的内容吗', k=2)
+for res in results:
+    print(type(res))
+    print(res.id)
+    print(f"* {res.page_content} [{res.metadata}]")
--- a/06-向量数据库/02-faiss删除操作.py
+++ b/06-向量数据库/02-faiss删除操作.py
@ -0,0 +1,34 @@
+from typing import List
+
+import faiss
+from langchain_community.docstore import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+
+
+
+# 准备好向量化的对象
+model_name = "BAAI/bge-small-zh-v1.5" # 模型名
+model_kwargs = {'device': 'cpu'} # 没有显卡就用cpu，有英伟达显卡写cuda
+encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
+
+#  第一次运行，会自动下载模型（去huggingface上下载），下载到hf默认的缓存目录。
+hf_embedding = HuggingFaceEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
+)
+# 1、
+
+# 把数据库写入磁盘
+vector_store = FAISS.load_local('./faiss_db', embeddings=hf_embedding, allow_dangerous_deserialization=True)
+
+vector_store.delete(ids=['id8'])
+
+# results = vector_store.similarity_search('今天的金融投资新闻', k=2)
+results = vector_store.similarity_search_with_score('有美食的内容吗', k=4, filter={"source": 'tweet'})  # 带分数
+for res, score in results:
+    print(type(res))
+    print(res.id)
+    print(f"* [Score={score:3f}] {res.page_content} [{res.metadata}]")