|
|
|
|
@ -0,0 +1,113 @@
|
|
|
|
|
from langchain_community.docstore import InMemoryDocstore
|
|
|
|
|
from langchain_community.vectorstores import FAISS
|
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
|
import faiss;
|
|
|
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
|
|
|
|
|
|
|
# 下载好FAISS
|
|
|
|
|
|
|
|
|
|
# 准备好向量化的对象
|
|
|
|
|
model_name = "BAAI/bge-small-zh-v1.5" # 模型名
|
|
|
|
|
model_kwargs = {'device': 'cpu'} # 没有显卡就用cpu,有英伟达显卡写cuda
|
|
|
|
|
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
|
|
|
|
|
|
|
|
|
# 第一次运行,会自动下载模型(去huggingface上下载),下载到hf默认的缓存目录。
|
|
|
|
|
hf_embedding = HuggingFaceEmbeddings(
|
|
|
|
|
model_name=model_name,
|
|
|
|
|
model_kwargs=model_kwargs,
|
|
|
|
|
encode_kwargs=encode_kwargs
|
|
|
|
|
)
|
|
|
|
|
# 1、初始化数据库
|
|
|
|
|
# 创建索引
|
|
|
|
|
index = faiss.IndexFlatL2(len(hf_embedding.embed_query('Hello world!')));
|
|
|
|
|
db = FAISS(
|
|
|
|
|
# 指定好初始化的Embedding的算法模型引用
|
|
|
|
|
embedding_function = hf_embedding,
|
|
|
|
|
# 索引长度
|
|
|
|
|
index = index,
|
|
|
|
|
# 内存存储文档 doc嘛,非关系型数据库
|
|
|
|
|
docstore = InMemoryDocstore(),
|
|
|
|
|
# 字典
|
|
|
|
|
index_to_docstore_id = {}
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
# 2、准备数据
|
|
|
|
|
document_1 = Document(
|
|
|
|
|
# 具体数据,key要定死是page_content
|
|
|
|
|
page_content = "今天早餐我吃了巧克力薄煎饼和炒蛋。",
|
|
|
|
|
# 额外数据,随便写,key:value格式
|
|
|
|
|
metadata = {"source": "tweet"}
|
|
|
|
|
);
|
|
|
|
|
document_2 = Document(
|
|
|
|
|
page_content="明天的天气预报是阴天多云,最高气温62华氏度。",
|
|
|
|
|
metadata={"source": "news"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_3 = Document(
|
|
|
|
|
page_content="正在用LangChain构建一个激动人心的新项目——快来看看吧!",
|
|
|
|
|
metadata={"source": "tweet"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_4 = Document(
|
|
|
|
|
page_content="劫匪闯入城市银行,盗走了100万美元现金。",
|
|
|
|
|
metadata={"source": "news"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_5 = Document(
|
|
|
|
|
page_content="哇!那部电影太精彩了,我已经迫不及待想再看一遍。",
|
|
|
|
|
metadata={"source": "tweet"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_6 = Document(
|
|
|
|
|
page_content="新iPhone值得这个价格吗?阅读这篇评测一探究竟。",
|
|
|
|
|
metadata={"source": "website"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_7 = Document(
|
|
|
|
|
page_content="当今世界排名前十的足球运动员。",
|
|
|
|
|
metadata={"source": "website"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_8 = Document(
|
|
|
|
|
page_content="LangGraph是构建有状态智能体应用的最佳框架!",
|
|
|
|
|
metadata={"source": "tweet"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_9 = Document(
|
|
|
|
|
page_content="由于对经济衰退的担忧,今日股市下跌500点。",
|
|
|
|
|
metadata={"source": "news"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document_10 = Document(
|
|
|
|
|
page_content="我有种不好的预感,我要被删除了 :(",
|
|
|
|
|
metadata={"source": "tweet"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
documents = [
|
|
|
|
|
document_1,
|
|
|
|
|
document_2,
|
|
|
|
|
document_3,
|
|
|
|
|
document_4,
|
|
|
|
|
document_5,
|
|
|
|
|
document_6,
|
|
|
|
|
document_7,
|
|
|
|
|
document_8,
|
|
|
|
|
document_9,
|
|
|
|
|
document_10,
|
|
|
|
|
]
|
|
|
|
|
# 生成个id,1~documents长度的
|
|
|
|
|
ids = [ 'id'+str(i+1) for i in range(len(documents))]
|
|
|
|
|
# 数据插入到faiss,并且指定ID,保存到内存~
|
|
|
|
|
db.add_documents(documents,ids = ids);
|
|
|
|
|
|
|
|
|
|
# 把数据库写入磁盘
|
|
|
|
|
db.save_local('./faiss_db')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 语义检索,k=返回几条
|
|
|
|
|
# results = db.similarity_search('今天的金融投资新闻', k=2)
|
|
|
|
|
results = db.similarity_search('有美食的内容吗', k=2)
|
|
|
|
|
for res in results:
|
|
|
|
|
print(type(res))
|
|
|
|
|
print(res.id)
|
|
|
|
|
print(f"* {res.page_content} [{res.metadata}]")
|