lindorm_search_store
Lindorm
The cloud-native multimodal database Lindorm from alibaba-cloud provides storage, indexing, and retrieval services for massive vector data. It supports various indexing algorithms and distance functions, as well as a rich set of fusion retrieval methods. It not only offers the necessary full-text and vector fusion retrieval capabilities for large model RAG systems, enhancing the accuracy of large model responses, but is also applicable to various AI business scenarios such as personalized recommendations, NLP services, and intelligent Q&A.
This notebook covers how to get started with One-stop AI + Vector Retrieval Service
from langchain_community.embeddings.lindorm_embedding import LindormAIEmbeddings
import environs
env = environs.Env()
env.read_env(".env")
class Config:
AI_EMB_ENDPOINT = env.str("AI_EMB_ENDPOINT", '<EMB_ENDPOINT>')
AI_USERNAME = env.str("AI_USERNAME", 'root')
AI_PWD = env.str("AI_PWD", '<PASSWORD>')
AI_DEFAULT_RERANK_MODEL = "rerank_bge_large"
AI_DEFAULT_EMBEDDING_MODEL = "bge-large-zh-v1.5"
SEARCH_ENDPOINT = env.str("SEARCH_ENDPOINT", 'SEARCH_ENDPOINT')
SEARCH_USERNAME = env.str("SEARCH_USERNAME", 'root')
SEARCH_PWD = env.str("SEARCH_PWD", '<PASSWORD>')
ldai_emb = LindormAIEmbeddings(
endpoint=Config.AI_EMB_ENDPOINT,
username=Config.AI_USERNAME,
password=Config.AI_PWD,
model_name=Config.AI_DEFAULT_EMBEDDING_MODEL)
API Reference:LindormAIEmbeddings
Define Helper functions
# Helper function for printing docs
def pretty_print_docs(docs):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content + "\n\n Metadata: " + str(d.metadata) for i, d in enumerate(docs)]
)
)
def pretty_print_docs_with_score(docs_with_score):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + t[0].page_content + "\n\n Metadata: " + str(t[0].metadata) + f", score: {t[1]}" for i, t in enumerate(docs_with_score)]
)
)
Load Document & Chunking
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
loader = TextLoader('baike_documents.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print("chunk_ids: ", len(docs))
pretty_print_docs(docs[0:1])
API Reference:CharacterTextSplitter | TextLoader
Adding metadata to documents
import copy
USE_ROUTE = False
if USE_ROUTE:
docs = [copy.deepcopy(doc) for doc in docs for _ in range(100)] # train ivfpq need data > max(256, nlist), nlist default to 1000
print("total doc:", len(docs))# 1100
for i, doc in enumerate(docs):
doc.metadata["chunk_id"] = i
doc.metadata["date"] = f"{range(2010, 2020)[i % 10]}-01-01"
doc.metadata["rating"] = range(1, 6)[i % 5]
doc.metadata["author"] = ["John Doe", "Jane Doe"][i % 2]
doc.metadata["routing"] = str(i % 2)
pretty_print_docs(docs[0:1])
Configure Lindorm Search Vector Store & Index the documents
from langchain_community.vectorstores.lindorm_search_store import LindormSearchStore
LDSEARCH_ENDPOINT = Config.SEARCH_ENDPOINT
LDSEARCH_USERNAME = Config.SEARCH_USERNAME
LDSEARCH_PWD = Config.SEARCH_PWD
if USE_ROUTE:
INDEX_NAME='search_route_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
routing_field="routing", # specify metadata["routing"] as routing_field
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
data_type="float",
method_name="ivfpq",
# following args for ivfpq index
nlist=32, # > 1000 by default
)
else:
INDEX_NAME='search_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
data_type="float", # datatype
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
method_name="hnsw"
)
API Reference:LindormSearchStore
Configure Lindorm Search Vector Store & Index the documents with ChunkId
from langchain_community.vectorstores.lindorm_search_store import LindormSearchStore
LDSEARCH_ENDPOINT=Config.SEARCH_ENDPOINT
LDSEARCH_USERNAME=Config.SEARCH_USERNAME
LDSEARCH_PWD=Config.SEARCH_PWD
if USE_ROUTE:
INDEX_NAME='search_route_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
ids=[str(d.metadata["chunk_id"]) for d in docs],
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
routing_field="routing", # specify metadata["routing"] as routing_field
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
data_type="float",
method_name="ivfpq", # route index support only ivfpq
overwrite=False, # ignore doc when _id existed, overwrite when True
# following args for ivfpq index
nlist=32, # > 1000 by default
)
else:
INDEX_NAME='search_test_idx'
ld_search_store = LindormSearchStore.from_documents(
docs,
ids=[str(d.metadata["chunk_id"]) for d in docs],
lindorm_search_url=LDSEARCH_ENDPOINT,
index_name=INDEX_NAME,
embedding=ldai_emb,
http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
use_ssl=False,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
timeout=60,
embed_thread_num=2, # text -> embedding thread num
write_thread_num=5, # embedding ingest thread num
pool_maxsize=10, # search client pool size
analyzer="ik_smart", # search's text analyzer
data_type="float", # datatype
space_type="cosinesimil", # others: l2, innerproduct
dimension=1024, # modify when embedding model change
method_name="hnsw",
overwrite=False # ignore doc when _id existed, overwrite when True
)
API Reference:LindormSearchStore
Routing
if USE_ROUTE:
assert ld_search_store.kwargs.get("method_name") == "ivfpq"
assert ld_search_store.kwargs.get("routing_field") is not None
query = "辛弃疾的纪念馆在哪里?"
docs_with_score = ld_search_store.similarity_search_with_score(query=query,
routing="0", # "0" or "1"
k=5,
hybrid=True,
nprobe="200",
reorder_factor="2",
client_refactor="true")
print(docs_with_score[0:1])
Dense vector search
if not USE_ROUTE:
query = "辛弃疾的纪念馆在哪里?"
#docs = ld_search_store.similarity_search(query, k=10)
#pretty_print_docs(docs)
docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, hybrid=True, rrf_rank_constant="60", _source=True)
print(docs_with_score)
pretty_print_docs_with_score(docs_with_score[0:1])
Dense vector search with metadata filtering
query = "辛弃疾"
#Filter by Partial Match
docs = ld_search_store.similarity_search(query, k=10, filter=[{"match": {"metadata.author": {"query": "Jon", "fuzziness": "AUTO"}}}])
print(docs[0].metadata['author'])
#Filter by Date Range
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.date": {"gte": "2016-01-01"}}}])
print(docs[0].metadata['date'])
#Filter by Numeric Range
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}])
print(docs[0].metadata['rating'])
#pre_filter
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="pre_filter")
print(docs[0].metadata['rating'])
#post_filter
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="post_filter")
print(docs[0].metadata['rating'])
Full text search
query = "辛弃疾的纪念馆在哪里?"
#docs = ld_search_store.similarity_search(query, k=10, search_type="text_search")
#pretty_print_docs(docs)
docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, search_type="text_search")
pretty_print_docs_with_score(docs_with_score)
Full text search with metadata filtering
query = "辛弃疾"
#Filter by Partial Match
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"match": {"metadata.author": {"query": "Jon", "fuzziness": "AUTO"}}}])
print(docs[0].metadata['author'])
#Filter by Date Range
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.date": {"gte": "2016-01-01"}}}])
print(docs[0].metadata['date'])
#Filter by Numeric Range
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}])
print(docs[0].metadata['rating'])
#pre_filter
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="pre_filter")
print(docs[0].metadata['rating'])
#post_filter
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="post_filter")
print(docs[0].metadata['rating'])
Hybrid retrieval with dense vector & keyword search
query = "辛弃疾是谁?"
#docs = ld_search_store.similarity_search(query, k=10, hybrid=True, rrf_rank_constant="60")
#pretty_print_docs(docs)
docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, hybrid=True, rrf_rank_constant="60")
pretty_print_docs_with_score(docs_with_score)
Reranking with LindormAIRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors.lindormai_rerank import LindormAIRerank
ldai_rerank = LindormAIRerank(endpoint=Config.AI_EMB_ENDPOINT, username=Config.AI_USERNAME, password=Config.AI_PWD, model_name=Config.AI_DEFAULT_RERANK_MODEL)
compression_retriever = ContextualCompressionRetriever(
base_compressor=ldai_rerank, base_retriever=ld_search_store.as_retriever()
)
compressed_docs = compression_retriever.invoke(
"辛弃疾的纪念馆在哪里?"
)
pretty_print_docs(compressed_docs)
API Reference:ContextualCompressionRetriever | LindormAIRerank
Delete Index
ld_search_store.delete_index()
Related
- Vector store conceptual guide
- Vector store how-to guides