lindorm_search_store

Lindorm

The cloud-native multimodal database Lindorm from alibaba-cloud provides storage, indexing, and retrieval services for massive vector data. It supports various indexing algorithms and distance functions, as well as a rich set of fusion retrieval methods. It not only offers the necessary full-text and vector fusion retrieval capabilities for large model RAG systems, enhancing the accuracy of large model responses, but is also applicable to various AI business scenarios such as personalized recommendations, NLP services, and intelligent Q&A.

This notebook covers how to get started with One-stop AI + Vector Retrieval Service

from langchain_community.embeddings.lindorm_embedding import LindormAIEmbeddings
import environs

env = environs.Env()
env.read_env(".env")


class Config:

    AI_EMB_ENDPOINT = env.str("AI_EMB_ENDPOINT", '<EMB_ENDPOINT>')
    AI_USERNAME = env.str("AI_USERNAME", 'root')
    AI_PWD = env.str("AI_PWD", '<PASSWORD>')

    AI_DEFAULT_RERANK_MODEL = "rerank_bge_large"
    AI_DEFAULT_EMBEDDING_MODEL = "bge-large-zh-v1.5"
    SEARCH_ENDPOINT = env.str("SEARCH_ENDPOINT", 'SEARCH_ENDPOINT')
    SEARCH_USERNAME = env.str("SEARCH_USERNAME", 'root')
    SEARCH_PWD = env.str("SEARCH_PWD", '<PASSWORD>')

ldai_emb = LindormAIEmbeddings(
    endpoint=Config.AI_EMB_ENDPOINT, 
    username=Config.AI_USERNAME, 
    password=Config.AI_PWD, 
    model_name=Config.AI_DEFAULT_EMBEDDING_MODEL)

API Reference:LindormAIEmbeddings

Define Helper functions

# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content + "\n\n Metadata: " + str(d.metadata) for i, d in enumerate(docs)]
        )
    )

def pretty_print_docs_with_score(docs_with_score):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + t[0].page_content + "\n\n Metadata: " + str(t[0].metadata) + f", score: {t[1]}" for i, t in enumerate(docs_with_score)]
        )
    )

Load Document & Chunking

from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader


loader = TextLoader('baike_documents.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

print("chunk_ids: ", len(docs))
pretty_print_docs(docs[0:1])

API Reference:CharacterTextSplitter | TextLoader

Adding metadata to documents

import copy
USE_ROUTE = False 
if USE_ROUTE:
    docs = [copy.deepcopy(doc) for doc in docs for _ in range(100)] # train ivfpq need data > max(256, nlist), nlist default to 1000
print("total doc:", len(docs))# 1100
    
for i, doc in enumerate(docs):
    doc.metadata["chunk_id"] = i
    doc.metadata["date"] = f"{range(2010, 2020)[i % 10]}-01-01"
    doc.metadata["rating"] = range(1, 6)[i % 5]
    doc.metadata["author"] = ["John Doe", "Jane Doe"][i % 2]
    doc.metadata["routing"] =  str(i % 2)

pretty_print_docs(docs[0:1])

Configure Lindorm Search Vector Store & Index the documents

from langchain_community.vectorstores.lindorm_search_store import LindormSearchStore

LDSEARCH_ENDPOINT = Config.SEARCH_ENDPOINT
LDSEARCH_USERNAME = Config.SEARCH_USERNAME
LDSEARCH_PWD = Config.SEARCH_PWD


if USE_ROUTE:
    INDEX_NAME='search_route_test_idx'
    ld_search_store = LindormSearchStore.from_documents(
        docs,
        lindorm_search_url=LDSEARCH_ENDPOINT,
        index_name=INDEX_NAME,
        embedding=ldai_emb,
        http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
        use_ssl=False,
        verify_certs=False,
        ssl_assert_hostname=False,
        ssl_show_warn=False,
        timeout=60,
        embed_thread_num=2, # text -> embedding thread num
        write_thread_num=5, # embedding ingest thread num
        pool_maxsize=10, # search client pool size
        analyzer="ik_smart", # search's text analyzer
        routing_field="routing", # specify metadata["routing"] as routing_field
        space_type="cosinesimil", # others: l2, innerproduct
        dimension=1024, # modify when embedding model change
        data_type="float", 
        method_name="ivfpq",
        # following args for ivfpq index
        nlist=32, # > 1000 by default
    )
else:
    INDEX_NAME='search_test_idx'
    ld_search_store = LindormSearchStore.from_documents(
        docs,
        lindorm_search_url=LDSEARCH_ENDPOINT,
        index_name=INDEX_NAME,
        embedding=ldai_emb,
        http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
        use_ssl=False,
        verify_certs=False,
        ssl_assert_hostname=False,
        ssl_show_warn=False,
        timeout=60,
        embed_thread_num=2, # text -> embedding thread num
        write_thread_num=5, # embedding ingest thread num
        pool_maxsize=10, # search client pool size
        analyzer="ik_smart", # search's text analyzer
        data_type="float", # datatype
        space_type="cosinesimil", # others: l2, innerproduct
        dimension=1024, # modify when embedding model change
        method_name="hnsw"
    )

API Reference:LindormSearchStore

Configure Lindorm Search Vector Store & Index the documents with ChunkId

from langchain_community.vectorstores.lindorm_search_store import LindormSearchStore

LDSEARCH_ENDPOINT=Config.SEARCH_ENDPOINT
LDSEARCH_USERNAME=Config.SEARCH_USERNAME
LDSEARCH_PWD=Config.SEARCH_PWD


if USE_ROUTE:
    INDEX_NAME='search_route_test_idx'
    ld_search_store = LindormSearchStore.from_documents(
        docs,
        ids=[str(d.metadata["chunk_id"]) for d in docs],
        lindorm_search_url=LDSEARCH_ENDPOINT,
        index_name=INDEX_NAME,
        embedding=ldai_emb,
        http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
        use_ssl=False,
        verify_certs=False,
        ssl_assert_hostname=False,
        ssl_show_warn=False,
        timeout=60,
        embed_thread_num=2, # text -> embedding thread num
        write_thread_num=5, # embedding ingest thread num
        pool_maxsize=10, # search client pool size
        analyzer="ik_smart", # search's text analyzer 
        routing_field="routing", # specify metadata["routing"] as routing_field
        space_type="cosinesimil", # others: l2, innerproduct
        dimension=1024, # modify when embedding model change
        data_type="float", 
        method_name="ivfpq", # route index support only ivfpq
        overwrite=False,  # ignore doc when _id existed, overwrite when True
        # following args for ivfpq index
        nlist=32, # > 1000 by default
    )
else:
    INDEX_NAME='search_test_idx'
    ld_search_store = LindormSearchStore.from_documents(
        docs,
        ids=[str(d.metadata["chunk_id"]) for d in docs],
        lindorm_search_url=LDSEARCH_ENDPOINT,
        index_name=INDEX_NAME,
        embedding=ldai_emb,
        http_auth=(LDSEARCH_USERNAME, LDSEARCH_PWD),
        use_ssl=False,
        verify_certs=False,
        ssl_assert_hostname=False,
        ssl_show_warn=False,
        timeout=60,
        embed_thread_num=2, # text -> embedding thread num
        write_thread_num=5, # embedding ingest thread num
        pool_maxsize=10, # search client pool size
        analyzer="ik_smart", # search's text analyzer 
        data_type="float", # datatype
        space_type="cosinesimil", # others: l2, innerproduct
        dimension=1024, # modify when embedding model change
        method_name="hnsw",
        overwrite=False  # ignore doc when _id existed, overwrite when True
    )

API Reference:LindormSearchStore

Routing

if USE_ROUTE:
    assert ld_search_store.kwargs.get("method_name") == "ivfpq"
    assert ld_search_store.kwargs.get("routing_field") is not None

    query = "辛弃疾的纪念馆在哪里？"
    docs_with_score = ld_search_store.similarity_search_with_score(query=query,
                                                                   routing="0", # "0" or "1"
                                                                   k=5,
                                                                   hybrid=True,
                                                                   nprobe="200",
                                                                   reorder_factor="2",
                                                                   client_refactor="true")
    print(docs_with_score[0:1])

Dense vector search

if not USE_ROUTE:
    query = "辛弃疾的纪念馆在哪里？"
    #docs = ld_search_store.similarity_search(query, k=10)
    #pretty_print_docs(docs)

    docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, hybrid=True,  rrf_rank_constant="60", _source=True)
    print(docs_with_score)
    pretty_print_docs_with_score(docs_with_score[0:1])

Dense vector search with metadata filtering

query = "辛弃疾"
#Filter by Partial Match
docs = ld_search_store.similarity_search(query, k=10, filter=[{"match": {"metadata.author": {"query": "Jon", "fuzziness": "AUTO"}}}])
print(docs[0].metadata['author'])
#Filter by Date Range
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.date": {"gte": "2016-01-01"}}}])
print(docs[0].metadata['date'])
#Filter by Numeric Range
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}])
print(docs[0].metadata['rating'])

#pre_filter
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="pre_filter")
print(docs[0].metadata['rating'])

#post_filter
docs = ld_search_store.similarity_search(query, k=10, filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="post_filter")
print(docs[0].metadata['rating'])

Full text search

query = "辛弃疾的纪念馆在哪里？"
#docs = ld_search_store.similarity_search(query, k=10, search_type="text_search")
#pretty_print_docs(docs)

docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, search_type="text_search")
pretty_print_docs_with_score(docs_with_score)

Full text search with metadata filtering

query = "辛弃疾"
#Filter by Partial Match
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"match": {"metadata.author": {"query": "Jon", "fuzziness": "AUTO"}}}])
print(docs[0].metadata['author'])
#Filter by Date Range
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.date": {"gte": "2016-01-01"}}}])
print(docs[0].metadata['date'])
#Filter by Numeric Range
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}])
print(docs[0].metadata['rating'])

#pre_filter
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="pre_filter")
print(docs[0].metadata['rating'])

#post_filter
docs = ld_search_store.similarity_search(query, k=10, search_type="text_search", filter=[{"range": {"metadata.rating": {"gte": 3}}}], filter_type="post_filter")
print(docs[0].metadata['rating'])

Hybrid retrieval with dense vector & keyword search

query = "辛弃疾是谁？"
#docs = ld_search_store.similarity_search(query, k=10, hybrid=True, rrf_rank_constant="60")
#pretty_print_docs(docs)

docs_with_score = ld_search_store.similarity_search_with_score(query, k=10, hybrid=True, rrf_rank_constant="60")
pretty_print_docs_with_score(docs_with_score)

Reranking with LindormAIRerank

from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors.lindormai_rerank import LindormAIRerank

ldai_rerank = LindormAIRerank(endpoint=Config.AI_EMB_ENDPOINT, username=Config.AI_USERNAME, password=Config.AI_PWD, model_name=Config.AI_DEFAULT_RERANK_MODEL)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=ldai_rerank, base_retriever=ld_search_store.as_retriever()
)

compressed_docs = compression_retriever.invoke(
    "辛弃疾的纪念馆在哪里？"
)
pretty_print_docs(compressed_docs)

API Reference:ContextualCompressionRetriever | LindormAIRerank

Delete Index

ld_search_store.delete_index()

Vector store conceptual guide
Vector store how-to guides

Lindorm​

Define Helper functions​

Load Document & Chunking​

Adding metadata to documents

Configure Lindorm Search Vector Store & Index the documents​

Configure Lindorm Search Vector Store & Index the documents with ChunkId​

Routing​

Dense vector search​

Dense vector search with metadata filtering​

Full text search​

Full text search with metadata filtering​

Hybrid retrieval with dense vector & keyword search​

Reranking with LindormAIRerank​

Delete Index​

Related​

Was this page helpful?