Spaces:

alperensn
/

PharmaBot

Sleeping

File size: 2,026 Bytes

25fcb73

# =================================================================================
# build_knowledge_base.py: One-time script to build and save the vector store
# =================================================================================
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import config
import data_processing
import os

def build_vector_store():
    """

    Builds and saves a LlamaIndex vector store from the processed documents.

    """
    # Load and process documents from all sources
    all_docs = data_processing.load_and_process_all()

    # If no documents were created, exit
    if not all_docs:
        print("No documents were created. Exiting.")
        return

    # The documents are already in the correct LlamaIndex format.
    llama_documents = all_docs

    # Initialize the embedding model
    print(f"Loading embedding model: {config.EMBEDDING_MODEL_NAME}...")
    embed_model = HuggingFaceEmbedding(model_name=config.EMBEDDING_MODEL_NAME)

    # Create the LlamaIndex VectorStoreIndex
    print("Creating the LlamaIndex vector store...")
    index = VectorStoreIndex.from_documents(
        llama_documents, 
        embed_model=embed_model,
        transformations=[SentenceSplitter(chunk_size=1000, chunk_overlap=150)]
    )

    # Persist the index to disk
    print(f"Saving the vector store to: {config.LLAMA_INDEX_STORE_PATH}")
    index.storage_context.persist(persist_dir=config.LLAMA_INDEX_STORE_PATH)
    print("Vector store built and saved successfully.")

def main():
    """

    Main function to build the knowledge base.

    """
    # Check if the vector store already exists
    if os.path.exists(config.LLAMA_INDEX_STORE_PATH):
        print("Vector store already exists. Skipping build process.")
    else:
        build_vector_store()

if __name__ == "__main__":
    main()