File size: 2,026 Bytes
25fcb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# =================================================================================
# build_knowledge_base.py: One-time script to build and save the vector store
# =================================================================================
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import config
import data_processing
import os

def build_vector_store():
    """

    Builds and saves a LlamaIndex vector store from the processed documents.

    """
    # Load and process documents from all sources
    all_docs = data_processing.load_and_process_all()

    # If no documents were created, exit
    if not all_docs:
        print("No documents were created. Exiting.")
        return

    # The documents are already in the correct LlamaIndex format.
    llama_documents = all_docs

    # Initialize the embedding model
    print(f"Loading embedding model: {config.EMBEDDING_MODEL_NAME}...")
    embed_model = HuggingFaceEmbedding(model_name=config.EMBEDDING_MODEL_NAME)

    # Create the LlamaIndex VectorStoreIndex
    print("Creating the LlamaIndex vector store...")
    index = VectorStoreIndex.from_documents(
        llama_documents, 
        embed_model=embed_model,
        transformations=[SentenceSplitter(chunk_size=1000, chunk_overlap=150)]
    )

    # Persist the index to disk
    print(f"Saving the vector store to: {config.LLAMA_INDEX_STORE_PATH}")
    index.storage_context.persist(persist_dir=config.LLAMA_INDEX_STORE_PATH)
    print("Vector store built and saved successfully.")

def main():
    """

    Main function to build the knowledge base.

    """
    # Check if the vector store already exists
    if os.path.exists(config.LLAMA_INDEX_STORE_PATH):
        print("Vector store already exists. Skipping build process.")
    else:
        build_vector_store()

if __name__ == "__main__":
    main()