Spaces:

alperensn
/

PharmaBot

Sleeping

App Files Files Community

alperensn commited on Oct 22

Commit

25fcb73

verified ·

1 Parent(s): cb96db0

Upload files

Browse files

Files changed (9) hide show

app.py +121 -0
build_knowledge_base.py +55 -0
config.py +34 -0
dataFetch.py +79 -0
dataPrep.py +199 -0
data_processing.py +138 -0
rag_pipeline.py +139 -0
requirements.txt +130 -0
vector_store_manager.py +29 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# =================================================================================
+# app.py: Main application file for the Streamlit web interface
+# =================================================================================
+import streamlit as st
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# Import the modules we've created
+import config
+import rag_pipeline  # Now using the LlamaIndex pipeline
+# --- Page Configuration ---
+st.set_page_config(
+    page_title="PharmaBot",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# --- State Management ---
+def initialize_state():
+    """Initializes session state variables."""
+    if "messages" not in st.session_state:
+        st.session_state.messages = [{"role": "assistant", "content": "Welcome to PharmaBot! How can I help you today?"}]
+    if "query_engine" not in st.session_state:
+        st.session_state.query_engine = None
+    if "initialized" not in st.session_state:
+        st.session_state.initialized = False
+# --- UI Components ---
+def setup_sidebar():
+    """Sets up the sidebar with app information."""
+    with st.sidebar:
+        st.header("About PharmaBot")
+        st.info(
+            "PharmaBot is an AI assistant designed to answer questions about "
+            "pharmaceuticals based on a knowledge base of RAG documents. "
+            "It uses a Retrieval-Augmented Generation (RAG) pipeline to provide accurate, "
+            "context-aware answers."
+        )
+        st.warning("**Disclaimer: I am an AI assistant, not a medical professional. This information is for educational purposes only. Please consult with a qualified healthcare provider for any health concerns or before making any medical decisions.**"
+        )
+        st.markdown("---")
+        st.header("Technical Details")
+        st.markdown(
+            f"""
+            - **LLM Model:** `{config.LLM_MODEL_ID}`
+            - **Embedding Model:** `{config.EMBEDDING_MODEL_NAME}`
+            - **Vector Type:** `LLama Index Vector Store`
+            - **Vector Store:** `{config.VECTOR_STORE_PATH}`
+            """
+        )
+def display_chat_history():
+    """Displays the chat history."""
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+def handle_user_input(chat_engine):
+    """Handles user input and displays the response."""
+    if prompt := st.chat_input("Ask me anything about pharmaceuticals..."):
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.write(prompt)
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                response = chat_engine.chat(prompt)
+                st.write(str(response))
+        st.session_state.messages.append({"role": "assistant", "content": str(response)})
+import time
+# --- Main Application Logic ---
+def main():
+    """Main function to run the Streamlit app."""
+    st.set_page_config(page_title="PharmaBot Assistant", page_icon="💊")
+    initialize_state()
+    st.title("💊 PharmaBot: Your AI Pharmaceutical Assistant")
+    setup_sidebar()
+    # Initialize the RAG pipeline if it hasn't been already
+    if not st.session_state.initialized:
+        with st.status("Initializing the RAG pipeline...", expanded=True) as status:
+            try:
+                status.write("Step 1/3: Initializing LLM and embedding models...")
+                rag_pipeline.initialize_llm_and_embed_model()
+                status.write("Step 2/3: Loading vector index from storage...")
+                index = rag_pipeline.load_vector_index()
+                status.write("Step 3/3: Building the conversational chat engine...")
+                st.session_state.query_engine = rag_pipeline.build_query_engine(index)
+                st.session_state.initialized = True
+                status.update(label="Initialization Complete!", state="complete", expanded=False)
+                time.sleep(1) # Brief pause to show completion
+            except FileNotFoundError as e:
+                status.update(label="Initialization Failed", state="error")
+                st.error(f"Error: {e}. Please make sure the vector store is built.")
+                st.warning("To build the vector store, run `python build_knowledge_base.py` from your terminal.")
+                return
+            except Exception as e:
+                status.update(label="Initialization Failed", state="error")
+                st.error(f"An unexpected error occurred during initialization: {e}")
+                return
+        st.rerun()
+    # Display chat and handle input if initialized
+    if st.session_state.initialized:
+        display_chat_history()
+        handle_user_input(st.session_state.query_engine)
+if __name__ == "__main__":
+    main()

build_knowledge_base.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# =================================================================================
+# build_knowledge_base.py: One-time script to build and save the vector store
+# =================================================================================
+from llama_index.core import VectorStoreIndex, Document
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+import config
+import data_processing
+import os
+def build_vector_store():
+    """
+    Builds and saves a LlamaIndex vector store from the processed documents.
+    """
+    # Load and process documents from all sources
+    all_docs = data_processing.load_and_process_all()
+    # If no documents were created, exit
+    if not all_docs:
+        print("No documents were created. Exiting.")
+        return
+    # The documents are already in the correct LlamaIndex format.
+    llama_documents = all_docs
+    # Initialize the embedding model
+    print(f"Loading embedding model: {config.EMBEDDING_MODEL_NAME}...")
+    embed_model = HuggingFaceEmbedding(model_name=config.EMBEDDING_MODEL_NAME)
+    # Create the LlamaIndex VectorStoreIndex
+    print("Creating the LlamaIndex vector store...")
+    index = VectorStoreIndex.from_documents(
+        llama_documents,
+        embed_model=embed_model,
+        transformations=[SentenceSplitter(chunk_size=1000, chunk_overlap=150)]
+    )
+    # Persist the index to disk
+    print(f"Saving the vector store to: {config.LLAMA_INDEX_STORE_PATH}")
+    index.storage_context.persist(persist_dir=config.LLAMA_INDEX_STORE_PATH)
+    print("Vector store built and saved successfully.")
+def main():
+    """
+    Main function to build the knowledge base.
+    """
+    # Check if the vector store already exists
+    if os.path.exists(config.LLAMA_INDEX_STORE_PATH):
+        print("Vector store already exists. Skipping build process.")
+    else:
+        build_vector_store()
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# =================================================================================
+# config.py: Project configuration settings
+# =================================================================================
+# This file contains constant parameters like model names, file paths, etc.
+# Sensitive information like API keys will be read from the .env file.
+# --- Model Settings ---
+# The main language model to be used in the RAG chain
+LLM_MODEL_ID = "gemini-2.0-flash-001"
+# The embedding model for converting text to vectors
+EMBEDDING_MODEL_NAME = "pritamdeka/S-BioBert-snli-multinli-stsb"
+# --- File Paths ---
+# Path to the raw data downloaded from the openFDA API
+RAW_DATA_PATH = "./fda_data/drug_labels_all.json"
+# Path to the cleaned/processed data
+CLEANED_DATA_PATH = "./fda_data/fda_data_processed.jsonl"
+# The name of the folder where the vector database will be saved
+VECTOR_STORE_PATH = "llamaIndexVectorBase_fda"
+# =================================================================================
+# LlamaIndex Settings
+# =================================================================================
+LLAMA_INDEX_STORE_PATH = "./llamaIndexVectorBase_fda"
+# =================================================================================
+# Data Source Paths
+# =================================================================================
+##HEALTHCARE_MAGIC_PATH = "../healthCareMagic/HealthCareMagic-100k.json"
+##MEDQUAD_PATH = "../medQuad/medDataset.json"

dataFetch.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import requests
+import json
+import os
+import math
+# Define the API endpoint
+API_URL = "https://api.fda.gov/drug/label.json"
+# Define the output directory and file for all data
+OUTPUT_DIR = "fda_data"
+OUTPUT_FILE = os.path.join(OUTPUT_DIR, "drug_labels_all.json")
+# The API's maximum limit per request is 1000
+CHUNK_SIZE = 1000
+MAX_RECORDS = 25000
+def fetch_all_fda_data():
+    """
+    Fetches drug label data from the openFDA API using pagination
+    and saves it to a single file.
+    """
+    print("Starting to fetch data from the openFDA endpoint...")
+    try:
+        # Step 1: Make an initial request to get the total number of records
+        print("Determining the total number of records...")
+        initial_response = requests.get(API_URL, params={"limit": 1})
+        initial_response.raise_for_status()
+        total_records = initial_response.json()['meta']['results']['total']
+        records_to_fetch = min(total_records, MAX_RECORDS)
+        print(f"Found a total of {total_records} records. Fetching up to {records_to_fetch} records.")
+        all_results = []
+        # Step 2: Loop through the data in chunks
+        num_chunks = math.ceil(records_to_fetch / CHUNK_SIZE)
+        for i in range(num_chunks):
+            skip = i * CHUNK_SIZE
+            # Ensure we don't request more than records_to_fetch
+            limit = min(CHUNK_SIZE, records_to_fetch - skip)
+            if limit <= 0:
+                break
+            params = {"limit": limit, "skip": skip}
+            print(f"Fetching chunk {i+1}/{num_chunks} (records {skip} to {skip + limit - 1})...")
+            response = requests.get(API_URL, params=params)
+            response.raise_for_status()
+            chunk_data = response.json()
+            if 'results' in chunk_data:
+                all_results.extend(chunk_data['results'])
+        print("\nAll data has been fetched successfully.")
+        # Step 3: Save all the data to a single file
+        if not os.path.exists(OUTPUT_DIR):
+            os.makedirs(OUTPUT_DIR)
+            print(f"Created directory: {OUTPUT_DIR}")
+        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+            json.dump({"results": all_results}, f, ensure_ascii=False, indent=4)
+        print(f"All {len(all_results)} records saved to: {OUTPUT_FILE}")
+    except requests.exceptions.HTTPError as http_err:
+        print(f"HTTP error occurred: {http_err}")
+    except requests.exceptions.RequestException as req_err:
+        print(f"An error occurred while fetching data: {req_err}")
+    except json.JSONDecodeError:
+        print("Failed to parse the response as JSON.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+if __name__ == "__main__":
+    fetch_all_fda_data()

dataPrep.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import json
+import re
+from tqdm import tqdm
+import os
+import config
+# --- Functions from dataOrganize.py ---
+def clean_text(text: str) -> str:
+    """
+    Cleans the input text by removing common noise from FDA documents.
+    """
+    if not text:
+        return ""
+    text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text)
+    text = re.sub(r'\s{2,}', ' ', text).strip()
+    text = re.sub(r'[\-=*]{3,}', '', text)
+    return text
+def organize_drug_data(input_path):
+    """
+    Loads raw drug data, filters for high-quality entries, cleans the text,
+    and returns the organized data as a list.
+    """
+    print(f"Loading raw data from: {input_path}...")
+    try:
+        with open(input_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"Error: The file '{input_path}' was not found.")
+        return []
+    except json.JSONDecodeError:
+        print(f"Error: Could not decode JSON from '{input_path}'.")
+        return []
+    entries = data.get('results', data) if isinstance(data, dict) else data
+    if not isinstance(entries, list):
+        print("Error: The JSON data is not in the expected list format.")
+        return []
+    organized_data = []
+    print("Filtering, cleaning, and organizing drug data...")
+    for entry in tqdm(entries, desc="Processing drug entries"):
+        if not isinstance(entry, dict):
+            continue
+        openfda = entry.get("openfda", {})
+        brand_name_list = openfda.get("brand_name")
+        generic_name_list = openfda.get("generic_name")
+        if not brand_name_list and not generic_name_list:
+            continue
+        if "indications_and_usage" not in entry:
+            continue
+        brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
+        generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"
+        sections_to_extract = {
+            "indications_and_usage": "Indications and Usage", "adverse_reactions": "Adverse Reactions",
+            "drug_interactions": "Drug Interactions", "contraindications": "Contraindications",
+            "warnings": "Warnings", "boxed_warning": "Boxed Warning",
+            "mechanism_of_action": "Mechanism of Action", "pharmacokinetics": "Pharmacokinetics",
+            "dosage_and_administration": "Dosage and Administration", "how_supplied": "How Supplied",
+            "storage_and_handling": "Storage and Handling", "information_for_patients": "Information for Patients",
+            "pregnancy": "Pregnancy", "nursing_mothers": "Nursing Mothers",
+            "pediatric_use": "Pediatric Use", "geriatric_use": "Geriatric Use"
+        }
+        processed_sections = {}
+        for key, section_name in sections_to_extract.items():
+            text_list = entry.get(key)
+            if text_list and isinstance(text_list, list) and text_list[0]:
+                cleaned_text = clean_text(text_list[0])
+                if cleaned_text:
+                    processed_sections[section_name] = cleaned_text
+        if processed_sections:
+            organized_entry = {
+                "brand_name": brand_name,
+                "generic_name": generic_name,
+                "sections": processed_sections
+            }
+            organized_data.append(organized_entry)
+    print(f"Found {len(organized_data)} high-quality drug entries.")
+    return organized_data
+# --- Functions from deduplicate_drugs.py ---
+def deduplicate_drugs(data):
+    """
+    Deduplicates a list of drugs based on brand_name and generic_name.
+    """
+    print(f"Deduplicating {len(data)} drugs...")
+    seen_drugs = set()
+    deduplicated_drugs = []
+    for drug in data:
+        brand_name = drug.get('brand_name')
+        generic_name = drug.get('generic_name')
+        if isinstance(brand_name, list):
+            brand_name = brand_name[0] if brand_name else None
+        if isinstance(generic_name, list):
+            generic_name = generic_name[0] if generic_name else None
+        brand_name_lower = brand_name.lower() if brand_name else None
+        generic_name_lower = generic_name.lower() if generic_name else None
+        drug_identifier = (brand_name_lower, generic_name_lower)
+        if drug_identifier not in seen_drugs:
+            seen_drugs.add(drug_identifier)
+            deduplicated_drugs.append(drug)
+    print(f"Deduplication complete. Found {len(deduplicated_drugs)} unique drugs.")
+    return deduplicated_drugs
+# --- Functions from format_fda_data.py ---
+def generate_section_id(section_title):
+    """Generates a simplified, lowercase, underscore-separated ID from a section title."""
+    s = re.sub(r'[/\-&]', ' ', section_title)
+    s = re.sub(r'[^a-zA-Z0-9\s]', '', s)
+    parts = s.lower().split()
+    if len(parts) >= 2:
+        return '_'.join(parts[:2])
+    elif len(parts) == 1:
+        return parts[0]
+    else:
+        return "section"
+def transform_drug_data(drugs, output_file_path):
+    """
+    Transforms drug data to a JSON Lines format.
+    """
+    print(f"Transforming {len(drugs)} drugs to JSONL format...")
+    processed_records = []
+    for drug in drugs:
+        generic_name = drug.get('generic_name')
+        sections = drug.get('sections')
+        if not generic_name or not isinstance(sections, dict):
+            continue
+        if isinstance(generic_name, list):
+            generic_name = generic_name[0] if generic_name else None
+        if not generic_name:
+            continue
+        generic_name_upper = generic_name.upper()
+        for section_title, section_content in sections.items():
+            if not section_title or not section_content:
+                continue
+            section_id = generate_section_id(section_title)
+            doc_id = f"{generic_name_upper.replace(' ', '_')}_{section_id}"
+            record = {
+                "doc_id": doc_id,
+                "generic_name": generic_name_upper,
+                "section": section_title,
+                "content": section_content.strip()
+            }
+            processed_records.append(json.dumps(record))
+    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
+    with open(output_file_path, 'w') as f_out:
+        f_out.write('\n'.join(processed_records))
+    print(f"Transformation complete. {len(processed_records)} records created.")
+    print(f"Transformed data saved to: {output_file_path}")
+if __name__ == '__main__':
+    # Define file paths using config
+    raw_data_path = config.RAW_DATA_PATH
+    cleaned_data_path = config.CLEANED_DATA_PATH
+    # --- Run the full pipeline ---
+    print("--- Starting Data Preparation Pipeline ---")
+    # Step 1: Organize and clean the raw data in memory
+    organized_data = organize_drug_data(raw_data_path)
+    # Step 2: Deduplicate the cleaned data in memory
+    deduplicated_data = deduplicate_drugs(organized_data)
+    # Step 3: Transform the deduplicated data and write to the final file
+    transform_drug_data(deduplicated_data, cleaned_data_path)
+    print("--- Data Preparation Pipeline Finished ---")

data_processing.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# =================================================================================
+# data_processing.py: Process and prepare raw data
+# =================================================================================
+import json
+import re
+from llama_index.core import Document
+from tqdm import tqdm
+import config
+def clean_text(text: str) -> str:
+    """
+    Cleans the input text by removing common noise from FDA documents.
+    """
+    text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text)
+    text = re.sub(r'\s{2,}', ' ', text).strip()
+    text = re.sub(r'[\-=*]{3,}', '', text)
+    return text
+def load_and_prepare_documents(json_path=config.RAW_DATA_PATH):
+    """
+    Loads drug data from a JSON file, filters for high-quality entries,
+    cleans the text, and returns a list of LangChain Document objects.
+    """
+    print(f"Loading data from: {json_path}...")
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    all_docs = []
+    print("Filtering, cleaning, and converting data to 'Document' objects...")
+    for entry in tqdm(data, desc="Processing drug data"):
+        if not entry: continue
+        # --- NEW FILTERING LOGIC ---
+        # 1. Ensure the entry has a brand or generic name.
+        brand_name_list = entry.get("openfda", {}).get("brand_name")
+        generic_name_list = entry.get("openfda", {}).get("generic_name")
+        if not brand_name_list and not generic_name_list:
+            continue # Skip entries with no name
+        # 2. Ensure it's likely a real drug by checking for a crucial section.
+        if "indications_and_usage" not in entry:
+            continue # Skip entries that don't say what the drug is for
+        brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
+        generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"
+        # ---------------------------
+        sections_to_process = {
+            "indications_and_usage": "Indications and Usage",
+            "adverse_reactions": "Adverse Reactions",
+            "drug_interactions": "Drug Interactions",
+            "contraindications": "Contraindications",
+            "warnings": "Warnings",
+            "boxed_warning": "Boxed Warning",
+            "mechanism_of_action": "Mechanism of Action",
+            "pharmacokinetics": "Pharmacokinetics",
+            "dosage_and_administration": "Dosage and Administration",
+            "how_supplied": "How Supplied",
+            "storage_and_handling": "Storage and Handling",
+            "information_for_patients": "Information for Patients",
+            "pregnancy": "Pregnancy",
+            "nursing_mothers": "Nursing Mothers",
+            "pediatric_use": "Pediatric Use",
+            "geriatric_use": "Geriatric Use"
+        }
+        for key, section_name in sections_to_process.items():
+            text_list = entry.get(key)
+            if text_list and isinstance(text_list, list) and text_list[0] and text_list[0].strip():
+                cleaned_text = clean_text(text_list[0])
+                if cleaned_text:
+                    metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name}
+                    doc = Document(page_content=cleaned_text, metadata=metadata)
+                    all_docs.append(doc)
+    print(f"Created a total of {len(all_docs)} 'Document' objects after filtering.")
+    return all_docs
+def load_and_process_all():
+    """
+    Loads and processes documents from all configured data sources.
+    """
+    all_docs = []
+    # Process FDA drug data
+    fda_docs = load_and_prepare_fda_documents()
+    all_docs.extend(fda_docs)
+    # Process HealthCareMagic data
+    # healthcare_docs = healthcare_data_processing.load_and_prepare_documents(config.HEALTHCARE_MAGIC_PATH)
+    # all_docs.extend(healthcare_docs)
+    '''# Process MedQuad data
+    medquad_docs = medquad_data_processing.load_and_prepare_documents(config.MEDQUAD_PATH)
+    all_docs.extend(medquad_docs)'''
+    print(f"Total documents loaded from all sources: {len(all_docs)}")
+    return all_docs
+def load_and_prepare_fda_documents(json_path=config.CLEANED_DATA_PATH):
+    """
+    Loads cleaned drug data from a JSON Lines file and converts it into
+    a list of LlamaIndex Document objects for the RAG pipeline.
+    """
+    print(f"Loading cleaned drug data from: {json_path}...")
+    all_docs = []
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            for line in tqdm(f, desc="Processing cleaned drug data"):
+                entry = json.loads(line)
+                content = entry.get("content")
+                if not content:
+                    continue
+                metadata = {
+                    "doc_id": entry.get("doc_id"),
+                    "brand_name": entry.get("brand_name"),
+                    "generic_name": entry.get("generic_name"),
+                    "section": entry.get("section"),
+                    "source": "FDA Drug Labels"
+                }
+                # The text for the document is just the content of the section
+                doc = Document(text=content, metadata=metadata)
+                all_docs.append(doc)
+    except FileNotFoundError:
+        print(f"Error: The file '{json_path}' was not found.")
+        return []
+    except json.JSONDecodeError as e:
+        print(f"Error: Could not decode JSON from a line in '{json_path}'. Details: {e}")
+        return []
+    print(f"Created {len(all_docs)} 'Document' objects from the cleaned FDA data.")
+    return all_docs

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# =================================================================================
+# rag_pipeline.py: Create the Gemini model and the RAG chain
+# =================================================================================
+from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
+from llama_index.llms.gemini import Gemini
+from llama_index.core.prompts.base import PromptTemplate
+from llama_index.core.prompts import ChatPromptTemplate
+from llama_index.core.llms import ChatMessage, MessageRole
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import Settings
+from google.generativeai.types import HarmCategory, HarmBlockThreshold
+import config
+import os
+def initialize_llm_and_embed_model():
+    """
+    Initializes and sets the global LLM and embedding model for LlamaIndex.
+    """
+    print(f"Initializing Gemini model: {config.LLM_MODEL_ID}...")
+    # Define safety settings to be less restrictive, especially for medical content
+    safety_settings = {
+        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+    }
+    # System instruction for Gemini (if supported by your version)
+    system_instruction = (
+        "You are PharmaBot, an AI pharmaceutical information assistant. "
+        "You provide accurate information from FDA drug labels but never give medical advice or diagnose conditions. "
+        "You always respond in the user's language and maintain conversation context throughout the session."
+    )
+    llm = Gemini(
+        model_name=config.LLM_MODEL_ID,
+        temperature=0.3,
+        safety_settings=safety_settings,
+        generation_config={"candidate_count": 1},
+        system_instruction=system_instruction  # Add system instruction
+    )
+    print(f"Loading embedding model: {config.EMBEDDING_MODEL_NAME}...")
+    # Get the token from environment variables
+    hf_token = os.getenv("HUGGING_FACE_TOKEN")
+    if not hf_token:
+        print("Warning: HUGGING_FACE_TOKEN environment variable not set.")
+    embed_model = HuggingFaceEmbedding(
+        model_name=config.EMBEDDING_MODEL_NAME,
+        token=hf_token
+    )
+    # Set the global models for LlamaIndex
+    Settings.llm = llm
+    Settings.embed_model = embed_model
+def load_vector_index():
+    """
+    Loads the LlamaIndex vector index from storage.
+    """
+    if not os.path.exists(config.LLAMA_INDEX_STORE_PATH):
+        raise FileNotFoundError(f"LlamaIndex store not found at {config.LLAMA_INDEX_STORE_PATH}. Please run build_knowledge_base.py first.")
+    print("Loading LlamaIndex vector store...")
+    storage_context = StorageContext.from_defaults(persist_dir=config.LLAMA_INDEX_STORE_PATH)
+    index = load_index_from_storage(storage_context)
+    return index
+from llama_index.core.memory import ChatMemoryBuffer
+def build_query_engine(index):
+    """
+    Builds a query engine from the LlamaIndex vector index.
+    """
+    # Condensed, action-oriented prompt that guides behavior without being conversational
+    qa_template_str = (
+        "Context information from FDA drug labels:\n"
+        "---------------------\n"
+        "{context_str}\n"
+        "---------------------\n\n"
+        "Instructions:\n"
+        "1. LANGUAGE: Respond entirely in the same language as the query. Detect: English, Turkish, Spanish, French, German, Arabic, etc.\n"
+        "2. QUERY TYPE:\n"
+        "   - Medical/Drug query (medications, symptoms, dosages, interactions) → Use context to provide structured response\n"
+        "   - General conversation (greetings, small talk) → Respond conversationally, no context needed\n"
+        "3. CONTEXT CHECK:\n"
+        "   - If context is empty/irrelevant → State you couldn't find information, ask for clarification\n"
+        "   - If context is relevant → Proceed with response\n"
+        "4. RESPONSE FORMAT FOR DRUG QUERIES:\n"
+        "   **Drug Name:** [from brand_name/generic_name]\n"
+        "   **What It's Used For:** [summarize indications_and_usage]\n"
+        "   **How to Take It:** [summarize dosage_and_administration]\n"
+        "   **Important Warnings:** [list 4-5 critical points from warnings/adverse_reactions/contraindications]\n"
+        "   **Drug Interactions:** [if available from drug_interactions]\n"
+        "5. RESPONSE FORMAT FOR DRUG INTERACTIONS:\n"
+        "   **Drug Interaction: [Drug A] and [Drug B]**\n"
+        "   **Interaction Found:** [describe]\n"
+        "   **Clinical Significance:** [explain risks]\n"
+        "   **Recommendation:** [FDA guidance]\n"
+        "6. RESPONSE FORMAT FOR SYMPTOM QUERIES (first ask):\n"
+        "   Ask 5 clarifying questions: duration, severity, prior medications, current medications, allergies\n"
+        "7. RESPONSE FORMAT FOR SYMPTOM QUERIES (after details):\n"
+        "   Present 2-3 FDA-approved medication options with: Type, Used For, Dosage, Key Warning\n"
+        "8. SAFETY:\n"
+        "   - Only use info from context for medical responses\n"
+        "   - If details missing from context, state explicitly\n"
+        "   - ALWAYS end medical responses with:\n"
+        "   ⚠️ Disclaimer: I am an AI assistant, not a medical professional. This information is from FDA labels and is for educational purposes only. Always consult your doctor or pharmacist before taking any medication.\n"
+        "9. MEMORY: Reference previous drugs/symptoms/allergies mentioned in conversation\n\n"
+        "Query: {query_str}\n\n"
+        "Answer (in same language as query):"
+    )
+    qa_template = PromptTemplate(qa_template_str)
+    print("Building query engine...")
+    memory = ChatMemoryBuffer.from_defaults(token_limit=3000)
+    # Use simple chat mode to avoid condense_question_prompt issues
+    # The chat mode will still maintain conversation history through memory
+    query_engine = index.as_chat_engine(
+        chat_mode="context",  # Changed from "condense_question" to "context"
+        memory=memory,
+        system_prompt=(
+            "You are PharmaBot, an AI pharmaceutical information assistant. "
+            "Always respond in the user's language. Use FDA drug label data to answer medical queries. "
+            "Never diagnose or prescribe. Include disclaimers on medical responses."
+        ),
+        context_template=qa_template,  # Use our custom template
+        similarity_top_k=5,
+        verbose=True
+    )
+    return query_engine

requirements.txt ADDED Viewed

	@@ -0,0 +1,130 @@

+aiohappyeyeballs==2.6.1
+aiohttp==3.13.1
+aiosignal==1.4.0
+aiosqlite==0.21.0
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.11.0
+attrs==25.4.0
+banks==2.2.0
+beautifulsoup4==4.14.2
+blinker==1.9.0
+cachetools==6.2.1
+certifi==2025.10.5
+charset-normalizer==3.4.4
+click==8.3.0
+colorama==0.4.6
+dataclasses-json==0.6.7
+defusedxml==0.7.1
+Deprecated==1.2.18
+dirtyjson==1.0.8
+distro==1.9.0
+filelock==3.20.0
+filetype==1.2.0
+frozenlist==1.8.0
+fsspec==2025.9.0
+gitdb==4.0.12
+GitPython==3.1.45
+google-ai-generativelanguage==0.6.15
+google-api-core==2.26.0
+google-api-python-client==2.185.0
+google-auth==2.41.1
+google-auth-httplib2==0.2.0
+google-generativeai==0.8.5
+googleapis-common-protos==1.70.0
+greenlet==3.2.4
+griffe==1.14.0
+grpcio==1.75.1
+grpcio-status==1.71.2
+h11==0.16.0
+hf-xet==1.1.10
+httpcore==1.0.9
+httplib2==0.31.0
+httpx==0.28.1
+huggingface-hub==0.35.3
+idna==3.11
+Jinja2==3.1.6
+jiter==0.11.1
+joblib==1.5.2
+jsonschema==4.25.1
+jsonschema-specifications==2025.9.1
+llama-cloud==0.1.35
+llama-cloud-services==0.6.54
+llama-index==0.14.5
+llama-index-cli==0.5.3
+llama-index-core==0.14.5
+llama-index-embeddings-huggingface==0.6.1
+llama-index-embeddings-openai==0.5.1
+llama-index-indices-managed-llama-cloud==0.9.4
+llama-index-instrumentation==0.4.2
+llama-index-llms-gemini==0.6.1
+llama-index-llms-openai==0.6.5
+llama-index-readers-file==0.5.4
+llama-index-readers-llama-parse==0.5.1
+llama-index-workflows==2.8.3
+llama-parse==0.6.54
+MarkupSafe==3.0.3
+marshmallow==3.26.1
+mpmath==1.3.0
+multidict==6.7.0
+mypy_extensions==1.1.0
+narwhals==2.9.0
+nest-asyncio==1.6.0
+networkx==3.5
+nltk==3.9.2
+numpy==2.3.4
+openai==1.109.1
+packaging==25.0
+pandas==2.2.3
+pillow==10.4.0
+platformdirs==4.5.0
+propcache==0.4.1
+proto-plus==1.26.1
+protobuf==5.29.5
+pyarrow==21.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pydantic==2.12.3
+pydantic_core==2.41.4
+pydeck==0.9.1
+pyparsing==3.2.5
+pypdf==6.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+pytz==2025.2
+PyYAML==6.0.3
+referencing==0.37.0
+regex==2025.9.18
+requests==2.32.5
+rpds-py==0.27.1
+rsa==4.9.1
+safetensors==0.6.2
+scikit-learn==1.7.2
+scipy==1.16.2
+sentence-transformers==5.1.1
+setuptools==80.9.0
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.8
+SQLAlchemy==2.0.44
+streamlit==1.50.0
+striprtf==0.0.26
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tiktoken==0.12.0
+tokenizers==0.22.1
+toml==0.10.2
+torch==2.9.0
+tornado==6.5.2
+tqdm==4.67.1
+transformers==4.57.1
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.2
+uritemplate==4.2.0
+urllib3==2.5.0
+wrapt==1.17.3
+yarl==1.22.0

vector_store_manager.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# =================================================================================
+# vector_store_manager.py: Management of the FAISS vector database
+# =================================================================================
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+import config
+def get_embeddings_model(model_name=config.EMBEDDING_MODEL_NAME):
+    """Loads and returns the embedding model."""
+    print(f"Loading embedding model: {model_name}...")
+    return HuggingFaceEmbeddings(model_name=model_name)
+def create_and_save_store(documents, embeddings, save_path=config.VECTOR_STORE_PATH):
+    """
+    Creates a FAISS vector database from the given documents and saves it to disk.
+    """
+    print("Creating and saving the FAISS vector store...")
+    vector_store = FAISS.from_documents(documents, embeddings)
+    vector_store.save_local(save_path)
+    print(f"✅ Vector store successfully saved to '{save_path}'.")
+def load_store(embeddings, load_path=config.VECTOR_STORE_PATH):
+    """
+    Loads the FAISS vector database from a local path.
+    """
+    print(f"Loading vector store from: {load_path}...")
+    # The allow_dangerous_deserialization flag is required for loading FAISS indexes with LangChain.
+    return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)