PharmaBot / data_processing.py
alperensn's picture
Upload files
25fcb73 verified
# =================================================================================
# data_processing.py: Process and prepare raw data
# =================================================================================
import json
import re
from llama_index.core import Document
from tqdm import tqdm
import config
def clean_text(text: str) -> str:
"""
Cleans the input text by removing common noise from FDA documents.
"""
text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text)
text = re.sub(r'\s{2,}', ' ', text).strip()
text = re.sub(r'[\-=*]{3,}', '', text)
return text
def load_and_prepare_documents(json_path=config.RAW_DATA_PATH):
"""
Loads drug data from a JSON file, filters for high-quality entries,
cleans the text, and returns a list of LangChain Document objects.
"""
print(f"Loading data from: {json_path}...")
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
all_docs = []
print("Filtering, cleaning, and converting data to 'Document' objects...")
for entry in tqdm(data, desc="Processing drug data"):
if not entry: continue
# --- NEW FILTERING LOGIC ---
# 1. Ensure the entry has a brand or generic name.
brand_name_list = entry.get("openfda", {}).get("brand_name")
generic_name_list = entry.get("openfda", {}).get("generic_name")
if not brand_name_list and not generic_name_list:
continue # Skip entries with no name
# 2. Ensure it's likely a real drug by checking for a crucial section.
if "indications_and_usage" not in entry:
continue # Skip entries that don't say what the drug is for
brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"
# ---------------------------
sections_to_process = {
"indications_and_usage": "Indications and Usage",
"adverse_reactions": "Adverse Reactions",
"drug_interactions": "Drug Interactions",
"contraindications": "Contraindications",
"warnings": "Warnings",
"boxed_warning": "Boxed Warning",
"mechanism_of_action": "Mechanism of Action",
"pharmacokinetics": "Pharmacokinetics",
"dosage_and_administration": "Dosage and Administration",
"how_supplied": "How Supplied",
"storage_and_handling": "Storage and Handling",
"information_for_patients": "Information for Patients",
"pregnancy": "Pregnancy",
"nursing_mothers": "Nursing Mothers",
"pediatric_use": "Pediatric Use",
"geriatric_use": "Geriatric Use"
}
for key, section_name in sections_to_process.items():
text_list = entry.get(key)
if text_list and isinstance(text_list, list) and text_list[0] and text_list[0].strip():
cleaned_text = clean_text(text_list[0])
if cleaned_text:
metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name}
doc = Document(page_content=cleaned_text, metadata=metadata)
all_docs.append(doc)
print(f"Created a total of {len(all_docs)} 'Document' objects after filtering.")
return all_docs
def load_and_process_all():
"""
Loads and processes documents from all configured data sources.
"""
all_docs = []
# Process FDA drug data
fda_docs = load_and_prepare_fda_documents()
all_docs.extend(fda_docs)
# Process HealthCareMagic data
# healthcare_docs = healthcare_data_processing.load_and_prepare_documents(config.HEALTHCARE_MAGIC_PATH)
# all_docs.extend(healthcare_docs)
'''# Process MedQuad data
medquad_docs = medquad_data_processing.load_and_prepare_documents(config.MEDQUAD_PATH)
all_docs.extend(medquad_docs)'''
print(f"Total documents loaded from all sources: {len(all_docs)}")
return all_docs
def load_and_prepare_fda_documents(json_path=config.CLEANED_DATA_PATH):
"""
Loads cleaned drug data from a JSON Lines file and converts it into
a list of LlamaIndex Document objects for the RAG pipeline.
"""
print(f"Loading cleaned drug data from: {json_path}...")
all_docs = []
try:
with open(json_path, 'r', encoding='utf-8') as f:
for line in tqdm(f, desc="Processing cleaned drug data"):
entry = json.loads(line)
content = entry.get("content")
if not content:
continue
metadata = {
"doc_id": entry.get("doc_id"),
"brand_name": entry.get("brand_name"),
"generic_name": entry.get("generic_name"),
"section": entry.get("section"),
"source": "FDA Drug Labels"
}
# The text for the document is just the content of the section
doc = Document(text=content, metadata=metadata)
all_docs.append(doc)
except FileNotFoundError:
print(f"Error: The file '{json_path}' was not found.")
return []
except json.JSONDecodeError as e:
print(f"Error: Could not decode JSON from a line in '{json_path}'. Details: {e}")
return []
print(f"Created {len(all_docs)} 'Document' objects from the cleaned FDA data.")
return all_docs