|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
import re
|
|
|
from llama_index.core import Document
|
|
|
from tqdm import tqdm
|
|
|
import config
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
|
"""
|
|
|
Cleans the input text by removing common noise from FDA documents.
|
|
|
"""
|
|
|
text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text)
|
|
|
text = re.sub(r'\s{2,}', ' ', text).strip()
|
|
|
text = re.sub(r'[\-=*]{3,}', '', text)
|
|
|
return text
|
|
|
|
|
|
def load_and_prepare_documents(json_path=config.RAW_DATA_PATH):
|
|
|
"""
|
|
|
Loads drug data from a JSON file, filters for high-quality entries,
|
|
|
cleans the text, and returns a list of LangChain Document objects.
|
|
|
"""
|
|
|
print(f"Loading data from: {json_path}...")
|
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
|
data = json.load(f)
|
|
|
|
|
|
all_docs = []
|
|
|
print("Filtering, cleaning, and converting data to 'Document' objects...")
|
|
|
for entry in tqdm(data, desc="Processing drug data"):
|
|
|
if not entry: continue
|
|
|
|
|
|
|
|
|
|
|
|
brand_name_list = entry.get("openfda", {}).get("brand_name")
|
|
|
generic_name_list = entry.get("openfda", {}).get("generic_name")
|
|
|
|
|
|
if not brand_name_list and not generic_name_list:
|
|
|
continue
|
|
|
|
|
|
|
|
|
if "indications_and_usage" not in entry:
|
|
|
continue
|
|
|
|
|
|
brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
|
|
|
generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"
|
|
|
|
|
|
|
|
|
sections_to_process = {
|
|
|
"indications_and_usage": "Indications and Usage",
|
|
|
"adverse_reactions": "Adverse Reactions",
|
|
|
"drug_interactions": "Drug Interactions",
|
|
|
"contraindications": "Contraindications",
|
|
|
"warnings": "Warnings",
|
|
|
"boxed_warning": "Boxed Warning",
|
|
|
"mechanism_of_action": "Mechanism of Action",
|
|
|
"pharmacokinetics": "Pharmacokinetics",
|
|
|
"dosage_and_administration": "Dosage and Administration",
|
|
|
"how_supplied": "How Supplied",
|
|
|
"storage_and_handling": "Storage and Handling",
|
|
|
"information_for_patients": "Information for Patients",
|
|
|
"pregnancy": "Pregnancy",
|
|
|
"nursing_mothers": "Nursing Mothers",
|
|
|
"pediatric_use": "Pediatric Use",
|
|
|
"geriatric_use": "Geriatric Use"
|
|
|
}
|
|
|
|
|
|
for key, section_name in sections_to_process.items():
|
|
|
text_list = entry.get(key)
|
|
|
if text_list and isinstance(text_list, list) and text_list[0] and text_list[0].strip():
|
|
|
cleaned_text = clean_text(text_list[0])
|
|
|
if cleaned_text:
|
|
|
metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name}
|
|
|
doc = Document(page_content=cleaned_text, metadata=metadata)
|
|
|
all_docs.append(doc)
|
|
|
|
|
|
print(f"Created a total of {len(all_docs)} 'Document' objects after filtering.")
|
|
|
return all_docs
|
|
|
|
|
|
def load_and_process_all():
|
|
|
"""
|
|
|
Loads and processes documents from all configured data sources.
|
|
|
"""
|
|
|
all_docs = []
|
|
|
|
|
|
|
|
|
fda_docs = load_and_prepare_fda_documents()
|
|
|
all_docs.extend(fda_docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''# Process MedQuad data
|
|
|
medquad_docs = medquad_data_processing.load_and_prepare_documents(config.MEDQUAD_PATH)
|
|
|
all_docs.extend(medquad_docs)'''
|
|
|
|
|
|
print(f"Total documents loaded from all sources: {len(all_docs)}")
|
|
|
return all_docs
|
|
|
|
|
|
def load_and_prepare_fda_documents(json_path=config.CLEANED_DATA_PATH):
|
|
|
"""
|
|
|
Loads cleaned drug data from a JSON Lines file and converts it into
|
|
|
a list of LlamaIndex Document objects for the RAG pipeline.
|
|
|
"""
|
|
|
print(f"Loading cleaned drug data from: {json_path}...")
|
|
|
all_docs = []
|
|
|
try:
|
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
|
for line in tqdm(f, desc="Processing cleaned drug data"):
|
|
|
entry = json.loads(line)
|
|
|
|
|
|
content = entry.get("content")
|
|
|
if not content:
|
|
|
continue
|
|
|
|
|
|
metadata = {
|
|
|
"doc_id": entry.get("doc_id"),
|
|
|
"brand_name": entry.get("brand_name"),
|
|
|
"generic_name": entry.get("generic_name"),
|
|
|
"section": entry.get("section"),
|
|
|
"source": "FDA Drug Labels"
|
|
|
}
|
|
|
|
|
|
|
|
|
doc = Document(text=content, metadata=metadata)
|
|
|
all_docs.append(doc)
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
print(f"Error: The file '{json_path}' was not found.")
|
|
|
return []
|
|
|
except json.JSONDecodeError as e:
|
|
|
print(f"Error: Could not decode JSON from a line in '{json_path}'. Details: {e}")
|
|
|
return []
|
|
|
|
|
|
print(f"Created {len(all_docs)} 'Document' objects from the cleaned FDA data.")
|
|
|
return all_docs
|
|
|
|
|
|
|