# ================================================================================= # data_processing.py: Process and prepare raw data # ================================================================================= import json import re from llama_index.core import Document from tqdm import tqdm import config def clean_text(text: str) -> str: """ Cleans the input text by removing common noise from FDA documents. """ text = re.sub(r'REVISED:\s*\d{1,2}/\d{4}', '', text) text = re.sub(r'\s{2,}', ' ', text).strip() text = re.sub(r'[\-=*]{3,}', '', text) return text def load_and_prepare_documents(json_path=config.RAW_DATA_PATH): """ Loads drug data from a JSON file, filters for high-quality entries, cleans the text, and returns a list of LangChain Document objects. """ print(f"Loading data from: {json_path}...") with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) all_docs = [] print("Filtering, cleaning, and converting data to 'Document' objects...") for entry in tqdm(data, desc="Processing drug data"): if not entry: continue # --- NEW FILTERING LOGIC --- # 1. Ensure the entry has a brand or generic name. brand_name_list = entry.get("openfda", {}).get("brand_name") generic_name_list = entry.get("openfda", {}).get("generic_name") if not brand_name_list and not generic_name_list: continue # Skip entries with no name # 2. Ensure it's likely a real drug by checking for a crucial section. if "indications_and_usage" not in entry: continue # Skip entries that don't say what the drug is for brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand" generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic" # --------------------------- sections_to_process = { "indications_and_usage": "Indications and Usage", "adverse_reactions": "Adverse Reactions", "drug_interactions": "Drug Interactions", "contraindications": "Contraindications", "warnings": "Warnings", "boxed_warning": "Boxed Warning", "mechanism_of_action": "Mechanism of Action", "pharmacokinetics": "Pharmacokinetics", "dosage_and_administration": "Dosage and Administration", "how_supplied": "How Supplied", "storage_and_handling": "Storage and Handling", "information_for_patients": "Information for Patients", "pregnancy": "Pregnancy", "nursing_mothers": "Nursing Mothers", "pediatric_use": "Pediatric Use", "geriatric_use": "Geriatric Use" } for key, section_name in sections_to_process.items(): text_list = entry.get(key) if text_list and isinstance(text_list, list) and text_list[0] and text_list[0].strip(): cleaned_text = clean_text(text_list[0]) if cleaned_text: metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name} doc = Document(page_content=cleaned_text, metadata=metadata) all_docs.append(doc) print(f"Created a total of {len(all_docs)} 'Document' objects after filtering.") return all_docs def load_and_process_all(): """ Loads and processes documents from all configured data sources. """ all_docs = [] # Process FDA drug data fda_docs = load_and_prepare_fda_documents() all_docs.extend(fda_docs) # Process HealthCareMagic data # healthcare_docs = healthcare_data_processing.load_and_prepare_documents(config.HEALTHCARE_MAGIC_PATH) # all_docs.extend(healthcare_docs) '''# Process MedQuad data medquad_docs = medquad_data_processing.load_and_prepare_documents(config.MEDQUAD_PATH) all_docs.extend(medquad_docs)''' print(f"Total documents loaded from all sources: {len(all_docs)}") return all_docs def load_and_prepare_fda_documents(json_path=config.CLEANED_DATA_PATH): """ Loads cleaned drug data from a JSON Lines file and converts it into a list of LlamaIndex Document objects for the RAG pipeline. """ print(f"Loading cleaned drug data from: {json_path}...") all_docs = [] try: with open(json_path, 'r', encoding='utf-8') as f: for line in tqdm(f, desc="Processing cleaned drug data"): entry = json.loads(line) content = entry.get("content") if not content: continue metadata = { "doc_id": entry.get("doc_id"), "brand_name": entry.get("brand_name"), "generic_name": entry.get("generic_name"), "section": entry.get("section"), "source": "FDA Drug Labels" } # The text for the document is just the content of the section doc = Document(text=content, metadata=metadata) all_docs.append(doc) except FileNotFoundError: print(f"Error: The file '{json_path}' was not found.") return [] except json.JSONDecodeError as e: print(f"Error: Could not decode JSON from a line in '{json_path}'. Details: {e}") return [] print(f"Created {len(all_docs)} 'Document' objects from the cleaned FDA data.") return all_docs