Spaces:

samiali12
/

medrag-assistant

Sleeping

App Files Files Community

Sami Ali commited on Oct 7

Commit

b098829

1 Parent(s): cacd7d3

feat: improve model and prompt

Browse files

Files changed (6) hide show

.gitignore +2 -0
src/data_processor.py +103 -13
src/download_data.py +3 -0
src/embedding.py +10 -8
src/prompt.py +6 -12
streamlit_app.py +1 -1

.gitignore CHANGED Viewed

@@ -7,6 +7,8 @@ __pycache__/
 data
 demo
 # C extensions
 *.so

 data
 demo
+test.ipynb
 # C extensions
 *.so

src/data_processor.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 from src.constant import BASE_DIR
 from langchain.schema import Document
@@ -8,36 +9,31 @@ DATA_DIR = os.path.join(BASE_DIR, "data", "pmc")
 class DataProcessor:
     """
-    Handles loading, cleaning, and chunking of text files
     from the PubMed Central (PMC) dataset.
     """
-    def __init__(self, data_path: str = DATA_DIR, limit=100):
         self.data_path = data_path
-        self.limit = limit
     def _load_files(self) -> list[dict]:
         """
         Load raw text files from the dataset directory.
         Returns a list of dictionaries with file name and raw content.
         """
-        count = 0
         data_list = []
         for file_name in os.listdir(self.data_path):
             if not file_name.endswith(".txt"):
                 continue
             file_path = os.path.join(self.data_path, file_name)
-            with open(file_path, "r", encoding="utf-8") as file_ref:
                 data_list.append(
                     {
                         "file_name": file_name,
                         "page_content": file_ref.read()
                     }
                 )
-            if count >= self.limit:
-                break
-            count += 1
         return data_list
     @staticmethod
@@ -48,10 +44,103 @@ class DataProcessor:
         if not isinstance(text, str):
             return text
         try:
-            return text.encode("utf-8").decode("unicode-escape")
         except Exception:
             return text
     def _preprocess(self, data: list[dict]) -> list[dict]:
         """
         Apply preprocessing steps (e.g., unicode decoding) to raw data.
@@ -59,17 +148,18 @@ class DataProcessor:
         cleaned_data = []
         for record in data:
             decoded_text = self._decode_unicode(record["page_content"])
             cleaned_data.append(
                 {
                     "file_name": record["file_name"],
-                    "page_content": decoded_text
                 }
             )
         return cleaned_data
     def load_documents(self) -> list[Document]:
         """
-        Load and preprocess text files, converting them into
         LangChain Document objects.
         """
         raw_data = self._load_files()
@@ -106,4 +196,4 @@ class DataProcessor:
         """
         documents = self.load_documents()
         chunks = self.chunk_documents(documents)
-        return chunks, documents

+import re
 import os
 from src.constant import BASE_DIR
 from langchain.schema import Document
 class DataProcessor:
     """
+    Handles loading, cleaning, and chunking of text files
     from the PubMed Central (PMC) dataset.
     """
+    def __init__(self, data_path: str = DATA_DIR):
         self.data_path = data_path
     def _load_files(self) -> list[dict]:
         """
         Load raw text files from the dataset directory.
         Returns a list of dictionaries with file name and raw content.
         """
         data_list = []
         for file_name in os.listdir(self.data_path):
             if not file_name.endswith(".txt"):
                 continue
             file_path = os.path.join(self.data_path, file_name)
+            with open(file_path, "r", encoding="utf-8", errors="replace") as file_ref:
                 data_list.append(
                     {
                         "file_name": file_name,
                         "page_content": file_ref.read()
                     }
                 )
         return data_list
     @staticmethod
         if not isinstance(text, str):
             return text
         try:
+            return text.encode("utf-8", "ignore").decode("utf-8", "ignore")
         except Exception:
             return text
+    def _extract_body(self, text: str) -> str:
+      if not text:
+        return ""
+      text = text.replace("\r\n", "\n").replace("\r", "\n")
+      text = re.sub(r'-\n', '', text)
+      text = re.sub(r'\n{3,}', '\n\n', text)
+      start_patterns = [
+        r"====\s*Body", r"^Body\s*$", r"^BODY\s*$",
+        r"^Abstract\s*$", r"^ABSTRACT\s*$", r"^Introduction\s*$", r"^INTRODUCTION\s*$"
+      ]
+      end_patterns = [
+          r"====\s*Back", r"^Back\s*$", r"^BACK\s*$",
+          r"^References\s*$", r"^REFERENCES\s*$", r"^Bibliography\s*$",
+          r"^Acknowledg", r"^Acknowledgments\s*$", r"^ACKNOWLEDGMENTS\s*$"
+      ]
+      start_idx = None
+      for pat in start_patterns:
+          m = re.search(pat, text, flags=re.IGNORECASE | re.MULTILINE)
+          if m:
+              start_idx = m.end()
+              break
+      if start_idx is not None:
+        # find end after start_idx
+        end_idx = None
+        for pat in end_patterns:
+            m = re.search(pat, text[start_idx:], flags=re.IGNORECASE | re.MULTILINE)
+            if m:
+                end_idx = start_idx + m.start()
+                break
+        body = text[start_idx:end_idx] if end_idx else text[start_idx:]
+      else:
+         paragraphs = re.split(r'\n{2,}', text)
+         paragraphs = [p.strip() for p in paragraphs if p.strip()]
+         def is_metadata_para(p: str) -> bool:
+            # DOI / arXiv / ISSN / PMCID / PMID
+            if re.search(r'\b10\.\d{4,9}/\S+\b', p):
+                return True
+            if re.search(r'\bPMCID\b|\bPMID\b', p, re.I):
+                return True
+            if re.search(r'ISSN[:\s]', p, re.I):
+                return True
+            # common metadata keywords
+            if re.search(r'Correspondence:|Affiliat|Author|ORCID|E-mail:|Contact:', p, re.I):
+                return True
+            if re.search(r'©|license|all rights reserved|Published|Received|Accepted', p, re.I):
+                return True
+            words = p.split()
+            if len(p) < 200 and len(words) <= 12 and sum(1 for w in words if w.isupper())/max(1,len(words)) > 0.6:
+                return True
+            return False
+         wc = [len(p.split()) for p in paragraphs]
+         good = [ (wc_i >= 40 and not is_metadata_para(p)) for p, wc_i in zip(paragraphs, wc) ]
+         best_start = best_len = 0
+         cur_start = cur_len = 0
+         for i, g in enumerate(good):
+              if g:
+                  if cur_len == 0:
+                      cur_start = i
+                  cur_len += 1
+                  if cur_len > best_len:
+                      best_len = cur_len
+                      best_start = cur_start
+              else:
+                  cur_len = 0
+         if best_len > 0:
+            body = "\n\n".join(paragraphs[best_start: best_start + best_len])
+         else:
+            # final fallback: pick the top N paragraphs by length (they likely contain body content)
+            top_idxs = sorted(range(len(paragraphs)), key=lambda i: wc[i], reverse=True)[:5]
+            top_idxs.sort()
+            body = "\n\n".join(paragraphs[i] for i in top_idxs)
+      body = re.sub(r'\n{2,}References[\s\S]*$', '', body, flags=re.IGNORECASE)
+      body = re.sub(r'\n{2,}Bibliography[\s\S]*$', '', body, flags=re.IGNORECASE)
+      body = re.sub(r'\n{2,}Acknowledg[\s\S]*$', '', body, flags=re.IGNORECASE)
+      # 4) Clean junk: remove URLs/emails, collapse whitespace
+      body = re.sub(r'https?://\S+', ' ', body)
+      body = re.sub(r'\S+@\S+', ' ', body)
+      body = re.sub(r'\s+', ' ', body).strip()
+      return body
     def _preprocess(self, data: list[dict]) -> list[dict]:
         """
         Apply preprocessing steps (e.g., unicode decoding) to raw data.
         cleaned_data = []
         for record in data:
             decoded_text = self._decode_unicode(record["page_content"])
+            main_body = self._extract_body(decoded_text)
             cleaned_data.append(
                 {
                     "file_name": record["file_name"],
+                    "page_content": main_body
                 }
             )
         return cleaned_data
     def load_documents(self) -> list[Document]:
         """
+        Load and preprocess text files, converting them into
         LangChain Document objects.
         """
         raw_data = self._load_files()
         """
         documents = self.load_documents()
         chunks = self.chunk_documents(documents)
+        return chunks, documents

src/download_data.py CHANGED Viewed

@@ -17,6 +17,9 @@ def download_pmc_docs(
         target_dir=TARGET_DIR,
         limit=1000
 ):
     os.makedirs(target_dir, exist_ok=True)
     s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

         target_dir=TARGET_DIR,
         limit=1000
 ):
+    if (len(os.listdir(target_dir)) > 0):
+        return
     os.makedirs(target_dir, exist_ok=True)
     s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

src/embedding.py CHANGED Viewed

@@ -1,17 +1,20 @@
-from typing import List
 import numpy as np
-from langchain_huggingface import HuggingFaceEmbeddings
 from tqdm import tqdm
 class EmbeddingManager:
     def __init__(self, model_name: str = "pritamdeka/S-BioBERT-snli-multinli-stsb"):
         self.model_name = model_name
         self.model = None
         self.load_model()
     def load_model(self):
         print("Loading embedding model:", self.model_name)
-        self.model = HuggingFaceEmbeddings(model_name=self.model_name)
         print("Model loaded.")
     def get_model(self):
@@ -24,10 +27,9 @@ class EmbeddingManager:
         embeddings = []
         for i in tqdm(range(0, len(texts), batch_size), desc="Embedding texts"):
             batch = texts[i:i + batch_size]
-            emb = self.model.embed_documents(batch)
             embeddings.extend(emb)
-        return np.array(embeddings)
-    def embed_one(self, text: str) -> np.ndarray:
-        return self.model.embed_query(text)

 import numpy as np
+import torch
+from typing import List
+from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
 class EmbeddingManager:
     def __init__(self, model_name: str = "pritamdeka/S-BioBERT-snli-multinli-stsb"):
         self.model_name = model_name
         self.model = None
+        self.device = 'cuda' if torch.cuda.is_available else 'cpu'
         self.load_model()
     def load_model(self):
         print("Loading embedding model:", self.model_name)
+        print('Using device', self.device)
+        self.model = SentenceTransformer(model_name=self.model_name, device=self.device)
         print("Model loaded.")
     def get_model(self):
         embeddings = []
         for i in tqdm(range(0, len(texts), batch_size), desc="Embedding texts"):
             batch = texts[i:i + batch_size]
+            emb = self.model.encode(batch, batch_size=batch_size, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
             embeddings.extend(emb)
+        return np.vstack(embeddings)
+    def embed_query(self, text: str) -> np.ndarray:
+        return self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True).flatten()

src/prompt.py CHANGED Viewed

@@ -1,22 +1,16 @@
 from langchain_core.prompts import PromptTemplate
-BIOMED_PROMPT = """
-You are MedRAG, an assistant specialized in biomedical research.
-Your job is to answer the question using ONLY the provided context.
-If the answer cannot be found in the context, say clearly:
-"I could not find an exact answer in the provided research papers."
-Always cite the source PMC IDs in your answer.
 Question:
 {question}
-Context:
-{context}
-Answer:
-"""
 prompt = PromptTemplate(
     template=BIOMED_PROMPT,

 from langchain_core.prompts import PromptTemplate
+BIOMED_PROMPT = """You are a scholarly assistant analyzing historical medical texts.
+Use the retrieved documents to answer the user's question as completely as possible.
+If the context implies but does not explicitly state a detail, you may infer it cautiously.
+Context:
+{context}
 Question:
 {question}
+Answer in a clear, factual summary style."""
 prompt = PromptTemplate(
     template=BIOMED_PROMPT,

streamlit_app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import streamlit as st
 @st.cache_resource(show_spinner="🔄 Building pipeline...")
 def load_pipeline():
-    limit = 1000
     download_pmc_docs(limit=limit)
     dp = DataProcessor()
     chunks, document = dp.build()

 @st.cache_resource(show_spinner="🔄 Building pipeline...")
 def load_pipeline():
+    limit = 2000
     download_pmc_docs(limit=limit)
     dp = DataProcessor()
     chunks, document = dp.build()