Sami Ali commited on
Commit
5f540b8
·
1 Parent(s): 1660531

implement data downloader script

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. app.py +13 -7
  3. src/download_data.py +45 -0
  4. src/llm.py +22 -0
  5. src/vectorstore.py +1 -1
.gitignore CHANGED
@@ -5,6 +5,7 @@ __pycache__/
5
 
6
  # data
7
  data
 
8
 
9
  # C extensions
10
  *.so
 
5
 
6
  # data
7
  data
8
+ demo
9
 
10
  # C extensions
11
  *.so
app.py CHANGED
@@ -1,12 +1,18 @@
1
  from src.data_processor import DataProcessor
2
  from src.embedding import EmbeddingManager
3
  from src.vectorstore import VectorStore
 
4
 
5
  if __name__ == '__main__':
6
- dp = DataProcessor()
7
- chunks, document = dp.build()
8
- embd = EmbeddingManager()
9
- chunks_embedding = embd.embed_texts(chunks)
10
- vectorstore = VectorStore()
11
- vectorstore.add_documents(chunks, chunks_embedding)
12
- retriver = vectorstore.get_retriever()
 
 
 
 
 
 
1
  from src.data_processor import DataProcessor
2
  from src.embedding import EmbeddingManager
3
  from src.vectorstore import VectorStore
4
+ from src.download_data import download_pmc_docs
5
 
6
  if __name__ == '__main__':
7
+ flag = True # download_pmc_docs()
8
+ if flag:
9
+ dp = DataProcessor()
10
+ chunks, document = dp.build()
11
+ chunks_list = [c.page_content for c in chunks]
12
+ embd = EmbeddingManager()
13
+ embd_model = embd.get_model()
14
+ chunks_embedding = embd.embed_texts(chunks_list)
15
+ vectorstore = VectorStore()
16
+ vectorstore.add_documents(chunks, chunks_embedding)
17
+ retriver = vectorstore.get_retriever(embd_model)
18
+
src/download_data.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from src.constant import BASE_DIR
4
+
5
+ import boto3
6
+ from botocore import UNSIGNED
7
+ from botocore.client import Config
8
+
9
+
10
+
11
+ TARGET_DIR = os.path.join(BASE_DIR, "data", "demo")
12
+
13
+ def download_pmc_docs(
14
+ bucket="pmc-oa-opendata",
15
+ prefix="oa_comm/txt/all",
16
+ target_dir=TARGET_DIR,
17
+ limit=10
18
+ ):
19
+ os.makedirs(target_dir, exist_ok=True)
20
+
21
+ s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
22
+ paginator = s3.get_paginator('list_objects_v2')
23
+
24
+ downloaded = 0
25
+
26
+ for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
27
+ for obj in page.get("Contents", []):
28
+ key = obj["Key"]
29
+ if not key.endswith(".txt"):
30
+ continue
31
+ filename = os.path.basename(key)
32
+ local_path = os.path.join(target_dir, filename)
33
+
34
+ if not os.path.exists(local_path):
35
+ s3.download_file(bucket, key, local_path)
36
+ downloaded += 1
37
+
38
+ if downloaded >= limit:
39
+ print(f"✅ Reached limit of {limit} documents.")
40
+ return
41
+
42
+ print(f"✅ Finished. Total downloaded: {downloaded}")
43
+
44
+ return True
45
+
src/llm.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_google_genai import ChatGoogleGenerativeAI
3
+ from langchain.chains import RetrievalQA
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+
9
+ class LLM:
10
+ def __init__(self, retriever):
11
+ self.llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash",
12
+ google_api_key=os.getenv("GOOGLE_API_KEY"))
13
+ self.qa = RetrievalQA.from_chain_type(
14
+ llm=self.llm,
15
+ retriever=retriever,
16
+ chain_type="stuff",
17
+ return_source_documents=True
18
+ )
19
+
20
+ def invoke(self, query: str):
21
+ result = self.qa.invoke({"query": query})
22
+ return result
src/vectorstore.py CHANGED
@@ -8,7 +8,7 @@ from langchain.vectorstores import Chroma
8
  from langchain.schema import Document
9
  from uuid import uuid4
10
 
11
- DATA_DIR = os.path.join(BASE_DIR, "db")
12
 
13
 
14
  class VectorStore:
 
8
  from langchain.schema import Document
9
  from uuid import uuid4
10
 
11
+ DATA_DIR = os.path.join(BASE_DIR, "data", "db")
12
 
13
 
14
  class VectorStore: