luciagomez commited on
Commit
c175b07
·
verified ·
1 Parent(s): 1e1e0e1

upload v1 of Dockerfile, rag, app, requirements and utils

Browse files
Files changed (5) hide show
  1. Dockerfile +24 -0
  2. app.py +191 -0
  3. rag.py +47 -0
  4. requirements.txt +11 -0
  5. utils.py +91 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # System deps
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential git curl && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ # Python deps
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # App
15
+ COPY . .
16
+
17
+ # Cache tokenizer (optional)
18
+ RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.3')"
19
+
20
+ ENV HF_HOME=/app/.cache/huggingface
21
+ ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
22
+
23
+ EXPOSE 7860
24
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, json, pandas as pd, gradio as gr, torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
+ from typing import Optional
4
+
5
+ from utils import (init_provenance_db, log_provenance, ensure_foundation_year_dir,
6
+ download_pdf, find_best_report_url, DATA_DIR)
7
+ from rag import add_pdf_to_index, get_retriever
8
+
9
+ # ---------- Data & DB ----------
10
+ FOUNDATIONS_CSV = "data/foundations.csv"
11
+ foundations = pd.read_csv(FOUNDATIONS_CSV, dtype={"id":"int"})
12
+ init_provenance_db()
13
+
14
+ # ---------- LLM (local Mistral) ----------
15
+ MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
17
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32)
18
+ DEVICE = 0 if torch.cuda.is_available() else -1
19
+ gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=DEVICE,
20
+ model_kwargs={"torch_dtype": torch.float32}, max_new_tokens=512, do_sample=False)
21
+
22
+ # ---------- MCP tool: fetch_annual_report ----------
23
+ def tool_fetch_annual_report(foundation_id: int, foundation_name: Optional[str] = None,
24
+ year: Optional[int] = None, search_terms: Optional[str] = None,
25
+ save_title: Optional[str] = None) -> dict:
26
+ if not foundation_name:
27
+ row = foundations[foundations["id"] == int(foundation_id)]
28
+ if row.empty:
29
+ return {"status":"error","message":f"Unknown foundation_id={foundation_id}"}
30
+ foundation_name = row.iloc[0]["name"]
31
+
32
+ try:
33
+ best = find_best_report_url(foundation_name, year, search_terms, serpapi_key=os.getenv("SERPAPI_KEY"))
34
+ if not best:
35
+ return {"status":"not_found","message":"No suitable report URL found."}
36
+ url = best.get("link")
37
+ title = save_title or best.get("title") or f"{foundation_name}-annual-report-{year or ''}".strip("-")
38
+
39
+ save_dir = ensure_foundation_year_dir(int(foundation_id), year)
40
+ saved_path = download_pdf(url, save_dir, preferred_name=f"{title}.pdf")
41
+
42
+ # Ingest into FAISS for RAG
43
+ add_pdf_to_index(saved_path, metadata={
44
+ "foundation_id": int(foundation_id),
45
+ "year": year,
46
+ "file_path": saved_path,
47
+ "source_url": url,
48
+ "title": title,
49
+ "doc_type": "annual_report"
50
+ })
51
+
52
+ # provenance
53
+ log_provenance(int(foundation_id), year, title, "annual_report", saved_path, url)
54
+
55
+ return {"status":"ok","url":url,"saved_path":saved_path,"message":f"Stored & indexed: {saved_path}"}
56
+ except Exception as e:
57
+ return {"status":"error","message":str(e)}
58
+
59
+ # ---------- MCP extraction ----------
60
+ def extract_function_call(text: str):
61
+ try:
62
+ data = json.loads(text.strip())
63
+ if isinstance(data, dict) and "function" in data and "parameters" in data:
64
+ return data["function"], data["parameters"]
65
+ except Exception:
66
+ pass
67
+ return None, None
68
+
69
+ def system_prompt(context: str, user_question: str) -> str:
70
+ return f"""You are a Swiss philanthropy assistant with a tool (MCP-style).
71
+
72
+ TOOL CALL FORMAT (STRICT JSON ONLY when calling a tool):
73
+ {{
74
+ "function": "fetch_annual_report",
75
+ "parameters": {{
76
+ "foundation_id": <int>,
77
+ "foundation_name": "<string, optional>",
78
+ "year": <int, optional>,
79
+ "search_terms": "<string, optional>"
80
+ }}
81
+ }}
82
+
83
+ RULES:
84
+ - If you need an annual report PDF URL, output ONLY the JSON tool call above.
85
+ - Prefer precise PDF URLs; the tool will download + index the PDF automatically.
86
+ - If you already have enough info to answer, reply normally (plain text), concise.
87
+
88
+ Context:
89
+ {context}
90
+
91
+ User:
92
+ {user_question}
93
+
94
+ Your response (either JSON tool call or plain text):
95
+ """
96
+
97
+ def llm(prompt: str) -> str:
98
+ out = gen(prompt)[0]["generated_text"]
99
+ # Return only the new segment after the prompt to avoid echo
100
+ return out[len(prompt):].strip() if out.startswith(prompt) else out.strip()
101
+
102
+ def mcp_orchestrate(user_question: str):
103
+ context = ""
104
+ used_tool = False
105
+ for _ in range(3):
106
+ raw = llm(system_prompt(context, user_question))
107
+ fname, params = extract_function_call(raw)
108
+ if fname == "fetch_annual_report":
109
+ # Fill missing year by heuristic
110
+ if "year" not in params or not params["year"]:
111
+ m = re.search(r"\b(20\d{2}|19\d{2})\b", user_question)
112
+ if m: params["year"] = int(m.group(1))
113
+ res = tool_fetch_annual_report(
114
+ foundation_id=int(params["foundation_id"]),
115
+ foundation_name=params.get("foundation_name"),
116
+ year=params.get("year"),
117
+ search_terms=params.get("search_terms"),
118
+ )
119
+ context += f"\n[tool:fetch_annual_report -> {json.dumps(res, ensure_ascii=False)}]\n"
120
+ used_tool = True
121
+ continue
122
+ else:
123
+ return raw, used_tool
124
+ # Final pass to get a text response after tool
125
+ final = llm(system_prompt(context, user_question))
126
+ return final, used_tool
127
+
128
+ # ---------- RAG answering ----------
129
+ def rag_answer(question: str):
130
+ retriever = get_retriever(k=5)
131
+ # simple manual RAG: fetch docs, stuff into prompt
132
+ docs = retriever.get_relevant_documents(question)
133
+ sources = []
134
+ context = ""
135
+ for d in docs:
136
+ sources.append({
137
+ "page_content": d.page_content[:500],
138
+ "file_path": d.metadata.get("file_path"),
139
+ "page": d.metadata.get("page", "N/A"),
140
+ "year": d.metadata.get("year"),
141
+ "foundation_id": d.metadata.get("foundation_id")
142
+ })
143
+ context += f"\n[Source chunk]\n{d.page_content}\n"
144
+
145
+ prompt = f"""You are answering based ONLY on the context chunks below. If unsure, say you don't know.
146
+ Context:
147
+ {context}
148
+
149
+ Question: {question}
150
+ Answer concisely:"""
151
+ answer = llm(prompt)
152
+ return answer, sources
153
+
154
+ # ---------- Gradio UI ----------
155
+ def ask(user_input: str):
156
+ # 1) Let the model decide if it needs to call the fetch tool
157
+ model_reply, used_tool = mcp_orchestrate(user_input)
158
+
159
+ # 2) Always try a RAG answer (in case the user asked about content)
160
+ rag_resp, sources = rag_answer(user_input)
161
+
162
+ # Decision: if model_reply is a normal sentence (not JSON) and used_tool=False, show RAG answer primarily
163
+ # If used_tool=True, show model confirmation + RAG.
164
+ if used_tool and model_reply:
165
+ header = "✅ Tool used: report fetched/indexed.\n\n"
166
+ final = header + model_reply + "\n\n" + "— RAG answer —\n" + rag_resp
167
+ elif model_reply and not model_reply.strip().startswith("{"):
168
+ final = rag_resp # prioritize grounded RAG
169
+ else:
170
+ final = rag_resp
171
+
172
+ # Pretty-print top sources
173
+ src_lines = []
174
+ for s in sources[:3]:
175
+ src_lines.append(f"- {s.get('file_path')} (page {s.get('page')}, year={s.get('year')}, id={s.get('foundation_id')})")
176
+ if src_lines:
177
+ final += "\n\nSources:\n" + "\n".join(src_lines)
178
+
179
+ return final
180
+
181
+ with gr.Blocks() as demo:
182
+ gr.Markdown("## Swiss Philanthropy Assistant (Mistral + MCP/SerpAPI + RAG/FAISS)")
183
+ gr.Markdown("Ask to fetch a foundation’s annual report (by ID/name/year), then ask questions about its content. PDFs are downloaded, indexed, and queryable.")
184
+ inp = gr.Textbox(label="Your question", placeholder="e.g., Fetch the 2023 annual report for foundation ID 1, then summarize grants by theme.")
185
+ out = gr.Textbox(label="Assistant", lines=18)
186
+ btn = gr.Button("Ask")
187
+ btn.click(ask, inputs=inp, outputs=out)
188
+ inp.submit(ask, inputs=inp, outputs=out)
189
+
190
+ if __name__ == "__main__":
191
+ demo.launch(server_name="0.0.0.0", server_port=7860)
rag.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List, Dict, Any, Optional
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+
8
+ INDEX_DIR = Path("data/vectorstore/faiss_index")
9
+ INDEX_DIR.mkdir(parents=True, exist_ok=True)
10
+
11
+ # Small + strong enough CPU embedding
12
+ EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
13
+
14
+ def load_embeddings():
15
+ return HuggingFaceEmbeddings(model_name=EMB_MODEL)
16
+
17
+ def split_pdf(file_path: str):
18
+ loader = PyPDFLoader(file_path)
19
+ pages = loader.load()
20
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
21
+ return splitter.split_documents(pages)
22
+
23
+ def _faiss_paths():
24
+ return str(INDEX_DIR / "index.faiss"), str(INDEX_DIR / "index.pkl")
25
+
26
+ def load_or_create_faiss(emb):
27
+ faiss_path, pkl_path = _faiss_paths()
28
+ if Path(faiss_path).exists() and Path(pkl_path).exists():
29
+ return FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
30
+ # empty new index
31
+ return FAISS.from_texts([""], emb).delete(["0"]) or FAISS(embeddings=emb, index=None, docstore=None, index_to_docstore_id=None)
32
+
33
+ def add_pdf_to_index(file_path: str, metadata: Optional[Dict[str, Any]] = None):
34
+ emb = load_embeddings()
35
+ vectordb = load_or_create_faiss(emb)
36
+ splits = split_pdf(file_path)
37
+ # attach metadata to each chunk
38
+ md = metadata or {}
39
+ for d in splits:
40
+ d.metadata.update(md)
41
+ vectordb.add_documents(splits)
42
+ vectordb.save_local(INDEX_DIR)
43
+
44
+ def get_retriever(k: int = 4):
45
+ emb = load_embeddings()
46
+ vectordb = load_or_create_faiss(emb)
47
+ return vectordb.as_retriever(search_kwargs={"k": k})
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ transformers>=4.42.0
3
+ torch>=2.1.0
4
+ pandas
5
+ requests
6
+ python-dateutil
7
+ faiss-cpu
8
+ pypdf
9
+ langchain>=0.2.7
10
+ langchain-community>=0.2.7
11
+ sentence-transformers>=2.6.1
utils.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, sqlite3, datetime, requests
2
+ from pathlib import Path
3
+ from typing import Optional, List, Dict
4
+
5
+ DATA_DIR = Path("data")
6
+ PROV_DB = "provenance.db"
7
+
8
+ # ---------- SQLite provenance ----------
9
+ def init_provenance_db(db_path: str = PROV_DB):
10
+ conn = sqlite3.connect(db_path)
11
+ c = conn.cursor()
12
+ c.execute("""
13
+ CREATE TABLE IF NOT EXISTS retrieved_docs (
14
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
15
+ foundation_id INTEGER NOT NULL,
16
+ year INTEGER,
17
+ title TEXT,
18
+ doc_type TEXT,
19
+ file_path TEXT,
20
+ source_url TEXT,
21
+ fetched_at TEXT DEFAULT CURRENT_TIMESTAMP
22
+ )""")
23
+ conn.commit(); conn.close()
24
+
25
+ def log_provenance(foundation_id: int, year: Optional[int], title: str,
26
+ doc_type: str, file_path: str, source_url: str,
27
+ db_path: str = PROV_DB):
28
+ conn = sqlite3.connect(db_path); c = conn.cursor()
29
+ c.execute("""INSERT INTO retrieved_docs
30
+ (foundation_id, year, title, doc_type, file_path, source_url, fetched_at)
31
+ VALUES (?,?,?,?,?,?,?)""",
32
+ (foundation_id, year, title, doc_type, file_path, source_url,
33
+ datetime.datetime.now().isoformat()))
34
+ conn.commit(); conn.close()
35
+
36
+ # ---------- Filesystem ----------
37
+ def safe_filename(name: str) -> str:
38
+ name = re.sub(r"[^\w\-. ]+", "_", name)
39
+ return re.sub(r"\s+", "_", name).strip("_")
40
+
41
+ def ensure_foundation_year_dir(fid: int, year: Optional[int]) -> Path:
42
+ base = DATA_DIR / f"{fid}_data"
43
+ if year: base = base / str(year)
44
+ base.mkdir(parents=True, exist_ok=True)
45
+ return base
46
+
47
+ def download_pdf(url: str, save_dir: Path, preferred_name: Optional[str] = None) -> str:
48
+ filename = preferred_name or url.split("/")[-1].split("?")[0]
49
+ if not filename.lower().endswith(".pdf"):
50
+ filename += ".pdf"
51
+ filename = safe_filename(filename)
52
+ target = save_dir / filename
53
+ r = requests.get(url, stream=True, timeout=30); r.raise_for_status()
54
+ with open(target, "wb") as f:
55
+ for chunk in r.iter_content(8192):
56
+ if chunk: f.write(chunk)
57
+ return str(target)
58
+
59
+ # ---------- SerpAPI search ----------
60
+ def serpapi_search(query: str, num_results: int = 20, serpapi_key: Optional[str] = None) -> List[Dict]:
61
+ key = serpapi_key or os.getenv("SERPAPI_KEY")
62
+ if not key:
63
+ raise RuntimeError("SERPAPI_KEY not set (add it in HF Space Secrets).")
64
+ params = {"engine": "google", "q": query, "num": num_results, "api_key": key}
65
+ resp = requests.get("https://serpapi.com/search", params=params, timeout=20)
66
+ resp.raise_for_status()
67
+ return resp.json().get("organic_results", [])
68
+
69
+ def _is_pdf_link(link: str) -> bool:
70
+ l = link.lower()
71
+ return l.endswith(".pdf") or (".pdf" in l)
72
+
73
+ def score_candidate(item: Dict, foundation_name: str, year: Optional[int]) -> float:
74
+ title = (item.get("title") or "").lower()
75
+ link = (item.get("link") or "").lower()
76
+ score = 0.0
77
+ if any(k in title for k in ["annual", "report", "jahresbericht", "rapport", "rapport annuel"]): score += 2
78
+ if foundation_name.lower()[:10] in title or foundation_name.lower()[:10] in link: score += 1.5
79
+ if year and (str(year) in title or str(year) in link): score += 1.5
80
+ if _is_pdf_link(link): score += 1.0
81
+ return score
82
+
83
+ def find_best_report_url(foundation_name: str, year: Optional[int], extra_terms: Optional[str], serpapi_key: Optional[str]) -> Optional[Dict]:
84
+ q = f'{foundation_name} annual report'
85
+ if year: q += f' {year}'
86
+ if extra_terms: q += f' {extra_terms}'
87
+ q += ' filetype:pdf site:org | site:ch | site:foundation | site:stiftung | site:fondation'
88
+ results = serpapi_search(q, num_results=20, serpapi_key=serpapi_key)
89
+ if not results: return None
90
+ ranked = sorted(results, key=lambda r: score_candidate(r, foundation_name, year), reverse=True)
91
+ return ranked[0]