import os from huggingface_hub import InferenceClient import pandas as pd import json import huggingface_hub from huggingface_hub import hf_hub_download HF_TOKEN_read = os.getenv("HF_TOKEN_read") DATASET_REPO = "luciagomez/MrPhil_vector" # ------------------------------------------------------------------- # 1. Download files from Hugging Face dataset # ------------------------------------------------------------------- parquet_path = hf_hub_download( repo_id=DATASET_REPO, filename="bgem3/foundations.parquet", repo_type="dataset", token=HF_TOKEN_read, cache_dir="/tmp/huggingface" ) # as bge-m3 uses just similarity search, there is no need for the faiss, simpler than icl, its another sort of RAG # ------------------------------------------------------------------- # 2. Load data # ------------------------------------------------------------------- df = pd.read_parquet(parquet_path,engine="pyarrow") # ------------------------------------------------------------------- # 3. Function to find similar foundations via API # ------------------------------------------------------------------- # inference client is defined at app.py, here just the function def find_similar_foundations_api(query: str, client, top_k: int = 5): if "Title" not in df.columns or "Purpose" not in df.columns: raise ValueError("Dataset must contain 'Title' and 'Purpose' columns.") # Compute similarity against all missions results = client.sentence_similarity( source_sentence=query, other_sentences=df["Purpose"].tolist(), model="BAAI/bge-m3" ) # Results = list of floats aligned with df scored = sorted( zip(results, df["Title"].tolist(), df["Purpose"].tolist()), key=lambda x: x[0], reverse=True ) return [ {"score": float(score), "Title": title, "Purpose": purpose} for score, title, purpose in scored[:top_k] ]