import os
from huggingface_hub import InferenceClient
import pandas as pd
import json
import huggingface_hub
from huggingface_hub import hf_hub_download

HF_TOKEN_read = os.getenv("HF_TOKEN_read")

DATASET_REPO = "luciagomez/MrPhil_vector"


# -------------------------------------------------------------------
# 1. Download files from Hugging Face dataset
# -------------------------------------------------------------------
parquet_path = hf_hub_download(
    repo_id=DATASET_REPO,
    filename="bgem3/foundations.parquet",
    repo_type="dataset",
    token=HF_TOKEN_read,
    cache_dir="/tmp/huggingface"
)
# as bge-m3 uses just similarity search, there is no need for the faiss, simpler than icl, its another sort of RAG


# -------------------------------------------------------------------
# 2. Load data
# -------------------------------------------------------------------
df = pd.read_parquet(parquet_path,engine="pyarrow")


# -------------------------------------------------------------------
# 3. Function to find similar foundations via API
# -------------------------------------------------------------------

# inference client is defined at app.py, here just the function

def find_similar_foundations_api(query: str, client, top_k: int = 5):
    if "Title" not in df.columns or "Purpose" not in df.columns:
        raise ValueError("Dataset must contain 'Title' and 'Purpose' columns.")

    
    # Compute similarity against all missions
    results = client.sentence_similarity(
        source_sentence=query,
        other_sentences=df["Purpose"].tolist(),
        model="BAAI/bge-m3"
    )


    # Results = list of floats aligned with df
    scored = sorted(
        zip(results, df["Title"].tolist(), df["Purpose"].tolist()),
        key=lambda x: x[0],
        reverse=True
    )

    return [
        {"score": float(score), "Title": title, "Purpose": purpose}
        for score, title, purpose in scored[:top_k]
    ]