embeddingsv3 / app.py
lyimo's picture
Update app.py
7a45593 verified
import os
import pandas as pd
import numpy as np
import gzip
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from together import Together
# Load pre-trained Sentence Transformer model
model = SentenceTransformer('LaBSE')
# Load questions and answers from the CSV file
df = pd.read_csv('combined_questions_and_answers.csv')
# Encode all questions in the dataset
question_embeddings = model.encode(df['Question'].tolist())
# Together API setup
client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
def compressed_length(s):
return len(gzip.compress(s.encode('utf-8')))
def ncd(x, y):
"""
Normalized Compression Distance for strings x and y.
"""
Cx = compressed_length(x)
Cy = compressed_length(y)
Cxy = compressed_length(x + " " + y)
return (Cxy - min(Cx, Cy)) / max(Cx, Cy)
def normalize_scores(scores, reverse=False):
"""
Scale a list of scores to [0,1], optionally reversing (1 - x).
"""
min_score = min(scores)
max_score = max(scores)
if max_score == min_score:
return [0] * len(scores)
if reverse:
return [(max_score - x) / (max_score - min_score) for x in scores]
return [(x - min_score) / (max_score - min_score) for x in scores]
def hybrid_retrieval(query, passages, embeddings, alpha=0.7, beta=0.3):
"""
Combine cosine similarity (SentenceTransformer) and
Normalized Compression Distance (NCD) for retrieval.
"""
query_embedding = model.encode(query)
cosine_similarities = cosine_similarity([query_embedding], embeddings)[0]
# Normalize
normalized_cosine_similarities = normalize_scores(cosine_similarities)
# Calculate NCD
ncd_values = [ncd(query, passage) for passage in passages]
normalized_ncd_values = normalize_scores(ncd_values, reverse=True)
# Combine
final_scores = [
alpha * cos_sim + beta * ncd_sim
for cos_sim, ncd_sim in zip(normalized_cosine_similarities, normalized_ncd_values)
]
most_similar_index = np.argmax(final_scores)
return most_similar_index, cosine_similarities[most_similar_index], ncd_values[most_similar_index], final_scores[most_similar_index]
def llama_query(prompt, system_content):
"""
Send a prompt to the Together LLaMa model and return the response.
"""
response = client.chat.completions.create(
model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
messages=[
{"role": "system", "content": system_content},
{"role": "user", "content": prompt}
],
max_tokens=512,
temperature=0.7,
top_p=0.7,
top_k=50,
repetition_penalty=1,
stop=["<|eot_id|>", "<|eom_id|>"],
stream=False
)
return response.choices[0].message.content
def check_blood_donation_relevance(question):
"""
Use LLaMa to check whether 'question' is about blood donation.
"""
prompt = f"Is the following question related to blood donation? Answer ONLY with 'Yes' or 'No': {question}"
system_content = "You are an assistant that determines if a question is related to blood donation."
response = llama_query(prompt, system_content).strip().lower()
return response == 'yes'
def detect_language(text):
"""
Use LLaMa to detect language (English or Swahili).
Returns 'swahili' or 'english'.
"""
prompt = (
"Detect the language of this text. If it's Swahili, return 'Swahili'. "
"If it's English, return 'English'. Here's the text:\n\n"
f"{text}"
)
system_content = "You are a language detection assistant."
response = llama_query(prompt, system_content).strip().lower()
# Attempt to match strictly 'swahili' or 'english' from the response
if "swahili" in response:
return "swahili"
if "english" in response:
return "english"
# Fallback: default to English
return "english"
def translate_to_english(text):
"""
Translate Swahili text to English using LLaMa.
"""
prompt = f"Translate the following Swahili text to English: {text}"
system_content = "You are a translation assistant that translates from Swahili to English."
response = llama_query(prompt, system_content)
return response.strip()
def translate_to_swahili(text):
"""
Translate any text to simple Swahili using LLaMa.
"""
prompt = f"Translate the following text to simple Swahili, avoiding difficult words: {text}"
system_content = "You are a translation assistant that translates to simple Swahili."
response = llama_query(prompt, system_content)
return response.strip()
def refine_answer(question, retrieved_answer):
"""
Refine the retrieved answer, making it more relevant and natural.
"""
prompt = (
f"Question: {question}\n\n"
f"Retrieved Answer: {retrieved_answer}\n\n"
"Please refine the retrieved answer so it's direct, clear, and specifically addresses the question."
)
system_content = "You are an assistant that refines answers to make them more relevant and natural."
return llama_query(prompt, system_content).strip()
def get_answer(user_question, threshold=0.3):
# 1) Detect user language
language = detect_language(user_question)
# 2) Convert user question to English for checking & retrieval
if language == 'swahili':
english_question = translate_to_english(user_question)
else:
english_question = user_question
# 3) Check if the question is about blood donation using LLaMa
is_blood_related = check_blood_donation_relevance(english_question)
if not is_blood_related:
# Off-topic response
off_topic_message = "I'm sorry, but your question doesn't seem to be related to blood donation. Could you please ask a question about blood donation?"
if language == 'swahili':
off_topic_message = translate_to_swahili(off_topic_message)
return off_topic_message, 0.0
# If it is about blood donation, proceed with hybrid retrieval
index, cosine_sim, ncd_value, final_score = hybrid_retrieval(
english_question,
df['Question'].tolist(),
question_embeddings
)
# 4) If retrieval confidence is high enough, refine the CSV answer
if final_score > threshold:
retrieved_answer = df.iloc[index]['Answer']
refined_answer_english = refine_answer(english_question, retrieved_answer)
# Translate back to user language if needed
if language == 'swahili':
return translate_to_swahili(refined_answer_english), final_score
else:
return refined_answer_english, final_score
else:
# 5) If retrieval is below threshold, ask LLaMa for a general blood-donation-related answer
llama_response_english = llama_query(
f"Please provide a concise, accurate answer about blood donation for the question: {english_question}",
"You are an assistant knowledgeable about blood donation. Provide concise, accurate answers."
)
llama_response_english = llama_response_english.strip()
# Translate back to user language if needed
if language == 'swahili':
return translate_to_swahili(llama_response_english), final_score
else:
return llama_response_english, final_score
# Gradio app
def gradio_app(user_question):
answer, similarity = get_answer(user_question)
return f"Similarity: {similarity:.2f}\nAnswer: {answer}"
iface = gr.Interface(
fn=gradio_app,
inputs=gr.Textbox(label="Enter your question"),
outputs=gr.Textbox(label="Answer"),
title="Blood Donation Q&A",
description="Ask questions about blood donation in English or Swahili. The system first checks if it's related to blood donation."
)
iface.launch()