Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import gzip | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import gradio as gr | |
| from together import Together | |
| # Load pre-trained Sentence Transformer model | |
| model = SentenceTransformer('LaBSE') | |
| # Load questions and answers from the CSV file | |
| df = pd.read_csv('combined_questions_and_answers.csv') | |
| # Encode all questions in the dataset | |
| question_embeddings = model.encode(df['Question'].tolist()) | |
| # Together API setup | |
| client = Together(api_key=os.environ.get("TOGETHER_API_KEY")) | |
| def compressed_length(s): | |
| return len(gzip.compress(s.encode('utf-8'))) | |
| def ncd(x, y): | |
| """ | |
| Normalized Compression Distance for strings x and y. | |
| """ | |
| Cx = compressed_length(x) | |
| Cy = compressed_length(y) | |
| Cxy = compressed_length(x + " " + y) | |
| return (Cxy - min(Cx, Cy)) / max(Cx, Cy) | |
| def normalize_scores(scores, reverse=False): | |
| """ | |
| Scale a list of scores to [0,1], optionally reversing (1 - x). | |
| """ | |
| min_score = min(scores) | |
| max_score = max(scores) | |
| if max_score == min_score: | |
| return [0] * len(scores) | |
| if reverse: | |
| return [(max_score - x) / (max_score - min_score) for x in scores] | |
| return [(x - min_score) / (max_score - min_score) for x in scores] | |
| def hybrid_retrieval(query, passages, embeddings, alpha=0.7, beta=0.3): | |
| """ | |
| Combine cosine similarity (SentenceTransformer) and | |
| Normalized Compression Distance (NCD) for retrieval. | |
| """ | |
| query_embedding = model.encode(query) | |
| cosine_similarities = cosine_similarity([query_embedding], embeddings)[0] | |
| # Normalize | |
| normalized_cosine_similarities = normalize_scores(cosine_similarities) | |
| # Calculate NCD | |
| ncd_values = [ncd(query, passage) for passage in passages] | |
| normalized_ncd_values = normalize_scores(ncd_values, reverse=True) | |
| # Combine | |
| final_scores = [ | |
| alpha * cos_sim + beta * ncd_sim | |
| for cos_sim, ncd_sim in zip(normalized_cosine_similarities, normalized_ncd_values) | |
| ] | |
| most_similar_index = np.argmax(final_scores) | |
| return most_similar_index, cosine_similarities[most_similar_index], ncd_values[most_similar_index], final_scores[most_similar_index] | |
| def llama_query(prompt, system_content): | |
| """ | |
| Send a prompt to the Together LLaMa model and return the response. | |
| """ | |
| response = client.chat.completions.create( | |
| model="meta-llama/Llama-3.3-70B-Instruct-Turbo", | |
| messages=[ | |
| {"role": "system", "content": system_content}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=512, | |
| temperature=0.7, | |
| top_p=0.7, | |
| top_k=50, | |
| repetition_penalty=1, | |
| stop=["<|eot_id|>", "<|eom_id|>"], | |
| stream=False | |
| ) | |
| return response.choices[0].message.content | |
| def check_blood_donation_relevance(question): | |
| """ | |
| Use LLaMa to check whether 'question' is about blood donation. | |
| """ | |
| prompt = f"Is the following question related to blood donation? Answer ONLY with 'Yes' or 'No': {question}" | |
| system_content = "You are an assistant that determines if a question is related to blood donation." | |
| response = llama_query(prompt, system_content).strip().lower() | |
| return response == 'yes' | |
| def detect_language(text): | |
| """ | |
| Use LLaMa to detect language (English or Swahili). | |
| Returns 'swahili' or 'english'. | |
| """ | |
| prompt = ( | |
| "Detect the language of this text. If it's Swahili, return 'Swahili'. " | |
| "If it's English, return 'English'. Here's the text:\n\n" | |
| f"{text}" | |
| ) | |
| system_content = "You are a language detection assistant." | |
| response = llama_query(prompt, system_content).strip().lower() | |
| # Attempt to match strictly 'swahili' or 'english' from the response | |
| if "swahili" in response: | |
| return "swahili" | |
| if "english" in response: | |
| return "english" | |
| # Fallback: default to English | |
| return "english" | |
| def translate_to_english(text): | |
| """ | |
| Translate Swahili text to English using LLaMa. | |
| """ | |
| prompt = f"Translate the following Swahili text to English: {text}" | |
| system_content = "You are a translation assistant that translates from Swahili to English." | |
| response = llama_query(prompt, system_content) | |
| return response.strip() | |
| def translate_to_swahili(text): | |
| """ | |
| Translate any text to simple Swahili using LLaMa. | |
| """ | |
| prompt = f"Translate the following text to simple Swahili, avoiding difficult words: {text}" | |
| system_content = "You are a translation assistant that translates to simple Swahili." | |
| response = llama_query(prompt, system_content) | |
| return response.strip() | |
| def refine_answer(question, retrieved_answer): | |
| """ | |
| Refine the retrieved answer, making it more relevant and natural. | |
| """ | |
| prompt = ( | |
| f"Question: {question}\n\n" | |
| f"Retrieved Answer: {retrieved_answer}\n\n" | |
| "Please refine the retrieved answer so it's direct, clear, and specifically addresses the question." | |
| ) | |
| system_content = "You are an assistant that refines answers to make them more relevant and natural." | |
| return llama_query(prompt, system_content).strip() | |
| def get_answer(user_question, threshold=0.3): | |
| # 1) Detect user language | |
| language = detect_language(user_question) | |
| # 2) Convert user question to English for checking & retrieval | |
| if language == 'swahili': | |
| english_question = translate_to_english(user_question) | |
| else: | |
| english_question = user_question | |
| # 3) Check if the question is about blood donation using LLaMa | |
| is_blood_related = check_blood_donation_relevance(english_question) | |
| if not is_blood_related: | |
| # Off-topic response | |
| off_topic_message = "I'm sorry, but your question doesn't seem to be related to blood donation. Could you please ask a question about blood donation?" | |
| if language == 'swahili': | |
| off_topic_message = translate_to_swahili(off_topic_message) | |
| return off_topic_message, 0.0 | |
| # If it is about blood donation, proceed with hybrid retrieval | |
| index, cosine_sim, ncd_value, final_score = hybrid_retrieval( | |
| english_question, | |
| df['Question'].tolist(), | |
| question_embeddings | |
| ) | |
| # 4) If retrieval confidence is high enough, refine the CSV answer | |
| if final_score > threshold: | |
| retrieved_answer = df.iloc[index]['Answer'] | |
| refined_answer_english = refine_answer(english_question, retrieved_answer) | |
| # Translate back to user language if needed | |
| if language == 'swahili': | |
| return translate_to_swahili(refined_answer_english), final_score | |
| else: | |
| return refined_answer_english, final_score | |
| else: | |
| # 5) If retrieval is below threshold, ask LLaMa for a general blood-donation-related answer | |
| llama_response_english = llama_query( | |
| f"Please provide a concise, accurate answer about blood donation for the question: {english_question}", | |
| "You are an assistant knowledgeable about blood donation. Provide concise, accurate answers." | |
| ) | |
| llama_response_english = llama_response_english.strip() | |
| # Translate back to user language if needed | |
| if language == 'swahili': | |
| return translate_to_swahili(llama_response_english), final_score | |
| else: | |
| return llama_response_english, final_score | |
| # Gradio app | |
| def gradio_app(user_question): | |
| answer, similarity = get_answer(user_question) | |
| return f"Similarity: {similarity:.2f}\nAnswer: {answer}" | |
| iface = gr.Interface( | |
| fn=gradio_app, | |
| inputs=gr.Textbox(label="Enter your question"), | |
| outputs=gr.Textbox(label="Answer"), | |
| title="Blood Donation Q&A", | |
| description="Ask questions about blood donation in English or Swahili. The system first checks if it's related to blood donation." | |
| ) | |
| iface.launch() | |