Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
|
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
import gradio as gr
|
|
@@ -18,6 +19,38 @@ question_embeddings = model.encode(df['Question'].tolist())
|
|
| 18 |
# Together API setup
|
| 19 |
client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def llama_query(prompt, system_content):
|
| 22 |
response = client.chat.completions.create(
|
| 23 |
model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
@@ -75,26 +108,21 @@ def get_answer(user_question, threshold=0.01):
|
|
| 75 |
else:
|
| 76 |
english_question = user_question
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
similarities = cosine_similarity([user_embedding], question_embeddings)
|
| 81 |
-
|
| 82 |
-
max_similarity = np.max(similarities)
|
| 83 |
|
| 84 |
-
if
|
| 85 |
-
|
| 86 |
-
retrieved_answer = df.iloc[similar_question_idx]['Answer']
|
| 87 |
refined_answer = refine_answer(english_question, retrieved_answer)
|
| 88 |
|
| 89 |
if language == 'swahili':
|
| 90 |
refined_answer = translate_to_swahili(refined_answer)
|
| 91 |
|
| 92 |
-
return refined_answer,
|
| 93 |
else:
|
| 94 |
default_message = "The system couldn't find a sufficient answer to your question. Do you want to learn anything else about blood donation?"
|
| 95 |
if language == 'swahili':
|
| 96 |
default_message = translate_to_swahili(default_message)
|
| 97 |
-
return default_message,
|
| 98 |
|
| 99 |
# Gradio app
|
| 100 |
def gradio_app(user_question):
|
|
|
|
| 1 |
import os
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
+
import gzip
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
import gradio as gr
|
|
|
|
| 19 |
# Together API setup
|
| 20 |
client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
|
| 21 |
|
| 22 |
+
def compressed_length(s):
|
| 23 |
+
return len(gzip.compress(s.encode('utf-8')))
|
| 24 |
+
|
| 25 |
+
def ncd(x, y):
|
| 26 |
+
Cx = compressed_length(x)
|
| 27 |
+
Cy = compressed_length(y)
|
| 28 |
+
Cxy = compressed_length(x + " " + y)
|
| 29 |
+
return (Cxy - min(Cx, Cy)) / max(Cx, Cy)
|
| 30 |
+
|
| 31 |
+
def normalize_scores(scores, reverse=False):
|
| 32 |
+
min_score = min(scores)
|
| 33 |
+
max_score = max(scores)
|
| 34 |
+
if reverse:
|
| 35 |
+
return [(max_score - x) / (max_score - min_score) for x in scores]
|
| 36 |
+
return [(x - min_score) / (max_score - min_score) for x in scores]
|
| 37 |
+
|
| 38 |
+
def hybrid_retrieval(query, passages, embeddings, alpha=0.7, beta=0.3):
|
| 39 |
+
query_embedding = model.encode(query)
|
| 40 |
+
cosine_similarities = cosine_similarity([query_embedding], embeddings)[0]
|
| 41 |
+
|
| 42 |
+
normalized_cosine_similarities = normalize_scores(cosine_similarities)
|
| 43 |
+
|
| 44 |
+
ncd_values = [ncd(query, passage) for passage in passages]
|
| 45 |
+
normalized_ncd_values = normalize_scores(ncd_values, reverse=True)
|
| 46 |
+
|
| 47 |
+
final_scores = [alpha * cos_sim + beta * ncd_sim
|
| 48 |
+
for cos_sim, ncd_sim in zip(normalized_cosine_similarities, normalized_ncd_values)]
|
| 49 |
+
|
| 50 |
+
most_similar_index = np.argmax(final_scores)
|
| 51 |
+
|
| 52 |
+
return most_similar_index, cosine_similarities[most_similar_index], ncd_values[most_similar_index], final_scores[most_similar_index]
|
| 53 |
+
|
| 54 |
def llama_query(prompt, system_content):
|
| 55 |
response = client.chat.completions.create(
|
| 56 |
model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
|
|
| 108 |
else:
|
| 109 |
english_question = user_question
|
| 110 |
|
| 111 |
+
index, cosine_sim, ncd_value, final_score = hybrid_retrieval(english_question, df['Question'].tolist(), question_embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
if final_score > threshold:
|
| 114 |
+
retrieved_answer = df.iloc[index]['Answer']
|
|
|
|
| 115 |
refined_answer = refine_answer(english_question, retrieved_answer)
|
| 116 |
|
| 117 |
if language == 'swahili':
|
| 118 |
refined_answer = translate_to_swahili(refined_answer)
|
| 119 |
|
| 120 |
+
return refined_answer, final_score
|
| 121 |
else:
|
| 122 |
default_message = "The system couldn't find a sufficient answer to your question. Do you want to learn anything else about blood donation?"
|
| 123 |
if language == 'swahili':
|
| 124 |
default_message = translate_to_swahili(default_message)
|
| 125 |
+
return default_message, final_score
|
| 126 |
|
| 127 |
# Gradio app
|
| 128 |
def gradio_app(user_question):
|