hamza50's picture
Update app.py
7773983
import gradio as gr
import pandas as pd
import tiktoken
import pandas as pd
import time
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import nltk
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import re
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import os
# from dotenv import load_dotenv
# load_dotenv()
# print(os.getcwd())
# openai.api_key = os.environ['OPENAI_KEY']
df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df
model = SentenceTransformer('all-mpnet-base-v2')
def remove_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
df['content'] = df.content.apply(lambda x: remove_html_tags(x))
df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x))
session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. I will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only."""
def new_ask(user_input):
response = openai.ChatCompletion.create(model ="gpt-3.5-turbo",
messages = [{'role': 'system', 'content': session_prompt},{'role': 'user', 'content': user_input}],
temperature = 0
)
# print(response)
return response['choices'][0]['message']['content']
def search(query):
n = 10
query_embedding = model.encode(query)
df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))
results = (df.sort_values("similarity", ascending=False).head(n))
r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max())
#results = results[['title','url','keywords','summary_html']].drop_duplicates()
results = r_groupby.reset_index()
results = results.sort_values("similarity", ascending=False)
tier_1 = []
tier_2 = []
for r in results.index:
if results.similarity[r][0] > 0.5:
tier_1.append(
{
"title":results.title[r],
"url":results.url[r],
"score": str(results.similarity[r][0]),
"summary": results.summary_html[r][:200],
"keywords": results.keywords[r]
}
)
elif results.similarity[r][0] > 0.4:
tier_2.append(
{
"title":results.title[r],
"url":results.url[r],
"score": str(results.similarity[r][0]),
"summary": results.summary_html[r][:200],
"keywords": results.keywords[r]
}
)
print(tier_1)
print(tier_2)
ln = "\n"
prefix = f"tier 1:\n{ln.join([x['title'] for x in tier_1])}"
print(prefix)
answer = new_ask(f"Answer the following query by giving arguments from the different arguments provided below. Make sure to quote the article used if the argument corrseponds to the query: Query: {query} Articles {ln.join([x['title'] + ': ' + x['summary'] for i, x in enumerate(tier_1)])}\nUse careful reasoning to explain your answer and give your conclusion about this.")
if len(tier_2):
suffix = f"tier 2:\n{ln.join([x['title'] for x in tier_2])}"
related_questions = new_ask(f"Give general questions related the following articles: {ln.join([str(i) + ' ' + x['summary'] for i, x in enumerate(tier_2)])}")
return f"{answer}\n\nRelated Questions:\n{related_questions}"
return f"{answer}"
def greet(query):
bm25 = search(query)
return bm25
examples = [
["Climate Change Challenges in Europe"],
["Philosophy in the world of Minimalism"],
["Hate Speech vs Freedom of Speech"],
["The importance of values and reflection"]
]
demo = gr.Interface(fn=greet, title="cicero-interactive-qa",
outputs = "text",inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),examples=examples)
demo.launch(share = True, debug = True)