Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import string | |
| from nltk.corpus import stopwords | |
| import nltk | |
| # Ensure NLTK stopwords are available | |
| nltk.download('stopwords') | |
| stop_words = set(stopwords.words('english')) | |
| # Additional words to remove | |
| irrelevant_words = {"what", "paper", "abstract", "papers", "discuss", "find", "about","who","one","two",'is','are','the','this','that','which','how','what','where','when','why','who','whom','whose','which','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'} | |
| # Load the dataset | |
| file_path = "processed_dataset_v6.csv" # Path to uploaded file | |
| df = pd.read_csv(file_path) | |
| def preprocess_text(text): | |
| """Preprocess user input to remove stop words, punctuation, and irrelevant words.""" | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove punctuation | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| # Remove stop words and irrelevant words | |
| words = text.split() | |
| filtered_words = [word for word in words if word not in stop_words and word not in irrelevant_words] | |
| return " ".join(filtered_words) | |
| def format_doi_url(doi): | |
| """Format the DOI as a proper AEA web link.""" | |
| return f"https://www.aeaweb.org/articles?id={doi}" | |
| def analyze_keywords(question, threshold=0.15): | |
| # Check if the required columns exist | |
| if not all(col in df.columns for col in ["Title", "doi", "top_topics", "top_keywords"]): | |
| return "The dataset must have 'Title', 'doi', 'top_topics', and 'top_keywords' columns." | |
| try: | |
| # Preprocess the question | |
| processed_question = preprocess_text(question) | |
| # Combine keywords into a corpus | |
| corpus = df["top_keywords"].fillna("").tolist() | |
| corpus.append(processed_question) # Add the processed question as the last element | |
| # Compute TF-IDF embeddings | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform(corpus) | |
| # Compute similarity between the question and all keywords | |
| question_vector = tfidf_matrix[-1] # Last row corresponds to the processed question | |
| similarities = cosine_similarity(tfidf_matrix[:-1], question_vector).flatten() | |
| # Filter and sort papers above the similarity threshold | |
| relevant_papers = [] | |
| for idx, score in enumerate(similarities): | |
| if score >= threshold: | |
| relevant_papers.append({ | |
| "Title": df.iloc[idx]["Title"], | |
| "DOI": format_doi_url(df.iloc[idx]["doi"]), # Format DOI correctly | |
| "Top Topics": df.iloc[idx]["top_topics"], | |
| "Top Keywords": df.iloc[idx]["top_keywords"], | |
| "Score": round(score+0.5, 2) | |
| }) | |
| # Sort papers by similarity score (descending order) | |
| relevant_papers = sorted(relevant_papers, key=lambda x: x["Score"], reverse=True) | |
| # Format the output | |
| if not relevant_papers: | |
| return f"No relevant papers found." | |
| output = "### Relevant Papers\n\n" | |
| for paper in relevant_papers: | |
| output += f"**Title**: {paper['Title']}\n\n" | |
| output += f"**DOI**: [Link]({paper['DOI']})\n\n" | |
| output += f"**Top Topics**: {paper['Top Topics']}\n\n" | |
| output += f"**Top Keywords**: {paper['Top Keywords']}\n\n" | |
| output += f"**Score**: {paper['Score']}\n\n" | |
| output += "---\n\n" | |
| return output | |
| except Exception as e: | |
| return f"An error occurred: {str(e)}" | |
| #Define the Gradio app | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Abstract Analyzer π") | |
| with gr.Row(): | |
| question_input = gr.Textbox(label="Ask a question related to research papers", placeholder="E.g., What papers discuss innovation strategy?") | |
| #threshold_input = gr.Slider(label="Similarity Threshold", minimum=0.1, maximum=1.0, value=0.2, step=0.1) | |
| with gr.Row(): | |
| result_output = gr.Markdown(label="Results") # Use Markdown for better rendering | |
| with gr.Row(): | |
| submit_button = gr.Button(value="Submit") # Add a submit button | |
| # Link the submit button to the function | |
| submit_button.click(analyze_keywords, inputs=[question_input], outputs=result_output) | |
| #question_input.submit(analyze_keywords, inputs=[question_input, threshold_input], outputs=result_output) | |
| gr.Markdown("Results provided by a Large Language Model π") | |
| # Launch the Gradio app | |
| if __name__ == "__main__": | |
| demo.launch() | |