Spaces:

MartynW
/

CS5260_demo

Sleeping

App Files Files Community

CS5260_demo / util.py

martyn-wong

updated utils.py

654e8ae 7 months ago

raw

history blame

8.02 kB

	import os
	import arxiv
	import spacy
	import numpy as np
	from pyvis.network import Network
	from itertools import combinations
	from sentence_transformers import SentenceTransformer
	from keybert import KeyBERT
	from sklearn.cluster import AgglomerativeClustering
	from semanticscholar import SemanticScholar
	from habanero import Crossref
	from collections import Counter
	from dotenv import load_dotenv
	from openai import OpenAI
	from typing import List


	sch = SemanticScholar(timeout=30)
	cr = Crossref(mailto="[email protected]")

	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	spacy.cli.download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")

	kw_model = KeyBERT(model="sentence-transformers/allenai-specter")
	embed_model = SentenceTransformer("sentence-transformers/allenai-specter")

	load_dotenv(dotenv_path="config/.env")
	client = OpenAI(
	api_key=os.getenv("API_KEY"),
	base_url="https://api.deepinfra.com/v1/openai"
	)

	def fetch_arxiv(query, max_results=5):
	# Scrape papers from arXiv
	search = arxiv.Search(query=query, max_results=max_results)
	return [{
	"entry_id": r.entry_id.split("/")[-1],
	"title": r.title,
	"abstract": r.summary
	} for r in search.results()]

	def fetch_semantic_scholar(query, max_results=5):
	# Scrape papers from Semantic Scholar
	# Note: Semantic Scholar API does not return abstracts for all papers
	paginated = sch.search_paper(query, fields=['title'], limit=max_results)
	first_page = paginated.items
	papers = []
	for paper in first_page:
	papers.append({
	"entry_id": paper.paperId,
	"title": paper.title,
	"abstract": paper.abstract or ""
	})
	return papers

	def fetch_crossref(query, max_results=5):
	# Scrape papers from CrossRef
	# Note: CrossRef API does not return abstracts for all papers
	items = cr.works(query=query, limit=max_results)["message"]["items"]
	return [{
	"entry_id": itm.get("DOI", str(i)),
	"title": itm.get("title", [""])[0],
	"abstract": itm.get("abstract", "")
	} for i, itm in enumerate(items)]

	def summarize_abstract_spacy(text: str, num_sentences: int = 3) -> str:
	# Summarize abstracts via spaCy's en_core_web_sm
	doc = nlp(text)
	freqs = {}
	for tok in doc:
	if tok.is_stop or tok.is_punct or not tok.is_alpha:
	continue
	w = tok.text.lower()
	freqs[w] = freqs.get(w, 0) + 1
	if not freqs: return ""
	maxf = max(freqs.values())
	for w in freqs: freqs[w] /= maxf

	sent_scores = {
	sent: sum(freqs.get(tok.text.lower(),0) for tok in sent if tok.is_alpha)
	for sent in doc.sents
	}
	# pick top sentences
	best = sorted(sent_scores, key=sent_scores.get, reverse=True)[:num_sentences]
	best_sorted = sorted(best, key=lambda s: list(doc.sents).index(s))
	return " ".join(s.text.strip() for s in best_sorted)

	def dedupe_by_substring(phrases):
	# Remove phrases that are substrings of others. Used in keyphrase extraction.
	filtered = []
	for ph, sc in phrases:
	# if any already-kept phrase contains this one, skip it
	if any(ph in kept for kept, _ in filtered):
	continue
	# if this phrase contains any already-kept shorter phrase, remove that shorter phrase
	filtered = [(k,s) for k,s in filtered if ph not in k]
	filtered.append((ph, sc))
	return filtered

	def dedupe_by_embedding(phrases, threshold: float = 0.1):
	# Remove phrase that are too similar to others. Used in keyphrase extraction.
	texts = [ph for ph, _ in phrases]
	embs = embed_model.encode(texts, normalize_embeddings=True)

	# Cluster by cosine distance
	clustering = AgglomerativeClustering(
	n_clusters=None,
	metric="cosine",
	linkage="average",
	distance_threshold=threshold
	).fit(embs)

	clusters = {}
	for (ph, sc), lbl in zip(phrases, clustering.labels_):
	clusters.setdefault(lbl, []).append((ph, sc))

	# Pick top scoring phrase per cluster
	result = [max(members, key=lambda x: x[1]) for members in clusters.values()]
	return sorted(result, key=lambda x: x[1], reverse=True)

	def extract_entities(text: str, top_n: int = 20):
	# Use Specter model via KeyBERT to extract keyphrases
	raw_phrases = kw_model.extract_keywords(
	text,
	keyphrase_ngram_range=(1, 3),
	stop_words="english",
	top_n=top_n
	)
	# Remove duplicates and too-similar phrases
	subphrases = dedupe_by_substring(raw_phrases)
	deduped = dedupe_by_embedding(subphrases)

	return [(ph, "KEYPHRASE") for ph, _ in deduped[:10]]

	def summarize_abstracts_llm(
	abstracts: List[str],
	model: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
	temperature: float = 0.7,
	max_tokens: int = 500
	) -> str:
	# Cross-paper summary using Qwen model
	prompt = (
	f"These are the abstracts of {len(abstracts)} papers. "
	"Produce a cross-paper summary that summarizes all the key points across each paper. Keep it to 5-6 sentences.\n\n"
	)
	for i, abs_text in enumerate(abstracts, start=1):
	prompt += f"Paper {i} abstract:\n{abs_text}\n\n"

	resp = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": "You are a helpful academic research assistant."},
	{"role": "user", "content": prompt}
	],
	temperature=temperature,
	max_tokens=max_tokens,
	)
	return resp.choices[0].message.content.strip()

	def build_global_concept_map(papers):
	# Global concept map of scraped papers

	# Map node to title fpr tooltip
	phrase_to_titles = {}
	for p in papers:
	ents = extract_entities(p["abstract"])
	phrases = {e for e,_ in ents}
	for ph in phrases:
	phrase_to_titles.setdefault(ph, []).append(p["title"])

	freq = Counter()
	for ph, titles in phrase_to_titles.items():
	freq[ph] = len(titles)

	net = Network(height="600px", width="100%")
	id_map = {ph: idx for idx, ph in enumerate(freq, start=1)}

	for ph, count in freq.items():
	titles = phrase_to_titles.get(ph, [])
	tooltip = "<br>".join(titles)
	net.add_node(
	id_map[ph],
	label=ph,
	title=tooltip,
	size=10 + 2 * count
	)

	cooc = Counter()

	per_paper_sets = []
	for p in papers:
	ents = extract_entities(p["abstract"])
	per_paper_sets.append({e for e,_ in ents})

	for phrases in per_paper_sets:
	for a, b in combinations(sorted(phrases), 2):
	cooc[(a, b)] += 1

	for (a, b), c in cooc.items():
	net.add_edge(id_map[a], id_map[b], value=c)

	net.set_options("""
	{
	"physics": {
	"solver": "repulsion",
	"repulsion": {
	"nodeDistance": 250,
	"springLength": 200,
	"damping": 0.5
	}
	}
	}
	""")
	return net

	def build_concept_map(phrases, sim_threshold: float = 0.85) -> Network:
	# Individual concept map of each paper. Threshold set to 0.85.
	net = Network(height="600px", width="100%")

	id_map = {}
	texts = [ph for ph, _ in phrases]
	for idx, (ph, lbl) in enumerate(phrases, start=1):
	id_map[ph] = idx
	net.add_node(idx, label=ph, title=lbl)

	embeddings = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

	for i, j in combinations(range(len(texts)), 2):
	sim = float(np.dot(embeddings[i], embeddings[j])) # since normalized, dot=cosine
	print(f"sim({texts[i]}, {texts[j]}) = {sim:.3f}")
	if sim >= sim_threshold:
	net.add_edge(id_map[texts[i]], id_map[texts[j]], value=sim)

	net.set_options("""
	{
	"physics": {
	"solver": "repulsion",
	"repulsion": {
	"nodeDistance": 200,
	"springLength": 200,
	"damping": 0.5
	}
	}
	}
	""")
	return net