CS5260_demo / util.py
martyn-wong's picture
updated utils.py
654e8ae
raw
history blame
8.02 kB
import os
import arxiv
import spacy
import numpy as np
from pyvis.network import Network
from itertools import combinations
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from sklearn.cluster import AgglomerativeClustering
from semanticscholar import SemanticScholar
from habanero import Crossref
from collections import Counter
from dotenv import load_dotenv
from openai import OpenAI
from typing import List
sch = SemanticScholar(timeout=30)
cr = Crossref(mailto="[email protected]")
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
kw_model = KeyBERT(model="sentence-transformers/allenai-specter")
embed_model = SentenceTransformer("sentence-transformers/allenai-specter")
load_dotenv(dotenv_path="config/.env")
client = OpenAI(
api_key=os.getenv("API_KEY"),
base_url="https://api.deepinfra.com/v1/openai"
)
def fetch_arxiv(query, max_results=5):
# Scrape papers from arXiv
search = arxiv.Search(query=query, max_results=max_results)
return [{
"entry_id": r.entry_id.split("/")[-1],
"title": r.title,
"abstract": r.summary
} for r in search.results()]
def fetch_semantic_scholar(query, max_results=5):
# Scrape papers from Semantic Scholar
# Note: Semantic Scholar API does not return abstracts for all papers
paginated = sch.search_paper(query, fields=['title'], limit=max_results)
first_page = paginated.items
papers = []
for paper in first_page:
papers.append({
"entry_id": paper.paperId,
"title": paper.title,
"abstract": paper.abstract or ""
})
return papers
def fetch_crossref(query, max_results=5):
# Scrape papers from CrossRef
# Note: CrossRef API does not return abstracts for all papers
items = cr.works(query=query, limit=max_results)["message"]["items"]
return [{
"entry_id": itm.get("DOI", str(i)),
"title": itm.get("title", [""])[0],
"abstract": itm.get("abstract", "")
} for i, itm in enumerate(items)]
def summarize_abstract_spacy(text: str, num_sentences: int = 3) -> str:
# Summarize abstracts via spaCy's en_core_web_sm
doc = nlp(text)
freqs = {}
for tok in doc:
if tok.is_stop or tok.is_punct or not tok.is_alpha:
continue
w = tok.text.lower()
freqs[w] = freqs.get(w, 0) + 1
if not freqs: return ""
maxf = max(freqs.values())
for w in freqs: freqs[w] /= maxf
sent_scores = {
sent: sum(freqs.get(tok.text.lower(),0) for tok in sent if tok.is_alpha)
for sent in doc.sents
}
# pick top sentences
best = sorted(sent_scores, key=sent_scores.get, reverse=True)[:num_sentences]
best_sorted = sorted(best, key=lambda s: list(doc.sents).index(s))
return " ".join(s.text.strip() for s in best_sorted)
def dedupe_by_substring(phrases):
# Remove phrases that are substrings of others. Used in keyphrase extraction.
filtered = []
for ph, sc in phrases:
# if any already-kept phrase contains this one, skip it
if any(ph in kept for kept, _ in filtered):
continue
# if this phrase contains any already-kept shorter phrase, remove that shorter phrase
filtered = [(k,s) for k,s in filtered if ph not in k]
filtered.append((ph, sc))
return filtered
def dedupe_by_embedding(phrases, threshold: float = 0.1):
# Remove phrase that are too similar to others. Used in keyphrase extraction.
texts = [ph for ph, _ in phrases]
embs = embed_model.encode(texts, normalize_embeddings=True)
# Cluster by cosine distance
clustering = AgglomerativeClustering(
n_clusters=None,
metric="cosine",
linkage="average",
distance_threshold=threshold
).fit(embs)
clusters = {}
for (ph, sc), lbl in zip(phrases, clustering.labels_):
clusters.setdefault(lbl, []).append((ph, sc))
# Pick top scoring phrase per cluster
result = [max(members, key=lambda x: x[1]) for members in clusters.values()]
return sorted(result, key=lambda x: x[1], reverse=True)
def extract_entities(text: str, top_n: int = 20):
# Use Specter model via KeyBERT to extract keyphrases
raw_phrases = kw_model.extract_keywords(
text,
keyphrase_ngram_range=(1, 3),
stop_words="english",
top_n=top_n
)
# Remove duplicates and too-similar phrases
subphrases = dedupe_by_substring(raw_phrases)
deduped = dedupe_by_embedding(subphrases)
return [(ph, "KEYPHRASE") for ph, _ in deduped[:10]]
def summarize_abstracts_llm(
abstracts: List[str],
model: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
temperature: float = 0.7,
max_tokens: int = 500
) -> str:
# Cross-paper summary using Qwen model
prompt = (
f"These are the abstracts of {len(abstracts)} papers. "
"Produce a cross-paper summary that summarizes all the key points across each paper. Keep it to 5-6 sentences.\n\n"
)
for i, abs_text in enumerate(abstracts, start=1):
prompt += f"Paper {i} abstract:\n{abs_text}\n\n"
resp = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful academic research assistant."},
{"role": "user", "content": prompt}
],
temperature=temperature,
max_tokens=max_tokens,
)
return resp.choices[0].message.content.strip()
def build_global_concept_map(papers):
# Global concept map of scraped papers
# Map node to title fpr tooltip
phrase_to_titles = {}
for p in papers:
ents = extract_entities(p["abstract"])
phrases = {e for e,_ in ents}
for ph in phrases:
phrase_to_titles.setdefault(ph, []).append(p["title"])
freq = Counter()
for ph, titles in phrase_to_titles.items():
freq[ph] = len(titles)
net = Network(height="600px", width="100%")
id_map = {ph: idx for idx, ph in enumerate(freq, start=1)}
for ph, count in freq.items():
titles = phrase_to_titles.get(ph, [])
tooltip = "<br>".join(titles)
net.add_node(
id_map[ph],
label=ph,
title=tooltip,
size=10 + 2 * count
)
cooc = Counter()
per_paper_sets = []
for p in papers:
ents = extract_entities(p["abstract"])
per_paper_sets.append({e for e,_ in ents})
for phrases in per_paper_sets:
for a, b in combinations(sorted(phrases), 2):
cooc[(a, b)] += 1
for (a, b), c in cooc.items():
net.add_edge(id_map[a], id_map[b], value=c)
net.set_options("""
{
"physics": {
"solver": "repulsion",
"repulsion": {
"nodeDistance": 250,
"springLength": 200,
"damping": 0.5
}
}
}
""")
return net
def build_concept_map(phrases, sim_threshold: float = 0.85) -> Network:
# Individual concept map of each paper. Threshold set to 0.85.
net = Network(height="600px", width="100%")
id_map = {}
texts = [ph for ph, _ in phrases]
for idx, (ph, lbl) in enumerate(phrases, start=1):
id_map[ph] = idx
net.add_node(idx, label=ph, title=lbl)
embeddings = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
for i, j in combinations(range(len(texts)), 2):
sim = float(np.dot(embeddings[i], embeddings[j])) # since normalized, dot=cosine
print(f"sim({texts[i]}, {texts[j]}) = {sim:.3f}")
if sim >= sim_threshold:
net.add_edge(id_map[texts[i]], id_map[texts[j]], value=sim)
net.set_options("""
{
"physics": {
"solver": "repulsion",
"repulsion": {
"nodeDistance": 200,
"springLength": 200,
"damping": 0.5
}
}
}
""")
return net