Spaces:
Runtime error
Runtime error
| # %% | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import rdflib | |
| import seaborn as sns | |
| import tensorflow as tf | |
| from adjustText import adjust_text | |
| from ampligraph.latent_features import ScoringBasedEmbeddingModel | |
| from ampligraph.utils import restore_model | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| # Start timer, count time to load graph | |
| start_time = tf.timestamp() | |
| g = rdflib.Graph() | |
| uri = "urn:acmcmc:unis:" | |
| unis = rdflib.Namespace(uri) | |
| g.bind("unis", unis) | |
| g.parse("universities.ttl", format="turtle") | |
| # End timer | |
| end_time = tf.timestamp() | |
| logger.info("Graph loaded in {} seconds".format(end_time - start_time)) | |
| # model = restore_model("model.pkl") | |
| # Start timer, count time to load model | |
| start_time = tf.timestamp() | |
| model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx") | |
| model.load_metadata("model/model") | |
| model.build_full_model() | |
| super(ScoringBasedEmbeddingModel, model).load_weights("model/") | |
| # End timer | |
| end_time = tf.timestamp() | |
| logger.info("Model loaded in {} seconds".format(end_time - start_time)) | |
| def separate_concepts(concepts): | |
| concept_list = concepts.split(",") | |
| # Trim the strings | |
| concept_list = [x.strip() for x in concept_list] | |
| return concept_list | |
| def pca(embeddings): | |
| pca = PCA(n_components=2) | |
| pca.fit(embeddings) | |
| entity_embeddings_pca = pca.transform(embeddings) | |
| return entity_embeddings_pca | |
| def cluster(embeddings): | |
| clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0) | |
| clusters = clustering_algorithm.fit_predict(embeddings) | |
| return clusters | |
| def get_concept_name(concept_uri): | |
| """ | |
| Get the name of the concept from the URI | |
| """ | |
| results = g.query( | |
| f"""SELECT ?name | |
| WHERE {{ | |
| <{concept_uri}> <urn:acmcmc:unis:name> ?name . | |
| }}""" | |
| ) | |
| return pd.DataFrame(results)[0][0] | |
| def get_similarities_to_node(array_of_triples, model): | |
| """ | |
| Calculate the similarity between the embeddings of a node and a list of other nodes | |
| """ | |
| # Cosine similarity using tensorflow | |
| indexes = model.get_indexes(array_of_triples) | |
| scores = model(indexes) | |
| return scores | |
| def process_user_input_concept(concept_chooser): | |
| """ | |
| The user input is the URI of the concept. Get the similarites between the concept and the institutions | |
| """ | |
| all_ids_institutions = np.loadtxt( | |
| "institutions.csv", delimiter=",", skiprows=1, dtype=str, quotechar='"' | |
| ) | |
| # Remove duplicates based on the first column | |
| all_ids_institutions = all_ids_institutions[ | |
| ~pd.DataFrame(all_ids_institutions).duplicated(0) | |
| ] | |
| chosen_concepts = separate_concepts(concept_chooser) | |
| chosen_concepts_names = [get_concept_name(concept) for concept in chosen_concepts] | |
| all_similarities = {} | |
| for concept in chosen_concepts: | |
| s = all_ids_institutions[:, 0] | |
| p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s)) | |
| o = np.array([concept] * len(s)) | |
| array_of_triples = np.array([s, p, o]).T | |
| scores = get_similarities_to_node(array_of_triples, model) | |
| all_similarities[concept] = scores | |
| # Now, average the similarities | |
| scores = np.stack(list(all_similarities.values()), axis=0) | |
| scores = np.mean(scores, axis=0) | |
| table_df = pd.DataFrame( | |
| { | |
| "institution": s, | |
| "mean_similarity": scores.flatten(), | |
| "institution_name": all_ids_institutions[:, 1], | |
| # "num_articles": all_ids_institutions[:, 2].astype(int), | |
| } | |
| ) | |
| # Add the individual similarities | |
| for i, concept in enumerate(chosen_concepts): | |
| table_df[f"similarity_to_{chosen_concepts_names[i]}"] = all_similarities[concept] | |
| # Reorder the columns so that the mean similarity is after the individual similarities and before the institution name | |
| table_df = table_df[ | |
| ["institution"] | |
| + [f"similarity_to_{chosen_concepts_names[i]}" for i in range(len(chosen_concepts))] | |
| + ["mean_similarity", "institution_name"] | |
| ] | |
| # Sort by mean similarity | |
| table_df = table_df.sort_values(by=["mean_similarity"], ascending=False) | |
| concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts] | |
| return ( | |
| table_df, | |
| gr.update(visible=True), | |
| gr.update(visible=True), | |
| #gr.update(visible=True), | |
| #f'Concept names: {", ".join(concept_names)}', | |
| ) | |
| def calculate_emdeddings_and_pca(table): | |
| gr.Info("Performing PCA and clustering...") | |
| # Perform PCA | |
| embeddings_of_institutions = model.get_embeddings( | |
| entities=np.array(table["institution"]) | |
| ) | |
| entity_embeddings_pca = pca(embeddings_of_institutions) | |
| # Perform clustering | |
| clusters = cluster(embeddings_of_institutions) | |
| plot_df = pd.DataFrame( | |
| { | |
| "embedding_x": entity_embeddings_pca[:, 0], | |
| "embedding_y": entity_embeddings_pca[:, 1], | |
| "cluster": "cluster" + pd.Series(clusters).astype(str), | |
| } | |
| ) | |
| # Toast message | |
| gr.Info("PCA and clustering done!") | |
| return plot_df | |
| def click_on_institution(table, embeddings_var, evt: gr.SelectData): | |
| institution_id = table["institution"][evt.index[0]] | |
| try: | |
| embeddings_df = embeddings_var["embeddings_df"] | |
| plot_df = pd.DataFrame( | |
| { | |
| "institution": table["institution"].values, | |
| "institution_name": table["institution_name"].values, | |
| "embedding_x": embeddings_df["embedding_x"].values, | |
| "embedding_y": embeddings_df["embedding_y"].values, | |
| "cluster": embeddings_df["cluster"].values, | |
| # "num_articles": table["num_articles"].values, | |
| } | |
| ) | |
| return plot_embeddings(plot_df, institution_id) | |
| except: | |
| pass | |
| def click_on_show_plot(table): | |
| embeddings_df = calculate_emdeddings_and_pca(table) | |
| plot_df = pd.DataFrame( | |
| { | |
| "institution": table["institution"].values, | |
| "Institution_name": table["institution Name"].values, | |
| "embedding_x": embeddings_df["embedding_x"].values, | |
| "embedding_y": embeddings_df["embedding_y"].values, | |
| "cluster": embeddings_df["cluster"].values, | |
| # "num_articles": table["num_articles"].values, | |
| } | |
| ) | |
| fig = plot_embeddings(plot_df, None) | |
| return fig, {"embeddings_df": plot_df} | |
| def plot_embeddings(plot_df, institution_id): | |
| fig = plt.figure(figsize=(12, 12)) | |
| np.random.seed(0) | |
| # fig.title("{} embeddings".format(parameter).capitalize()) | |
| ax = sns.scatterplot( | |
| data=plot_df, | |
| x="embedding_x", | |
| y="embedding_y", | |
| hue="cluster", | |
| ) | |
| row_of_institution = plot_df[plot_df["institution"] == institution_id] | |
| if not row_of_institution.empty: | |
| ax.text( | |
| row_of_institution["embedding_x"], | |
| row_of_institution["embedding_y"], | |
| row_of_institution["institution_name"].values[0], | |
| horizontalalignment="left", | |
| size="medium", | |
| color="black", | |
| weight="normal", | |
| ) | |
| # Also draw a point for the institution | |
| ax.scatter( | |
| row_of_institution["embedding_x"], | |
| row_of_institution["embedding_y"], | |
| color="black", | |
| s=100, | |
| marker="x", | |
| ) | |
| # texts = [] | |
| # for i, point in plot_df.iterrows(): | |
| # if point["institution"] == institution_id: | |
| # texts.append( | |
| # fig.text( | |
| # point["embedding_x"] + 0.02, | |
| # point["embedding_y"] + 0.01, | |
| # str(point["institution_name"]), | |
| # ) | |
| # ) | |
| # adjust_text(texts) | |
| return fig | |
| def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.SelectData): | |
| """ | |
| Get the authors of an institution | |
| """ | |
| institution = institutions_table["institution"][0] | |
| number_of_row = evt.index[0] | |
| institution = institutions_table["institution"][number_of_row] | |
| concepts = separate_concepts(concept_chooser) | |
| results_dfs = [] | |
| for concept in concepts: | |
| # Create a dataframe of the authors and the number of articles they have written for each concept | |
| result = g.query( | |
| f"""SELECT ?author ?name (COUNT (?article) AS ?num_articles) | |
| WHERE {{ | |
| ?author a <urn:acmcmc:unis:Author> . | |
| ?author <urn:acmcmc:unis:name> ?name . | |
| ?article <urn:acmcmc:unis:written_in_institution> <{institution}> . | |
| ?article <urn:acmcmc:unis:has_author> ?author . | |
| ?article <urn:acmcmc:unis:related_to_concept> <{concept}> . | |
| }} | |
| GROUP BY ?author ?name | |
| ORDER BY DESC(COUNT (?article)) | |
| """ | |
| ) | |
| result_df = pd.DataFrame(result) | |
| result_df.columns = ["author", "name", "num_articles"] | |
| results_dfs.append(result_df) | |
| # Now, aggregate the results into a single dataframe by summing the number of articles | |
| results_df = pd.concat(results_dfs) | |
| results_df = results_df.groupby(["author", "name"]).sum().reset_index() | |
| # Sort by number of articles | |
| results_df = results_df.sort_values(by=["num_articles"], ascending=False) | |
| return results_df, gr.update(visible=True) | |
| # %% | |
| theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia") | |
| with gr.Blocks(theme=theme) as demo: | |
| embeddings_df = gr.State({}) | |
| # App title and description | |
| title = gr.Markdown( | |
| """ | |
| # Universities Explorer | |
| This app allows you to explore the institutions more closely related to a concept. | |
| It uses embeddings of institutions and concepts to calculate the similarity between them. The embedding model, [ComplEx](https://doi.org/10.48550/arXiv.1606.06357), was trained using the [AmpliGraph](https://github.com/Accenture/AmpliGraph) library. The data comes from the [OpenAlex](https://openalex.org/) dataset, which contains information about scientific articles, authors, institutions, and concepts. | |
| """ | |
| ) | |
| with gr.Group() as institution_search: | |
| concept_chooser = gr.Textbox( | |
| label="Concept URI", | |
| info="Using OpenAlex, find the URI of the concept you want to search for. For example, the URI of the concept 'Knowledge Graph' is https://openalex.org/C2987255567, while the URI of the concept 'Natural Language Processing' is https://openalex.org/C204321447. You can find the URI of a concept by searching for it on OpenAlex and copying the URL from the address bar. You can also search for multiple concepts by separating them with a comma.", | |
| placeholder="https://openalex.org/C2987255567, https://openalex.org/C204321447", | |
| value="https://openalex.org/C2987255567, https://openalex.org/C204321447", | |
| ) | |
| concept_name_label = gr.Markdown("Concept name: ", visible=False) | |
| # Table for name of institution and similarity to concept | |
| btn_search_institutions = gr.Button("Search institutions", variant="primary") | |
| table = gr.Dataframe( | |
| interactive=False, visible=False, elem_classes="institutions", wrap=True | |
| ) | |
| btn_search_institutions.click( | |
| lambda: gr.update(visible=True), outputs=[table], queue=True | |
| ) | |
| btn_plot_embeddings = gr.Button( | |
| "Plot embeddings", variant="primary", visible=False, elem_classes="embeddings" | |
| ) | |
| # Description of what plot embeddings does | |
| plot_embeddings_info = gr.Markdown( | |
| """ | |
| This button will plot the embeddings of the institutions related to the concept. The embeddings are calculated using the trained model and then reduced to 2 dimensions using PCA. The institutions are then clustered using KMeans. | |
| Running this may take a while, as we need to calculate the embeddings for all institutions and then perform PCA and clustering. | |
| """, | |
| visible=False, | |
| ) | |
| btn_search_institutions.click( | |
| process_user_input_concept, | |
| inputs=[concept_chooser], | |
| outputs=[ | |
| table, | |
| btn_plot_embeddings, | |
| plot_embeddings_info, | |
| #concept_name_label, | |
| #concept_name_label, | |
| ], | |
| queue=True, | |
| ) | |
| plot = gr.Plot(visible=False, elem_classes="embeddings") | |
| btn_plot_embeddings.click( | |
| lambda: gr.update(visible=True), outputs=[plot], queue=True | |
| ) | |
| btn_plot_embeddings.click( | |
| click_on_show_plot, | |
| inputs=[table], | |
| outputs=[plot, embeddings_df], | |
| queue=True, | |
| ) | |
| # When the user selects a row in the table, get the authors of that institution and display them in a dataframe | |
| with gr.Group(visible=False, elem_classes="authors") as authors: | |
| table_authors = gr.Dataframe( | |
| interactive=False, label="Authors in institution writing about concept" | |
| ) | |
| table.select( | |
| get_authors_of_institution, | |
| inputs=[table, concept_chooser], | |
| outputs=[table_authors], | |
| ) | |
| table.select( | |
| click_on_institution, | |
| inputs=[table, embeddings_df], | |
| outputs=[plot], | |
| ) | |
| btn_clear = gr.ClearButton(components=[table, plot, table_authors]) | |
| # Author information | |
| author_info = gr.Markdown( | |
| """ | |
| This demo has been built by [Aldan Creo]( | |
| https://acmc-website.web.app/). | |
| """ | |
| ) | |
| demo.queue() | |
| demo.launch() | |