Spaces:
Sleeping
Sleeping
| # %% | |
| import rdflib | |
| import pandas as pd | |
| def get_graph(): | |
| # File with the graph: MGCONSO.RRF | |
| df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0) | |
| # Rename the column '#CUI' to 'CUI' | |
| df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True) | |
| # Remove the last column, it's empty | |
| df_concepts = df_concepts.iloc[:, :-1] | |
| print(df_concepts.head()) | |
| # Create a graph | |
| g = rdflib.Graph() | |
| # Bind the namespace | |
| g.bind("medgen", "http://identifiers.org/medgen/") | |
| # Iterate over the rows | |
| for i, row in df_concepts.iterrows(): | |
| if row.SUPPRESS == "Y": | |
| continue | |
| if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P": | |
| # Create the URI | |
| uri = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI}") | |
| # Add the triple | |
| g.add((uri, rdflib.RDFS.label, rdflib.Literal(row.STR))) | |
| # Now, load MGREL.RRF | |
| df_relations = pd.read_csv("MGREL.RRF", sep="|", header=0) | |
| # Rename the column '#CUI1' to 'CUI1' | |
| df_relations.rename(columns={"#CUI1": "CUI1"}, inplace=True) | |
| # Remove the last column, it's empty | |
| df_relations = df_relations.iloc[:, :-1] | |
| print(df_relations.head()) | |
| # Iterate over the rows | |
| for i, row in df_relations.iterrows(): | |
| if row.SUPPRESS == "Y": | |
| continue | |
| # Create the URI | |
| uri1 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI1}") | |
| uri2 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI2}") | |
| # Add the triple | |
| if row.REL == "RL": | |
| g.add((uri1, rdflib.URIRef("related"), uri2)) | |
| continue | |
| g.add((uri1, rdflib.URIRef(f"http://identifiers.org/medgen/{row.REL}"), uri2)) | |
| return g | |
| def apply_rules_to_graph(g): | |
| # Now, apply this rule: if two nodes have the same parent (i.e. node1 RB node2 and node3 RB node2, then node1 related node3) | |
| # Query the graph to get the parents of each node | |
| query = """ | |
| PREFIX medgen: <http://identifiers.org/medgen/> | |
| SELECT DISTINCT ?parent ?child1 ?child2 WHERE { | |
| ?parent medgen:RN ?child1 . | |
| ?parent medgen:RN ?child2 . | |
| FILTER (?child1 != ?child2) | |
| } | |
| """ | |
| res = g.query(query) | |
| for row in res: | |
| g.add((row.child1, rdflib.URIRef("related"), row.child2)) | |
| g.add((row.child2, rdflib.URIRef("related"), row.child1)) | |
| return g | |
| def get_labels_of_entities(): | |
| """ | |
| Returns a dictionary with the labels of the entities | |
| """ | |
| # File with the graph: MGCONSO.RRF | |
| df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0) | |
| # Rename the column '#CUI' to 'CUI' | |
| df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True) | |
| # Remove the last column, it's empty | |
| df_concepts = df_concepts.iloc[:, :-1] | |
| # Create a dictionary | |
| labels_of_entities = {} | |
| # Iterate over the rows | |
| for i, row in df_concepts.iterrows(): | |
| if row.SUPPRESS == "Y": | |
| continue | |
| if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P": | |
| labels_of_entities[f"http://identifiers.org/medgen/{row.CUI}"] = row.STR | |
| return labels_of_entities | |
| def generate_triples_file(graph: rdflib.Graph): | |
| with open("triples_medgen.tsv", "w") as f: | |
| # Output the triples ?s ?p ?o | |
| for s, p, o in graph.triples((None, rdflib.URIRef("related"), None)): | |
| f.write(f"{s}\t{p}\t{o}\n") | |
| for s, p, o in graph.triples( | |
| (None, rdflib.URIRef("http://identifiers.org/medgen/RN"), None) | |
| ): | |
| f.write(f"{s}\t{p}\t{o}\n") | |
| for s, p, o in graph.triples( | |
| (None, rdflib.URIRef("http://identifiers.org/medgen/RB"), None) | |
| ): | |
| f.write(f"{s}\t{p}\t{o}\n") | |
| for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/PAR"), None)): | |
| f.write(f"{s}\t{p}\t{o}\n") | |
| for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/CHD"), None)): | |
| f.write(f"{s}\t{p}\t{o}\n") | |
| def save_adjacency_matrix(): | |
| # Load the triples file generated | |
| df = pd.read_csv("triples_medgen.tsv", sep="\t", header=None) | |
| # Now output the adjacency matrix, where the rows are the subjects and the columns are the objects | |
| # The values are the relations (i.e. 0 if no relation and 1 if there is a relation) | |
| # Get the unique subjects and objects | |
| subjects = df[0].unique() | |
| objects = df[2].unique() | |
| # Create the adjacency matrix | |
| adj_matrix = pd.DataFrame(0, index=subjects, columns=objects) | |
| # Iterate over the rows | |
| for i, row in df.iterrows(): | |
| adj_matrix.loc[row[0], row[2]] = 1 | |
| # Save the adjacency matrix | |
| adj_matrix.to_csv("adjacency_matrix.mat", sep="\t") | |
| # %% | |
| g = get_graph() | |
| # %% | |
| g = apply_rules_to_graph(g) | |
| # %% | |
| labels_of_entities = get_labels_of_entities() | |
| # %% | |
| generate_triples_file(g) | |
| # %% | |
| from pykeen.triples import TriplesFactory | |
| from pykeen.models import TuckER, TransE, TransH | |
| from pykeen.pipeline import pipeline | |
| tf = TriplesFactory.from_path("triples_medgen.tsv") | |
| print(f"Triples count: {tf.num_triples}") | |
| training, testing, validation = tf.split([0.8, 0.1, 0.1], random_state=42, randomize_cleanup=False) | |
| result = pipeline( | |
| training=training, | |
| testing=testing, | |
| validation=validation, | |
| model=TransE, | |
| stopper="early", | |
| epochs=500, # short epochs for testing - you should go | |
| # higher, especially with early stopper enabled | |
| ) | |
| result.save_to_directory("doctests/test_unstratified_stopped_complex") | |
| # %% | |
| import torch | |
| alzheimers = "http://identifiers.org/medgen/C1843013" | |
| # What does the model predict for Alzheimer's disease? | |
| model = result.model | |
| alzheimers_id = tf.entity_to_id[alzheimers] | |
| relation_id = tf.relation_to_id["related"] | |
| batch_to_predict = torch.tensor([[alzheimers_id, relation_id]]) | |
| alzheimers_pred = model.predict_t(hr_batch=batch_to_predict) | |
| print(alzheimers_pred.shape) | |
| # Get the indices of the top 10 predictions | |
| top10 = torch.topk(alzheimers_pred, 10, largest=True) | |
| # Get the entities | |
| entities = tf.entity_id_to_label | |
| print(top10.indices) | |
| for i in top10.indices[0]: | |
| # Ask the graph, what is the label for this entity? | |
| query = f""" | |
| PREFIX medgen: <http://identifiers.org/medgen/> | |
| SELECT ?label WHERE {{ | |
| <{entities[i.item()]}> <http://www.w3.org/2000/01/rdf-schema#label> ?label | |
| }} | |
| """ | |
| res = g.query(query) | |
| for i, row in enumerate(res): | |
| print(f"{i}: {row}") | |
| # %% | |
| from pykeen.nn.representation import Embedding | |
| # Get the embeddings of all the entities | |
| entity_ids = torch.LongTensor(list(tf.entity_to_id.values())).cuda() | |
| entity_embeddings: Embedding = model.entity_representations[0]._embeddings(entity_ids) | |
| # Get the embeddings of the relations | |
| relation_ids = torch.LongTensor(list(tf.relation_to_id.values())).cuda() | |
| relation_embeddings: Embedding = model.relation_representations[0]._embeddings( | |
| relation_ids | |
| ) | |
| print(f"Entity embeddings shape: {entity_embeddings.shape}") | |
| print(f"Relation embeddings shape: {relation_embeddings.shape}") | |
| # Store the embeddings in a DataFrame | |
| df = pd.DataFrame( | |
| { | |
| "embedding": entity_embeddings.detach().cpu().tolist(), | |
| "label": [ | |
| labels_of_entities[tf.entity_id_to_label[i]] if tf.entity_id_to_label[i] in labels_of_entities else "" | |
| for i in range(len(tf.entity_id_to_label)) | |
| ], | |
| "uri": [ | |
| f"{tf.entity_id_to_label[i]}" for i in range(len(tf.entity_id_to_label)) | |
| ], | |
| }, | |
| index=range(len(entity_embeddings)), | |
| ) | |
| ## Save the DataFrame | |
| df.to_csv("entity_embeddings.csv") | |
| # Store the embeddings in a DataFrame | |
| df = pd.DataFrame( | |
| { | |
| "embedding": relation_embeddings.detach().cpu().tolist(), | |
| "label": [ | |
| tf.relation_id_to_label[i] for i in range(len(tf.relation_id_to_label)) | |
| ], | |
| "uri": [ | |
| f"{tf.relation_id_to_label[i]}" for i in range(len(tf.relation_id_to_label)) | |
| ], | |
| }, | |
| index=range(len(relation_embeddings)), | |
| ) | |
| ## Save the DataFrame | |
| df.to_csv("relation_embeddings.csv") | |
| # %% | |
| import pyobo | |
| pyobo.get_name("mesh", "16793") | |
| # %% | |