Spaces:
Sleeping
Sleeping
| # %% | |
| import pandas as pd | |
| # Load the disease descriptions from MGDEF.RRF | |
| df_disease_descriptions = pd.read_csv("MGDEF.RRF", sep="|", header=0) | |
| # Rename the column '#CUI' to 'CUI' | |
| df_disease_descriptions.rename(columns={"#CUI": "CUI"}, inplace=True) | |
| # Rename the column 'DEF' to 'definition' | |
| df_disease_descriptions.rename(columns={"DEF": "definition"}, inplace=True) | |
| # Remove the last column, it's empty | |
| df_disease_descriptions = df_disease_descriptions.iloc[:, :-1] | |
| # Filter out the rows where the SUPPRESS field is equal to 'Y' | |
| df_disease_descriptions = df_disease_descriptions[ | |
| df_disease_descriptions["SUPPRESS"] != "Y" | |
| ] | |
| # Some of the rows include a \n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C' | |
| df_disease_descriptions = df_disease_descriptions[ | |
| df_disease_descriptions["CUI"].str.startswith("C") | |
| & ~df_disease_descriptions["CUI"].str.contains(" ") | |
| ] | |
| # Remove the rows where the DEF field is empty | |
| df_disease_descriptions = df_disease_descriptions[ | |
| df_disease_descriptions["definition"].notnull() | |
| ] | |
| df_disease_descriptions["uri"] = df_disease_descriptions["CUI"].apply( | |
| lambda x: f"http://identifiers.org/medgen/{x}" | |
| ) | |
| # Drop the columns that are not needed (source, SUPPRESS, CUI) | |
| df_disease_descriptions.drop(columns=["source", "SUPPRESS", "CUI"], inplace=True) | |
| # Drop the descriptions that are duplicates | |
| df_disease_descriptions.drop_duplicates(subset=["definition"], inplace=True) | |
| # Reset the index | |
| df_disease_descriptions.reset_index(drop=True, inplace=True) | |
| # %% | |
| from sentence_transformers import SentenceTransformer | |
| encoder = SentenceTransformer("allenai-specter") | |
| vectors = encoder.encode( | |
| df_disease_descriptions.definition, show_progress_bar=True, batch_size=64 | |
| ) | |
| vectors.shape | |
| # %% | |
| import numpy as np | |
| df_disease_descriptions["embeddings"] = vectors.astype( | |
| "float32", casting="same_kind" | |
| ).tolist() | |
| # %% | |
| # Write to a CSV file | |
| df_disease_descriptions.to_csv( | |
| "disease_descriptions_with_embeddings.csv", index=False, header=True | |
| ) | |
| # %% | |