Spaces:

klinic-hackupc
/

klinic

Sleeping

klinic / get_embeddings_of_disease_descriptions.py

acmc

First commit in HuggingFace

93e1b64 over 1 year ago

2.05 kB

	# %%
	import pandas as pd

	# Load the disease descriptions from MGDEF.RRF
	df_disease_descriptions = pd.read_csv("MGDEF.RRF", sep="\|", header=0)
	# Rename the column '#CUI' to 'CUI'
	df_disease_descriptions.rename(columns={"#CUI": "CUI"}, inplace=True)
	# Rename the column 'DEF' to 'definition'
	df_disease_descriptions.rename(columns={"DEF": "definition"}, inplace=True)
	# Remove the last column, it's empty
	df_disease_descriptions = df_disease_descriptions.iloc[:, :-1]
	# Filter out the rows where the SUPPRESS field is equal to 'Y'
	df_disease_descriptions = df_disease_descriptions[
	df_disease_descriptions["SUPPRESS"] != "Y"
	]
	# Some of the rows include a \n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C'
	df_disease_descriptions = df_disease_descriptions[
	df_disease_descriptions["CUI"].str.startswith("C")
	& ~df_disease_descriptions["CUI"].str.contains(" ")
	]
	# Remove the rows where the DEF field is empty
	df_disease_descriptions = df_disease_descriptions[
	df_disease_descriptions["definition"].notnull()
	]
	df_disease_descriptions["uri"] = df_disease_descriptions["CUI"].apply(
	lambda x: f"http://identifiers.org/medgen/{x}"
	)
	# Drop the columns that are not needed (source, SUPPRESS, CUI)
	df_disease_descriptions.drop(columns=["source", "SUPPRESS", "CUI"], inplace=True)

	# Drop the descriptions that are duplicates
	df_disease_descriptions.drop_duplicates(subset=["definition"], inplace=True)

	# Reset the index
	df_disease_descriptions.reset_index(drop=True, inplace=True)

	# %%
	from sentence_transformers import SentenceTransformer

	encoder = SentenceTransformer("allenai-specter")
	vectors = encoder.encode(
	df_disease_descriptions.definition, show_progress_bar=True, batch_size=64
	)
	vectors.shape

	# %%
	import numpy as np

	df_disease_descriptions["embeddings"] = vectors.astype(
	"float32", casting="same_kind"
	).tolist()
	# %%
	# Write to a CSV file
	df_disease_descriptions.to_csv(
	"disease_descriptions_with_embeddings.csv", index=False, header=True
	)

	# %%