Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

acmc commited on May 4, 2024

Commit

93e1b64

0 Parent(s):

First commit in HuggingFace

Browse files

Files changed (26) hide show

.DS_Store +0 -0
.assets/architecture.png +0 -0
.gitattributes +2 -0
.gitignore +166 -0
MGCONSO.RRF +3 -0
MGDEF.RRF +3 -0
MGREL.RRF +3 -0
README.md +27 -0
app.py +85 -0
calculate_smilar_nodes.py +63 -0
clinical_trials_diseases.csv +3 -0
clinical_trials_embeddings.csv +3 -0
clinical_trials_embeddings.ipynb +1714 -0
database.ipynb +265 -0
disease_descriptions_with_embeddings.csv +3 -0
docker-compose.yml +11 -0
entity_embeddings.csv +3 -0
get_embeddings_of_disease_descriptions.py +58 -0
graph.py +234 -0
graph_analysis.m +49 -0
graph_visualization.mlapp +0 -0
main.ipynb +417 -0
mock_trial.json +305 -0
relation_embeddings.csv +3 -0
requirements.txt +12 -0
utils.py +107 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.assets/architecture.png ADDED Viewed

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.csv filter=lfs diff=lfs merge=lfs -text
2	+ *.RRF filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.tsv
+doctests/
+file_db/
+clinical_trials.csv

MGCONSO.RRF ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe1722c72d4e2eef72f4b14468ce1fb05bd22952b20cda7249bce4d17ef0607f
+size 74127292

MGDEF.RRF ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb47c3f4e1731560571844b9520747987352d24baf37b8907aefe817383d9e18
+size 17450055

MGREL.RRF ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ae7bd453b114dad737adbc65aa23028aaf02370d6c7d37877251629e8162ae0
+size 94519907

README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+---
+title: Klìnic
+emoji: 👍🏻
+colorFrom: pink
+colorTo: blue
+sdk: streamlit
+sdk_version: 1.34.0
+app_file: app.py
+pinned: false
+---
+# hackupc-24
+## Architecture
+![alt text](.assets/architecture.png "Architecture")
+## Setup
+1. Start the IRIS Docker container:
+    ```Shell
+    docker-compose up
+    ```
+2. Start a Jupyter Notebook
+3. Navigate to http://localhost:52773/csp/sys/UtilHome.csp to access IRIS and login with username: `demo`, password: `demo`
+    - You can execute SQL queries at 'System Explorer' → 'SQL'
+4. run [vector_search.ipynb](./vector_search.ipynb)
+5. run the frontend: `streamlit run app.py`

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import streamlit as st
+from streamlit_agraph import agraph, Node, Edge, Config
+import os
+from sqlalchemy import create_engine, text
+import pandas as pd
+from utils import get_all_diseases_name, get_most_similar_diseases_from_uri, get_uri_from_name
+import json
+username = 'demo'
+password = 'demo'
+hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
+port = '1972'
+namespace = 'USER'
+CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
+engine = create_engine(CONNECTION_STRING)
+def handle_click_on_analyze_button():
+    # 1. Embed the textual description that the user entered using the model ()
+    # 2. Get 5 diseases with the highest cosine silimarity from the DB
+    # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
+    # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
+    # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
+    # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
+    # 7. Use an LLM to get a summary of the clinical trials, in plain text format
+    # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
+    # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
+    pass
+st.write("# Klìnic")
+description_input = st.text_input(label="Enter the disease description 👇")
+st.write(":red[Here should be the graph]")  # TODO remove
+chart_data = pd.DataFrame(
+    np.random.randn(20, 3), columns=["a", "b", "c"]
+)  # TODO remove
+st.scatter_chart(chart_data)  # TODO remove
+st.write("## Disease Overview")
+disease_overview = ":red[lorem ipsum]"  # TODO
+st.write(disease_overview)
+st.write("## Clinical Trials Details")
+trials = []
+# TODO replace mock data
+with open("mock_trial.json") as f:
+    d = json.load(f)
+for i in range(0, 5):
+    trials.append(d)
+for trial in trials:
+    with st.expander(f"{trial['protocolSection']['identificationModule']['nctId']}"):
+        official_title = trial["protocolSection"]["identificationModule"][
+            "officialTitle"
+        ]
+        st.write(f"##### {official_title}")
+        brief_summary = trial["protocolSection"]["descriptionModule"]["briefSummary"]
+        st.write(brief_summary)
+        status_module = {
+            "Status": trial["protocolSection"]["statusModule"]["overallStatus"],
+            "Status Date": trial["protocolSection"]["statusModule"][
+                "statusVerifiedDate"
+            ],
+        }
+        st.write("###### Status")
+        st.table(status_module)
+        design_module = {
+            "Study Type": trial["protocolSection"]["designModule"]["studyType"],
+            # "Phases": trial["protocolSection"]["designModule"]["phases"], # breaks formatting because it is an array
+            "Allocation": trial["protocolSection"]["designModule"]["designInfo"][
+                "allocation"
+            ],
+            "Participants": trial["protocolSection"]["designModule"]["enrollmentInfo"][
+                "count"
+            ],
+        }
+        st.write("###### Design")
+        st.table(design_module)
+        # TODO more modules?

calculate_smilar_nodes.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# %%
+def transe_distance(head, tail, relation, entity_embeddings, relation_embeddings):
+    head_embedding = entity_embeddings[head]
+    tail_embedding = entity_embeddings[tail]
+    relation_embeddings = relation_embeddings[relation]
+    distance = head_embedding + relation_embeddings - tail_embedding
+    return distance
+def calculate_similar_nodes(node, entity_embeddings, relation_embeddings, top_n=10):
+    distances = []
+    for i in range(len(entity_embeddings)):
+        distance = transe_distance(node, i, 0, entity_embeddings, relation_embeddings)
+        distances.append((i, distance))
+    distances.sort(key=lambda x: x[1].norm().item())
+    return distances[:top_n]
+# %%
+import pandas as pd
+# Load the embeddings from the CSV files
+entity_embeddings = pd.read_csv("entity_embeddings.csv", index_col=0)
+# The embedding column is a string, convert it to a tensor
+import torch
+entity_embeddings["embedding"] = entity_embeddings["embedding"].apply(
+    lambda x: torch.tensor(eval(x))
+)
+entity_embeddings.head()
+# Now, load the relation embeddings
+relation_embeddings = pd.read_csv("relation_embeddings.csv", index_col=0)
+relation_embeddings["embedding"] = relation_embeddings["embedding"].apply(
+    lambda x: torch.tensor(eval(x))
+)
+display(relation_embeddings.head())
+# %%
+# Find the index of the entity with the uri "http://identifiers.org/medgen/C0002395"
+head = entity_embeddings[
+    entity_embeddings["uri"] == "http://identifiers.org/medgen/C0002395"
+].index[0]
+# Find the index of the entity with the uri "http://identifiers.org/medgen/C1843013"
+tail = entity_embeddings[
+    entity_embeddings["uri"] == "http://identifiers.org/medgen/C1843013"
+].index[0]
+relation = 0
+distance = transe_distance(
+    head,
+    tail,
+    relation,
+    entity_embeddings["embedding"],
+    relation_embeddings["embedding"],
+)
+print(
+    f'Distance between {entity_embeddings["label"][head]} ({head}) and {entity_embeddings["label"][tail]} ({tail}) via relation {relation_embeddings["label"][relation]} is {distance.norm().item()}'
+)
+# %%
+# Calculate similar nodes to the head
+similar_nodes = calculate_similar_nodes(head, entity_embeddings["embedding"], relation_embeddings["embedding"])
+print(f"Similar nodes to {entity_embeddings['label'][head]} ({head}):")
+# Print the similar nodes
+for i, (node, distance) in enumerate(similar_nodes):
+    print(f"{i}: {entity_embeddings['label'][node]} ({node}) with distance {distance.norm().item()}")
+# %%

clinical_trials_diseases.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2efed4939576aa55aa19b07949b58b4c3fc5022629e96f0f665581431c85b4a
+size 86845185

clinical_trials_embeddings.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6f1cce06d76a6e83695fa6e1583292bcd85b7846488cde3c43ec2067409a91c
+size 2111058864

clinical_trials_embeddings.ipynb ADDED Viewed

	@@ -0,0 +1,1714 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9db3b813-22dc-4209-86d2-42e935f5f5dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "import pandas as pd\n",
+    "import langchain\n",
+    "import os\n",
+    "import openai\n",
+    "import ast\n",
+    "from langchain import OpenAI\n",
+    "from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "from langchain.document_loaders import UnstructuredURLLoader\n",
+    "from langchain.embeddings import OpenAIEmbeddings\n",
+    "from langchain.vectorstores import FAISS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fb48dccd-37e5-484a-a2fc-c482839b9ed9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# loader = CSVLoader(file_path=\"trials/brief_summaries.csv\")\n",
+    "# data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "cdcc3107-e6a8-47ca-bd89-e381fbcf9b9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df= pd.read_csv(\"trials/brief_summaries.txt\",  delimiter=\"|\")\n",
+    "# df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c6a94c14-b197-4eff-9941-1ca52069cd5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df.head(20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "95c000ff-bf0c-4489-8643-93a238db41dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"OPENAI_API_KEY\"] = (\n",
+    "    \"sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e2e03936-fcce-4287-bfe6-e31f1b69f693",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `OpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.\n",
+      "  warn_deprecated(\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm = OpenAI(temperature=0.6, max_tokens=500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "aede3d75-2441-44f6-b3cf-2f86f050da24",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'clinical_trials.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_trials\u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclinical_trials.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_trials\u001b[38;5;241m.\u001b[39mshape)\n\u001b[1;32m      3\u001b[0m df_trials\u001b[38;5;241m.\u001b[39mhead()\n",
+      "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1024\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m   1011\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m   1012\u001b[0m     dialect,\n\u001b[1;32m   1013\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1020\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m   1021\u001b[0m )\n\u001b[1;32m   1022\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1024\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:618\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    615\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    617\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 618\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    620\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    621\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+      "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1618\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1615\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1618\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1878\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1876\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1877\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1878\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1879\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1880\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1881\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1882\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1883\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1884\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1885\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1886\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1887\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1888\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1889\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+      "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    869\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    870\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    871\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    872\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    874\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    875\u001b[0m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    876\u001b[0m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    877\u001b[0m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    878\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    879\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'clinical_trials.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "df_trials = pd.read_csv(\"clinical_trials.csv\")\n",
+    "print(df_trials.shape)\n",
+    "df_trials.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "65589fc4-4f55-4b72-9935-c3b163fde0e2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>['marijuana abuse', 'substance-related disorde...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>['marijuana abuse', 'substance-related disorde...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>['tuberculosis', 'latent tuberculosis', 'infec...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>['heart failure', 'heart diseases', 'cardiovas...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>['lymphoma', 'neoplasms by histologic type', '...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   desease_condition\n",
+       "0  ['marijuana abuse', 'substance-related disorde...\n",
+       "1  ['marijuana abuse', 'substance-related disorde...\n",
+       "2  ['tuberculosis', 'latent tuberculosis', 'infec...\n",
+       "3  ['heart failure', 'heart diseases', 'cardiovas...\n",
+       "4  ['lymphoma', 'neoplasms by histologic type', '..."
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_trials_filtered = df_trials[[\"desease_condition\"]]\n",
+    "df_trials_filtered.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "88e28056-e340-416c-a1a9-4a6c29556dc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"['marijuana abuse', 'substance-related disorders', 'chemically-induced disorders', 'mental disorders']\""
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_trials_filtered[\"desease_condition\"].iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "5bd8f876-0480-40a5-a32f-ca7ec137a70f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\ariji\\AppData\\Local\\Temp\\ipykernel_22340\\16068817.py:4: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df_trials_filtered['desease_condition']= df_trials_filtered['desease_condition'].apply(list_to_string)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>tuberculosis, latent tuberculosis, infections,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>heart failure, heart diseases, cardiovascular ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>lymphoma, neoplasms by histologic type, neopla...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440512</th>\n",
+       "      <td>obesity, overweight, overnutrition, nutrition ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440513</th>\n",
+       "      <td>obesity, overweight, overnutrition, nutrition ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440514</th>\n",
+       "      <td>obesity, overweight, overnutrition, nutrition ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440515</th>\n",
+       "      <td>autistic disorder, autism spectrum disorder, c...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440516</th>\n",
+       "      <td>autistic disorder, autism spectrum disorder, c...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>440517 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        desease_condition\n",
+       "0       marijuana abuse, substance-related disorders, ...\n",
+       "1       marijuana abuse, substance-related disorders, ...\n",
+       "2       tuberculosis, latent tuberculosis, infections,...\n",
+       "3       heart failure, heart diseases, cardiovascular ...\n",
+       "4       lymphoma, neoplasms by histologic type, neopla...\n",
+       "...                                                   ...\n",
+       "440512  obesity, overweight, overnutrition, nutrition ...\n",
+       "440513  obesity, overweight, overnutrition, nutrition ...\n",
+       "440514  obesity, overweight, overnutrition, nutrition ...\n",
+       "440515  autistic disorder, autism spectrum disorder, c...\n",
+       "440516  autistic disorder, autism spectrum disorder, c...\n",
+       "\n",
+       "[440517 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def list_to_string(disease_list):\n",
+    "    disease_list = ast.literal_eval(disease_list)\n",
+    "    return \", \".join(disease_list)\n",
+    "\n",
+    "\n",
+    "df_trials_filtered[\"desease_condition\"] = df_trials_filtered[\"desease_condition\"].apply(\n",
+    "    list_to_string\n",
+    ")\n",
+    "df_trials_filtered"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "bbbb22e6-4883-4869-8ccc-95696bc67b1b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    marijuana abuse, substance-related disorders, ...\n",
+       "1    marijuana abuse, substance-related disorders, ...\n",
+       "2    tuberculosis, latent tuberculosis, infections,...\n",
+       "3    heart failure, heart diseases, cardiovascular ...\n",
+       "4    lymphoma, neoplasms by histologic type, neopla...\n",
+       "Name: desease_condition, dtype: object"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_trials_filtered[\"desease_condition\"].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "7f4e7ceb-8bfd-4294-a850-8935f88b6555",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_trials_filtered.to_csv(\"diseases.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "af1c5c2b-24a0-44a1-9e5d-7ee89ca4cccf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "440517"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loader = CSVLoader(file_path=\"./diseases.csv\", encoding=\"utf-8\")\n",
+    "data = loader.load()\n",
+    "len(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cab89218-41ca-4048-886d-bc2c1c9b30bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = OpenAIEmbeddings()\n",
+    "vectorstore = FAISS.from_documents(data, embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "225ade4a-d004-44cc-a5ff-22ce2bfcac32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = \"vector_index.pkl\"\n",
+    "with open(file_path, \"wb\") as f:\n",
+    "    pickle.dump(vectorstore, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "11912a93-ad02-41cb-8bce-2750c947fa74",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(440517, 2)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>['marijuana abuse', 'substance-related disorde...</td>\n",
+       "      <td>nct_id: NCT03055377\\nsummary: This is a 12-wee...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>['marijuana abuse', 'substance-related disorde...</td>\n",
+       "      <td>nct_id: NCT03055377\\nsummary: This is a 12-wee...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>['tuberculosis', 'latent tuberculosis', 'infec...</td>\n",
+       "      <td>nct_id: NCT03042754\\nsummary: Early diagnosis ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>['heart failure', 'heart diseases', 'cardiovas...</td>\n",
+       "      <td>nct_id: NCT03035123\\nsummary: The EduStra-HF s...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>['lymphoma', 'neoplasms by histologic type', '...</td>\n",
+       "      <td>nct_id: NCT02272751\\nsummary: This study will ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   desease_condition  \\\n",
+       "0  ['marijuana abuse', 'substance-related disorde...   \n",
+       "1  ['marijuana abuse', 'substance-related disorde...   \n",
+       "2  ['tuberculosis', 'latent tuberculosis', 'infec...   \n",
+       "3  ['heart failure', 'heart diseases', 'cardiovas...   \n",
+       "4  ['lymphoma', 'neoplasms by histologic type', '...   \n",
+       "\n",
+       "                                                text  \n",
+       "0  nct_id: NCT03055377\\nsummary: This is a 12-wee...  \n",
+       "1  nct_id: NCT03055377\\nsummary: This is a 12-wee...  \n",
+       "2  nct_id: NCT03042754\\nsummary: Early diagnosis ...  \n",
+       "3  nct_id: NCT03035123\\nsummary: The EduStra-HF s...  \n",
+       "4  nct_id: NCT02272751\\nsummary: This study will ...  "
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_trials = pd.read_csv(\"clinical_trials.csv\")\n",
+    "print(df_trials.shape)\n",
+    "df_trials.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25b31e55-2961-474d-92d8-5963f2c6bf84",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "918c078c-46fe-4d7b-9748-88c52a5b004a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36e83202-97ad-425d-95ae-075a1e26a34e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d705875-5dd7-4c71-8d94-99c101020ac0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f2818abe-1a43-4d7d-92a7-7562812bf43d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>tuberculosis, latent tuberculosis, infections,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>heart failure, heart diseases, cardiovascular ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>lymphoma, neoplasms by histologic type, neopla...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440512</th>\n",
+       "      <td>obesity, overweight, overnutrition, nutrition ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440513</th>\n",
+       "      <td>obesity, overweight, overnutrition, nutrition ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440514</th>\n",
+       "      <td>obesity, overweight, overnutrition, nutrition ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440515</th>\n",
+       "      <td>autistic disorder, autism spectrum disorder, c...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440516</th>\n",
+       "      <td>autistic disorder, autism spectrum disorder, c...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>440517 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        desease_condition\n",
+       "0       marijuana abuse, substance-related disorders, ...\n",
+       "1       marijuana abuse, substance-related disorders, ...\n",
+       "2       tuberculosis, latent tuberculosis, infections,...\n",
+       "3       heart failure, heart diseases, cardiovascular ...\n",
+       "4       lymphoma, neoplasms by histologic type, neopla...\n",
+       "...                                                   ...\n",
+       "440512  obesity, overweight, overnutrition, nutrition ...\n",
+       "440513  obesity, overweight, overnutrition, nutrition ...\n",
+       "440514  obesity, overweight, overnutrition, nutrition ...\n",
+       "440515  autistic disorder, autism spectrum disorder, c...\n",
+       "440516  autistic disorder, autism spectrum disorder, c...\n",
+       "\n",
+       "[440517 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_trials_filtered = pd.read_csv(\"diseases.csv\")\n",
+    "df_trials_filtered"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c89e3cf6-a376-4029-9c04-0f5e664a2237",
+   "metadata": {
+    "notebookRunGroups": {
+     "groupValue": "1"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(440517, 1)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2 = df_trials_filtered  # [:100]\n",
+    "df2.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c5012bcf-3e25-4f21-a29c-6bdbdafbb8c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "40a480bd-6754-40b6-870c-42d10ce9a960",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_embeddings(text):\n",
+    "    response = client.embeddings.create(\n",
+    "        input=text, dimensions=128, model=\"text-embedding-3-small\"\n",
+    "    )\n",
+    "    return response.data[0].embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ef6d6b62-de0b-4bc6-a6eb-847ab8e99da5",
+   "metadata": {
+    "notebookRunGroups": {
+     "groupValue": "1"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "Batches: 100%|██████████| 6884/6884 [04:32<00:00, 25.25it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 14min 6s, sys: 1min 31s, total: 15min 37s\n",
+      "Wall time: 4min 48s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(440517, 768)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "from  sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "encoder= SentenceTransformer(\"allenai-specter\")\n",
+    "vectors= encoder.encode(df2.desease_condition, show_progress_bar=True, batch_size=64)\n",
+    "vectors.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "7966d754-56d7-4555-a6c6-6a13772fb000",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 261 ms, sys: 13 ms, total: 274 ms\n",
+      "Wall time: 26.8 s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<timed exec>:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "      <th>embeddings</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "      <td>[-0.05811865255236626, -0.023393018171191216, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "      <td>[-0.0581701435148716, -0.023382455110549927, 0...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>tuberculosis, latent tuberculosis, infections,...</td>\n",
+       "      <td>[-0.03460180386900902, -0.084668830037117, 0.2...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>heart failure, heart diseases, cardiovascular ...</td>\n",
+       "      <td>[-0.08236236125230789, -0.1235777735710144, 0....</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>lymphoma, neoplasms by histologic type, neopla...</td>\n",
+       "      <td>[-0.1227850392460823, 0.07155642658472061, 0.1...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   desease_condition  \\\n",
+       "0  marijuana abuse, substance-related disorders, ...   \n",
+       "1  marijuana abuse, substance-related disorders, ...   \n",
+       "2  tuberculosis, latent tuberculosis, infections,...   \n",
+       "3  heart failure, heart diseases, cardiovascular ...   \n",
+       "4  lymphoma, neoplasms by histologic type, neopla...   \n",
+       "\n",
+       "                                          embeddings  \n",
+       "0  [-0.05811865255236626, -0.023393018171191216, ...  \n",
+       "1  [-0.0581701435148716, -0.023382455110549927, 0...  \n",
+       "2  [-0.03460180386900902, -0.084668830037117, 0.2...  \n",
+       "3  [-0.08236236125230789, -0.1235777735710144, 0....  \n",
+       "4  [-0.1227850392460823, 0.07155642658472061, 0.1...  "
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "df2['embeddings']= df2['desease_condition'].apply(get_embeddings)\n",
+    "df2.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c2f99031",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2['embeddings'] = vectors.astype('float32',casting='same_kind').tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "952d69c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove duplicate rows based on the 'nct_id' column\n",
+    "df2_without_duplicates = df2.drop_duplicates(subset='nct_id', keep='first')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "2711980a-d1c0-441e-ae9a-531500b7b7cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2_without_duplicates[:130077].to_csv(\n",
+    "    \"diseases_embeddings.csv\", index=False, header=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "fccd4f0e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "      <th>embeddings</th>\n",
+       "      <th>nct_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "      <td>[-0.8323991298675537, 1.47855544090271, 0.0013...</td>\n",
+       "      <td>NCT03055377</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>tuberculosis, latent tuberculosis, infections,...</td>\n",
+       "      <td>[-0.43443307280540466, 0.9625586271286011, -0....</td>\n",
+       "      <td>NCT03042754</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>heart failure, heart diseases, cardiovascular ...</td>\n",
+       "      <td>[-0.5791705250740051, 0.13008448481559753, 0.1...</td>\n",
+       "      <td>NCT03035123</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>lymphoma, neoplasms by histologic type, neopla...</td>\n",
+       "      <td>[-0.1608569175004959, 0.8489153981208801, -0.5...</td>\n",
+       "      <td>NCT02272751</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>anemia, hematologic diseases</td>\n",
+       "      <td>[0.21379394829273224, 0.17073844373226166, -0....</td>\n",
+       "      <td>NCT00931606</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440506</th>\n",
+       "      <td>scoliosis, spinal curvatures, spinal diseases,...</td>\n",
+       "      <td>[-1.20807683467865, 0.19357842206954956, 0.314...</td>\n",
+       "      <td>NCT03641469</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440507</th>\n",
+       "      <td>asphyxia neonatorum, asphyxia, death, patholog...</td>\n",
+       "      <td>[-0.7226205468177795, 1.0146900415420532, -0.2...</td>\n",
+       "      <td>NCT03621956</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440509</th>\n",
+       "      <td>tuberculosis, helminthiasis, malnutrition, myc...</td>\n",
+       "      <td>[-0.7196142673492432, 0.9588190913200378, 0.08...</td>\n",
+       "      <td>NCT03598842</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440512</th>\n",
+       "      <td>obesity, overweight, overnutrition, nutrition ...</td>\n",
+       "      <td>[-1.159234642982483, 0.5251776576042175, 0.237...</td>\n",
+       "      <td>NCT03574103</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>440515</th>\n",
+       "      <td>autistic disorder, autism spectrum disorder, c...</td>\n",
+       "      <td>[-0.8618993759155273, 0.7515497803688049, 0.08...</td>\n",
+       "      <td>NCT03570372</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>233077 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        desease_condition  \\\n",
+       "0       marijuana abuse, substance-related disorders, ...   \n",
+       "2       tuberculosis, latent tuberculosis, infections,...   \n",
+       "3       heart failure, heart diseases, cardiovascular ...   \n",
+       "4       lymphoma, neoplasms by histologic type, neopla...   \n",
+       "6                            anemia, hematologic diseases   \n",
+       "...                                                   ...   \n",
+       "440506  scoliosis, spinal curvatures, spinal diseases,...   \n",
+       "440507  asphyxia neonatorum, asphyxia, death, patholog...   \n",
+       "440509  tuberculosis, helminthiasis, malnutrition, myc...   \n",
+       "440512  obesity, overweight, overnutrition, nutrition ...   \n",
+       "440515  autistic disorder, autism spectrum disorder, c...   \n",
+       "\n",
+       "                                               embeddings       nct_id  \n",
+       "0       [-0.8323991298675537, 1.47855544090271, 0.0013...  NCT03055377  \n",
+       "2       [-0.43443307280540466, 0.9625586271286011, -0....  NCT03042754  \n",
+       "3       [-0.5791705250740051, 0.13008448481559753, 0.1...  NCT03035123  \n",
+       "4       [-0.1608569175004959, 0.8489153981208801, -0.5...  NCT02272751  \n",
+       "6       [0.21379394829273224, 0.17073844373226166, -0....  NCT00931606  \n",
+       "...                                                   ...          ...  \n",
+       "440506  [-1.20807683467865, 0.19357842206954956, 0.314...  NCT03641469  \n",
+       "440507  [-0.7226205468177795, 1.0146900415420532, -0.2...  NCT03621956  \n",
+       "440509  [-0.7196142673492432, 0.9588190913200378, 0.08...  NCT03598842  \n",
+       "440512  [-1.159234642982483, 0.5251776576042175, 0.237...  NCT03574103  \n",
+       "440515  [-0.8618993759155273, 0.7515497803688049, 0.08...  NCT03570372  \n",
+       "\n",
+       "[233077 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2_without_duplicates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "id": "8ed985f4-9402-431f-bfba-1236ba16b895",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000023493347480> >"
+      ]
+     },
+     "execution_count": 106,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import faiss\n",
+    "\n",
+    "index = faiss.IndexFlatL2(dim)\n",
+    "index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "id": "71dad860-1f19-4166-a309-c9ce15f24792",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(768,)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([-8.82369652e-03,  8.50650743e-02,  2.08267733e-03,  6.77651772e-03,\n",
+       "       -2.86661759e-02, -8.71188380e-03,  6.99447095e-02,  5.04214764e-02,\n",
+       "        3.58386151e-02,  5.29594952e-03, -1.40875215e-02,  1.99297220e-02,\n",
+       "        2.27009598e-03,  2.10810862e-02,  2.66138893e-02,  1.90623086e-02,\n",
+       "        4.44708914e-02,  2.96202525e-02,  5.42085357e-02, -2.34859088e-03,\n",
+       "       -9.87798795e-02, -5.00183590e-02, -3.42465192e-02,  2.08440255e-02,\n",
+       "        5.31156994e-02, -1.37044629e-02,  2.92537250e-02, -2.61334293e-02,\n",
+       "       -1.21854078e-04, -2.36813519e-02, -3.81283499e-02, -1.79494768e-02,\n",
+       "       -6.29265187e-03,  1.27150817e-02,  1.19849676e-06, -7.78729608e-03,\n",
+       "       -1.28973828e-04,  4.01791967e-02,  4.21229303e-02, -8.72302521e-03,\n",
+       "        7.44823692e-03,  7.68032745e-02,  6.50246907e-03,  3.40298638e-02,\n",
+       "       -1.80711355e-02, -2.71878559e-02,  5.74751608e-02,  3.67745496e-02,\n",
+       "       -3.34868580e-02,  1.05205458e-02,  2.08975170e-02,  4.36686277e-02,\n",
+       "        3.47612537e-02, -4.99080680e-02,  4.44446988e-02,  5.57280704e-03,\n",
+       "       -2.31200755e-02, -4.60692644e-02, -1.39789237e-02, -3.79957110e-02,\n",
+       "        4.67903316e-02,  1.91651955e-02, -5.12171052e-02,  2.46807020e-02,\n",
+       "       -5.52081019e-02,  3.50596346e-02, -7.01438356e-03, -3.36519890e-02,\n",
+       "       -1.41502097e-02, -1.37693482e-02,  4.11427952e-02,  6.94046309e-03,\n",
+       "       -1.30138136e-02,  5.91567121e-02,  3.37168351e-02,  3.01467292e-02,\n",
+       "       -4.59552221e-02,  1.37365120e-03,  1.00179566e-02,  6.98126853e-04,\n",
+       "        3.58139984e-02,  1.18174301e-02,  1.33722462e-02, -1.35893077e-02,\n",
+       "        4.75908853e-02,  5.48331346e-03, -6.41460950e-03, -1.23906611e-02,\n",
+       "        5.82688041e-02, -1.60842277e-02, -2.95833423e-04,  6.97355811e-03,\n",
+       "        2.48331465e-02, -2.35959496e-02, -1.24989869e-02, -1.36585534e-02,\n",
+       "        1.52637456e-02,  7.01832073e-03,  5.50601333e-02,  4.35096538e-03,\n",
+       "        2.36319732e-02, -1.38118947e-02, -7.24233836e-02,  9.39742289e-03,\n",
+       "       -2.66901590e-02,  2.96042152e-02,  1.28761679e-02,  2.23339219e-02,\n",
+       "        3.08373477e-03,  7.12765753e-02,  7.13613164e-03,  3.62721197e-02,\n",
+       "       -4.53250594e-02,  2.54001115e-02, -2.54253373e-02, -1.23151275e-03,\n",
+       "       -1.34750446e-02, -2.70653702e-02, -1.02220355e-02,  2.07683407e-02,\n",
+       "       -7.31003610e-03,  2.65329964e-02, -2.79857730e-03,  4.20840643e-02,\n",
+       "        3.20205763e-02, -1.19518824e-02, -5.77116087e-02,  9.88688134e-03,\n",
+       "        1.86814573e-02, -5.10204993e-02, -6.77110278e-04,  9.40234493e-03,\n",
+       "       -3.33383717e-02, -5.52933291e-02,  5.64148054e-02,  4.92153503e-02,\n",
+       "        3.33690383e-02, -3.92963700e-02, -6.91099390e-02,  3.79911740e-03,\n",
+       "       -1.74410697e-02, -1.60171147e-02,  4.89675067e-02,  2.67119659e-03,\n",
+       "        2.61192098e-02, -2.74193864e-02,  6.92490395e-03, -4.64810384e-03,\n",
+       "       -8.99862905e-04,  1.02159111e-02, -4.81114909e-02,  1.22787328e-02,\n",
+       "       -9.32844076e-03, -2.00431682e-02, -1.36102587e-02, -3.67914373e-03,\n",
+       "       -1.60810221e-02, -2.20200215e-02,  2.32051890e-02, -5.07331975e-02,\n",
+       "       -1.01248249e-02,  5.62567115e-02, -2.60966737e-03,  9.27545596e-03,\n",
+       "        5.32410555e-02,  4.81746234e-02, -9.83138476e-03,  1.81230865e-02,\n",
+       "       -2.12969314e-02,  9.82244611e-02, -2.47648880e-02,  7.06253499e-02,\n",
+       "        8.71159416e-03, -2.73140483e-02,  5.59884915e-03, -2.14829091e-02,\n",
+       "       -6.67077005e-02,  2.48677693e-02, -8.29503238e-02, -7.96182230e-02,\n",
+       "        3.77488993e-02, -1.37352264e-02, -2.85069812e-02,  1.81708820e-02,\n",
+       "       -4.07746173e-02, -4.71230270e-03, -1.59605164e-02, -1.25815195e-03,\n",
+       "       -6.59954594e-03, -1.51611334e-02,  7.87123516e-02, -4.09705602e-02,\n",
+       "        3.07933297e-02,  1.27626080e-02, -4.34489138e-02,  9.91576444e-03,\n",
+       "        1.25470785e-02, -8.67356583e-02,  1.26097840e-03,  3.24709825e-02,\n",
+       "       -6.92409948e-02, -4.35011238e-02, -2.79313605e-02, -3.37213017e-02,\n",
+       "       -2.35359464e-02, -2.95022167e-02,  2.88009271e-02, -3.26618887e-02,\n",
+       "        7.09307985e-03,  3.09435464e-03, -5.09097055e-02,  3.54242921e-02,\n",
+       "        5.37336655e-02,  1.55867739e-02,  2.09988486e-02, -4.38529663e-02,\n",
+       "        2.93767708e-03,  2.27999203e-02,  1.02668423e-02,  3.35033536e-02,\n",
+       "       -8.28316063e-02, -4.17127199e-02, -1.23034064e-02,  2.38543525e-02,\n",
+       "       -3.72257493e-02,  2.97443867e-02,  3.35034318e-02, -5.21336049e-02,\n",
+       "        5.74519299e-03,  2.89844945e-02, -2.21337453e-02,  2.34603398e-02,\n",
+       "        6.33142609e-03, -2.24104542e-02,  1.47326495e-02,  1.98041964e-02,\n",
+       "        3.05697713e-02, -9.37094465e-02, -6.84579164e-02,  4.63523576e-03,\n",
+       "        3.88860740e-02, -3.97440195e-02, -4.70216498e-02,  1.02172708e-02,\n",
+       "       -3.37972888e-03, -8.54947045e-03,  4.81354557e-02,  4.99849804e-02,\n",
+       "        7.11378129e-03, -2.54327375e-02, -1.14872465e-02, -3.54485810e-02,\n",
+       "        5.24284095e-02,  2.16708388e-02, -4.00698110e-02,  5.15380092e-02,\n",
+       "       -6.03203699e-02, -6.50304696e-03, -1.03860423e-02, -7.47132823e-02,\n",
+       "        3.59848235e-03, -4.68364358e-02, -4.23019789e-02, -1.86387468e-02,\n",
+       "       -2.88047381e-02, -2.81904116e-02,  1.52729014e-02, -1.55570190e-02,\n",
+       "        1.34619148e-02,  2.34364290e-02,  3.10326237e-02, -4.70464528e-02,\n",
+       "       -2.43550166e-02, -7.20657408e-03, -1.16065536e-02, -3.42444591e-02,\n",
+       "       -5.30204549e-03,  5.52049950e-02,  4.50828709e-02, -7.30262510e-03,\n",
+       "        5.56289777e-02, -9.46066808e-03, -3.37345451e-02, -1.87659152e-02,\n",
+       "        3.57284099e-02,  4.20488343e-02,  1.66770478e-03, -5.27675785e-02,\n",
+       "        2.96422077e-04,  4.22447585e-02,  4.97253910e-02,  6.03130311e-02,\n",
+       "        1.32281650e-02,  2.35939436e-02, -1.59284715e-02,  4.46444489e-02,\n",
+       "       -1.68315917e-02,  1.34740606e-01, -3.54593806e-02,  4.79029641e-02,\n",
+       "        8.99049267e-03,  4.74606343e-02,  6.70041004e-03, -1.15184486e-03,\n",
+       "        2.69540539e-03, -2.77549177e-02, -1.33260442e-02,  2.60788556e-02,\n",
+       "        4.35438640e-02, -2.55859867e-02,  2.76670083e-02,  3.37177999e-02,\n",
+       "        2.93240137e-02,  1.82274636e-03, -1.40310880e-02, -1.91633645e-02,\n",
+       "        1.18790809e-02, -4.65121269e-02, -4.19883654e-02, -2.69681774e-02,\n",
+       "       -3.23035605e-02, -6.84630498e-02,  6.26784265e-02,  1.37511576e-02,\n",
+       "       -2.55833156e-02, -5.73152229e-02,  3.30126472e-02, -7.90146552e-03,\n",
+       "       -1.08651863e-02,  1.10474667e-02,  3.03509296e-03,  1.55274626e-02,\n",
+       "        1.05599947e-02, -7.16960803e-03, -5.01419827e-02, -3.34469602e-02,\n",
+       "        3.77239436e-02,  9.44003314e-02, -4.80610691e-02,  4.73537892e-02,\n",
+       "        3.40655483e-02,  7.88806472e-03, -2.84915343e-02,  7.96849206e-02,\n",
+       "        1.57442074e-02, -4.15650755e-02,  7.51048513e-03,  3.66957486e-02,\n",
+       "       -1.72730908e-01, -8.72075930e-02,  2.86346450e-02,  2.16962174e-02,\n",
+       "       -4.80199270e-02,  6.49317261e-03,  1.67240556e-02, -2.56227311e-02,\n",
+       "        2.19670162e-02, -6.10647202e-02, -2.65449155e-02,  6.17929082e-03,\n",
+       "       -2.89566331e-02,  1.19498251e-02, -2.33849231e-02, -2.69133616e-02,\n",
+       "       -1.46602485e-02,  1.18886270e-02,  1.64973717e-02, -3.90495770e-02,\n",
+       "       -3.45575088e-03,  5.12249060e-02, -8.63745401e-04,  5.59820198e-02,\n",
+       "        2.10017413e-02,  2.74998210e-02,  3.03551817e-04, -1.15796946e-01,\n",
+       "       -4.66962112e-03, -4.80118394e-02, -3.55160870e-02, -4.72528581e-03,\n",
+       "       -4.29739058e-02, -1.07347388e-02, -1.32423071e-02, -2.34632343e-02,\n",
+       "        1.98413953e-02, -7.27679394e-03,  2.27117930e-02, -2.59338003e-02,\n",
+       "        4.31442596e-02,  1.07885078e-02, -2.47129947e-02, -4.14506458e-02,\n",
+       "        4.40958813e-02,  6.65106403e-04, -2.26945560e-02, -4.76796739e-02,\n",
+       "        1.13289580e-02, -5.57265691e-02,  1.71151303e-03, -1.24145029e-02,\n",
+       "       -3.57853901e-03, -4.86295968e-02, -5.14956787e-02,  4.79425713e-02,\n",
+       "       -3.24050151e-02,  7.39779174e-02,  2.67242044e-02,  1.16365692e-02,\n",
+       "        8.20766483e-03, -6.27530292e-02, -1.30661400e-02, -3.52081768e-02,\n",
+       "        4.83807474e-02,  9.81860235e-03,  1.14539362e-01, -1.88471414e-02,\n",
+       "        6.07751869e-02, -1.75345445e-03,  3.13236266e-02, -1.94595556e-03,\n",
+       "        2.64345529e-03,  3.07400171e-02, -4.31060083e-02, -6.19985871e-02,\n",
+       "        5.50477020e-03,  1.62547994e-02, -8.26352183e-03,  7.56437238e-03,\n",
+       "       -4.79784003e-03,  6.93615247e-03,  3.59064825e-02,  2.08517518e-02,\n",
+       "        1.41595434e-02,  5.31185642e-02,  6.78585656e-03,  6.56357184e-02,\n",
+       "       -5.06135784e-02, -3.05179805e-02,  7.06539825e-02, -3.55644710e-02,\n",
+       "       -4.92612133e-03,  9.91953164e-02,  1.00235650e-02, -2.22671125e-02,\n",
+       "       -1.86746120e-02,  2.49281265e-02, -4.92450967e-03,  1.66887734e-02,\n",
+       "        4.62210961e-02,  4.07794118e-02,  2.52511259e-02, -2.83305068e-02,\n",
+       "       -2.78001893e-02, -1.69764105e-02,  1.79186705e-02,  1.09842177e-02,\n",
+       "        1.09969089e-02,  1.69700030e-02, -8.59475043e-03,  4.70476560e-02,\n",
+       "        3.64770554e-02,  2.09835749e-02,  1.01236468e-02,  2.75151283e-02,\n",
+       "        4.33402918e-02, -4.30559181e-02, -3.53547297e-02,  7.77268112e-02,\n",
+       "       -6.10819347e-02, -2.86280159e-02,  4.68054451e-02,  1.29892454e-02,\n",
+       "       -1.71940885e-02, -2.52429228e-02,  3.86423096e-02, -1.35919163e-02,\n",
+       "       -5.27431667e-02,  6.45831088e-03,  2.96409409e-02,  5.97442053e-02,\n",
+       "        3.23252901e-02,  5.03172688e-02, -4.45654802e-02,  2.90075876e-02,\n",
+       "       -1.35373492e-02,  6.78209821e-03, -5.89249916e-02,  4.28890549e-02,\n",
+       "       -2.36034058e-02, -5.30969724e-03,  3.85405980e-02, -1.82616734e-03,\n",
+       "        1.45543357e-02,  1.07806427e-02, -6.06855676e-02, -4.95252907e-02,\n",
+       "        1.02004781e-02,  4.60227691e-02, -1.08090881e-02,  4.42408510e-02,\n",
+       "        4.15152796e-02,  1.23609398e-02,  5.11957100e-03,  1.17597533e-02,\n",
+       "       -2.70090066e-02,  2.68773828e-02, -1.97812133e-02,  2.25932393e-02,\n",
+       "       -1.33560598e-02, -1.50896851e-02, -3.14053567e-03,  1.54051669e-02,\n",
+       "        1.86488125e-02, -1.71708278e-02, -3.95283476e-03,  7.68053811e-04,\n",
+       "       -2.37891261e-04,  1.84722953e-02,  3.60381305e-02, -5.85213909e-03,\n",
+       "        4.44293395e-02, -1.11264118e-03, -4.79441285e-02,  3.46464328e-02,\n",
+       "       -2.53370814e-02, -3.26901935e-02, -2.28975322e-02, -1.96164921e-02,\n",
+       "       -4.38152434e-04,  4.08602282e-02, -2.29470823e-02, -1.89938806e-02,\n",
+       "       -1.52037974e-04,  1.05516789e-02,  2.08601039e-02, -6.98119551e-02,\n",
+       "        3.66246551e-02, -1.26779894e-03, -4.03217562e-02, -5.35424761e-02,\n",
+       "        6.51817098e-02,  4.29646857e-02,  2.56071109e-02, -3.28080021e-02,\n",
+       "        1.20534413e-02,  3.56224040e-03, -1.01593453e-02, -1.96505673e-04,\n",
+       "        4.33485657e-02, -4.25680764e-02,  9.73126665e-03,  3.76882474e-03,\n",
+       "       -1.40319867e-02, -3.63940969e-02, -3.09983976e-02, -4.19548260e-33,\n",
+       "        7.11604580e-02,  4.78382297e-02,  1.89297704e-03, -1.60731785e-02,\n",
+       "        2.53787991e-02, -3.15741785e-02, -4.27713171e-02, -7.53164338e-03,\n",
+       "        1.68679946e-03,  1.92391127e-02, -2.20667192e-04,  1.32907527e-02,\n",
+       "        5.99487219e-03,  2.75156219e-02, -5.06000873e-03, -3.58465910e-02,\n",
+       "        8.20948277e-03, -2.11624149e-02, -7.07996823e-03, -4.23992332e-03,\n",
+       "       -1.09853260e-01, -3.66037302e-02,  3.55480015e-02,  4.23291475e-02,\n",
+       "        1.48312682e-02,  5.68749309e-02,  3.57767567e-02,  1.40728084e-02,\n",
+       "       -4.00471613e-02,  1.01988176e-02,  2.83056553e-02, -1.55737845e-03,\n",
+       "        1.24238459e-02,  1.20237898e-02, -7.69484974e-03, -3.30727436e-02,\n",
+       "       -1.45808076e-02,  3.43246050e-02,  3.21143419e-02, -4.96741422e-02,\n",
+       "       -5.27968369e-02,  2.51889303e-02, -1.11904610e-02,  5.64832352e-02,\n",
+       "        2.77636852e-02,  5.90689071e-02, -2.61273161e-02, -6.95008039e-02,\n",
+       "       -3.15576978e-02, -5.62214339e-03, -7.93884136e-03, -3.62196900e-02,\n",
+       "       -8.26047733e-03,  8.05249214e-02, -4.16241921e-02, -2.01846119e-02,\n",
+       "       -2.52235290e-02, -3.88054736e-02, -2.00710595e-02,  1.50789914e-03,\n",
+       "       -5.51338419e-02, -8.35673045e-03, -1.61523875e-02, -8.79513845e-02,\n",
+       "       -5.28004877e-02, -2.88654189e-03, -1.11697149e-02,  7.10910782e-02,\n",
+       "        4.44932319e-02,  8.69598426e-03, -1.14432694e-02,  4.47212979e-02,\n",
+       "        2.70624813e-02, -3.86100151e-02, -3.07358261e-02,  2.75634117e-02,\n",
+       "        1.48464069e-02, -1.00845508e-02,  6.45884350e-02,  4.28387662e-03,\n",
+       "        8.05836394e-02, -1.69498641e-02,  4.44465503e-02, -2.09145956e-02,\n",
+       "       -3.37407738e-02,  3.85780074e-02, -7.44559616e-02,  1.17512364e-02,\n",
+       "        1.01964204e-02, -3.02421930e-03,  4.80608828e-02, -1.49494391e-02,\n",
+       "        2.54592765e-02, -1.46158040e-02,  5.46646416e-02,  1.43051194e-03,\n",
+       "        2.99116820e-02,  2.24273186e-02, -5.79927117e-03, -1.33864526e-02,\n",
+       "       -2.52460372e-02, -2.69225910e-02,  1.64003875e-02,  1.20901112e-02,\n",
+       "        3.38429734e-02, -2.11539529e-02,  7.17787817e-02, -7.78904185e-02,\n",
+       "       -4.04084288e-02,  4.90567498e-02, -2.61603445e-02,  1.97753590e-02,\n",
+       "        4.97209951e-02, -4.88655381e-02, -4.52128090e-02,  3.63065898e-02,\n",
+       "        2.68440694e-02,  3.29160057e-02, -8.24410375e-03, -1.33646047e-02,\n",
+       "       -6.22822754e-02, -1.13362661e-02, -3.79339382e-02, -6.56360280e-05,\n",
+       "       -1.08087100e-02,  2.67575700e-02,  1.33866509e-02,  5.89998253e-02,\n",
+       "       -2.54666172e-02, -3.05371322e-02, -1.53249800e-02, -9.87035502e-03,\n",
+       "        1.95337094e-07, -1.76476724e-02,  5.71432859e-02, -2.49180794e-02,\n",
+       "        5.85253723e-02,  4.49808314e-02, -5.99673577e-02, -9.97425616e-03,\n",
+       "        4.07801419e-02,  4.13940698e-02,  2.55707726e-02,  2.18985360e-02,\n",
+       "       -3.04434425e-03, -3.77355106e-02, -6.24866784e-02, -1.17468778e-02,\n",
+       "       -4.82194684e-02, -7.78659210e-02, -1.48841189e-02, -1.75396129e-02,\n",
+       "       -2.48471629e-02,  8.05181568e-04, -4.85844910e-03, -5.16015477e-03,\n",
+       "        7.53483502e-03, -9.46175400e-03, -2.39896346e-02, -3.14654633e-02,\n",
+       "        1.50111094e-02, -1.22348899e-02,  3.00448518e-02,  3.55701670e-02,\n",
+       "        3.08971256e-02,  1.72299352e-02,  5.93419448e-02, -5.74274361e-02,\n",
+       "       -8.16087723e-02, -4.80572283e-02, -2.68838424e-02, -1.96331330e-02,\n",
+       "       -9.15831141e-03,  1.07509056e-02,  2.35639680e-02, -2.62569580e-02,\n",
+       "        9.21937004e-02,  1.37132118e-02, -1.19096776e-02, -4.09874134e-02,\n",
+       "        3.37628126e-02, -4.64820908e-03, -2.50304434e-02,  6.25852346e-02,\n",
+       "       -1.24449311e-02,  3.82654071e-02, -2.35330854e-02,  8.68125912e-03,\n",
+       "        5.08641489e-02,  2.53822445e-03,  5.25634140e-02,  1.14882430e-02,\n",
+       "        5.01894541e-02, -3.55215147e-02, -3.31749097e-02, -3.02003417e-03,\n",
+       "       -5.36288768e-02, -2.80938316e-02, -7.51279444e-02, -4.71623316e-02,\n",
+       "        9.56887701e-35,  2.55127084e-02, -1.44770980e-04,  1.96710341e-02,\n",
+       "       -1.33620016e-02, -1.51910949e-02, -3.28495577e-02, -1.52465852e-03,\n",
+       "       -2.65272055e-02, -4.35708016e-02, -1.75950192e-02, -2.20594816e-02],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "search_query = \"clinical trials related to alzheimers\"\n",
+    "vec = encoder.encode(search_query)\n",
+    "print(vec.shape)\n",
+    "vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "613fd415-4194-45e6-b9f3-9a7707845ad5",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1, 768)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[-8.82369652e-03,  8.50650743e-02,  2.08267733e-03,\n",
+       "         6.77651772e-03, -2.86661759e-02, -8.71188380e-03,\n",
+       "         6.99447095e-02,  5.04214764e-02,  3.58386151e-02,\n",
+       "         5.29594952e-03, -1.40875215e-02,  1.99297220e-02,\n",
+       "         2.27009598e-03,  2.10810862e-02,  2.66138893e-02,\n",
+       "         1.90623086e-02,  4.44708914e-02,  2.96202525e-02,\n",
+       "         5.42085357e-02, -2.34859088e-03, -9.87798795e-02,\n",
+       "        -5.00183590e-02, -3.42465192e-02,  2.08440255e-02,\n",
+       "         5.31156994e-02, -1.37044629e-02,  2.92537250e-02,\n",
+       "        -2.61334293e-02, -1.21854078e-04, -2.36813519e-02,\n",
+       "        -3.81283499e-02, -1.79494768e-02, -6.29265187e-03,\n",
+       "         1.27150817e-02,  1.19849676e-06, -7.78729608e-03,\n",
+       "        -1.28973828e-04,  4.01791967e-02,  4.21229303e-02,\n",
+       "        -8.72302521e-03,  7.44823692e-03,  7.68032745e-02,\n",
+       "         6.50246907e-03,  3.40298638e-02, -1.80711355e-02,\n",
+       "        -2.71878559e-02,  5.74751608e-02,  3.67745496e-02,\n",
+       "        -3.34868580e-02,  1.05205458e-02,  2.08975170e-02,\n",
+       "         4.36686277e-02,  3.47612537e-02, -4.99080680e-02,\n",
+       "         4.44446988e-02,  5.57280704e-03, -2.31200755e-02,\n",
+       "        -4.60692644e-02, -1.39789237e-02, -3.79957110e-02,\n",
+       "         4.67903316e-02,  1.91651955e-02, -5.12171052e-02,\n",
+       "         2.46807020e-02, -5.52081019e-02,  3.50596346e-02,\n",
+       "        -7.01438356e-03, -3.36519890e-02, -1.41502097e-02,\n",
+       "        -1.37693482e-02,  4.11427952e-02,  6.94046309e-03,\n",
+       "        -1.30138136e-02,  5.91567121e-02,  3.37168351e-02,\n",
+       "         3.01467292e-02, -4.59552221e-02,  1.37365120e-03,\n",
+       "         1.00179566e-02,  6.98126853e-04,  3.58139984e-02,\n",
+       "         1.18174301e-02,  1.33722462e-02, -1.35893077e-02,\n",
+       "         4.75908853e-02,  5.48331346e-03, -6.41460950e-03,\n",
+       "        -1.23906611e-02,  5.82688041e-02, -1.60842277e-02,\n",
+       "        -2.95833423e-04,  6.97355811e-03,  2.48331465e-02,\n",
+       "        -2.35959496e-02, -1.24989869e-02, -1.36585534e-02,\n",
+       "         1.52637456e-02,  7.01832073e-03,  5.50601333e-02,\n",
+       "         4.35096538e-03,  2.36319732e-02, -1.38118947e-02,\n",
+       "        -7.24233836e-02,  9.39742289e-03, -2.66901590e-02,\n",
+       "         2.96042152e-02,  1.28761679e-02,  2.23339219e-02,\n",
+       "         3.08373477e-03,  7.12765753e-02,  7.13613164e-03,\n",
+       "         3.62721197e-02, -4.53250594e-02,  2.54001115e-02,\n",
+       "        -2.54253373e-02, -1.23151275e-03, -1.34750446e-02,\n",
+       "        -2.70653702e-02, -1.02220355e-02,  2.07683407e-02,\n",
+       "        -7.31003610e-03,  2.65329964e-02, -2.79857730e-03,\n",
+       "         4.20840643e-02,  3.20205763e-02, -1.19518824e-02,\n",
+       "        -5.77116087e-02,  9.88688134e-03,  1.86814573e-02,\n",
+       "        -5.10204993e-02, -6.77110278e-04,  9.40234493e-03,\n",
+       "        -3.33383717e-02, -5.52933291e-02,  5.64148054e-02,\n",
+       "         4.92153503e-02,  3.33690383e-02, -3.92963700e-02,\n",
+       "        -6.91099390e-02,  3.79911740e-03, -1.74410697e-02,\n",
+       "        -1.60171147e-02,  4.89675067e-02,  2.67119659e-03,\n",
+       "         2.61192098e-02, -2.74193864e-02,  6.92490395e-03,\n",
+       "        -4.64810384e-03, -8.99862905e-04,  1.02159111e-02,\n",
+       "        -4.81114909e-02,  1.22787328e-02, -9.32844076e-03,\n",
+       "        -2.00431682e-02, -1.36102587e-02, -3.67914373e-03,\n",
+       "        -1.60810221e-02, -2.20200215e-02,  2.32051890e-02,\n",
+       "        -5.07331975e-02, -1.01248249e-02,  5.62567115e-02,\n",
+       "        -2.60966737e-03,  9.27545596e-03,  5.32410555e-02,\n",
+       "         4.81746234e-02, -9.83138476e-03,  1.81230865e-02,\n",
+       "        -2.12969314e-02,  9.82244611e-02, -2.47648880e-02,\n",
+       "         7.06253499e-02,  8.71159416e-03, -2.73140483e-02,\n",
+       "         5.59884915e-03, -2.14829091e-02, -6.67077005e-02,\n",
+       "         2.48677693e-02, -8.29503238e-02, -7.96182230e-02,\n",
+       "         3.77488993e-02, -1.37352264e-02, -2.85069812e-02,\n",
+       "         1.81708820e-02, -4.07746173e-02, -4.71230270e-03,\n",
+       "        -1.59605164e-02, -1.25815195e-03, -6.59954594e-03,\n",
+       "        -1.51611334e-02,  7.87123516e-02, -4.09705602e-02,\n",
+       "         3.07933297e-02,  1.27626080e-02, -4.34489138e-02,\n",
+       "         9.91576444e-03,  1.25470785e-02, -8.67356583e-02,\n",
+       "         1.26097840e-03,  3.24709825e-02, -6.92409948e-02,\n",
+       "        -4.35011238e-02, -2.79313605e-02, -3.37213017e-02,\n",
+       "        -2.35359464e-02, -2.95022167e-02,  2.88009271e-02,\n",
+       "        -3.26618887e-02,  7.09307985e-03,  3.09435464e-03,\n",
+       "        -5.09097055e-02,  3.54242921e-02,  5.37336655e-02,\n",
+       "         1.55867739e-02,  2.09988486e-02, -4.38529663e-02,\n",
+       "         2.93767708e-03,  2.27999203e-02,  1.02668423e-02,\n",
+       "         3.35033536e-02, -8.28316063e-02, -4.17127199e-02,\n",
+       "        -1.23034064e-02,  2.38543525e-02, -3.72257493e-02,\n",
+       "         2.97443867e-02,  3.35034318e-02, -5.21336049e-02,\n",
+       "         5.74519299e-03,  2.89844945e-02, -2.21337453e-02,\n",
+       "         2.34603398e-02,  6.33142609e-03, -2.24104542e-02,\n",
+       "         1.47326495e-02,  1.98041964e-02,  3.05697713e-02,\n",
+       "        -9.37094465e-02, -6.84579164e-02,  4.63523576e-03,\n",
+       "         3.88860740e-02, -3.97440195e-02, -4.70216498e-02,\n",
+       "         1.02172708e-02, -3.37972888e-03, -8.54947045e-03,\n",
+       "         4.81354557e-02,  4.99849804e-02,  7.11378129e-03,\n",
+       "        -2.54327375e-02, -1.14872465e-02, -3.54485810e-02,\n",
+       "         5.24284095e-02,  2.16708388e-02, -4.00698110e-02,\n",
+       "         5.15380092e-02, -6.03203699e-02, -6.50304696e-03,\n",
+       "        -1.03860423e-02, -7.47132823e-02,  3.59848235e-03,\n",
+       "        -4.68364358e-02, -4.23019789e-02, -1.86387468e-02,\n",
+       "        -2.88047381e-02, -2.81904116e-02,  1.52729014e-02,\n",
+       "        -1.55570190e-02,  1.34619148e-02,  2.34364290e-02,\n",
+       "         3.10326237e-02, -4.70464528e-02, -2.43550166e-02,\n",
+       "        -7.20657408e-03, -1.16065536e-02, -3.42444591e-02,\n",
+       "        -5.30204549e-03,  5.52049950e-02,  4.50828709e-02,\n",
+       "        -7.30262510e-03,  5.56289777e-02, -9.46066808e-03,\n",
+       "        -3.37345451e-02, -1.87659152e-02,  3.57284099e-02,\n",
+       "         4.20488343e-02,  1.66770478e-03, -5.27675785e-02,\n",
+       "         2.96422077e-04,  4.22447585e-02,  4.97253910e-02,\n",
+       "         6.03130311e-02,  1.32281650e-02,  2.35939436e-02,\n",
+       "        -1.59284715e-02,  4.46444489e-02, -1.68315917e-02,\n",
+       "         1.34740606e-01, -3.54593806e-02,  4.79029641e-02,\n",
+       "         8.99049267e-03,  4.74606343e-02,  6.70041004e-03,\n",
+       "        -1.15184486e-03,  2.69540539e-03, -2.77549177e-02,\n",
+       "        -1.33260442e-02,  2.60788556e-02,  4.35438640e-02,\n",
+       "        -2.55859867e-02,  2.76670083e-02,  3.37177999e-02,\n",
+       "         2.93240137e-02,  1.82274636e-03, -1.40310880e-02,\n",
+       "        -1.91633645e-02,  1.18790809e-02, -4.65121269e-02,\n",
+       "        -4.19883654e-02, -2.69681774e-02, -3.23035605e-02,\n",
+       "        -6.84630498e-02,  6.26784265e-02,  1.37511576e-02,\n",
+       "        -2.55833156e-02, -5.73152229e-02,  3.30126472e-02,\n",
+       "        -7.90146552e-03, -1.08651863e-02,  1.10474667e-02,\n",
+       "         3.03509296e-03,  1.55274626e-02,  1.05599947e-02,\n",
+       "        -7.16960803e-03, -5.01419827e-02, -3.34469602e-02,\n",
+       "         3.77239436e-02,  9.44003314e-02, -4.80610691e-02,\n",
+       "         4.73537892e-02,  3.40655483e-02,  7.88806472e-03,\n",
+       "        -2.84915343e-02,  7.96849206e-02,  1.57442074e-02,\n",
+       "        -4.15650755e-02,  7.51048513e-03,  3.66957486e-02,\n",
+       "        -1.72730908e-01, -8.72075930e-02,  2.86346450e-02,\n",
+       "         2.16962174e-02, -4.80199270e-02,  6.49317261e-03,\n",
+       "         1.67240556e-02, -2.56227311e-02,  2.19670162e-02,\n",
+       "        -6.10647202e-02, -2.65449155e-02,  6.17929082e-03,\n",
+       "        -2.89566331e-02,  1.19498251e-02, -2.33849231e-02,\n",
+       "        -2.69133616e-02, -1.46602485e-02,  1.18886270e-02,\n",
+       "         1.64973717e-02, -3.90495770e-02, -3.45575088e-03,\n",
+       "         5.12249060e-02, -8.63745401e-04,  5.59820198e-02,\n",
+       "         2.10017413e-02,  2.74998210e-02,  3.03551817e-04,\n",
+       "        -1.15796946e-01, -4.66962112e-03, -4.80118394e-02,\n",
+       "        -3.55160870e-02, -4.72528581e-03, -4.29739058e-02,\n",
+       "        -1.07347388e-02, -1.32423071e-02, -2.34632343e-02,\n",
+       "         1.98413953e-02, -7.27679394e-03,  2.27117930e-02,\n",
+       "        -2.59338003e-02,  4.31442596e-02,  1.07885078e-02,\n",
+       "        -2.47129947e-02, -4.14506458e-02,  4.40958813e-02,\n",
+       "         6.65106403e-04, -2.26945560e-02, -4.76796739e-02,\n",
+       "         1.13289580e-02, -5.57265691e-02,  1.71151303e-03,\n",
+       "        -1.24145029e-02, -3.57853901e-03, -4.86295968e-02,\n",
+       "        -5.14956787e-02,  4.79425713e-02, -3.24050151e-02,\n",
+       "         7.39779174e-02,  2.67242044e-02,  1.16365692e-02,\n",
+       "         8.20766483e-03, -6.27530292e-02, -1.30661400e-02,\n",
+       "        -3.52081768e-02,  4.83807474e-02,  9.81860235e-03,\n",
+       "         1.14539362e-01, -1.88471414e-02,  6.07751869e-02,\n",
+       "        -1.75345445e-03,  3.13236266e-02, -1.94595556e-03,\n",
+       "         2.64345529e-03,  3.07400171e-02, -4.31060083e-02,\n",
+       "        -6.19985871e-02,  5.50477020e-03,  1.62547994e-02,\n",
+       "        -8.26352183e-03,  7.56437238e-03, -4.79784003e-03,\n",
+       "         6.93615247e-03,  3.59064825e-02,  2.08517518e-02,\n",
+       "         1.41595434e-02,  5.31185642e-02,  6.78585656e-03,\n",
+       "         6.56357184e-02, -5.06135784e-02, -3.05179805e-02,\n",
+       "         7.06539825e-02, -3.55644710e-02, -4.92612133e-03,\n",
+       "         9.91953164e-02,  1.00235650e-02, -2.22671125e-02,\n",
+       "        -1.86746120e-02,  2.49281265e-02, -4.92450967e-03,\n",
+       "         1.66887734e-02,  4.62210961e-02,  4.07794118e-02,\n",
+       "         2.52511259e-02, -2.83305068e-02, -2.78001893e-02,\n",
+       "        -1.69764105e-02,  1.79186705e-02,  1.09842177e-02,\n",
+       "         1.09969089e-02,  1.69700030e-02, -8.59475043e-03,\n",
+       "         4.70476560e-02,  3.64770554e-02,  2.09835749e-02,\n",
+       "         1.01236468e-02,  2.75151283e-02,  4.33402918e-02,\n",
+       "        -4.30559181e-02, -3.53547297e-02,  7.77268112e-02,\n",
+       "        -6.10819347e-02, -2.86280159e-02,  4.68054451e-02,\n",
+       "         1.29892454e-02, -1.71940885e-02, -2.52429228e-02,\n",
+       "         3.86423096e-02, -1.35919163e-02, -5.27431667e-02,\n",
+       "         6.45831088e-03,  2.96409409e-02,  5.97442053e-02,\n",
+       "         3.23252901e-02,  5.03172688e-02, -4.45654802e-02,\n",
+       "         2.90075876e-02, -1.35373492e-02,  6.78209821e-03,\n",
+       "        -5.89249916e-02,  4.28890549e-02, -2.36034058e-02,\n",
+       "        -5.30969724e-03,  3.85405980e-02, -1.82616734e-03,\n",
+       "         1.45543357e-02,  1.07806427e-02, -6.06855676e-02,\n",
+       "        -4.95252907e-02,  1.02004781e-02,  4.60227691e-02,\n",
+       "        -1.08090881e-02,  4.42408510e-02,  4.15152796e-02,\n",
+       "         1.23609398e-02,  5.11957100e-03,  1.17597533e-02,\n",
+       "        -2.70090066e-02,  2.68773828e-02, -1.97812133e-02,\n",
+       "         2.25932393e-02, -1.33560598e-02, -1.50896851e-02,\n",
+       "        -3.14053567e-03,  1.54051669e-02,  1.86488125e-02,\n",
+       "        -1.71708278e-02, -3.95283476e-03,  7.68053811e-04,\n",
+       "        -2.37891261e-04,  1.84722953e-02,  3.60381305e-02,\n",
+       "        -5.85213909e-03,  4.44293395e-02, -1.11264118e-03,\n",
+       "        -4.79441285e-02,  3.46464328e-02, -2.53370814e-02,\n",
+       "        -3.26901935e-02, -2.28975322e-02, -1.96164921e-02,\n",
+       "        -4.38152434e-04,  4.08602282e-02, -2.29470823e-02,\n",
+       "        -1.89938806e-02, -1.52037974e-04,  1.05516789e-02,\n",
+       "         2.08601039e-02, -6.98119551e-02,  3.66246551e-02,\n",
+       "        -1.26779894e-03, -4.03217562e-02, -5.35424761e-02,\n",
+       "         6.51817098e-02,  4.29646857e-02,  2.56071109e-02,\n",
+       "        -3.28080021e-02,  1.20534413e-02,  3.56224040e-03,\n",
+       "        -1.01593453e-02, -1.96505673e-04,  4.33485657e-02,\n",
+       "        -4.25680764e-02,  9.73126665e-03,  3.76882474e-03,\n",
+       "        -1.40319867e-02, -3.63940969e-02, -3.09983976e-02,\n",
+       "        -4.19548260e-33,  7.11604580e-02,  4.78382297e-02,\n",
+       "         1.89297704e-03, -1.60731785e-02,  2.53787991e-02,\n",
+       "        -3.15741785e-02, -4.27713171e-02, -7.53164338e-03,\n",
+       "         1.68679946e-03,  1.92391127e-02, -2.20667192e-04,\n",
+       "         1.32907527e-02,  5.99487219e-03,  2.75156219e-02,\n",
+       "        -5.06000873e-03, -3.58465910e-02,  8.20948277e-03,\n",
+       "        -2.11624149e-02, -7.07996823e-03, -4.23992332e-03,\n",
+       "        -1.09853260e-01, -3.66037302e-02,  3.55480015e-02,\n",
+       "         4.23291475e-02,  1.48312682e-02,  5.68749309e-02,\n",
+       "         3.57767567e-02,  1.40728084e-02, -4.00471613e-02,\n",
+       "         1.01988176e-02,  2.83056553e-02, -1.55737845e-03,\n",
+       "         1.24238459e-02,  1.20237898e-02, -7.69484974e-03,\n",
+       "        -3.30727436e-02, -1.45808076e-02,  3.43246050e-02,\n",
+       "         3.21143419e-02, -4.96741422e-02, -5.27968369e-02,\n",
+       "         2.51889303e-02, -1.11904610e-02,  5.64832352e-02,\n",
+       "         2.77636852e-02,  5.90689071e-02, -2.61273161e-02,\n",
+       "        -6.95008039e-02, -3.15576978e-02, -5.62214339e-03,\n",
+       "        -7.93884136e-03, -3.62196900e-02, -8.26047733e-03,\n",
+       "         8.05249214e-02, -4.16241921e-02, -2.01846119e-02,\n",
+       "        -2.52235290e-02, -3.88054736e-02, -2.00710595e-02,\n",
+       "         1.50789914e-03, -5.51338419e-02, -8.35673045e-03,\n",
+       "        -1.61523875e-02, -8.79513845e-02, -5.28004877e-02,\n",
+       "        -2.88654189e-03, -1.11697149e-02,  7.10910782e-02,\n",
+       "         4.44932319e-02,  8.69598426e-03, -1.14432694e-02,\n",
+       "         4.47212979e-02,  2.70624813e-02, -3.86100151e-02,\n",
+       "        -3.07358261e-02,  2.75634117e-02,  1.48464069e-02,\n",
+       "        -1.00845508e-02,  6.45884350e-02,  4.28387662e-03,\n",
+       "         8.05836394e-02, -1.69498641e-02,  4.44465503e-02,\n",
+       "        -2.09145956e-02, -3.37407738e-02,  3.85780074e-02,\n",
+       "        -7.44559616e-02,  1.17512364e-02,  1.01964204e-02,\n",
+       "        -3.02421930e-03,  4.80608828e-02, -1.49494391e-02,\n",
+       "         2.54592765e-02, -1.46158040e-02,  5.46646416e-02,\n",
+       "         1.43051194e-03,  2.99116820e-02,  2.24273186e-02,\n",
+       "        -5.79927117e-03, -1.33864526e-02, -2.52460372e-02,\n",
+       "        -2.69225910e-02,  1.64003875e-02,  1.20901112e-02,\n",
+       "         3.38429734e-02, -2.11539529e-02,  7.17787817e-02,\n",
+       "        -7.78904185e-02, -4.04084288e-02,  4.90567498e-02,\n",
+       "        -2.61603445e-02,  1.97753590e-02,  4.97209951e-02,\n",
+       "        -4.88655381e-02, -4.52128090e-02,  3.63065898e-02,\n",
+       "         2.68440694e-02,  3.29160057e-02, -8.24410375e-03,\n",
+       "        -1.33646047e-02, -6.22822754e-02, -1.13362661e-02,\n",
+       "        -3.79339382e-02, -6.56360280e-05, -1.08087100e-02,\n",
+       "         2.67575700e-02,  1.33866509e-02,  5.89998253e-02,\n",
+       "        -2.54666172e-02, -3.05371322e-02, -1.53249800e-02,\n",
+       "        -9.87035502e-03,  1.95337094e-07, -1.76476724e-02,\n",
+       "         5.71432859e-02, -2.49180794e-02,  5.85253723e-02,\n",
+       "         4.49808314e-02, -5.99673577e-02, -9.97425616e-03,\n",
+       "         4.07801419e-02,  4.13940698e-02,  2.55707726e-02,\n",
+       "         2.18985360e-02, -3.04434425e-03, -3.77355106e-02,\n",
+       "        -6.24866784e-02, -1.17468778e-02, -4.82194684e-02,\n",
+       "        -7.78659210e-02, -1.48841189e-02, -1.75396129e-02,\n",
+       "        -2.48471629e-02,  8.05181568e-04, -4.85844910e-03,\n",
+       "        -5.16015477e-03,  7.53483502e-03, -9.46175400e-03,\n",
+       "        -2.39896346e-02, -3.14654633e-02,  1.50111094e-02,\n",
+       "        -1.22348899e-02,  3.00448518e-02,  3.55701670e-02,\n",
+       "         3.08971256e-02,  1.72299352e-02,  5.93419448e-02,\n",
+       "        -5.74274361e-02, -8.16087723e-02, -4.80572283e-02,\n",
+       "        -2.68838424e-02, -1.96331330e-02, -9.15831141e-03,\n",
+       "         1.07509056e-02,  2.35639680e-02, -2.62569580e-02,\n",
+       "         9.21937004e-02,  1.37132118e-02, -1.19096776e-02,\n",
+       "        -4.09874134e-02,  3.37628126e-02, -4.64820908e-03,\n",
+       "        -2.50304434e-02,  6.25852346e-02, -1.24449311e-02,\n",
+       "         3.82654071e-02, -2.35330854e-02,  8.68125912e-03,\n",
+       "         5.08641489e-02,  2.53822445e-03,  5.25634140e-02,\n",
+       "         1.14882430e-02,  5.01894541e-02, -3.55215147e-02,\n",
+       "        -3.31749097e-02, -3.02003417e-03, -5.36288768e-02,\n",
+       "        -2.80938316e-02, -7.51279444e-02, -4.71623316e-02,\n",
+       "         9.56887701e-35,  2.55127084e-02, -1.44770980e-04,\n",
+       "         1.96710341e-02, -1.33620016e-02, -1.51910949e-02,\n",
+       "        -3.28495577e-02, -1.52465852e-03, -2.65272055e-02,\n",
+       "        -4.35708016e-02, -1.75950192e-02, -2.20594816e-02]], dtype=float32)"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "svec = np.array(vec).reshape(1, -1)\n",
+    "print(svec.shape)\n",
+    "svec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "fef30d70-6958-4259-abb6-09f8c1870a2b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.7731663  0.79433584]] [[330 331]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "distances, I = index.search(svec, k=2)\n",
+    "print(distances, I)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "id": "eb00598c-9799-4697-b2a3-356bb5aae0f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>330</th>\n",
+       "      <td>['alzheimer disease', 'dementia', 'brain disea...</td>\n",
+       "      <td>nct_id: NCT02164643\\nsummary: A Multicenter na...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>331</th>\n",
+       "      <td>['alzheimer disease', 'dementia', 'brain disea...</td>\n",
+       "      <td>nct_id: NCT02164643\\nsummary: A Multicenter na...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     desease_condition  \\\n",
+       "330  ['alzheimer disease', 'dementia', 'brain disea...   \n",
+       "331  ['alzheimer disease', 'dementia', 'brain disea...   \n",
+       "\n",
+       "                                                  text  \n",
+       "330  nct_id: NCT02164643\\nsummary: A Multicenter na...  \n",
+       "331  nct_id: NCT02164643\\nsummary: A Multicenter na...  "
+      ]
+     },
+     "execution_count": 111,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2 = df.iloc[I[0]]\n",
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "af5bf8e2-43b6-47af-affa-5111789371ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'nct_id: NCT02164643\\nsummary: A Multicenter national longitudinal cohort study including at least 800 individuals consecutively recruited from French Research Memory Centers and followed-up over 24 month and included in Memento.\\nintervention_type: Drug\\nintervention_name: Florbetapir (18F)\\nintervention_description: nan\\nkeywords: [\"Alzheimer\\'s disease\", \\'Mild Cognitive Impairment\\']'"
+      ]
+     },
+     "execution_count": 113,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2.iloc[1].text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3899f81-e120-475c-97ed-080cb7f46510",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

database.ipynb ADDED Viewed

	@@ -0,0 +1,265 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Vector Search "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, pandas as pd\n",
+    "from sqlalchemy import create_engine, text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "username = 'demo'\n",
+    "password = 'demo'\n",
+    "hostname = os.getenv('IRIS_HOSTNAME', 'localhost')\n",
+    "port = '1972' \n",
+    "namespace = 'USER'\n",
+    "CONNECTION_STRING = f\"iris://{username}:{password}@{hostname}:{port}/{namespace}\"\n",
+    "\n",
+    "engine = create_engine(CONNECTION_STRING)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load knowledge graph\n",
+    "entity_embeddings = pd.read_csv('./data/entity_embeddings.csv', index_col=0)\n",
+    "entity_embeddings[\"embedding\"] = entity_embeddings[\"embedding\"].apply(\n",
+    "    lambda x: x[1:-1])\n",
+    "\n",
+    "len_label = entity_embeddings['label'].str.len().max()\n",
+    "len_uri = entity_embeddings['uri'].str.len().max()\n",
+    "# TODO: set varchar length dynamically as above\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin(): \n",
+    "        result = conn.execute(text('DROP TABLE IF EXISTS Test.EntityEmbeddings'))\n",
+    "        sql = f\"\"\"\n",
+    "                CREATE TABLE Test.EntityEmbeddings (\n",
+    "                        embedding VECTOR(DOUBLE, 50),\n",
+    "                        label VARCHAR(143),\n",
+    "                        uri VARCHAR(38)\n",
+    "                )\n",
+    "                \"\"\"\n",
+    "        result = conn.execute(text(sql))\n",
+    "\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin():\n",
+    "        for index, row in entity_embeddings.iterrows():\n",
+    "            sql = text(\"\"\"\n",
+    "                INSERT INTO Test.EntityEmbeddings \n",
+    "                (embedding, label, uri) \n",
+    "                VALUES (TO_VECTOR(:embedding), :label, :uri)\n",
+    "            \"\"\")\n",
+    "            conn.execute(sql, {\n",
+    "                'embedding': str(row['embedding']),\n",
+    "                'label': row['label'], \n",
+    "                'uri': row['uri']\n",
+    "            })\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate distance between entities\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin():\n",
+    "        sql = f\"\"\"\n",
+    "                SELECT TOP 10 e1.uri AS uri1, e2.uri AS uri2, e1.label AS label1, e2.label AS label2,\n",
+    "                VECTOR_COSINE(e1.embedding, e2.embedding) AS distance\n",
+    "                FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2\n",
+    "                WHERE e1.uri = 'http://identifiers.org/medgen/C0002395'\n",
+    "                ORDER BY distance DESC\n",
+    "                \"\"\"\n",
+    "        result = conn.execute(text(sql))\n",
+    "        data = result.fetchall()\n",
+    "        display(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load clinical trials\n",
+    "\n",
+    "relation_embeddings = pd.read_csv('./data/relation_embeddings.csv', index_col=0)\n",
+    "relation_embeddings[\"embedding\"] = relation_embeddings[\"embedding\"].apply(\n",
+    "    lambda x: x[1:-1])\n",
+    "\n",
+    "len_label = relation_embeddings['label'].str.len().max()\n",
+    "len_uri = relation_embeddings['uri'].str.len().max()\n",
+    "# TODO: set varchar length dynamically as above\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin():# Load \n",
+    "        result = conn.execute(text('DROP TABLE IF EXISTS Test.RelationEmbeddings'))\n",
+    "        sql = f\"\"\"\n",
+    "                CREATE TABLE Test.RelationEmbeddings (\n",
+    "                        embedding VECTOR(DOUBLE, 50),\n",
+    "                        label VARCHAR(10),\n",
+    "                        uri VARCHAR(38)\n",
+    "                )\n",
+    "                \"\"\"\n",
+    "        result = conn.execute(text(sql))\n",
+    "\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin():\n",
+    "        for index, row in relation_embeddings.iterrows():\n",
+    "            sql = text(\"\"\"\n",
+    "                INSERT INTO Test.ClinicalTrials \n",
+    "                (embedding, label, uri) \n",
+    "                VALUES (TO_VECTOR(:embedding), :label, :uri)\n",
+    "            \"\"\")\n",
+    "            conn.execute(sql, {\n",
+    "                'embedding': str(row['embedding']),\n",
+    "                'label': row['label'], \n",
+    "                'uri': row['uri']\n",
+    "            })\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load knowledge graph\n",
+    "clinical_trials = pd.read_csv(\"clinical_trials_embeddings.csv\")\n",
+    "clinical_trials[\"embeddings\"] = clinical_trials[\"embeddings\"].apply(lambda x: x[1:-1])\n",
+    "display(clinical_trials.head())\n",
+    "\n",
+    "# TODO: set varchar length dynamically as above\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin():\n",
+    "        result = conn.execute(text(\"DROP TABLE IF EXISTS Test.ClinicalTrials\"))\n",
+    "        sql = f\"\"\"\n",
+    "                CREATE TABLE Test.ClinicalTrials (\n",
+    "                        nct_id VARCHAR(11) PRIMARY KEY,\n",
+    "                        diseases TEXT,\n",
+    "                        embedding VECTOR(DOUBLE, 768)\n",
+    "                )\n",
+    "                \"\"\"\n",
+    "        result = conn.execute(text(sql))\n",
+    "\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin():\n",
+    "        for index, row in clinical_trials.iterrows():\n",
+    "\n",
+    "            sql = text(\n",
+    "                \"\"\"\n",
+    "                INSERT INTO Test.ClinicalTrials \n",
+    "                (nct_id, diseases, embedding)\n",
+    "                VALUES (:nct_id, :diseases, TO_VECTOR(:embedding))\n",
+    "            \"\"\"\n",
+    "            )\n",
+    "            conn.execute(\n",
+    "                sql,\n",
+    "                {\n",
+    "                    \"nct_id\": row[\"nct_id\"],\n",
+    "                    \"diseases\": row[\"desease_condition\"],\n",
+    "                    \"embedding\": str(row[\"embeddings\"]),\n",
+    "                },\n",
+    "            )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%\n",
+    "import pandas as pd\n",
+    "import rdflib\n",
+    "\n",
+    "# Load the disease descriptions from MGDEF.RRF\n",
+    "df_disease_descriptions = pd.read_csv(\"MGDEF.RRF\", sep=\"|\", header=0)\n",
+    "# Rename the column '#CUI' to 'CUI'\n",
+    "df_disease_descriptions.rename(columns={\"#CUI\": \"CUI\"}, inplace=True)\n",
+    "# Remove the last column, it's empty\n",
+    "df_disease_descriptions = df_disease_descriptions.iloc[:, :-1]\n",
+    "# Filter out the rows where the SUPPRESS field is equal to 'Y'\n",
+    "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"SUPPRESS\"] != \"Y\"]\n",
+    "# Some of the rows include a \\n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C'\n",
+    "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"CUI\"].str.startswith(\"C\") & ~df_disease_descriptions[\"CUI\"].str.contains(\" \")]\n",
+    "# Remove the rows where the DEF field is empty\n",
+    "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"DEF\"].notnull()]\n",
+    "df_disease_descriptions['uri'] = df_disease_descriptions['CUI'].apply(lambda x: f'http://identifiers.org/medgen/{x}')\n",
+    "\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin(): \n",
+    "        result = conn.execute(text('DROP TABLE IF EXISTS Test.DiseaseDescriptions'))\n",
+    "        sql = f\"\"\"\n",
+    "                CREATE TABLE Test.DiseaseDescriptions (\n",
+    "                        uri VARCHAR(50),\n",
+    "                        description TEXT\n",
+    "                )\n",
+    "                \"\"\"\n",
+    "        result = conn.execute(text(sql))\n",
+    "\n",
+    "with engine.connect() as conn:\n",
+    "    with conn.begin():\n",
+    "        for index, row in df_disease_descriptions.iterrows():\n",
+    "            print(row['DEF'])\n",
+    "            print(row['uri'])\n",
+    "            sql = text(\"\"\"\n",
+    "                INSERT INTO Test.DiseaseDescriptions \n",
+    "                (uri, description) \n",
+    "                VALUES ( :uri, :description)\n",
+    "            \"\"\")\n",
+    "            conn.execute(sql, {\n",
+    "                'uri': row['uri'],\n",
+    "                'description': row['DEF'], \n",
+    "            })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "treehacks",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

disease_descriptions_with_embeddings.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:098d7a2172812d9eeaaa5f6be94d356d88e2338ee80425d300866e59ea5008db
+size 1075089337

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+services:
+  iris:
+    image: intersystemsdc/iris-community:latest
+    environment:
+      IRIS_USERNAME: demo
+      IRIS_PASSWORD: demo
+    restart: always
+    hostname: iris
+    ports:
+      - 1972:1972
+      - 52773:52773

entity_embeddings.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2d933ddb9bf7777abcc3e6aaf95c8e4212c6b2d3e145e8a5508d79a6fe01818
+size 86750825

get_embeddings_of_disease_descriptions.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# %%
+import pandas as pd
+# Load the disease descriptions from MGDEF.RRF
+df_disease_descriptions = pd.read_csv("MGDEF.RRF", sep="|", header=0)
+# Rename the column '#CUI' to 'CUI'
+df_disease_descriptions.rename(columns={"#CUI": "CUI"}, inplace=True)
+# Rename the column 'DEF' to 'definition'
+df_disease_descriptions.rename(columns={"DEF": "definition"}, inplace=True)
+# Remove the last column, it's empty
+df_disease_descriptions = df_disease_descriptions.iloc[:, :-1]
+# Filter out the rows where the SUPPRESS field is equal to 'Y'
+df_disease_descriptions = df_disease_descriptions[
+    df_disease_descriptions["SUPPRESS"] != "Y"
+]
+# Some of the rows include a \n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C'
+df_disease_descriptions = df_disease_descriptions[
+    df_disease_descriptions["CUI"].str.startswith("C")
+    & ~df_disease_descriptions["CUI"].str.contains(" ")
+]
+# Remove the rows where the DEF field is empty
+df_disease_descriptions = df_disease_descriptions[
+    df_disease_descriptions["definition"].notnull()
+]
+df_disease_descriptions["uri"] = df_disease_descriptions["CUI"].apply(
+    lambda x: f"http://identifiers.org/medgen/{x}"
+)
+# Drop the columns that are not needed (source, SUPPRESS, CUI)
+df_disease_descriptions.drop(columns=["source", "SUPPRESS", "CUI"], inplace=True)
+# Drop the descriptions that are duplicates
+df_disease_descriptions.drop_duplicates(subset=["definition"], inplace=True)
+# Reset the index
+df_disease_descriptions.reset_index(drop=True, inplace=True)
+# %%
+from sentence_transformers import SentenceTransformer
+encoder = SentenceTransformer("allenai-specter")
+vectors = encoder.encode(
+    df_disease_descriptions.definition, show_progress_bar=True, batch_size=64
+)
+vectors.shape
+# %%
+import numpy as np
+df_disease_descriptions["embeddings"] = vectors.astype(
+    "float32", casting="same_kind"
+).tolist()
+# %%
+# Write to a CSV file
+df_disease_descriptions.to_csv(
+    "disease_descriptions_with_embeddings.csv", index=False, header=True
+)
+# %%

graph.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# %%
+import rdflib
+import pandas as pd
+def get_graph():
+    # File with the graph: MGCONSO.RRF
+    df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0)
+    # Rename the column '#CUI' to 'CUI'
+    df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True)
+    # Remove the last column, it's empty
+    df_concepts = df_concepts.iloc[:, :-1]
+    print(df_concepts.head())
+    # Create a graph
+    g = rdflib.Graph()
+    # Bind the namespace
+    g.bind("medgen", "http://identifiers.org/medgen/")
+    # Iterate over the rows
+    for i, row in df_concepts.iterrows():
+        if row.SUPPRESS == "Y":
+            continue
+        if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P":
+            # Create the URI
+            uri = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI}")
+            # Add the triple
+            g.add((uri, rdflib.RDFS.label, rdflib.Literal(row.STR)))
+    # Now, load MGREL.RRF
+    df_relations = pd.read_csv("MGREL.RRF", sep="|", header=0)
+    # Rename the column '#CUI1' to 'CUI1'
+    df_relations.rename(columns={"#CUI1": "CUI1"}, inplace=True)
+    # Remove the last column, it's empty
+    df_relations = df_relations.iloc[:, :-1]
+    print(df_relations.head())
+    # Iterate over the rows
+    for i, row in df_relations.iterrows():
+        if row.SUPPRESS == "Y":
+            continue
+        # Create the URI
+        uri1 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI1}")
+        uri2 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI2}")
+        # Add the triple
+        if row.REL == "RL":
+            g.add((uri1, rdflib.URIRef("related"), uri2))
+            continue
+        g.add((uri1, rdflib.URIRef(f"http://identifiers.org/medgen/{row.REL}"), uri2))
+    return g
+def apply_rules_to_graph(g):
+    # Now, apply this rule: if two nodes have the same parent (i.e. node1 RB node2 and node3 RB node2, then node1 related node3)
+    # Query the graph to get the parents of each node
+    query = """
+    PREFIX medgen: <http://identifiers.org/medgen/>
+    SELECT DISTINCT ?parent ?child1 ?child2 WHERE {
+        ?parent medgen:RN ?child1 .
+        ?parent medgen:RN ?child2 .
+        FILTER (?child1 != ?child2)
+    }
+    """
+    res = g.query(query)
+    for row in res:
+        g.add((row.child1, rdflib.URIRef("related"), row.child2))
+        g.add((row.child2, rdflib.URIRef("related"), row.child1))
+    return g
+def get_labels_of_entities():
+    """
+    Returns a dictionary with the labels of the entities
+    """
+    # File with the graph: MGCONSO.RRF
+    df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0)
+    # Rename the column '#CUI' to 'CUI'
+    df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True)
+    # Remove the last column, it's empty
+    df_concepts = df_concepts.iloc[:, :-1]
+    # Create a dictionary
+    labels_of_entities = {}
+    # Iterate over the rows
+    for i, row in df_concepts.iterrows():
+        if row.SUPPRESS == "Y":
+            continue
+        if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P":
+            labels_of_entities[f"http://identifiers.org/medgen/{row.CUI}"] = row.STR
+    return labels_of_entities
+def generate_triples_file(graph: rdflib.Graph):
+    with open("triples_medgen.tsv", "w") as f:
+        # Output the triples ?s ?p ?o
+        for s, p, o in graph.triples((None, rdflib.URIRef("related"), None)):
+            f.write(f"{s}\t{p}\t{o}\n")
+        for s, p, o in graph.triples(
+            (None, rdflib.URIRef("http://identifiers.org/medgen/RN"), None)
+        ):
+            f.write(f"{s}\t{p}\t{o}\n")
+        for s, p, o in graph.triples(
+            (None, rdflib.URIRef("http://identifiers.org/medgen/RB"), None)
+        ):
+            f.write(f"{s}\t{p}\t{o}\n")
+        for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/PAR"), None)):
+            f.write(f"{s}\t{p}\t{o}\n")
+        for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/CHD"), None)):
+            f.write(f"{s}\t{p}\t{o}\n")
+def save_adjacency_matrix():
+    # Load the triples file generated
+    df = pd.read_csv("triples_medgen.tsv", sep="\t", header=None)
+    # Now output the adjacency matrix, where the rows are the subjects and the columns are the objects
+    # The values are the relations (i.e. 0 if no relation and 1 if there is a relation)
+    # Get the unique subjects and objects
+    subjects = df[0].unique()
+    objects = df[2].unique()
+    # Create the adjacency matrix
+    adj_matrix = pd.DataFrame(0, index=subjects, columns=objects)
+    # Iterate over the rows
+    for i, row in df.iterrows():
+        adj_matrix.loc[row[0], row[2]] = 1
+    # Save the adjacency matrix
+    adj_matrix.to_csv("adjacency_matrix.mat", sep="\t")
+# %%
+g = get_graph()
+# %%
+g = apply_rules_to_graph(g)
+# %%
+labels_of_entities = get_labels_of_entities()
+# %%
+generate_triples_file(g)
+# %%
+from pykeen.triples import TriplesFactory
+from pykeen.models import TuckER, TransE, TransH
+from pykeen.pipeline import pipeline
+tf = TriplesFactory.from_path("triples_medgen.tsv")
+print(f"Triples count: {tf.num_triples}")
+training, testing, validation = tf.split([0.8, 0.1, 0.1], random_state=42, randomize_cleanup=False)
+result = pipeline(
+    training=training,
+    testing=testing,
+    validation=validation,
+    model=TransE,
+    stopper="early",
+    epochs=500,  # short epochs for testing - you should go
+    # higher, especially with early stopper enabled
+)
+result.save_to_directory("doctests/test_unstratified_stopped_complex")
+# %%
+import torch
+alzheimers = "http://identifiers.org/medgen/C1843013"
+# What does the model predict for Alzheimer's disease?
+model = result.model
+alzheimers_id = tf.entity_to_id[alzheimers]
+relation_id = tf.relation_to_id["related"]
+batch_to_predict = torch.tensor([[alzheimers_id, relation_id]])
+alzheimers_pred = model.predict_t(hr_batch=batch_to_predict)
+print(alzheimers_pred.shape)
+# Get the indices of the top 10 predictions
+top10 = torch.topk(alzheimers_pred, 10, largest=True)
+# Get the entities
+entities = tf.entity_id_to_label
+print(top10.indices)
+for i in top10.indices[0]:
+    # Ask the graph, what is the label for this entity?
+    query = f"""
+    PREFIX medgen: <http://identifiers.org/medgen/>
+    SELECT ?label WHERE {{
+        <{entities[i.item()]}> <http://www.w3.org/2000/01/rdf-schema#label> ?label
+    }}
+    """
+    res = g.query(query)
+    for i, row in enumerate(res):
+        print(f"{i}: {row}")
+# %%
+from pykeen.nn.representation import Embedding
+# Get the embeddings of all the entities
+entity_ids = torch.LongTensor(list(tf.entity_to_id.values())).cuda()
+entity_embeddings: Embedding = model.entity_representations[0]._embeddings(entity_ids)
+# Get the embeddings of the relations
+relation_ids = torch.LongTensor(list(tf.relation_to_id.values())).cuda()
+relation_embeddings: Embedding = model.relation_representations[0]._embeddings(
+    relation_ids
+)
+print(f"Entity embeddings shape: {entity_embeddings.shape}")
+print(f"Relation embeddings shape: {relation_embeddings.shape}")
+# Store the embeddings in a DataFrame
+df = pd.DataFrame(
+    {
+        "embedding": entity_embeddings.detach().cpu().tolist(),
+        "label": [
+            labels_of_entities[tf.entity_id_to_label[i]] if tf.entity_id_to_label[i] in labels_of_entities else ""
+            for i in range(len(tf.entity_id_to_label))
+        ],
+        "uri": [
+            f"{tf.entity_id_to_label[i]}" for i in range(len(tf.entity_id_to_label))
+        ],
+    },
+    index=range(len(entity_embeddings)),
+)
+## Save the DataFrame
+df.to_csv("entity_embeddings.csv")
+# Store the embeddings in a DataFrame
+df = pd.DataFrame(
+    {
+        "embedding": relation_embeddings.detach().cpu().tolist(),
+        "label": [
+            tf.relation_id_to_label[i] for i in range(len(tf.relation_id_to_label))
+        ],
+        "uri": [
+            f"{tf.relation_id_to_label[i]}" for i in range(len(tf.relation_id_to_label))
+        ],
+    },
+    index=range(len(relation_embeddings)),
+)
+## Save the DataFrame
+df.to_csv("relation_embeddings.csv")
+# %%
+import pyobo
+pyobo.get_name("mesh", "16793")
+# %%

graph_analysis.m ADDED Viewed

	@@ -0,0 +1,49 @@

+% Read the CSV file
+data = readtable('MGREL.RRF', Delimiter='|', FileType='text', NumHeaderLines=0, VariableNamingRule='preserve');
+data = renamevars(data,"#CUI1","CUI1");
+data = data(1:1000,:);
+ids_1 = data.CUI1;
+for k = 1 : length(ids_1)
+    cellContents = ids_1{k};
+    % Truncate and stick back into the cell
+    ids_1{k} = cellContents(2:end);
+end
+ids_1 = str2double(ids_1);
+ids_2 = data.CUI2;
+ids_2 = data.CUI1(2:end);
+for k = 1 : length(ids_2)
+    cellContents = ids_2{k};
+    % Truncate and stick back into the cell
+    ids_2{k} = cellContents(2:end);
+end
+ids_2 = str2double(ids_2);
+ids_1 = ids_1(1:end-1);
+ids_2 = ids_2(2:end);
+% Get the number of unique nodes
+%nodes = unique([ids_1; ids_2]);
+%num_nodes = length(nodes);
+% Initialize sparse adjacency matrix
+%A = sparse(ids_1, ids_2, 1, max(ids_2), max(ids_2));
+% Display adjacency matrix
+%disp(A);
+%G = digraph(A);
+G = digraph(ids_1, ids_2);
+[bin,binsize] = conncomp(G,'Type','weak');
+bin(1:100)
+size(unique(bin))
+max(binsize)
+pg_ranks = centrality(G,'pagerank');
+G.Nodes.PageRank = pg_ranks;
+%hub_ranks = centrality(G,'hubs');
+%auth_ranks = centrality(G,'authorities');
+%G.Nodes.Hubs = hub_ranks;
+%G.Nodes.Authorities = auth_ranks;
+G.Nodes
+%plot(G);

graph_visualization.mlapp ADDED Viewed

Binary file (30.2 kB). View file

main.ipynb ADDED Viewed

	@@ -0,0 +1,417 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>nct_id</th>\n",
+       "      <th>mesh_term</th>\n",
+       "      <th>downcase_mesh_term</th>\n",
+       "      <th>mesh_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>336369685</td>\n",
+       "      <td>NCT04016870</td>\n",
+       "      <td>Infections</td>\n",
+       "      <td>infections</td>\n",
+       "      <td>mesh-ancestor</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>336369788</td>\n",
+       "      <td>NCT03266874</td>\n",
+       "      <td>Necrosis</td>\n",
+       "      <td>necrosis</td>\n",
+       "      <td>mesh-list</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>336369897</td>\n",
+       "      <td>NCT02743455</td>\n",
+       "      <td>Fever</td>\n",
+       "      <td>fever</td>\n",
+       "      <td>mesh-list</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>336370004</td>\n",
+       "      <td>NCT01683877</td>\n",
+       "      <td>Neoplasms</td>\n",
+       "      <td>neoplasms</td>\n",
+       "      <td>mesh-ancestor</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>336370095</td>\n",
+       "      <td>NCT01268579</td>\n",
+       "      <td>Carcinoma</td>\n",
+       "      <td>carcinoma</td>\n",
+       "      <td>mesh-list</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          id       nct_id   mesh_term downcase_mesh_term      mesh_type\n",
+       "0  336369685  NCT04016870  Infections         infections  mesh-ancestor\n",
+       "1  336369788  NCT03266874    Necrosis           necrosis      mesh-list\n",
+       "2  336369897  NCT02743455       Fever              fever      mesh-list\n",
+       "3  336370004  NCT01683877   Neoplasms          neoplasms  mesh-ancestor\n",
+       "4  336370095  NCT01268579   Carcinoma          carcinoma      mesh-list"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv('file_db/browse_conditions.txt', delimiter='|')  # Use the appropriate delimiter if not tab-separated\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files_to_keep = [\"brief_summaries\", \"interventions\", \"keywords\", \"browse_conditions\"]\n",
+    "\n",
+    "# maybe \"study_references\" \"sponsors\" \"overall_officials\" \"pending_results\" \"outcome_analyses\" \"provided_documents\" \"reported_event_totals\" \"responsible_parties\"\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>nct_id</th>\n",
+       "      <th>summary</th>\n",
+       "      <th>intervention_name</th>\n",
+       "      <th>intervention_type</th>\n",
+       "      <th>intervention_description</th>\n",
+       "      <th>keywords</th>\n",
+       "      <th>desease_condition</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>NCT03569293</td>\n",
+       "      <td>The objective of this study is to assess the e...</td>\n",
+       "      <td>[Placebo for Upadacitinib, Upadacitinib]</td>\n",
+       "      <td>Drug</td>\n",
+       "      <td>Tablets taken orally once a day</td>\n",
+       "      <td>[Atopic Dermatitis, Upadacitinib]</td>\n",
+       "      <td>[dermatitis, atopic, dermatitis, eczema, skin ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>NCT03556839</td>\n",
+       "      <td>The study will integrate the efficacy of combi...</td>\n",
+       "      <td>[Atezolizumab, Bevacizumab, Cisplatin/Carbopla...</td>\n",
+       "      <td>Drug</td>\n",
+       "      <td>Intravenous Infusion</td>\n",
+       "      <td>[Cervix, Carcinoma, Atezolizumab]</td>\n",
+       "      <td>[carcinoma, neoplasms, glandular and epithelia...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>NCT03526874</td>\n",
+       "      <td>Migraine affects 10-28% of children and adoles...</td>\n",
+       "      <td>[Lidocaine 4% Topical Application Cream [LMX 4...</td>\n",
+       "      <td>Drug</td>\n",
+       "      <td>Run-in Step: All subjects receive 32 mg (4 cm ...</td>\n",
+       "      <td>[Episodic Migraine, Headache, Nerve Block, Pai...</td>\n",
+       "      <td>[pain, migraine disorders, headache, headache ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>NCT03526835</td>\n",
+       "      <td>This is a Phase 1/2 open-label, multi-center, ...</td>\n",
+       "      <td>[MCLA-158, MCLA-158 +Pembrolizumab]</td>\n",
+       "      <td>Drug</td>\n",
+       "      <td>full-length IgG1 bispecific antibody targeting...</td>\n",
+       "      <td>[Bispecific antibody, First-in-human, MCLA-158...</td>\n",
+       "      <td>[squamous cell carcinoma of head and neck, neo...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>NCT02272751</td>\n",
+       "      <td>This study will aim to compare the effects of ...</td>\n",
+       "      <td>[Exercise, Relaxation]</td>\n",
+       "      <td>Behavioral</td>\n",
+       "      <td>The Exercise intervention will consist of aero...</td>\n",
+       "      <td>[cancer survivorship, exercise, relaxation, mi...</td>\n",
+       "      <td>[lymphoma, neoplasms by histologic type, neopl...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         nct_id                                            summary  \\\n",
+       "0   NCT03569293  The objective of this study is to assess the e...   \n",
+       "2   NCT03556839  The study will integrate the efficacy of combi...   \n",
+       "6   NCT03526874  Migraine affects 10-28% of children and adoles...   \n",
+       "9   NCT03526835  This is a Phase 1/2 open-label, multi-center, ...   \n",
+       "11  NCT02272751  This study will aim to compare the effects of ...   \n",
+       "\n",
+       "                                    intervention_name intervention_type  \\\n",
+       "0            [Placebo for Upadacitinib, Upadacitinib]              Drug   \n",
+       "2   [Atezolizumab, Bevacizumab, Cisplatin/Carbopla...              Drug   \n",
+       "6   [Lidocaine 4% Topical Application Cream [LMX 4...              Drug   \n",
+       "9                 [MCLA-158, MCLA-158 +Pembrolizumab]              Drug   \n",
+       "11                             [Exercise, Relaxation]        Behavioral   \n",
+       "\n",
+       "                             intervention_description  \\\n",
+       "0                     Tablets taken orally once a day   \n",
+       "2                                Intravenous Infusion   \n",
+       "6   Run-in Step: All subjects receive 32 mg (4 cm ...   \n",
+       "9   full-length IgG1 bispecific antibody targeting...   \n",
+       "11  The Exercise intervention will consist of aero...   \n",
+       "\n",
+       "                                             keywords  \\\n",
+       "0                   [Atopic Dermatitis, Upadacitinib]   \n",
+       "2                   [Cervix, Carcinoma, Atezolizumab]   \n",
+       "6   [Episodic Migraine, Headache, Nerve Block, Pai...   \n",
+       "9   [Bispecific antibody, First-in-human, MCLA-158...   \n",
+       "11  [cancer survivorship, exercise, relaxation, mi...   \n",
+       "\n",
+       "                                    desease_condition  \n",
+       "0   [dermatitis, atopic, dermatitis, eczema, skin ...  \n",
+       "2   [carcinoma, neoplasms, glandular and epithelia...  \n",
+       "6   [pain, migraine disorders, headache, headache ...  \n",
+       "9   [squamous cell carcinoma of head and neck, neo...  \n",
+       "11  [lymphoma, neoplasms by histologic type, neopl...  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_summary = pd.read_csv('file_db/brief_summaries.txt', delimiter='|')\n",
+    "df_summary = df_summary.rename(columns={'description': 'summary'})\n",
+    "\n",
+    "### create and merge intervention ###\n",
+    "df_intervention = pd.read_csv('file_db/interventions.txt', delimiter='|')\n",
+    "\n",
+    "intervention_grouped = df_intervention.groupby('nct_id')['name'].apply(list).reset_index()\n",
+    "intervention_grouped = intervention_grouped.rename(columns={'name': 'intervention_name'})\n",
+    "merged_df = pd.merge(\n",
+    "    df_summary[['nct_id', 'summary']], \n",
+    "    intervention_grouped[['nct_id', 'intervention_name']], \n",
+    "    on='nct_id')\n",
+    "\n",
+    "df_intervention = df_intervention.rename(columns={'description': 'intervention_description'})\n",
+    "\n",
+    "merged_df = pd.merge(\n",
+    "    merged_df,\n",
+    "    df_intervention[['nct_id', 'intervention_type', 'intervention_description']], \n",
+    "    on='nct_id')\n",
+    "\n",
+    "### create and merge keywords ###\n",
+    "df_keyword = pd.read_csv('file_db/keywords.txt', delimiter='|')\n",
+    "keywords_grouped = df_keyword.groupby('nct_id')['name'].apply(list).reset_index()\n",
+    "keywords_grouped = keywords_grouped.rename(columns={'name': 'keywords'})\n",
+    "\n",
+    "merged_df = pd.merge(\n",
+    "    merged_df,\n",
+    "    keywords_grouped,\n",
+    "    on='nct_id'\n",
+    ")\n",
+    "\n",
+    "### create and merge browse conditions\n",
+    "df_condition = pd.read_csv('file_db/browse_conditions.txt', delimiter='|')\n",
+    "conditions_grouped = df_condition.groupby('nct_id')['downcase_mesh_term'].apply(list).reset_index()\n",
+    "conditions_grouped = conditions_grouped.rename(columns={'downcase_mesh_term': 'desease_condition'})\n",
+    "\n",
+    "merged_df = pd.merge(\n",
+    "    merged_df,\n",
+    "    conditions_grouped,\n",
+    "    on='nct_id'\n",
+    ")\n",
+    "\n",
+    "merged_df = merged_df.drop_duplicates(subset='nct_id')\n",
+    "\n",
+    "merged_df.head()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[dermatitis, atopic, dermatitis, eczema, skin ...</td>\n",
+       "      <td>nct_id: NCT03569293\\nsummary: The objective of...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[carcinoma, neoplasms, glandular and epithelia...</td>\n",
+       "      <td>nct_id: NCT03556839\\nsummary: The study will i...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>[pain, migraine disorders, headache, headache ...</td>\n",
+       "      <td>nct_id: NCT03526874\\nsummary: Migraine affects...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>[squamous cell carcinoma of head and neck, neo...</td>\n",
+       "      <td>nct_id: NCT03526835\\nsummary: This is a Phase ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>[lymphoma, neoplasms by histologic type, neopl...</td>\n",
+       "      <td>nct_id: NCT02272751\\nsummary: This study will ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                    desease_condition  \\\n",
+       "0   [dermatitis, atopic, dermatitis, eczema, skin ...   \n",
+       "2   [carcinoma, neoplasms, glandular and epithelia...   \n",
+       "6   [pain, migraine disorders, headache, headache ...   \n",
+       "9   [squamous cell carcinoma of head and neck, neo...   \n",
+       "11  [lymphoma, neoplasms by histologic type, neopl...   \n",
+       "\n",
+       "                                                 text  \n",
+       "0   nct_id: NCT03569293\\nsummary: The objective of...  \n",
+       "2   nct_id: NCT03556839\\nsummary: The study will i...  \n",
+       "6   nct_id: NCT03526874\\nsummary: Migraine affects...  \n",
+       "9   nct_id: NCT03526835\\nsummary: This is a Phase ...  \n",
+       "11  nct_id: NCT02272751\\nsummary: This study will ...  "
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Concatenate all columns into one written text\n",
+    "merged_df['text'] = merged_df.drop(columns=['desease_condition']).apply(lambda row: '\\n'.join([f\"{col}: {val}\" for col, val in row.items()]), axis=1)\n",
+    "\n",
+    "# Save the DataFrame to a new CSV file\n",
+    "merged_df = merged_df[['desease_condition', 'text']]\n",
+    "merged_df.to_csv('clinical_trials.csv', index=False)\n",
+    "\n",
+    "merged_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

mock_trial.json ADDED Viewed

	@@ -0,0 +1,305 @@

+{
+    "protocolSection": {
+        "identificationModule": {
+            "nctId": "NCT00841061",
+            "orgStudyIdInfo": {
+                "id": "B530500"
+            },
+            "secondaryIdInfos": [
+                {
+                    "id": "HD40315-01"
+                }
+            ],
+            "organization": {
+                "fullName": "Eunice Kennedy Shriver National Institute of Child Health and Human Development (NICHD)",
+                "class": "NIH"
+            },
+            "briefTitle": "Cereals as a Source of Iron for Breastfed Infants",
+            "officialTitle": "Breast Feeding and Iron: Comparison of Cereals Fortified With Different Forms of Iron",
+            "acronym": "Bfe03B"
+        },
+        "statusModule": {
+            "statusVerifiedDate": "2009-01",
+            "overallStatus": "COMPLETED",
+            "expandedAccessInfo": {
+                "hasExpandedAccess": false
+            },
+            "startDateStruct": {
+                "date": "2003-07"
+            },
+            "primaryCompletionDateStruct": {
+                "date": "2006-05",
+                "type": "ACTUAL"
+            },
+            "completionDateStruct": {
+                "date": "2006-05",
+                "type": "ACTUAL"
+            },
+            "studyFirstSubmitDate": "2009-02-09",
+            "studyFirstSubmitQcDate": "2009-02-09",
+            "studyFirstPostDateStruct": {
+                "date": "2009-02-11",
+                "type": "ESTIMATED"
+            },
+            "lastUpdateSubmitDate": "2009-02-09",
+            "lastUpdatePostDateStruct": {
+                "date": "2009-02-11",
+                "type": "ESTIMATED"
+            }
+        },
+        "sponsorCollaboratorsModule": {
+            "responsibleParty": {
+                "oldNameTitle": "Dr. Ekhard E. Ziegler",
+                "oldOrganization": "University of Iowa"
+            },
+            "leadSponsor": {
+                "name": "National Institutes of Health (NIH)",
+                "class": "NIH"
+            }
+        },
+        "oversightModule": {
+            "oversightHasDmc": false
+        },
+        "descriptionModule": {
+            "briefSummary": "The purpose of this research study is to determine whether the type of iron in infant cereals makes a differance in how well the cereal helps infants remain free of iron deficiency."
+        },
+        "conditionsModule": {
+            "conditions": [
+                "Iron Deficiency"
+            ]
+        },
+        "designModule": {
+            "studyType": "INTERVENTIONAL",
+            "phases": [
+                "NA"
+            ],
+            "designInfo": {
+                "allocation": "RANDOMIZED",
+                "interventionModel": "PARALLEL",
+                "primaryPurpose": "PREVENTION",
+                "maskingInfo": {
+                    "masking": "QUADRUPLE",
+                    "whoMasked": [
+                        "PARTICIPANT",
+                        "CARE_PROVIDER",
+                        "INVESTIGATOR",
+                        "OUTCOMES_ASSESSOR"
+                    ]
+                }
+            },
+            "enrollmentInfo": {
+                "count": 111,
+                "type": "ACTUAL"
+            }
+        },
+        "armsInterventionsModule": {
+            "armGroups": [
+                {
+                    "label": "Cereal L",
+                    "type": "ACTIVE_COMPARATOR",
+                    "description": "Rice cereal with electrolytic iron",
+                    "interventionNames": [
+                        "Dietary Supplement: electrolytic iron"
+                    ]
+                },
+                {
+                    "label": "Cereal M",
+                    "type": "ACTIVE_COMPARATOR",
+                    "description": "Rice cereal with ferrous fumarate",
+                    "interventionNames": [
+                        "Dietary Supplement: ferrous fumarate"
+                    ]
+                }
+            ],
+            "interventions": [
+                {
+                    "type": "DIETARY_SUPPLEMENT",
+                    "name": "electrolytic iron",
+                    "description": "1/4 a cup of cereal fortified with electrolytic iron per day between the ages of 112 days and 280 days of age",
+                    "armGroupLabels": [
+                        "Cereal L"
+                    ]
+                },
+                {
+                    "type": "DIETARY_SUPPLEMENT",
+                    "name": "ferrous fumarate",
+                    "description": "1/4 cup of cereal fortified with ferrous fumarate to be fed per day between ages of 112 days and 280 days of age",
+                    "armGroupLabels": [
+                        "Cereal M"
+                    ]
+                }
+            ]
+        },
+        "outcomesModule": {
+            "primaryOutcomes": [
+                {
+                    "measure": "plasma ferritin",
+                    "timeFrame": "280 days"
+                }
+            ],
+            "secondaryOutcomes": [
+                {
+                    "measure": "hemoglobin",
+                    "timeFrame": "280"
+                }
+            ]
+        },
+        "eligibilityModule": {
+            "eligibilityCriteria": "Inclusion Criteria:\n\n* exclusively breastfed\n* birth weight between 2500 and 4200g\n* gestational age \\>36 weeks\n\nExclusion Criteria:\n\n* supplementing formula\n* no iron drops",
+            "healthyVolunteers": true,
+            "sex": "ALL",
+            "minimumAge": "28 Days",
+            "maximumAge": "1 Year",
+            "stdAges": [
+                "CHILD"
+            ]
+        },
+        "contactsLocationsModule": {
+            "overallOfficials": [
+                {
+                    "name": "Ekhard E Ziegler, MD",
+                    "affiliation": "University of Iowa",
+                    "role": "PRINCIPAL_INVESTIGATOR"
+                }
+            ],
+            "locations": [
+                {
+                    "facility": "University of Iowa",
+                    "city": "Iowa City",
+                    "state": "Iowa",
+                    "zip": "52242",
+                    "country": "United States",
+                    "geoPoint": {
+                        "lat": 41.66113,
+                        "lon": -91.53017
+                    }
+                }
+            ]
+        },
+        "referencesModule": {
+            "references": [
+                {
+                    "pmid": "21178077",
+                    "type": "DERIVED",
+                    "citation": "Ziegler EE, Fomon SJ, Nelson SE, Jeter JM, Theuer RC. Dry cereals fortified with electrolytic iron or ferrous fumarate are equally effective in breast-fed infants. J Nutr. 2011 Feb;141(2):243-8. doi: 10.3945/jn.110.127266. Epub 2010 Dec 22."
+                }
+            ]
+        }
+    },
+    "derivedSection": {
+        "miscInfoModule": {
+            "versionHolder": "2024-05-03"
+        },
+        "conditionBrowseModule": {
+            "meshes": [
+                {
+                    "id": "D000090463",
+                    "term": "Iron Deficiencies"
+                }
+            ],
+            "ancestors": [
+                {
+                    "id": "D000019189",
+                    "term": "Iron Metabolism Disorders"
+                },
+                {
+                    "id": "D000008659",
+                    "term": "Metabolic Diseases"
+                }
+            ],
+            "browseLeaves": [
+                {
+                    "id": "M20857",
+                    "name": "Anemia, Iron-Deficiency",
+                    "relevance": "LOW"
+                },
+                {
+                    "id": "M2781",
+                    "name": "Iron Deficiencies",
+                    "asFound": "Iron Deficiency",
+                    "relevance": "HIGH"
+                },
+                {
+                    "id": "M11639",
+                    "name": "Metabolic Diseases",
+                    "relevance": "LOW"
+                },
+                {
+                    "id": "M21177",
+                    "name": "Iron Metabolism Disorders",
+                    "relevance": "LOW"
+                }
+            ],
+            "browseBranches": [
+                {
+                    "abbrev": "BC15",
+                    "name": "Blood and Lymph Conditions"
+                },
+                {
+                    "abbrev": "BC18",
+                    "name": "Nutritional and Metabolic Diseases"
+                },
+                {
+                    "abbrev": "All",
+                    "name": "All Conditions"
+                }
+            ]
+        },
+        "interventionBrowseModule": {
+            "meshes": [
+                {
+                    "id": "C000031621",
+                    "term": "Ferrous fumarate"
+                }
+            ],
+            "ancestors": [
+                {
+                    "id": "D000014131",
+                    "term": "Trace Elements"
+                },
+                {
+                    "id": "D000018977",
+                    "term": "Micronutrients"
+                },
+                {
+                    "id": "D000045505",
+                    "term": "Physiological Effects of Drugs"
+                }
+            ],
+            "browseLeaves": [
+                {
+                    "id": "M10533",
+                    "name": "Iron",
+                    "relevance": "LOW"
+                },
+                {
+                    "id": "M225448",
+                    "name": "Ferrous fumarate",
+                    "asFound": "Mouse",
+                    "relevance": "HIGH"
+                },
+                {
+                    "id": "M21009",
+                    "name": "Micronutrients",
+                    "relevance": "LOW"
+                },
+                {
+                    "id": "M16885",
+                    "name": "Trace Elements",
+                    "relevance": "LOW"
+                }
+            ],
+            "browseBranches": [
+                {
+                    "abbrev": "Micro",
+                    "name": "Micronutrients"
+                },
+                {
+                    "abbrev": "All",
+                    "name": "All Drugs and Chemicals"
+                }
+            ]
+        }
+    },
+    "hasResults": false
+}

relation_embeddings.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81f5c180ff1a488b185fb73bb512b3e4402d0fcc6b483d1592a768a6a376a261
+size 5747

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+sqlalchemy-iris==0.13.3
+datasets==2.19.0
+pandas==2.2.0
+pykeen==1.10.2
+rdflib==7.0.0
+scipy==1.13.0
+pyobo==0.10.11
+langchain==0.1.17
+openai==1.25.1
+sentence_transformers==2.7.0
+streamlit-agraph
+streamlit==1.34.0

utils.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# %%
+from typing import List, Dict, Any
+import os
+from sqlalchemy import create_engine, text
+import requests
+def get_all_diseases_name(engine) -> List[List[str]]:
+    with engine.connect() as conn:
+        with conn.begin():
+            sql = f"""
+                    SELECT * FROM Test.EntityEmbeddings
+                    """
+            result = conn.execute(text(sql))
+            data = result.fetchall()
+    all_diseases = [row[1] for row in data if row[1] != "nan"]
+    return all_diseases
+def get_uri_from_name(engine, name: str) -> str:
+    with engine.connect() as conn:
+        with conn.begin():
+            sql = f"""
+                    SELECT uri FROM Test.EntityEmbeddings
+                    WHERE label = '{name}'
+                    """
+            result = conn.execute(text(sql))
+            data = result.fetchall()
+    return data[0][0].split('/')[-1]
+def get_most_similar_diseases_from_uri(engine, original_disease_uri: str, threshold: float = 0.8) -> List[str]:
+    with engine.connect() as conn:
+        with conn.begin():
+            sql = f"""
+                    SELECT * FROM Test.EntityEmbeddings
+                    """
+            result = conn.execute(text(sql))
+            data = result.fetchall()
+    all_diseases = [row[1] for row in data if row[1] != "nan"]
+    return all_diseases
+def get_uri_from_name(engine, name: str) -> str:
+    with engine.connect() as conn:
+        with conn.begin():
+            sql = f"""
+                    SELECT uri FROM Test.EntityEmbeddings
+                    WHERE label = '{name}'
+                    """
+            result = conn.execute(text(sql))
+            data = result.fetchall()
+    return data[0][0].split('/')[-1]
+def get_most_similar_diseases_from_uri(engine, original_disease_uri: str, threshold: float = 0.8) -> List[str]:
+    with engine.connect() as conn:
+        with conn.begin():
+            sql = f"""
+                    SELECT TOP 10 e1.uri AS uri1, e2.uri AS uri2, e1.label AS label1, e2.label AS label2,
+                    VECTOR_COSINE(e1.embedding, e2.embedding) AS distance
+                    FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2
+                    WHERE e1.uri = 'http://identifiers.org/medgen/{original_disease_uri}'
+                    AND VECTOR_COSINE(e1.embedding, e2.embedding) > {threshold}
+                    AND e1.uri != e2.uri
+                    ORDER BY distance DESC
+                    """
+            result = conn.execute(text(sql))
+            data = result.fetchall()
+    similar_diseases = [(row[1].split('/')[-1], row[3], row[4]) for row in data if row[3] != "nan"]
+    return similar_diseases
+def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
+    # Request:
+    # curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \
+    # -H "accept: text/csv"
+    request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}"
+    response = requests.get(request_url, headers={"accept": "application/json"})
+    return response.json()
+def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]:
+    clinical_records = []
+    for clinical_record_id in clinical_record_ids:
+        clinical_record_info = get_clinical_record_info(clinical_record_id)
+        clinical_records.append(clinical_record_info)
+    return clinical_records
+if __name__ == "__main__":
+    username = 'demo'
+    password = 'demo'
+    hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
+    port = '1972'
+    namespace = 'USER'
+    CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
+    try:
+        engine = create_engine(CONNECTION_STRING)
+        diseases = get_most_similar_diseases_from_uri('C1843013')
+        for disease in diseases:
+            print(disease)
+    except Exception as e:
+        print(e)
+    print(get_uri_from_name(engine, 'Alzheimer disease 3'))
+    clinical_record_info = get_clinical_records_by_ids(['NCT00841061'])
+    print(clinical_record_info)