Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

ACMCMC commited on May 4, 2024

Commit

1f35211

1 Parent(s): 2408e3d

WIP app

Browse files

Files changed (2) hide show

app.py +15 -4
utils.py +47 -16

app.py CHANGED Viewed

@@ -3,8 +3,10 @@ from streamlit_agraph import agraph, Node, Edge, Config
 import os
 from sqlalchemy import create_engine, text
 import pandas as pd
-from utils import get_all_diseases_name, get_most_similar_diseases_from_uri, get_uri_from_name, get_diseases_related_to_a_textual_description
 import json
 username = 'demo'
@@ -15,11 +17,17 @@ namespace = 'USER'
 CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
 engine = create_engine(CONNECTION_STRING)
-def handle_click_on_analyze_button():
     # 1. Embed the textual description that the user entered using the model
-    diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(description_input)
     # 2. Get 5 diseases with the highest cosine silimarity from the DB
     # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
     # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
     # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
     # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
@@ -31,7 +39,10 @@ def handle_click_on_analyze_button():
 st.write("# Klìnic")
-description_input = st.text_input(label="Enter the disease description 👇")
 st.write(":red[Here should be the graph]")  # TODO remove
 chart_data = pd.DataFrame(

 import os
 from sqlalchemy import create_engine, text
 import pandas as pd
+from utils import get_all_diseases_name, get_most_similar_diseases_from_uri, get_uri_from_name, get_diseases_related_to_a_textual_description, get_similarities_among_diseases_uris
 import json
+import numpy as np
+from sentence_transformers import SentenceTransformer
 username = 'demo'
 CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
 engine = create_engine(CONNECTION_STRING)
+def handle_click_on_analyze_button(user_text):
     # 1. Embed the textual description that the user entered using the model
     # 2. Get 5 diseases with the highest cosine silimarity from the DB
+    encoder = SentenceTransformer("allenai-specter")
+    diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(user_text, encoder)
+    #for disease_label in diseases_related_to_the_user_text:
+    #    st.text(disease_label)
     # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
+    diseases_uris = [disease['uri'] for disease in diseases_related_to_the_user_text]
+    get_similarities_among_diseases_uris(diseases_uris)
+    print(diseases_related_to_the_user_text)
     # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
     # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
     # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
 st.write("# Klìnic")
+description_input = st.text_input(label="Enter the disease description 👇", placeholder='A disease that causes memory loss and other cognitive impairments.')
+if st.button("Analyze"):
+    handle_click_on_analyze_button(description_input)
+# TODO: also when user clicks enter
 st.write(":red[Here should be the graph]")  # TODO remove
 chart_data = pd.DataFrame(

utils.py CHANGED Viewed

@@ -5,6 +5,15 @@ from sqlalchemy import create_engine, text
 import requests
 from sentence_transformers import SentenceTransformer
 def get_all_diseases_name(engine) -> List[List[str]]:
     with engine.connect() as conn:
@@ -98,46 +107,48 @@ def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str
     return clinical_records
-def get_uris_of_similar_diseases(uri_list: List[str]) -> List[tuple[str, str, float]]:
-    uri_list = tuple(uri_list)
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
                     SELECT e1.uri AS uri1, e2.uri AS uri2, VECTOR_COSINE(e1.embedding, e2.embedding) AS distance
                     FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2
-                    WHERE e1.uri IN {uri_list} AND e2.uri IN {uri_list} AND e1.uri != e2.uri
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
     return data
-encoder = SentenceTransformer("allenai-specter")
-def get_embedding(string: str) -> List[float]:
     # Embed the string using sentence-transformers
     vector = encoder.encode(string, show_progress_bar=False)
     return vector
-def get_diseases_related_to_a_textual_description(description: str) -> List[str]:
     # Embed the description using sentence-transformers
-    description_embedding = get_embedding(description)
-    print(f'Size of the embedding: {len(description_embedding)}')
     string_representation = str(description_embedding.tolist())[1:-1]
-    print(f'String representation: {string_representation}')
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
-                    SELECT TOP 5 uri, VECTOR_COSINE(e.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
-                    FROM Test.DiseaseDescriptions e
                     ORDER BY distance DESC
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
-    return data
 if __name__ == "__main__":
@@ -164,9 +175,29 @@ if __name__ == "__main__":
     clinical_record_info = get_clinical_records_by_ids(["NCT00841061"])
     print(clinical_record_info)
-    textual_description = "A disease that causes memory loss and other cognitive impairments."
-    diseases = get_diseases_related_to_a_textual_description(textual_description)
     for disease in diseases:
         print(disease)
 # %%

 import requests
 from sentence_transformers import SentenceTransformer
+username = "demo"
+password = "demo"
+hostname = os.getenv("IRIS_HOSTNAME", "localhost")
+port = "1972"
+namespace = "USER"
+CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
+engine = create_engine(CONNECTION_STRING)
 def get_all_diseases_name(engine) -> List[List[str]]:
     with engine.connect() as conn:
     return clinical_records
+def get_similarities_among_diseases_uris(
+    uri_list: List[str],
+) -> List[tuple[str, str, float]]:
+    uri_list = ", ".join([f"'{uri}'" for uri in uri_list])
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
                     SELECT e1.uri AS uri1, e2.uri AS uri2, VECTOR_COSINE(e1.embedding, e2.embedding) AS distance
                     FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2
+                    WHERE e1.uri IN ({uri_list}) AND e2.uri IN ({uri_list}) AND e1.uri != e2.uri
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
     return data
+def get_embedding(string: str, encoder) -> List[float]:
     # Embed the string using sentence-transformers
     vector = encoder.encode(string, show_progress_bar=False)
     return vector
+def get_diseases_related_to_a_textual_description(
+    description: str, encoder
+) -> List[str]:
     # Embed the description using sentence-transformers
+    description_embedding = get_embedding(description, encoder)
+    print(f"Size of the embedding: {len(description_embedding)}")
     string_representation = str(description_embedding.tolist())[1:-1]
+    print(f"String representation: {string_representation}")
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
+                    SELECT TOP 5 d.uri, VECTOR_COSINE(d.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
+                    FROM Test.DiseaseDescriptions d
                     ORDER BY distance DESC
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
+    return [{"uri": row[0], "distance": row[1]} for row in data]
 if __name__ == "__main__":
     clinical_record_info = get_clinical_records_by_ids(["NCT00841061"])
     print(clinical_record_info)
+    textual_description = (
+        "A disease that causes memory loss and other cognitive impairments."
+    )
+    encoder = SentenceTransformer("allenai-specter")
+    diseases = get_diseases_related_to_a_textual_description(
+        textual_description, encoder
+    )
     for disease in diseases:
         print(disease)
+    try:
+        similarities = get_similarities_among_diseases_uris(
+            [
+                "http://identifiers.org/medgen/C4553765",
+                "http://identifiers.org/medgen/C4553176",
+                "http://identifiers.org/medgen/C4024935",
+            ]
+        )
+        for similarity in similarities:
+            print(
+                f'{similarity[0].split("/")[-1]} and {similarity[1].split("/")[-1]} have a similarity of {similarity[2]}'
+            )
+    except Exception as e:
+        print(e)
 # %%