Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

ACMCMC commited on May 5, 2024

Commit

4bb7c94

1 Parent(s): 1fb895a

WIP

Browse files

Files changed (3) hide show

app.py +44 -10
llm_res.py +49 -36
utils.py +1 -1

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ show_graph = False
 show_analyze_status = False
 show_overview = False
 show_details = False
 # IRIS connection
 username = "demo"
@@ -66,7 +67,7 @@ with st.container():
             diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(
                 description_input, encoder
             )
-            status.info(f'Found {len(diseases_related_to_the_user_text)} diseases related to the description you entered.')
             status.json(diseases_related_to_the_user_text, expanded=False)
             status.divider()
             # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
@@ -94,26 +95,37 @@ with st.container():
             clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
                 augmented_set_of_diseases, encoder
             )
-            status.info(f'Found {len(clinical_trials_related_to_the_diseases)} clinical trials related to the diseases.')
             status.json(clinical_trials_related_to_the_diseases, expanded=False)
             status.divider()
             status.write("Getting the details of the clinical trials...")
             json_of_clinical_trials = get_clinical_records_by_ids(
                 [trial["nct_id"] for trial in clinical_trials_related_to_the_diseases]
             )
             status.json(json_of_clinical_trials, expanded=False)
             status.divider()
             # 7. Use an LLM to get a summary of the clinical trials, in plain text format.
-            status.write("Getting a summary of the clinical trials...")
-            response = get_short_summary_out_of_json_files(json_of_clinical_trials)
-            disease_overview = response
             try:
             # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
                 status.write("Getting summary statistics of the clinical trials...")
                 response = tagging_insights_from_json(json_of_clinical_trials)
                 print(f'Response from LLM tagging: {response}')
-                status.write(f'Response from LLM tagging: {response}')
             except Exception as e:
                 print(f'Error while extracting numerical data from the clinical trials: {e}')
                 status.warning(f'Error while extracting numerical data from the clinical trials. This information will not be shown.')
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
@@ -170,10 +182,32 @@ $$"""
 # overview
 with st.container():
     if show_overview:
-        st.write("## Overview of Related Clinical Trials")
-        st.write(disease_overview)
-        time.sleep(2)
-        show_details = True
 # details

 show_analyze_status = False
 show_overview = False
 show_details = False
+show_metrics = False
 # IRIS connection
 username = "demo"
             diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(
                 description_input, encoder
             )
+            status.info(f'Selected {len(diseases_related_to_the_user_text)} diseases related to the description you entered.')
             status.json(diseases_related_to_the_user_text, expanded=False)
             status.divider()
             # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
             clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
                 augmented_set_of_diseases, encoder
             )
+            status.info(f'Selected {len(clinical_trials_related_to_the_diseases)} clinical trials related to the diseases.')
             status.json(clinical_trials_related_to_the_diseases, expanded=False)
             status.divider()
             status.write("Getting the details of the clinical trials...")
             json_of_clinical_trials = get_clinical_records_by_ids(
                 [trial["nct_id"] for trial in clinical_trials_related_to_the_diseases]
             )
+            status.success(f'Details of the clinical trials obtained.')
             status.json(json_of_clinical_trials, expanded=False)
             status.divider()
             # 7. Use an LLM to get a summary of the clinical trials, in plain text format.
+            try:
+                status.write("Getting a summary of the clinical trials...")
+                response = get_short_summary_out_of_json_files(json_of_clinical_trials)
+                status.success("Summary of the clinical trials obtained.")
+                disease_overview = response
+            except Exception as e:
+                print(f'Error while getting a summary of the clinical trials: {e}')
+                status.warning(f'Error while getting a summary of the clinical trials. This information will not be shown.')
             try:
             # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
                 status.write("Getting summary statistics of the clinical trials...")
                 response = tagging_insights_from_json(json_of_clinical_trials)
+                average_minimum_age = response["avg_min_age"]
+                average_maximum_age = response["avg_max_age"]
+                most_common_gender = response['most_common_gender']
                 print(f'Response from LLM tagging: {response}')
+                status.success(f'Summary statistics of the clinical trials obtained.')
             except Exception as e:
+                raise e
                 print(f'Error while extracting numerical data from the clinical trials: {e}')
                 status.warning(f'Error while extracting numerical data from the clinical trials. This information will not be shown.')
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
 # overview
 with st.container():
     if show_overview:
+        try:
+            st.write("## Overview of Related Clinical Trials")
+            st.write(disease_overview)
+            time.sleep(1)
+        except Exception as e:
+            print(f'Error while showing the overview of the clinical trials: {e}')
+        finally:
+            show_metrics = True
+with st.container():
+    if show_metrics:
+        try:
+            st.write("## Metrics of the Clinical Trials")
+            col1, col2, col3  = st.columns(3)
+            with col1:
+                st.metric("Average Minimum Age", average_minimum_age)
+            with col2:
+                st.metric("Average Maximum Age", average_maximum_age)
+            with col3:
+                st.metric("Most Common Gender", most_common_gender)
+            time.sleep(2)
+        except Exception as e:
+            print(f'Error while showing the metrics: {e}')
+        finally:
+            show_details = True
 # details

llm_res.py CHANGED Viewed

@@ -24,6 +24,7 @@ from langchain.chains.llm import LLMChain
 from langchain_core.prompts import PromptTemplate
 from collections import Counter
 import statistics
 load_dotenv()
@@ -134,11 +135,12 @@ def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str
 #             "eligibility": eligibility,
 #         }
 #         filtered_data.append(filtered_item)
 #     return filtered_data
 #     # for ele in filtered_data:
 #     #     print(ele)
 def process_dictionaty_with_llm_to_generate_response(json_data):
     # processed_data = process_json_data_for_llm(json_data)
     # res = tagging_chain.invoke({"input": processed_data})
@@ -217,9 +219,10 @@ def process_dictionaty_with_llm_to_generate_response(json_data):
             "eligibility": eligibility,
         }
         filtered_data.append(filtered_item)
     return filtered_data
 def get_short_summary_out_of_json_files(data_json):
     prompt_template = """You are an expert on clinicial trials and their analysis of their reports.
@@ -272,29 +275,36 @@ General summary:"""
     return result
 def analyze_data(data):
-    # Extract minimum and maximum ages
-    min_ages = [int(age.split()[0]) for age in data['minimum_age'] if age]
-    max_ages = [int(age.split()[0]) for age in data['maximum_age'] if age]
     # primary_timeframe= [int(age.split()[0]) for age in data['[primary_outcome]'] if age]
     # Calculate average minimum and maximum ages
     avg_min_age = statistics.mean(min_ages) if min_ages else None
     avg_max_age = statistics.mean(max_ages) if max_ages else None
     # Find most common gender
-    gender_counter = Counter(data['gender'])
     most_common_gender = gender_counter.most_common(1)[0][0]
     # Flatten keywords list and find common keywords
-    keywords = [keyword for sublist in data['keywords'] for keyword in sublist]
-    common_keywords = [word for word, count in Counter(keywords).most_common()]
-    return avg_min_age, avg_max_age, most_common_gender, common_keywords
 def tagging_insights_from_json(data_json):
-    processed_json= process_dictionaty_with_llm_to_generate_response(data_json)
     tagging_prompt = ChatPromptTemplate.from_template(
         """
     You are an expert on clinicial trials and analysis of their reports.
@@ -307,6 +317,7 @@ def tagging_insights_from_json(data_json):
     {input}
     """
     )
     class Classification(BaseModel):
         # description: str = Field(
         #     description="text description grouping all the clinical trials using briefDescription and detailedDescription keys"
@@ -317,25 +328,25 @@ def tagging_insights_from_json(data_json):
         # status: list = Field(
         #     description="Extract the status of all the clinical trials"
         # )
-        #keywords: list = Field(
         #   description="Extract the most relevant keywords for each clinical trials"
-        #)
         # interventions: list = Field(
         #     description="describe the interventions for each clinical trial using title, name and description"
         # )
-        #primary_outcomes: list = Field(
         #    description="get the timeframe of each clinical trial"
-        #)
-        #secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
-        #eligibility: list = Field(
         #   description="get the timeframe of each clinical trial"
-        #)
         # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
         minimum_age: list = Field(
-           description="get the minimum age from each experiment"
         )
         maximum_age: list = Field(
-           description="get the maximum age from each experiment"
         )
         gender: list = Field(description="get the gender from each experiment")
@@ -343,15 +354,15 @@ def tagging_insights_from_json(data_json):
             return {
                 # "project_title": self.project_title,
                 # "status": self.status,
-                #"keywords": self.keywords,
                 # "interventions": self.interventions,
-                #"primary_outcomes": self.primary_outcomes,
-                #"secondary_outcomes": self.secondary_outcomes,
                 # "eligibility": self.eligibility,
                 # "healthy_volunteers": self.healthy_volunteers,
                 "minimum_age": self.minimum_age,
                 "maximum_age": self.maximum_age,
-                "gender": self.gender
             }
     # LLM
@@ -365,18 +376,20 @@ def tagging_insights_from_json(data_json):
     tagging_chain = tagging_prompt | llm
-    res= tagging_chain.invoke({"input": processed_json})
-    result_dict= res.get_dict()
-    avg_min_age, avg_max_age, most_common_gender, common_keywords= analyze_data(result_dict)
-    #stats_dict= {'Average Minimum age': avg_min_age,
     #             'Average Maximum age': avg_max_age,
     #             'Most common gender undergoing the trials': most_common_gender,
     #             'common keywords found in the trials': common_keywords}
-    print(f"Result_tagging: {result_dict}")
-    return result_dict#, stats_dict
 # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
@@ -386,4 +399,4 @@ def tagging_insights_from_json(data_json):
 #     json.dump(clinical_record_info, f, indent=4)
-# tagging_chain = tagging_insights_from_json(json_data)

 from langchain_core.prompts import PromptTemplate
 from collections import Counter
 import statistics
+import regex as re
 load_dotenv()
 #             "eligibility": eligibility,
 #         }
 #         filtered_data.append(filtered_item)
 #     return filtered_data
 #     # for ele in filtered_data:
 #     #     print(ele)
 def process_dictionaty_with_llm_to_generate_response(json_data):
     # processed_data = process_json_data_for_llm(json_data)
     # res = tagging_chain.invoke({"input": processed_data})
             "eligibility": eligibility,
         }
         filtered_data.append(filtered_item)
     return filtered_data
 def get_short_summary_out_of_json_files(data_json):
     prompt_template = """You are an expert on clinicial trials and their analysis of their reports.
     return result
 def analyze_data(data):
+    print(f"Data: {data}")
+    # Extract minimum and maximum ages: Turn ['18 Years', '20 Years'] into [18, 20]
+    min_ages = [int(re.search(r"\d+", age).group()) for age in data["minimum_age"] if age]
+    max_ages = [int(re.search(r"\d+", age).group()) for age in data["maximum_age"] if age]
     # primary_timeframe= [int(age.split()[0]) for age in data['[primary_outcome]'] if age]
     # Calculate average minimum and maximum ages
     avg_min_age = statistics.mean(min_ages) if min_ages else None
     avg_max_age = statistics.mean(max_ages) if max_ages else None
     # Find most common gender
+    gender_counter = Counter(data["gender"])
     most_common_gender = gender_counter.most_common(1)[0][0]
     # Flatten keywords list and find common keywords
+    #keywords = [keyword for sublist in data["keywords"] for keyword in sublist]
+    #common_keywords = [word for word, count in Counter(keywords).most_common()]
+    return {
+        "avg_min_age": avg_min_age,
+        "avg_max_age": avg_max_age,
+        "most_common_gender": most_common_gender
+    }
 def tagging_insights_from_json(data_json):
+    processed_json = process_dictionaty_with_llm_to_generate_response(data_json)
     tagging_prompt = ChatPromptTemplate.from_template(
         """
     You are an expert on clinicial trials and analysis of their reports.
     {input}
     """
     )
     class Classification(BaseModel):
         # description: str = Field(
         #     description="text description grouping all the clinical trials using briefDescription and detailedDescription keys"
         # status: list = Field(
         #     description="Extract the status of all the clinical trials"
         # )
+        # keywords: list = Field(
         #   description="Extract the most relevant keywords for each clinical trials"
+        # )
         # interventions: list = Field(
         #     description="describe the interventions for each clinical trial using title, name and description"
         # )
+        # primary_outcomes: list = Field(
         #    description="get the timeframe of each clinical trial"
+        # )
+        # secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
+        # eligibility: list = Field(
         #   description="get the timeframe of each clinical trial"
+        # )
         # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
         minimum_age: list = Field(
+            description="get the minimum age from each experiment"
         )
         maximum_age: list = Field(
+            description="get the maximum age from each experiment"
         )
         gender: list = Field(description="get the gender from each experiment")
             return {
                 # "project_title": self.project_title,
                 # "status": self.status,
+                # "keywords": self.keywords,
                 # "interventions": self.interventions,
+                # "primary_outcomes": self.primary_outcomes,
+                # "secondary_outcomes": self.secondary_outcomes,
                 # "eligibility": self.eligibility,
                 # "healthy_volunteers": self.healthy_volunteers,
                 "minimum_age": self.minimum_age,
                 "maximum_age": self.maximum_age,
+                "gender": self.gender,
             }
     # LLM
     tagging_chain = tagging_prompt | llm
+    res = tagging_chain.invoke({"input": processed_json})
+    unprocessed_results_dict = res.get_dict()
+    results_dict = analyze_data(
+        unprocessed_results_dict
+    )
+    # stats_dict= {'Average Minimum age': avg_min_age,
     #             'Average Maximum age': avg_max_age,
     #             'Most common gender undergoing the trials': most_common_gender,
     #             'common keywords found in the trials': common_keywords}
+    print(f"Result_tagging: {results_dict}")
+    return results_dict
 # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
 #     json.dump(clinical_record_info, f, indent=4)
+# tagging_chain = tagging_insights_from_json(json_data)

utils.py CHANGED Viewed

@@ -189,7 +189,7 @@ def get_clinical_trials_related_to_diseases(
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
-                    SELECT TOP 10 d.nct_id, VECTOR_COSINE(d.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
                     FROM Test.ClinicalTrials d
                     ORDER BY distance DESC
                 """

     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
+                    SELECT TOP 15 d.nct_id, VECTOR_COSINE(d.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
                     FROM Test.ClinicalTrials d
                     ORDER BY distance DESC
                 """