Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| from openai import AzureOpenAI | |
| import json | |
| load_dotenv() | |
| AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") | |
| AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
| AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") # deployment name | |
| AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview") | |
| # Configure OpenAI for Azure | |
| client = AzureOpenAI( | |
| api_key=AZURE_OPENAI_KEY, | |
| api_version=AZURE_OPENAI_API_VERSION, | |
| azure_endpoint=AZURE_OPENAI_ENDPOINT | |
| ) | |
| def extract_medical_entities(text: str) -> dict: | |
| prompt = f""" You are a medical NER expert. Your task is to extract relevant entities from the given medical report text and return them in a JSON object. | |
| Analyze the text carefully and identify the following fields: | |
| - "exam_types": any type of medical test, examination, or diagnostic method performed on the patient. | |
| - "specialties": the branch of medicine or medical discipline relevant to the report. | |
| - "anatomical_regions": specific parts or regions of the body mentioned in the report. | |
| - "pathologies": diagnosed diseases, disorders, or abnormal medical conditions noted in the report. | |
| - "procedures": medical interventions, treatments, or actions performed on the patient. | |
| - "measurements": numerical values or quantities recorded in the report, such as vital signs, lab results, sizes, or pressures. | |
| - "medications": drugs, therapies, or prescribed substances mentioned in the report. | |
| - "symptoms": patient-experienced signs or observable indications of a health issue. | |
| Text to analyze: | |
| \"\"\" | |
| {text} | |
| \"\"\" | |
| Return ONLY a valid JSON object with all fields. If a field has no values, return an empty list. | |
| """ | |
| response = client.chat.completions.create( | |
| model=AZURE_OPENAI_DEPLOYMENT, | |
| messages=[{"role": "user", "content": prompt}], | |
| #temperature=0, | |
| #max_tokens=1024 | |
| ) | |
| content = response.choices[0].message.content | |
| try: | |
| return json.loads(content) | |
| except json.JSONDecodeError: | |
| return { | |
| "exam_types": [], | |
| "specialties": [], | |
| "anatomical_regions": [], | |
| "pathologies": [], | |
| "procedures": [], | |
| "measurements": [], | |
| "medications": [], | |
| "symptoms": [] | |
| } | |
| import json | |
| def save_annotation(text: str, labels: dict, output_file="dataset.jsonl"): | |
| record = { | |
| "text": text, | |
| "labels": labels | |
| } | |
| # append as one line of JSON | |
| with open(output_file, "a", encoding="utf-8") as f: | |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| if __name__ == "__main__": | |
| input_folder = "data_txt" # 📂 folder containing your .txt files | |
| output_file = "dataset.json" | |
| # Ensure output file is empty before starting | |
| open(output_file, "w", encoding="utf-8").close() | |
| for filename in os.listdir(input_folder): | |
| if filename.endswith(".txt"): | |
| file_path = os.path.join(input_folder, filename) | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| transcription = f.read().strip() | |
| print(f"\n=== Processing {filename} ===") | |
| entities = extract_medical_entities(transcription) | |
| # Save results | |
| save_annotation(transcription, entities, output_file=output_file) | |
| print(f"✅ Saved {filename} → {output_file}") | |
| """ | |
| if __name__ == "__main__": | |
| input_folder = "data_txt" # 📂 folder containing your .txt files | |
| output_file = "dataset.json" | |
| # Liste des fichiers à exclure | |
| excluded_files = { | |
| "template7.txt", | |
| "template1167.txt", | |
| "template429.txt", | |
| "template401.txt", | |
| "template367.txt", | |
| "template415.txt", | |
| "template398.txt", | |
| "template1198.txt", | |
| "template159.txt", | |
| "template165.txt", | |
| "template1107.txt", | |
| "template449.txt", | |
| "template1113.txt", | |
| "template313.txt", | |
| "template475.txt", | |
| "template461.txt", | |
| "template307.txt", | |
| "template893.txt", | |
| "template139.txt", | |
| "template887.txt", | |
| "template677.txt", | |
| "template111.txt", | |
| "template105.txt", | |
| "template663.txt", | |
| "template688.txt", | |
| "template850.txt", | |
| "template844.txt", | |
| "template878.txt", | |
| "template16.txt", | |
| "template703.txt", | |
| "template717.txt", | |
| "template924.txt", | |
| "template930.txt", | |
| "template918.txt", | |
| "template1073.txt", | |
| "template529.txt", | |
| "template1067.txt", | |
| "template267.txt", | |
| "template501.txt", | |
| "template515.txt", | |
| "template273.txt", | |
| "template298.txt", | |
| "template1098.txt", | |
| "template1099.txt", | |
| "template299.txt", | |
| "template514.txt", | |
| "template272.txt", | |
| "template266.txt", | |
| "template500.txt", | |
| "template528.txt", | |
| "template1066.txt", | |
| "template1072.txt", | |
| "template919.txt", | |
| "template931.txt", | |
| "template925.txt", | |
| "template716.txt", | |
| "template702.txt", | |
| "template879.txt", | |
| "template845.txt", | |
| "template851.txt", | |
| "template689.txt", | |
| "template104.txt", | |
| "template662.txt", | |
| "template676.txt", | |
| "template110.txt", | |
| "template138.txt", | |
| "template886.txt", | |
| "template892.txt", | |
| "template460.txt", | |
| "template306.txt", | |
| "template312.txt", | |
| "template474.txt", | |
| "template1112.txt", | |
| "template1106.txt", | |
| "template448.txt", | |
| "template338.txt", | |
| "template1110.txt", | |
| "template1104.txt", | |
| "template304.txt", | |
| "template462.txt", | |
| "template476.txt", | |
| "template310.txt", | |
| "template1138.txt", | |
| "template489.txt", | |
| "template884.txt", | |
| "template890.txt", | |
| "template648.txt", | |
| "template660.txt", | |
| "template106.txt", | |
| "template112.txt", | |
| "template674.txt", | |
| "template847.txt", | |
| "template853.txt", | |
| "template728.txt", | |
| "template15.txt", | |
| "template714.txt", | |
| "template29.txt", | |
| "template700.txt", | |
| "template933.txt", | |
| "template927.txt", | |
| "template1064.txt", | |
| "template1070.txt", | |
| "template258.txt", | |
| "template1058.txt", | |
| "template270.txt", | |
| "template516.txt", | |
| "template502.txt", | |
| "template264.txt", | |
| "template503.txt", | |
| "template265.txt", | |
| "template271.txt", | |
| "template1059.txt", | |
| "template517.txt", | |
| "template259.txt", | |
| "template1071.txt", | |
| "template1065.txt", | |
| "template926.txt", | |
| "template932.txt", | |
| "template701.txt", | |
| "template715.txt", | |
| "template28.txt", | |
| "template729.txt", | |
| "template14.txt", | |
| "template852.txt", | |
| "template846.txt", | |
| "template113.txt", | |
| "template675.txt", | |
| "template661.txt", | |
| "template107.txt", | |
| "template649.txt", | |
| "template891.txt", | |
| "template885.txt", | |
| "template488.txt", | |
| "template477.txt", | |
| "template1139.txt", | |
| "template311.txt", | |
| "template305.txt", | |
| "template463.txt", | |
| "template1105.txt", | |
| "template1111.txt", | |
| "template339.txt", | |
| "template467.txt", | |
| "template1129.txt", | |
| "template301.txt", | |
| "template315.txt", | |
| "template473.txt", | |
| "template1115.txt", | |
| "template1101.txt", | |
| "template329.txt", | |
| "template498.txt", | |
| "template103.txt", | |
| "template665.txt", | |
| "template671.txt", | |
| "template117.txt", | |
| "template881.txt", | |
| "template659.txt", | |
| "template895.txt", | |
| "template842.txt", | |
| "template856.txt", | |
| "template711.txt", | |
| "template705.txt", | |
| "template38.txt", | |
| "template10.txt", | |
| "template739.txt", | |
| "template936.txt", | |
| "template922.txt", | |
| "template513.txt", | |
| "template275.txt", | |
| "template261.txt", | |
| "template1049.txt", | |
| "template507.txt", | |
| "template249.txt", | |
| "template1061.txt", | |
| "template1075.txt", | |
| "template1074.txt", | |
| "template1060.txt", | |
| "template248.txt", | |
| "template1048.txt", | |
| "template260.txt", | |
| "template506.txt", | |
| "template512.txt", | |
| "template274.txt", | |
| "template923.txt", | |
| "template937.txt", | |
| "template738.txt", | |
| "template11.txt", | |
| "template704.txt", | |
| "template710.txt", | |
| "template857.txt", | |
| "template843.txt", | |
| "template894.txt", | |
| "template658.txt", | |
| "template880.txt", | |
| "template670.txt", | |
| "template116.txt", | |
| "template102.txt", | |
| "template664.txt", | |
| "template499.txt", | |
| "template328.txt", | |
| "template1100.txt", | |
| "template1114.txt", | |
| "template314.txt", | |
| "template472.txt", | |
| "template466.txt", | |
| "template300.txt", | |
| "template1128.txt", | |
| "template470.txt", | |
| "template316.txt", | |
| "template302.txt", | |
| "template464.txt", | |
| "template1102.txt", | |
| "template1116.txt", | |
| "template458.txt", | |
| "template114.txt", | |
| "template672.txt", | |
| "template666.txt", | |
| "template100.txt", | |
| "template128.txt", | |
| "template896.txt", | |
| "template882.txt", | |
| "template869.txt", | |
| "template855.txt", | |
| "template699.txt", | |
| "template841.txt", | |
| "template706.txt", | |
| "template712.txt", | |
| "template13.txt", | |
| "template909.txt", | |
| "template921.txt", | |
| "template935.txt", | |
| "template504.txt", | |
| "template262.txt", | |
| "template276.txt", | |
| "template510.txt", | |
| "template538.txt", | |
| "template1076.txt", | |
| "template1062.txt", | |
| "template1089.txt", | |
| "template289.txt", | |
| "template288.txt", | |
| "template1088.txt", | |
| "template1063.txt", | |
| "template539.txt", | |
| "template1077.txt", | |
| "template277.txt", | |
| "template511.txt", | |
| "template505.txt", | |
| "template263.txt", | |
| "template934.txt", | |
| "template920.txt", | |
| "template908.txt", | |
| "template12.txt", | |
| "template713.txt", | |
| "template707.txt", | |
| "template840.txt", | |
| "template698.txt", | |
| "template854.txt", | |
| "template868.txt", | |
| "template883.txt", | |
| "template129.txt", | |
| "template897.txt", | |
| "template667.txt", | |
| "template101.txt", | |
| "template115.txt", | |
| "template673.txt", | |
| "template1117.txt", | |
| "template459.txt", | |
| "template1103.txt", | |
| "template303.txt", | |
| "template465.txt", | |
| "template471.txt", | |
| "template317.txt", | |
| "template4.txt", | |
| "template1164.txt", | |
| "template1170.txt", | |
| "template358.txt", | |
| "template416.txt", | |
| "template1158.txt", | |
| "template370.txt", | |
| "template364.txt", | |
| "template402.txt", | |
| "template628.txt", | |
| "template172.txt", | |
| "template614.txt", | |
| "template600.txt", | |
| "template166.txt", | |
| "template833.txt", | |
| "template827.txt", | |
| "template199.txt", | |
| "template61.txt", | |
| "template1212.txt", | |
| "template984.txt", | |
| "template748.txt", | |
| "template990.txt", | |
| "template75.txt", | |
| "template1206.txt", | |
| "template760.txt", | |
| "template774.txt", | |
| "template49.txt", | |
| "template947.txt", | |
| "template953.txt", | |
| "template238.txt", | |
| "template1010.txt", | |
| "template1004.txt", | |
| "template562.txt", | |
| "template204.txt", | |
| "template210.txt", | |
| "template1038.txt", | |
| "template576.txt", | |
| "template589.txt", | |
| "template588.txt", | |
| "template1039.txt", | |
| "template211.txt", | |
| "template577.txt", | |
| "template563.txt", | |
| "template205.txt", | |
| "template1005.txt", | |
| "template1011.txt", | |
| "template239.txt", | |
| "template952.txt", | |
| "template946.txt", | |
| "template775.txt", | |
| "template48.txt", | |
| "template761.txt", | |
| "template991.txt", | |
| "template749.txt", | |
| "template1207.txt", | |
| "template74.txt", | |
| "template1213.txt", | |
| "template60.txt", | |
| "template985.txt", | |
| "template826.txt", | |
| "template198.txt", | |
| "template832.txt", | |
| "template601.txt", | |
| "template167.txt", | |
| "template173.txt", | |
| "template615.txt", | |
| "template629.txt", | |
| "template365.txt", | |
| "template403.txt", | |
| "template417.txt", | |
| "template371.txt", | |
| "template1159.txt", | |
| "template359.txt", | |
| "template1171.txt", | |
| "template1165.txt", | |
| "template5.txt", | |
| "template1173.txt", | |
| "template373.txt" | |
| } | |
| # Ensure output file is empty before starting | |
| open(output_file, "w", encoding="utf-8").close() | |
| processed_count = 0 | |
| excluded_count = 0 | |
| for filename in os.listdir(input_folder): | |
| if filename.endswith(".txt"): | |
| # Vérifier si le fichier est dans la liste d'exclusion | |
| if filename in excluded_files: | |
| print(f"⏭️ Fichier exclu : {filename}") | |
| excluded_count += 1 | |
| continue | |
| file_path = os.path.join(input_folder, filename) | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| transcription = f.read().strip() | |
| print(f"\n=== Processing {filename} ===") | |
| entities = extract_medical_entities(transcription) | |
| # Save results | |
| save_annotation(transcription, entities, output_file=output_file) | |
| print(f"✅ Saved {filename} → {output_file}") | |
| processed_count += 1 | |
| print(f"\n📊 Résumé : {processed_count} fichiers traités, {excluded_count} fichiers exclus") | |
| """ |