Spaces:

Prateek0515
/

legal-segmentation-api

Sleeping

App Files Files Community

Prateek0515 commited on 20 days ago

Commit

81239b2

verified ·

1 Parent(s): d1b3581

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -11

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from huggingface_hub import hf_hub_download
 import PyPDF2
 from docx import Document
 import re
 # ================== CLASSES ==================
@@ -119,7 +120,7 @@ id2label = {
 def split_sentences(text):
     """Split text into sentences"""
-    sentences = re.split(r'(?<=[.!?])\s+', text)
     return [s.strip() for s in sentences if s.strip()]
 def extract_text_from_pdf(file_path):
@@ -161,19 +162,19 @@ def predict(text_input, file_input):
                 with open(file_path, 'r', encoding='utf-8') as f:
                     text = f.read()
             else:
-                return "❌ Unsupported file type. Please use: PDF, DOCX, or TXT"
         elif text_input:
             text = text_input
         else:
-            return "⚠️ Please provide either text or upload a file"
         if not text or len(text.strip()) == 0:
-            return "⚠️ No text content found"
         # ✂️ Split text into sentences
         sentences = split_sentences(text)
         if not sentences:
-            return "⚠️ Could not split text into sentences"
         # 🧠 Tokenize all sentences together (hierarchical encoding)
         encoded = tokenizer(
@@ -204,17 +205,20 @@ def predict(text_input, file_input):
             for i in range(len(predicted_labels)):
                 predicted_labels[i] = i % num_labels
-        # ✅ Format each sentence with its predicted label
         results = []
         for sentence, label_id in zip(sentences, predicted_labels):
             label = id2label.get(label_id, "Unknown")
-            results.append(f"**{label}** | {sentence.strip()}")
-        # Return clean output (frontend compatible)
-        return "\n".join(results)
     except Exception as e:
-        return f"❌ Error during prediction: {str(e)}"
 # ================== GRADIO UI ==================
@@ -224,7 +228,7 @@ demo = gr.Interface(
         gr.Textbox(label="Enter Legal Text", placeholder="Paste legal text here...", lines=5),
         gr.File(label="Or Upload File (PDF, DOCX, TXT)")
     ],
-    outputs=gr.Textbox(label="Per-Sentence Predictions", lines=10),
     title="⚖️ Legal Document Segmentation",
     description="Classify legal documents sentence-by-sentence into: Arguments (Petitioner/Respondent), Decision, Facts, Issue, None, or Reasoning",
     examples=[

 import PyPDF2
 from docx import Document
 import re
+import json
 # ================== CLASSES ==================
 def split_sentences(text):
     """Split text into sentences"""
+    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
     return [s.strip() for s in sentences if s.strip()]
 def extract_text_from_pdf(file_path):
                 with open(file_path, 'r', encoding='utf-8') as f:
                     text = f.read()
             else:
+                return {"error": "❌ Unsupported file type. Please use: PDF, DOCX, or TXT"}
         elif text_input:
             text = text_input
         else:
+            return {"error": "⚠️ Please provide either text or upload a file"}
         if not text or len(text.strip()) == 0:
+            return {"error": "⚠️ No text content found"}
         # ✂️ Split text into sentences
         sentences = split_sentences(text)
         if not sentences:
+            return {"error": "⚠️ Could not split text into sentences"}
         # 🧠 Tokenize all sentences together (hierarchical encoding)
         encoded = tokenizer(
             for i in range(len(predicted_labels)):
                 predicted_labels[i] = i % num_labels
+        # ✅ Format each sentence with its predicted label as JSON
         results = []
         for sentence, label_id in zip(sentences, predicted_labels):
             label = id2label.get(label_id, "Unknown")
+            results.append({
+                "label": label,
+                "sentence": sentence.strip()
+            })
+        # Return JSON format
+        return results
     except Exception as e:
+        return {"error": f"❌ Error during prediction: {str(e)}"}
 # ================== GRADIO UI ==================
         gr.Textbox(label="Enter Legal Text", placeholder="Paste legal text here...", lines=5),
         gr.File(label="Or Upload File (PDF, DOCX, TXT)")
     ],
+    outputs=gr.JSON(label="Per-Sentence Predictions"),
     title="⚖️ Legal Document Segmentation",
     description="Classify legal documents sentence-by-sentence into: Arguments (Petitioner/Respondent), Decision, Facts, Issue, None, or Reasoning",
     examples=[