Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ from huggingface_hub import hf_hub_download
|
|
| 8 |
import PyPDF2
|
| 9 |
from docx import Document
|
| 10 |
import re
|
|
|
|
| 11 |
|
| 12 |
# ================== CLASSES ==================
|
| 13 |
|
|
@@ -119,7 +120,7 @@ id2label = {
|
|
| 119 |
|
| 120 |
def split_sentences(text):
|
| 121 |
"""Split text into sentences"""
|
| 122 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 123 |
return [s.strip() for s in sentences if s.strip()]
|
| 124 |
|
| 125 |
def extract_text_from_pdf(file_path):
|
|
@@ -161,19 +162,19 @@ def predict(text_input, file_input):
|
|
| 161 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 162 |
text = f.read()
|
| 163 |
else:
|
| 164 |
-
return "❌ Unsupported file type. Please use: PDF, DOCX, or TXT"
|
| 165 |
elif text_input:
|
| 166 |
text = text_input
|
| 167 |
else:
|
| 168 |
-
return "⚠️ Please provide either text or upload a file"
|
| 169 |
|
| 170 |
if not text or len(text.strip()) == 0:
|
| 171 |
-
return "⚠️ No text content found"
|
| 172 |
|
| 173 |
# ✂️ Split text into sentences
|
| 174 |
sentences = split_sentences(text)
|
| 175 |
if not sentences:
|
| 176 |
-
return "⚠️ Could not split text into sentences"
|
| 177 |
|
| 178 |
# 🧠 Tokenize all sentences together (hierarchical encoding)
|
| 179 |
encoded = tokenizer(
|
|
@@ -204,17 +205,20 @@ def predict(text_input, file_input):
|
|
| 204 |
for i in range(len(predicted_labels)):
|
| 205 |
predicted_labels[i] = i % num_labels
|
| 206 |
|
| 207 |
-
# ✅ Format each sentence with its predicted label
|
| 208 |
results = []
|
| 209 |
for sentence, label_id in zip(sentences, predicted_labels):
|
| 210 |
label = id2label.get(label_id, "Unknown")
|
| 211 |
-
results.append(
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
# Return
|
| 214 |
-
return
|
| 215 |
|
| 216 |
except Exception as e:
|
| 217 |
-
return f"❌ Error during prediction: {str(e)}"
|
| 218 |
|
| 219 |
# ================== GRADIO UI ==================
|
| 220 |
|
|
@@ -224,7 +228,7 @@ demo = gr.Interface(
|
|
| 224 |
gr.Textbox(label="Enter Legal Text", placeholder="Paste legal text here...", lines=5),
|
| 225 |
gr.File(label="Or Upload File (PDF, DOCX, TXT)")
|
| 226 |
],
|
| 227 |
-
outputs=gr.
|
| 228 |
title="⚖️ Legal Document Segmentation",
|
| 229 |
description="Classify legal documents sentence-by-sentence into: Arguments (Petitioner/Respondent), Decision, Facts, Issue, None, or Reasoning",
|
| 230 |
examples=[
|
|
|
|
| 8 |
import PyPDF2
|
| 9 |
from docx import Document
|
| 10 |
import re
|
| 11 |
+
import json
|
| 12 |
|
| 13 |
# ================== CLASSES ==================
|
| 14 |
|
|
|
|
| 120 |
|
| 121 |
def split_sentences(text):
|
| 122 |
"""Split text into sentences"""
|
| 123 |
+
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
|
| 124 |
return [s.strip() for s in sentences if s.strip()]
|
| 125 |
|
| 126 |
def extract_text_from_pdf(file_path):
|
|
|
|
| 162 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 163 |
text = f.read()
|
| 164 |
else:
|
| 165 |
+
return {"error": "❌ Unsupported file type. Please use: PDF, DOCX, or TXT"}
|
| 166 |
elif text_input:
|
| 167 |
text = text_input
|
| 168 |
else:
|
| 169 |
+
return {"error": "⚠️ Please provide either text or upload a file"}
|
| 170 |
|
| 171 |
if not text or len(text.strip()) == 0:
|
| 172 |
+
return {"error": "⚠️ No text content found"}
|
| 173 |
|
| 174 |
# ✂️ Split text into sentences
|
| 175 |
sentences = split_sentences(text)
|
| 176 |
if not sentences:
|
| 177 |
+
return {"error": "⚠️ Could not split text into sentences"}
|
| 178 |
|
| 179 |
# 🧠 Tokenize all sentences together (hierarchical encoding)
|
| 180 |
encoded = tokenizer(
|
|
|
|
| 205 |
for i in range(len(predicted_labels)):
|
| 206 |
predicted_labels[i] = i % num_labels
|
| 207 |
|
| 208 |
+
# ✅ Format each sentence with its predicted label as JSON
|
| 209 |
results = []
|
| 210 |
for sentence, label_id in zip(sentences, predicted_labels):
|
| 211 |
label = id2label.get(label_id, "Unknown")
|
| 212 |
+
results.append({
|
| 213 |
+
"label": label,
|
| 214 |
+
"sentence": sentence.strip()
|
| 215 |
+
})
|
| 216 |
|
| 217 |
+
# Return JSON format
|
| 218 |
+
return results
|
| 219 |
|
| 220 |
except Exception as e:
|
| 221 |
+
return {"error": f"❌ Error during prediction: {str(e)}"}
|
| 222 |
|
| 223 |
# ================== GRADIO UI ==================
|
| 224 |
|
|
|
|
| 228 |
gr.Textbox(label="Enter Legal Text", placeholder="Paste legal text here...", lines=5),
|
| 229 |
gr.File(label="Or Upload File (PDF, DOCX, TXT)")
|
| 230 |
],
|
| 231 |
+
outputs=gr.JSON(label="Per-Sentence Predictions"),
|
| 232 |
title="⚖️ Legal Document Segmentation",
|
| 233 |
description="Classify legal documents sentence-by-sentence into: Arguments (Petitioner/Respondent), Decision, Facts, Issue, None, or Reasoning",
|
| 234 |
examples=[
|