Spaces:

Prateek0515
/

legal-segmentation-api

Sleeping

App Files Files Community

Prateek0515 commited on 21 days ago

Commit

68a58fd

verified ·

1 Parent(s): 36d9d2a

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -17

app.py CHANGED Viewed

@@ -9,8 +9,11 @@ import PyPDF2
 from docx import Document
 import re
 class PositionalEncoding(nn.Module):
-    def __init__(self, d_model, dropout=0.1, max_len=5000):  # ✅ DOUBLE UNDERSCORE
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
         pe = torch.zeros(max_len, d_model)
@@ -23,8 +26,11 @@ class PositionalEncoding(nn.Module):
     def forward(self, x):
         return x + self.pe[:, :x.size(1)]
 class VanillaTransformer(nn.Module):
-    def __init__(self, d_model=768, nhead=8, num_layers=3, dim_feedforward=2048, dropout=0.1):  # ✅ DOUBLE UNDERSCORE
         super().__init__()
         self.pos_encoder = PositionalEncoding(d_model, dropout)
         encoder_layer = nn.TransformerEncoderLayer(
@@ -37,14 +43,17 @@ class VanillaTransformer(nn.Module):
         src = self.pos_encoder(src)
         return self.transformer(src, src_key_padding_mask=src_key_padding_mask)
 class HierarchicalLegalSegModel(nn.Module):
-    def __init__(self, longformer_model, num_labels, hidden_dim=768, transformer_layers=3, transformer_heads=8, dropout=0.1):  # ✅ DOUBLE UNDERSCORE
         super().__init__()
         self.longformer = longformer_model
         self.hidden_dim = hidden_dim
         self.vanilla_transformer = VanillaTransformer(
             d_model=hidden_dim, nhead=transformer_heads, num_layers=transformer_layers,
-            dim_feedforward=hidden_dim*4, dropout=dropout
         )
         self.classifier = nn.Linear(hidden_dim, num_labels)
         self.crf = CRF(num_labels, batch_first=True)
@@ -64,7 +73,7 @@ class HierarchicalLegalSegModel(nn.Module):
         sentence_embeddings = self.encode_sentences(input_ids, attention_mask)
         sentence_embeddings = self.dropout(sentence_embeddings)
         transformer_output = self.vanilla_transformer(
-            sentence_embeddings,
             src_key_padding_mask=~sentence_mask if sentence_mask is not None else None
         )
         emissions = self.classifier(transformer_output)
@@ -75,6 +84,9 @@ class HierarchicalLegalSegModel(nn.Module):
             predictions = self.crf.decode(emissions, mask=sentence_mask)
             return predictions
 print("Loading model...")
 device = torch.device("cpu")
@@ -84,8 +96,10 @@ longformer = AutoModel.from_pretrained("lexlms/legal-longformer-base").to(device
 for param in longformer.parameters():
     param.requires_grad = False
-model = HierarchicalLegalSegModel(longformer, num_labels=7, hidden_dim=768, transformer_layers=3, transformer_heads=8, dropout=0.1)
-model = model.to(device)
 model_path = hf_hub_download(
     repo_id="Prateek0515/legal-document-segmentation",
@@ -93,14 +107,26 @@ model_path = hf_hub_download(
 )
 checkpoint = torch.load(model_path, map_location=device)
-if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
-    model.load_state_dict(checkpoint['model_state_dict'])
 else:
     model.load_state_dict(checkpoint)
 model.eval()
 print("Model loaded successfully!")
 id2label = {
     0: "Arguments of Petitioner",
     1: "Arguments of Respondent",
@@ -111,8 +137,10 @@ id2label = {
     6: "Reasoning"
 }
 def split_sentences(text):
-    """Split text into sentences"""
     sentences = re.split(r'(?<=[.!?])\s+', text)
     return [s.strip() for s in sentences if s.strip()]
@@ -134,6 +162,9 @@ def extract_text_from_docx(file_path):
     except Exception as e:
         return f"Error reading DOCX: {str(e)}"
 def predict(text_input, file_input):
     try:
         text = None
@@ -160,11 +191,10 @@ def predict(text_input, file_input):
             return "⚠️ No text content found"
         sentences = split_sentences(text)
         if not sentences:
             return "⚠️ Could not split text into sentences"
-        # SIMPLE - PROCESS ALL SENTENCES TOGETHER (WORKS PERFECTLY)
         encoded_sentences = []
         for sentence in sentences:
             encoded = tokenizer(
@@ -180,11 +210,24 @@ def predict(text_input, file_input):
         attention_mask = torch.stack([e["attention_mask"].squeeze(0) for e in encoded_sentences]).unsqueeze(0).to(device)
         sentence_mask = torch.ones(1, len(sentences), dtype=torch.bool).to(device)
         with torch.no_grad():
-            predictions = model(input_ids, attention_mask, sentence_mask=sentence_mask)
         predicted_labels = predictions[0]
         results = []
         for sentence, label_id in zip(sentences, predicted_labels):
             label = id2label.get(label_id, "Unknown")
@@ -195,6 +238,9 @@ def predict(text_input, file_input):
     except Exception as e:
         return f"❌ Error during prediction: {str(e)}"
 demo = gr.Interface(
     fn=predict,
     inputs=[
@@ -202,7 +248,7 @@ demo = gr.Interface(
         gr.File(label="Or Upload File (PDF, DOCX, TXT)")
     ],
     outputs=gr.Textbox(label="Per-Sentence Predictions", lines=10),
-    title="⚖️ Legal Document Segmentation",
     description="Classify legal documents sentence-by-sentence into: Arguments (Petitioner/Respondent), Decision, Facts, Issue, or Reasoning",
     examples=[
         ["The appellant filed a petition against the respondent. The court decides that the appellant is liable.", None],
@@ -210,5 +256,5 @@ demo = gr.Interface(
     api_name="predict"
 )
-if __name__ == "__main__":  # ✅ DOUBLE UNDERSCORE
     demo.launch()

 from docx import Document
 import re
+# -------------------------
+# Positional Encoding
+# -------------------------
 class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
         pe = torch.zeros(max_len, d_model)
     def forward(self, x):
         return x + self.pe[:, :x.size(1)]
+# -------------------------
+# Vanilla Transformer
+# -------------------------
 class VanillaTransformer(nn.Module):
+    def __init__(self, d_model=768, nhead=8, num_layers=3, dim_feedforward=2048, dropout=0.1):
         super().__init__()
         self.pos_encoder = PositionalEncoding(d_model, dropout)
         encoder_layer = nn.TransformerEncoderLayer(
         src = self.pos_encoder(src)
         return self.transformer(src, src_key_padding_mask=src_key_padding_mask)
+# -------------------------
+# Hierarchical Model
+# -------------------------
 class HierarchicalLegalSegModel(nn.Module):
+    def __init__(self, longformer_model, num_labels, hidden_dim=768, transformer_layers=3, transformer_heads=8, dropout=0.1):
         super().__init__()
         self.longformer = longformer_model
         self.hidden_dim = hidden_dim
         self.vanilla_transformer = VanillaTransformer(
             d_model=hidden_dim, nhead=transformer_heads, num_layers=transformer_layers,
+            dim_feedforward=hidden_dim * 4, dropout=dropout
         )
         self.classifier = nn.Linear(hidden_dim, num_labels)
         self.crf = CRF(num_labels, batch_first=True)
         sentence_embeddings = self.encode_sentences(input_ids, attention_mask)
         sentence_embeddings = self.dropout(sentence_embeddings)
         transformer_output = self.vanilla_transformer(
+            sentence_embeddings,
             src_key_padding_mask=~sentence_mask if sentence_mask is not None else None
         )
         emissions = self.classifier(transformer_output)
             predictions = self.crf.decode(emissions, mask=sentence_mask)
             return predictions
+# -------------------------
+# Load Model
+# -------------------------
 print("Loading model...")
 device = torch.device("cpu")
 for param in longformer.parameters():
     param.requires_grad = False
+model = HierarchicalLegalSegModel(
+    longformer, num_labels=7, hidden_dim=768,
+    transformer_layers=3, transformer_heads=8, dropout=0.1
+).to(device)
 model_path = hf_hub_download(
     repo_id="Prateek0515/legal-document-segmentation",
 )
 checkpoint = torch.load(model_path, map_location=device)
+if isinstance(checkpoint, dict):
+    if 'model_state_dict' in checkpoint:
+        model.load_state_dict(checkpoint['model_state_dict'])
+    else:
+        model.load_state_dict(checkpoint)
 else:
     model.load_state_dict(checkpoint)
 model.eval()
 print("Model loaded successfully!")
+# 🔍 Debug model info
+print("\n>>> MODEL CHECK <<<")
+checkpoint = torch.load(model_path, map_location=device)
+print("Checkpoint keys:", checkpoint.keys())
+print("Tokenizer used:", tokenizer.name_or_path)
+# -------------------------
+# Label mapping
+# -------------------------
 id2label = {
     0: "Arguments of Petitioner",
     1: "Arguments of Respondent",
     6: "Reasoning"
 }
+# -------------------------
+# Helpers
+# -------------------------
 def split_sentences(text):
     sentences = re.split(r'(?<=[.!?])\s+', text)
     return [s.strip() for s in sentences if s.strip()]
     except Exception as e:
         return f"Error reading DOCX: {str(e)}"
+# -------------------------
+# Prediction Function
+# -------------------------
 def predict(text_input, file_input):
     try:
         text = None
             return "⚠️ No text content found"
         sentences = split_sentences(text)
         if not sentences:
             return "⚠️ Could not split text into sentences"
+        # Encode sentences
         encoded_sentences = []
         for sentence in sentences:
             encoded = tokenizer(
         attention_mask = torch.stack([e["attention_mask"].squeeze(0) for e in encoded_sentences]).unsqueeze(0).to(device)
         sentence_mask = torch.ones(1, len(sentences), dtype=torch.bool).to(device)
+        # 🧩 Debug info
+        print(">>> DEBUG INFO <<<")
+        print("input_ids:", input_ids.shape)
+        print("attention_mask:", attention_mask.shape)
+        print("sentence_mask:", sentence_mask.shape)
         with torch.no_grad():
+            sentence_embeddings = model.encode_sentences(input_ids, attention_mask)
+            print("sentence_embeddings:", sentence_embeddings.shape)
+            transformer_output = model.vanilla_transformer(sentence_embeddings)
+            print("transformer_output mean:", transformer_output.mean().item())
+            emissions = model.classifier(transformer_output)
+            print("emissions shape:", emissions.shape, " | mean:", emissions.mean().item())
+            predictions = model.crf.decode(emissions, mask=sentence_mask)
+        print("Predictions (raw):", predictions)
         predicted_labels = predictions[0]
         results = []
         for sentence, label_id in zip(sentences, predicted_labels):
             label = id2label.get(label_id, "Unknown")
     except Exception as e:
         return f"❌ Error during prediction: {str(e)}"
+# -------------------------
+# Gradio Interface
+# -------------------------
 demo = gr.Interface(
     fn=predict,
     inputs=[
         gr.File(label="Or Upload File (PDF, DOCX, TXT)")
     ],
     outputs=gr.Textbox(label="Per-Sentence Predictions", lines=10),
+    title="⚖️ Legal Document Segmentation (Debug Mode)",
     description="Classify legal documents sentence-by-sentence into: Arguments (Petitioner/Respondent), Decision, Facts, Issue, or Reasoning",
     examples=[
         ["The appellant filed a petition against the respondent. The court decides that the appellant is liable.", None],
     api_name="predict"
 )
+if __name__ == "__main__":
     demo.launch()