Spaces:

harshildarji
/

Juristische-NER

Sleeping

App Files Files Community

harshildarji commited on May 21

Commit

da37e40

verified ·

1 Parent(s): 1b901a6

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -351

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import re
 import os
-import warnings
-import matplotlib.colors as mcolors
-import matplotlib.pyplot as plt
 import streamlit as st
 from charset_normalizer import detect
 from transformers import (
@@ -13,153 +12,61 @@ from transformers import (
     pipeline,
 )
-warnings.simplefilter(action="ignore", category=Warning)
 logging.set_verbosity(logging.ERROR)
-st.set_page_config(page_title="Legal NER", page_icon="⚖️", layout="wide")
 st.markdown(
     """
-    <style>
-        body {
-            font-family: 'Poppins', sans-serif;
-            background-color: #f4f4f8;
-        }
-        .header {
-            background-color: rgba(220, 219, 219, 0.25);
-            color: #000;
-            padding: 5px 0;
-            text-align: center;
-            border-radius: 7px;
-            margin-bottom: 13px;
-            border-bottom: 2px solid #333;
-        }
-        .container {
-            background-color: #fff;
-            padding: 30px;
-            border-radius: 10px;
-            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
-            width: 100%;
-            max-width: 1000px;
-            margin: 0 auto;
-            position: absolute;
-            top: 50%;
-            left: 50%;
-            transform: translate(-50%, -50%);
-        }
-        .btn-primary {
-            background-color: #5477d1;
-            border: none;
-            transition: background-color 0.3s, transform 0.2s;
-            border-radius: 25px;
-            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
-        }
-        .btn-primary:hover {
-            background-color: #4c6cbe;
-            transform: translateY(-1px);
-        }
-        h2 {
-            font-weight: 600;
-            font-size: 24px;
-            margin-bottom: 20px;
-        }
-        label {
-            font-weight: 500;
-        }
-        .tip {
-            background-color: rgba(180, 47, 109, 0.25);
-            padding: 7px;
-            border-radius: 7px;
-            display: inline-block;
-            margin-top: 15px;
-            margin-bottom: 15px;
-        }
-        .sec {
-            background-color: rgba(220, 219, 219, 0.10);
-            padding: 7px;
-            border-radius: 5px;
-            display: inline-block;
-            margin-top: 15px;
-            margin-bottom: 15px;
-        }
-        .tooltip {
-            position: relative;
-            display: inline-block;
-            cursor: pointer;
-        }
-        .tooltip .tooltiptext {
-            visibility: hidden;
-            width: 120px;
-            background-color: #6c757d;
-            color: #fff;
-            text-align: center;
-            border-radius: 3px;
-            padding: 3px;
-            position: absolute;
-            z-index: 1;
-            bottom: 125%;
-            left: 50%;
-            margin-left: -60px;
-            opacity: 0;
-            transition: opacity 0.3s;
-        }
-        .tooltip:hover .tooltiptext {
-            visibility: visible;
-            opacity: 1;
-        }
-        .anonymized {
-            background-color: #ffcccb;
-            color: #000;
-            font-weight: bold;
-            border-radius: 3px;
-            padding: 2px 4px;
-        }
-        #language-container {
-            position: fixed;
-            top: 10px;
-            right: 10px;
-            z-index: 1000;
-        }
-    </style>
 """,
     unsafe_allow_html=True,
 )
-# UI text for English and German.
-ui_text = {
-    "EN": {
-        "title": "Legal NER",
-        "upload": "Upload a .txt file",
-        "anonymize": "Anonymize",
-        "select_entities": "Entity types to anonymize:",
-        "download": "Download Anonymized Text",
-        "tip": "Tip: Hover over the colored words to see its class.",
-        "error": "An error occurred while processing the file: ",
-    },
-    "DE": {
-        "title": "Juristische NER",
-        "upload": "Lade eine .txt-Datei hoch",
-        "anonymize": "Anonymisieren",
-        "select_entities": "Entitätstypen zur Anonymisierung:",
-        "download": "Anonymisierten Text herunterladen",
-        "tip": "Tipp: Fahre mit der Maus über die farbigen Wörter, um deren Klasse zu sehen.",
-        "error": "Beim Verarbeiten der Datei ist ein Fehler aufgetreten: ",
-    },
-}
-col1, col2 = st.columns([4, 1])
-with col2:
-    lang = st.radio(
-        "Language:",
-        options=["EN", "DE"],
-        horizontal=True,
-        label_visibility="hidden",
-        key="language_selector",
-    )
-with col1:
-    st.title(ui_text[lang]["title"])
-# Initialization for German Legal NER
 tkn = os.getenv("tkn")
 tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER", use_auth_token=tkn)
 model = AutoModelForTokenClassification.from_pretrained(
@@ -167,8 +74,8 @@ model = AutoModelForTokenClassification.from_pretrained(
 )
 ner = pipeline("ner", model=model, tokenizer=tokenizer)
-# Define class labels for the model
-classes = {
     "AN": "Lawyer",
     "EUN": "European legal norm",
     "GRT": "Court",
@@ -189,223 +96,124 @@ classes = {
     "VS": "Regulation",
     "VT": "Contract",
 }
-ner_labels = list(classes.keys())
-# Generate a list of colors for visualization
-def generate_colors(num_colors):
-    cm = plt.get_cmap("tab20")
-    colors = [mcolors.rgb2hex(cm(1.0 * i / num_colors)) for i in range(num_colors)]
-    return colors
-# Color substrings based on NER results
-def color_substrings(input_string, model_output):
-    colors = generate_colors(len(ner_labels))
-    label_to_color = {
-        label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
-    }
     last_end = 0
-    html_output = ""
-    for entity in sorted(model_output, key=lambda x: x["start"]):
-        start, end, label = entity["start"], entity["end"], entity["label"]
-        html_output += input_string[last_end:start]
-        tooltip = classes.get(label, "")
-        html_output += (
-            f'<span class="tooltip" style="color: {label_to_color.get(label)}; font-weight: bold;">'
-            f'{input_string[start:end]}<span class="tooltiptext">{tooltip}</span></span>'
         )
         last_end = end
-    html_output += input_string[last_end:]
-    return html_output
-# Selectively anonymize entities
-def anonymize_text(input_string, model_output, selected_entities=None):
-    merged_model_output = []
-    sorted_entities = sorted(model_output, key=lambda x: x["start"])
-    if sorted_entities:
-        current = sorted_entities[0]
-        for entity in sorted_entities[1:]:
-            if (
-                entity["label"] == current["label"]
-                and input_string[current["end"] : entity["start"]].strip() == ""
-            ):
-                current["end"] = entity["end"]
-                current["word"] = input_string[current["start"] : current["end"]]
-            else:
-                merged_model_output.append(current)
-                current = entity
-        merged_model_output.append(current)
-    else:
-        merged_model_output = sorted_entities
-    anonymized_text = ""
-    last_end = 0
-    colors = generate_colors(len(ner_labels))
-    label_to_color = {
-        label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
-    }
-    for entity in merged_model_output:
-        start, end, label = entity["start"], entity["end"], entity["label"]
-        anonymized_text += input_string[last_end:start]
-        if selected_entities is None or label in selected_entities:
-            anonymized_text += (
-                f'<span class="anonymized">[{classes.get(label, label)}]</span>'
-            )
-        else:
-            tooltip = classes.get(label, "")
-            anonymized_text += (
-                f'<span class="tooltip" style="color: {label_to_color.get(label)}; font-weight: bold;">'
-                f'{input_string[start:end]}<span class="tooltiptext">{tooltip}</span></span>'
-            )
-        last_end = end
-    anonymized_text += input_string[last_end:]
-    return anonymized_text
-def merge_entities(ner_results):
-    merged_entities = []
-    current_entity = None
-    for token in ner_results:
-        tag = token["entity"]
-        entity_type = tag.split("-")[-1] if "-" in tag else tag
-        token_start, token_end = token["start"], token["end"]
-        token_word = token["word"].replace("##", "")  # Remove subword prefixes
-        if (
-            tag.startswith("B-")
-            or current_entity is None
-            or current_entity["label"] != entity_type
-        ):
-            if current_entity:
-                merged_entities.append(current_entity)
-            current_entity = {
-                "start": token_start,
-                "end": token_end,
-                "label": entity_type,
-                "word": token_word,
-            }
-        elif (
-            tag.startswith("I-")
-            and current_entity
-            and current_entity["label"] == entity_type
-        ):
-            current_entity["end"] = token_end
-            current_entity["word"] += token_word
-        else:
-            if (
-                current_entity
-                and token_start == current_entity["end"]
-                and current_entity["label"] == entity_type
-            ):
-                current_entity["end"] = token_end
-                current_entity["word"] += token_word
-            else:
-                if current_entity:
-                    merged_entities.append(current_entity)
-                current_entity = {
-                    "start": token_start,
-                    "end": token_end,
-                    "label": entity_type,
-                    "word": token_word,
-                }
-    if current_entity:
-        merged_entities.append(current_entity)
-    return merged_entities
-uploaded_file = st.file_uploader(ui_text[lang]["upload"], type="txt")
-if uploaded_file is not None:
-    try:
-        raw_content = uploaded_file.read()
-        detected = detect(raw_content)
-        encoding = detected["encoding"]
-        if encoding is None:
-            raise ValueError("Unable to detect file encoding.")
-        lines = raw_content.decode(encoding).splitlines()
-        line_results = []
-        for line in lines:
-            if line.strip():
-                results = ner(line)
-                merged_results = merge_entities(results)
-                line_results.append(merged_results)
-            else:
-                line_results.append([])
-        anonymize_mode = st.checkbox(ui_text[lang]["anonymize"])
-        selected_entities = None
-        if anonymize_mode:
-            detected_entity_tags = set()
-            for merged_results in line_results:
-                for entity in merged_results:
-                    detected_entity_tags.add(entity["label"])
-            inverse_classes = {v: k for k, v in classes.items()}
-            detected_options = sorted([classes[tag] for tag in detected_entity_tags])
-            selected_options = st.multiselect(
-                ui_text[lang]["select_entities"],
-                options=detected_options,
-                default=detected_options,
-            )
-            selected_entities = [
-                inverse_classes[options] for options in selected_options
-            ]
-        st.markdown(
-            "<hr style='margin-top: 10px; margin-bottom: 20px;'>",
-            unsafe_allow_html=True,
-        )
-        anonymized_lines = []
-        displayed_lines = []
-        for line, merged_results in zip(lines, line_results):
-            if line.strip():
-                if anonymize_mode:
-                    anonymized_text = anonymize_text(
-                        line, merged_results, selected_entities=selected_entities
-                    )
-                    displayed_lines.append(anonymized_text)
-                    plain_text = re.sub(r"<.*?>", "", anonymized_text)
-                    anonymized_lines.append(plain_text.strip())
-                else:
-                    colored_html = color_substrings(line, merged_results)
-                    st.markdown(f"{colored_html}", unsafe_allow_html=True)
-            else:
-                # displayed_lines.append("<br>")
-                anonymized_lines.append("")
-        if anonymize_mode:
-            original_file_name = uploaded_file.name
-            download_file_name = f"Anon_{original_file_name}"
-            anonymized_content = "\n".join(anonymized_lines)
-            for displayed_line in displayed_lines:
-                st.markdown(f"{displayed_line}", unsafe_allow_html=True)
-            st.markdown("<hr>", unsafe_allow_html=True)
-            st.download_button(
-                label=ui_text[lang]["download"],
-                data=anonymized_content,
-                file_name=download_file_name,
-                mime="text/plain",
-            )
-        else:
-            st.markdown("<hr>", unsafe_allow_html=True)
             st.markdown(
-                f'<div class="tip"><strong>{ui_text[lang]["tip"]}</strong></div>',
                 unsafe_allow_html=True,
-            )
-    except Exception as e:
-        st.error(f"{ui_text[lang]['error']}{e}")

 import os
+import re
+import string
+import matplotlib.cm as cm
 import streamlit as st
 from charset_normalizer import detect
 from transformers import (
     pipeline,
 )
+st.set_page_config(page_title="German Legal NER", page_icon="⚖️", layout="wide")
 logging.set_verbosity(logging.ERROR)
 st.markdown(
     """
+<style>
+.block-container {
+    padding-top: 1rem;
+    padding-bottom: 5rem;
+    padding-left: 3rem;
+    padding-right: 3rem;
+}
+header, footer {visibility: hidden;}
+.entity {
+    position: relative;
+    display: inline-block;
+    background-color: transparent;
+    font-weight: normal;
+    cursor: help;
+}
+.entity .tooltip {
+    visibility: hidden;
+    background-color: #333;
+    color: #fff;
+    text-align: center;
+    border-radius: 4px;
+    padding: 2px 6px;
+    position: absolute;
+    z-index: 1;
+    bottom: 125%;
+    left: 50%;
+    transform: translateX(-50%);
+    white-space: nowrap;
+    opacity: 0;
+    transition: opacity 0.05s;
+    font-size: 11px;
+}
+.entity:hover .tooltip {
+    visibility: visible;
+    opacity: 1;
+}
+.entity.marked {
+    background-color: rgba(255, 230, 0, 0.4);
+}
+</style>
 """,
     unsafe_allow_html=True,
 )
+# Load model
 tkn = os.getenv("tkn")
 tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER", use_auth_token=tkn)
 model = AutoModelForTokenClassification.from_pretrained(
 )
 ner = pipeline("ner", model=model, tokenizer=tokenizer)
+# Entity labels
+entity_labels = {
     "AN": "Lawyer",
     "EUN": "European legal norm",
     "GRT": "Court",
     "VS": "Regulation",
     "VT": "Contract",
 }
+# Fixed colors
+def generate_fixed_colors(keys, alpha=0.25):
+    cmap = cm.get_cmap("tab20", len(keys))
+    rgba_colors = {}
+    for i, key in enumerate(keys):
+        r, g, b, _ = cmap(i)
+        rgba = f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
+        rgba_colors[key] = rgba
+    return rgba_colors
+ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()), alpha=0.30)
+# UI
+st.markdown("#### German Legal NER")
+uploaded_file = st.file_uploader("Upload a .txt file", type="txt")
+threshold = st.slider("Confidence threshold:", 0.0, 1.0, 0.8, 0.01)
+st.markdown("---")
+# Merge logic
+def merge_entities(entities):
+    if not entities:
+        return []
+    ents = sorted(entities, key=lambda e: e["index"])
+    merged = [ents[0].copy()]
+    merged[0]["score_sum"] = ents[0]["score"]
+    merged[0]["count"] = 1
+    for ent in ents[1:]:
+        prev = merged[-1]
+        if ent["index"] == prev["index"] + 1:
+            tok = ent["word"]
+            if tok.startswith("##"):
+                prev["word"] += tok[2:]
+            else:
+                prev["word"] += " " + tok
+            prev["end"] = ent["end"]
+            prev["index"] = ent["index"]
+            prev["score_sum"] += ent["score"]
+            prev["count"] += 1
+        else:
+            prev["score"] = prev["score_sum"] / prev["count"]
+            del prev["score_sum"]
+            del prev["count"]
+            new_ent = ent.copy()
+            new_ent["score_sum"] = ent["score"]
+            new_ent["count"] = 1
+            merged.append(new_ent)
+    if "score_sum" in merged[-1]:
+        merged[-1]["score"] = merged[-1]["score_sum"] / merged[-1]["count"]
+        del merged[-1]["score_sum"]
+        del merged[-1]["count"]
+    final = []
+    for ent in merged:
+        w = ent["word"].strip()
+        w = re.sub(r"\s*\.\s*", ".", w)
+        w = re.sub(r"\s*,\s*", ", ", w)
+        w = re.sub(r"\s*/\s*", "/", w)
+        w = w.strip(string.whitespace + string.punctuation)
+        if len(w) > 1 and re.search(r"\w", w):
+            cleaned = ent.copy()
+            cleaned["word"] = w
+            final.append(cleaned)
+    return final
+# HTML highlighting
+def highlight_entities(line, merged_entities, threshold):
+    html = ""
     last_end = 0
+    for ent in merged_entities:
+        if ent["score"] < threshold:
+            continue
+        start, end = ent["start"], ent["end"]
+        label = ent["entity"].split("-")[-1]
+        label_desc = entity_labels.get(label, label)
+        color = ENTITY_COLORS.get(label, "#cccccc")
+        html += line[last_end:start]
+        highlight_style = f"background-color:{color}; font-weight:600;"
+        html += (
+            f'<span class="entity marked" style="{highlight_style}">'
+            f'{ent["word"]}<span class="tooltip">{label_desc}</span></span>'
         )
         last_end = end
+    html += line[last_end:]
+    return html
+if uploaded_file:
+    raw_bytes = uploaded_file.read()
+    encoding = detect(raw_bytes)["encoding"]
+    if encoding is None:
+        st.error("Could not detect file encoding.")
+    else:
+        text = raw_bytes.decode(encoding)
+        for line in text.splitlines():
+            if not line.strip():
+                st.write("")
+                continue
+            tokens = ner(line)
+            merged = merge_entities(tokens)
+            html_line = highlight_entities(line, merged, threshold)
             st.markdown(
+                f'<div style="margin:0;padding:0;line-height:1.4;">{html_line}</div>',
                 unsafe_allow_html=True,
+            )