Spaces:

harshildarji
/

Juristische-Anonymisierung

Sleeping

App Files Files Community

harshildarji commited on May 21

Commit

4694471

1 Parent(s): a3633ad

upload app

Browse files

Files changed (7) hide show

.gitignore +1 -0
.streamlit/config.toml +2 -0
README.md +4 -4
app.py +346 -0
index.html +0 -19
requirements.txt +3 -0
style.css +0 -28

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_Store

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [theme]
2	+ base="light"

README.md CHANGED Viewed

@@ -3,8 +3,8 @@ title: Juristische Anonymisierung
 emoji: 👀
 colorFrom: indigo
 colorTo: purple
-sdk: static
 pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 emoji: 👀
 colorFrom: indigo
 colorTo: purple
+sdk: streamlit
+sdk_version: 1.45.1
+app_file: app.py
 pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import re
+import string
+import seaborn as sns
+import streamlit as st
+from charset_normalizer import detect
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    logging,
+    pipeline,
+)
+def setup_page():
+    st.set_page_config(
+        page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
+    )
+    logging.set_verbosity(logging.ERROR)
+    st.markdown(
+        """
+    <style>
+    .block-container {
+        padding-top: 1rem;
+        padding-bottom: 5rem;
+        padding-left: 3rem;
+        padding-right: 3rem;
+    }
+    header, footer {visibility: hidden;}
+    .entity {
+        position: relative;
+        display: inline-block;
+        background-color: transparent;
+        font-weight: normal;
+        cursor: help;
+    }
+    .entity .tooltip {
+        visibility: hidden;
+        background-color: #333;
+        color: #fff;
+        text-align: center;
+        border-radius: 4px;
+        padding: 2px 6px;
+        position: absolute;
+        z-index: 1;
+        bottom: 125%;
+        left: 50%;
+        transform: translateX(-50%);
+        white-space: nowrap;
+        opacity: 0;
+        transition: opacity 0.05s;
+        font-size: 11px;
+    }
+    .entity:hover .tooltip {
+        visibility: visible;
+        opacity: 1;
+    }
+    .entity.marked {
+        background-color: rgba(255, 230, 0, 0.4);
+        line-height: 1.3;
+        padding: 0 1px;
+        border-radius: 0px;
+    }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+def get_constants():
+    entity_importance = {
+        "High": ["PER", "UN", "INN", "MRK"],
+        "Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
+        "Low": ["LD", "ST", "STR", "LDS", "ORG"],
+    }
+    entity_labels = {
+        "AN": "Rechtsbeistand",
+        "EUN": "EUNorm",
+        "GRT": "Gericht",
+        "GS": "Norm",
+        "INN": "Institution",
+        "LD": "Land",
+        "LDS": "Bezirk",
+        "LIT": "Schrifttum",
+        "MRK": "Marke",
+        "ORG": "Organisation",
+        "PER": "Person",
+        "RR": "RichterIn",
+        "RS": "Entscheidung",
+        "ST": "Stadt",
+        "STR": "Strasse",
+        "UN": "Unternehmen",
+        "VO": "Verordnung",
+        "VS": "Richtlinie",
+        "VT": "Vertrag",
+    }
+    return entity_importance, entity_labels
+def generate_fixed_colors(keys, alpha=0.25):
+    base_colors = sns.color_palette("tab20", len(keys))
+    rgba_colors = {
+        key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
+        for key, (r, g, b) in zip(keys, base_colors)
+    }
+    return rgba_colors
+def load_ner_model():
+    tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
+    model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
+    return pipeline("ner", model=model, tokenizer=tokenizer)
+def merge_entities(entities):
+    if not entities:
+        return []
+    ents = sorted(entities, key=lambda e: e["index"])
+    merged = [ents[0].copy()]
+    merged[0]["score_sum"] = ents[0]["score"]
+    merged[0]["count"] = 1
+    for ent in ents[1:]:
+        prev = merged[-1]
+        if ent["index"] == prev["index"] + 1:
+            tok = ent["word"]
+            prev["word"] += tok[2:] if tok.startswith("##") else " " + tok
+            prev["end"] = ent["end"]
+            prev["index"] = ent["index"]
+            prev["score_sum"] += ent["score"]
+            prev["count"] += 1
+        else:
+            prev["score"] = prev["score_sum"] / prev["count"]
+            del prev["score_sum"], prev["count"]
+            new_ent = ent.copy()
+            new_ent["score_sum"] = ent["score"]
+            new_ent["count"] = 1
+            merged.append(new_ent)
+    if "score_sum" in merged[-1]:
+        merged[-1]["score"] = merged[-1]["score_sum"] / merged[-1]["count"]
+        del merged[-1]["score_sum"], merged[-1]["count"]
+    final = []
+    for ent in merged:
+        w = ent["word"].strip()
+        w = re.sub(r"\s*\.\s*", ".", w)
+        w = re.sub(r"\s*,\s*", ", ", w)
+        w = re.sub(r"\s*/\s*", "/", w)
+        w = w.strip(string.whitespace + string.punctuation)
+        if len(w) > 1 and re.search(r"\w", w):
+            cleaned = ent.copy()
+            cleaned["word"] = w
+            final.append(cleaned)
+    return final
+def truncate(number, decimals=2):
+    factor = 10**decimals
+    return int(number * factor) / factor
+def highlight_entities(
+    line,
+    merged_entities,
+    importance_levels,
+    threshold,
+    label_counters,
+    anonymized_map,
+    allowed_keys,
+    entity_labels,
+    entity_importance,
+    ENTITY_COLORS,
+):
+    html = ""
+    last_end = 0
+    for ent in merged_entities:
+        if ent["score"] < threshold:
+            continue
+        start, end = ent["start"], ent["end"]
+        label = ent["entity"].split("-")[-1]
+        label_desc = entity_labels.get(label, label)
+        truncated_score = truncate(ent["score"], 2)
+        tooltip = f"{label_desc} ({truncated_score:.2f})"
+        color = ENTITY_COLORS.get(label, "#cccccc")
+        html += line[last_end:start]
+        should_anonymize = any(
+            label in entity_importance[level] for level in importance_levels
+        )
+        if should_anonymize:
+            key = (ent["word"].lower(), label)
+            if key not in anonymized_map:
+                count = label_counters.get(label, 0)
+                suffix = chr(ord("A") + count)
+                label_counters[label] = count + 1
+                anonymized_map[key] = suffix
+            suffix = anonymized_map[key]
+            display = f"{label_desc} {suffix}"
+            normalized_word = ent["word"].strip().lower()
+            display_key = f"{label_desc} {suffix} : {normalized_word}"
+            if display_key not in allowed_keys:
+                display = ent["word"]
+                style = ""
+                css_class = "entity"
+            else:
+                style = f"background-color:{color}; font-weight:600;"
+                css_class = "entity marked"
+        else:
+            display = ent["word"]
+            style = ""
+            css_class = "entity"
+        html += f'<span class="{css_class}" style="{style}">{display}<span class="tooltip">{tooltip}</span></span>'
+        last_end = end
+    html += line[last_end:]
+    return html
+def main():
+    setup_page()
+    entity_importance, entity_labels = get_constants()
+    ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
+    ner = load_ner_model()
+    st.markdown("#### Juristische Anonymisierung")
+    uploaded_file = st.file_uploader(
+        "Bitte laden Sie eine .txt-Datei hoch:", type="txt"
+    )
+    importance_display_to_key = {"Hoch": "High", "Mittel": "Mid", "Niedrig": "Low"}
+    selected_importance_display = st.multiselect(
+        "Wähle Wichtigkeitsstufen zur Anonymisierung:",
+        options=list(importance_display_to_key.keys()),
+        default=["Hoch"],
+    )
+    importance_levels = [
+        importance_display_to_key[i] for i in selected_importance_display
+    ]
+    with st.expander("Übersicht: Entitätstypen nach Wichtigkeit", expanded=False):
+        for level in ["High", "Mid", "Low"]:
+            label = {"High": "Hoch", "Mid": "Mittel", "Low": "Niedrig"}[level]
+            ent_list = [entity_labels[k] for k in entity_importance[level]]
+            st.markdown(f"**{label}**: {', '.join(ent_list)}")
+    threshold = st.slider("Schwellenwert für das Modellvertrauen:", 0.0, 1.0, 0.8, 0.01)
+    st.markdown("---")
+    if uploaded_file:
+        raw_bytes = uploaded_file.read()
+        encoding = detect(raw_bytes)["encoding"]
+        if encoding is None:
+            st.error("Zeichenkodierung konnte nicht erkannt werden.")
+            return
+        text = raw_bytes.decode(encoding)
+        label_counters = {}
+        anonymized_map = {}
+        all_display_keys = []
+        merged_all_lines = []
+        with st.spinner("Modell läuft und verarbeitet die Datei..."):
+            for line in text.splitlines():
+                if not line.strip():
+                    continue
+                tokens = ner(line)
+                merged = merge_entities(tokens)
+                merged_all_lines.append((line, merged))
+                for ent in merged:
+                    label = ent["entity"].split("-")[-1]
+                    if any(
+                        label in entity_importance[lvl] for lvl in importance_levels
+                    ):
+                        key = (ent["word"].lower(), label)
+                        if key not in anonymized_map:
+                            count = label_counters.get(label, 0)
+                            suffix = chr(ord("A") + count)
+                            label_counters[label] = count + 1
+                            anonymized_map[key] = suffix
+                        suffix = anonymized_map[key]
+                        normalized_word = ent["word"].strip().lower()
+                        display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
+                        if display not in all_display_keys:
+                            all_display_keys.append(display)
+        all_display_keys.sort(key=lambda tag: tag.lower())
+        with st.sidebar:
+            st.markdown("### Anonymisierte Entitäten verwalten:")
+            selected_keys = []
+            for label_code in sorted(
+                set(k[1] for k in anonymized_map.keys()),
+                key=lambda x: entity_labels.get(x, x),
+            ):
+                group = [k for k in anonymized_map if k[1] == label_code]
+                label_name = entity_labels[label_code]
+                st.markdown(f"**{label_name}**")
+                for key in sorted(group, key=lambda k: anonymized_map[k]):
+                    suffix = anonymized_map[key]
+                    normalized_word = key[0].strip().lower()
+                    entity_display = f"{label_name} {suffix} : {normalized_word}"
+                    if st.checkbox(entity_display, value=True, key=entity_display):
+                        selected_keys.append(entity_display)
+        anonymized_lines = []
+        for line, merged in merged_all_lines:
+            if not line.strip():
+                st.markdown("<br>", unsafe_allow_html=True)
+                anonymized_lines.append("")
+                continue
+            html_line = highlight_entities(
+                line,
+                merged,
+                importance_levels,
+                threshold,
+                label_counters,
+                anonymized_map,
+                selected_keys,
+                entity_labels,
+                entity_importance,
+                ENTITY_COLORS,
+            )
+            st.markdown(
+                f'<div style="margin-bottom:0.8rem; line-height:1.8;">{html_line}</div>',
+                unsafe_allow_html=True,
+            )
+            cleaned = re.sub(r'<span class="tooltip">.*?</span>', "", html_line)
+            text_only = re.sub(r"<[^>]+>", "", cleaned)
+            anonymized_lines.append(text_only.strip())
+        st.markdown("---")
+        st.download_button(
+            label="Anonymisierten Text herunterladen",
+            data="\n".join(anonymized_lines),
+            file_name=f"anonymisiert_{uploaded_file.name}",
+            mime="text/plain",
+        )
+if __name__ == "__main__":
+    main()

index.html DELETED Viewed

@@ -1,19 +0,0 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
-</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+torch
+seaborn

style.css DELETED Viewed

@@ -1,28 +0,0 @@
-body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
-}
-h1 {
-	font-size: 16px;
-	margin-top: 0;
-}
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
-}
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
-}
-.card p:last-child {
-	margin-bottom: 0;
-}