harshildarji commited on
Commit
4694471
·
1 Parent(s): a3633ad

upload app

Browse files
Files changed (7) hide show
  1. .gitignore +1 -0
  2. .streamlit/config.toml +2 -0
  3. README.md +4 -4
  4. app.py +346 -0
  5. index.html +0 -19
  6. requirements.txt +3 -0
  7. style.css +0 -28
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .DS_Store
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ base="light"
README.md CHANGED
@@ -3,8 +3,8 @@ title: Juristische Anonymisierung
3
  emoji: 👀
4
  colorFrom: indigo
5
  colorTo: purple
6
- sdk: static
 
 
7
  pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
3
  emoji: 👀
4
  colorFrom: indigo
5
  colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.45.1
8
+ app_file: app.py
9
  pinned: false
10
+ ---
 
 
app.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+
4
+ import seaborn as sns
5
+ import streamlit as st
6
+ from charset_normalizer import detect
7
+ from transformers import (
8
+ AutoModelForTokenClassification,
9
+ AutoTokenizer,
10
+ logging,
11
+ pipeline,
12
+ )
13
+
14
+
15
+ def setup_page():
16
+ st.set_page_config(
17
+ page_title="Juristische Anonymisierung", page_icon="⚖️", layout="wide"
18
+ )
19
+ logging.set_verbosity(logging.ERROR)
20
+ st.markdown(
21
+ """
22
+ <style>
23
+ .block-container {
24
+ padding-top: 1rem;
25
+ padding-bottom: 5rem;
26
+ padding-left: 3rem;
27
+ padding-right: 3rem;
28
+ }
29
+ header, footer {visibility: hidden;}
30
+ .entity {
31
+ position: relative;
32
+ display: inline-block;
33
+ background-color: transparent;
34
+ font-weight: normal;
35
+ cursor: help;
36
+ }
37
+ .entity .tooltip {
38
+ visibility: hidden;
39
+ background-color: #333;
40
+ color: #fff;
41
+ text-align: center;
42
+ border-radius: 4px;
43
+ padding: 2px 6px;
44
+ position: absolute;
45
+ z-index: 1;
46
+ bottom: 125%;
47
+ left: 50%;
48
+ transform: translateX(-50%);
49
+ white-space: nowrap;
50
+ opacity: 0;
51
+ transition: opacity 0.05s;
52
+ font-size: 11px;
53
+ }
54
+ .entity:hover .tooltip {
55
+ visibility: visible;
56
+ opacity: 1;
57
+ }
58
+ .entity.marked {
59
+ background-color: rgba(255, 230, 0, 0.4);
60
+ line-height: 1.3;
61
+ padding: 0 1px;
62
+ border-radius: 0px;
63
+ }
64
+ </style>
65
+ """,
66
+ unsafe_allow_html=True,
67
+ )
68
+
69
+
70
+ def get_constants():
71
+ entity_importance = {
72
+ "High": ["PER", "UN", "INN", "MRK"],
73
+ "Mid": ["RR", "AN", "GRT", "GS", "VO", "RS", "EUN", "LIT", "VS", "VT"],
74
+ "Low": ["LD", "ST", "STR", "LDS", "ORG"],
75
+ }
76
+ entity_labels = {
77
+ "AN": "Rechtsbeistand",
78
+ "EUN": "EUNorm",
79
+ "GRT": "Gericht",
80
+ "GS": "Norm",
81
+ "INN": "Institution",
82
+ "LD": "Land",
83
+ "LDS": "Bezirk",
84
+ "LIT": "Schrifttum",
85
+ "MRK": "Marke",
86
+ "ORG": "Organisation",
87
+ "PER": "Person",
88
+ "RR": "RichterIn",
89
+ "RS": "Entscheidung",
90
+ "ST": "Stadt",
91
+ "STR": "Strasse",
92
+ "UN": "Unternehmen",
93
+ "VO": "Verordnung",
94
+ "VS": "Richtlinie",
95
+ "VT": "Vertrag",
96
+ }
97
+ return entity_importance, entity_labels
98
+
99
+
100
+ def generate_fixed_colors(keys, alpha=0.25):
101
+ base_colors = sns.color_palette("tab20", len(keys))
102
+ rgba_colors = {
103
+ key: f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})"
104
+ for key, (r, g, b) in zip(keys, base_colors)
105
+ }
106
+ return rgba_colors
107
+
108
+
109
+ def load_ner_model():
110
+ tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraNER")
111
+ model = AutoModelForTokenClassification.from_pretrained("harshildarji/JuraNER")
112
+ return pipeline("ner", model=model, tokenizer=tokenizer)
113
+
114
+
115
+ def merge_entities(entities):
116
+ if not entities:
117
+ return []
118
+ ents = sorted(entities, key=lambda e: e["index"])
119
+ merged = [ents[0].copy()]
120
+ merged[0]["score_sum"] = ents[0]["score"]
121
+ merged[0]["count"] = 1
122
+
123
+ for ent in ents[1:]:
124
+ prev = merged[-1]
125
+ if ent["index"] == prev["index"] + 1:
126
+ tok = ent["word"]
127
+ prev["word"] += tok[2:] if tok.startswith("##") else " " + tok
128
+ prev["end"] = ent["end"]
129
+ prev["index"] = ent["index"]
130
+ prev["score_sum"] += ent["score"]
131
+ prev["count"] += 1
132
+ else:
133
+ prev["score"] = prev["score_sum"] / prev["count"]
134
+ del prev["score_sum"], prev["count"]
135
+ new_ent = ent.copy()
136
+ new_ent["score_sum"] = ent["score"]
137
+ new_ent["count"] = 1
138
+ merged.append(new_ent)
139
+
140
+ if "score_sum" in merged[-1]:
141
+ merged[-1]["score"] = merged[-1]["score_sum"] / merged[-1]["count"]
142
+ del merged[-1]["score_sum"], merged[-1]["count"]
143
+
144
+ final = []
145
+ for ent in merged:
146
+ w = ent["word"].strip()
147
+ w = re.sub(r"\s*\.\s*", ".", w)
148
+ w = re.sub(r"\s*,\s*", ", ", w)
149
+ w = re.sub(r"\s*/\s*", "/", w)
150
+ w = w.strip(string.whitespace + string.punctuation)
151
+ if len(w) > 1 and re.search(r"\w", w):
152
+ cleaned = ent.copy()
153
+ cleaned["word"] = w
154
+ final.append(cleaned)
155
+ return final
156
+
157
+
158
+ def truncate(number, decimals=2):
159
+ factor = 10**decimals
160
+ return int(number * factor) / factor
161
+
162
+
163
+ def highlight_entities(
164
+ line,
165
+ merged_entities,
166
+ importance_levels,
167
+ threshold,
168
+ label_counters,
169
+ anonymized_map,
170
+ allowed_keys,
171
+ entity_labels,
172
+ entity_importance,
173
+ ENTITY_COLORS,
174
+ ):
175
+ html = ""
176
+ last_end = 0
177
+ for ent in merged_entities:
178
+ if ent["score"] < threshold:
179
+ continue
180
+ start, end = ent["start"], ent["end"]
181
+ label = ent["entity"].split("-")[-1]
182
+ label_desc = entity_labels.get(label, label)
183
+
184
+ truncated_score = truncate(ent["score"], 2)
185
+ tooltip = f"{label_desc} ({truncated_score:.2f})"
186
+
187
+ color = ENTITY_COLORS.get(label, "#cccccc")
188
+ html += line[last_end:start]
189
+
190
+ should_anonymize = any(
191
+ label in entity_importance[level] for level in importance_levels
192
+ )
193
+ if should_anonymize:
194
+ key = (ent["word"].lower(), label)
195
+ if key not in anonymized_map:
196
+ count = label_counters.get(label, 0)
197
+ suffix = chr(ord("A") + count)
198
+ label_counters[label] = count + 1
199
+ anonymized_map[key] = suffix
200
+ suffix = anonymized_map[key]
201
+ display = f"{label_desc} {suffix}"
202
+ normalized_word = ent["word"].strip().lower()
203
+ display_key = f"{label_desc} {suffix} : {normalized_word}"
204
+ if display_key not in allowed_keys:
205
+ display = ent["word"]
206
+ style = ""
207
+ css_class = "entity"
208
+ else:
209
+ style = f"background-color:{color}; font-weight:600;"
210
+ css_class = "entity marked"
211
+ else:
212
+ display = ent["word"]
213
+ style = ""
214
+ css_class = "entity"
215
+
216
+ html += f'<span class="{css_class}" style="{style}">{display}<span class="tooltip">{tooltip}</span></span>'
217
+ last_end = end
218
+
219
+ html += line[last_end:]
220
+ return html
221
+
222
+
223
+ def main():
224
+ setup_page()
225
+ entity_importance, entity_labels = get_constants()
226
+ ENTITY_COLORS = generate_fixed_colors(list(entity_labels.keys()))
227
+ ner = load_ner_model()
228
+
229
+ st.markdown("#### Juristische Anonymisierung")
230
+ uploaded_file = st.file_uploader(
231
+ "Bitte laden Sie eine .txt-Datei hoch:", type="txt"
232
+ )
233
+
234
+ importance_display_to_key = {"Hoch": "High", "Mittel": "Mid", "Niedrig": "Low"}
235
+ selected_importance_display = st.multiselect(
236
+ "Wähle Wichtigkeitsstufen zur Anonymisierung:",
237
+ options=list(importance_display_to_key.keys()),
238
+ default=["Hoch"],
239
+ )
240
+ importance_levels = [
241
+ importance_display_to_key[i] for i in selected_importance_display
242
+ ]
243
+
244
+ with st.expander("Übersicht: Entitätstypen nach Wichtigkeit", expanded=False):
245
+ for level in ["High", "Mid", "Low"]:
246
+ label = {"High": "Hoch", "Mid": "Mittel", "Low": "Niedrig"}[level]
247
+ ent_list = [entity_labels[k] for k in entity_importance[level]]
248
+ st.markdown(f"**{label}**: {', '.join(ent_list)}")
249
+
250
+ threshold = st.slider("Schwellenwert für das Modellvertrauen:", 0.0, 1.0, 0.8, 0.01)
251
+ st.markdown("---")
252
+
253
+ if uploaded_file:
254
+ raw_bytes = uploaded_file.read()
255
+ encoding = detect(raw_bytes)["encoding"]
256
+ if encoding is None:
257
+ st.error("Zeichenkodierung konnte nicht erkannt werden.")
258
+ return
259
+ text = raw_bytes.decode(encoding)
260
+
261
+ label_counters = {}
262
+ anonymized_map = {}
263
+ all_display_keys = []
264
+ merged_all_lines = []
265
+
266
+ with st.spinner("Modell läuft und verarbeitet die Datei..."):
267
+ for line in text.splitlines():
268
+ if not line.strip():
269
+ continue
270
+ tokens = ner(line)
271
+ merged = merge_entities(tokens)
272
+ merged_all_lines.append((line, merged))
273
+ for ent in merged:
274
+ label = ent["entity"].split("-")[-1]
275
+ if any(
276
+ label in entity_importance[lvl] for lvl in importance_levels
277
+ ):
278
+ key = (ent["word"].lower(), label)
279
+ if key not in anonymized_map:
280
+ count = label_counters.get(label, 0)
281
+ suffix = chr(ord("A") + count)
282
+ label_counters[label] = count + 1
283
+ anonymized_map[key] = suffix
284
+ suffix = anonymized_map[key]
285
+ normalized_word = ent["word"].strip().lower()
286
+ display = f"{entity_labels.get(label, label)} {suffix} : {normalized_word}"
287
+ if display not in all_display_keys:
288
+ all_display_keys.append(display)
289
+
290
+ all_display_keys.sort(key=lambda tag: tag.lower())
291
+
292
+ with st.sidebar:
293
+ st.markdown("### Anonymisierte Entitäten verwalten:")
294
+ selected_keys = []
295
+ for label_code in sorted(
296
+ set(k[1] for k in anonymized_map.keys()),
297
+ key=lambda x: entity_labels.get(x, x),
298
+ ):
299
+ group = [k for k in anonymized_map if k[1] == label_code]
300
+ label_name = entity_labels[label_code]
301
+ st.markdown(f"**{label_name}**")
302
+ for key in sorted(group, key=lambda k: anonymized_map[k]):
303
+ suffix = anonymized_map[key]
304
+ normalized_word = key[0].strip().lower()
305
+ entity_display = f"{label_name} {suffix} : {normalized_word}"
306
+ if st.checkbox(entity_display, value=True, key=entity_display):
307
+ selected_keys.append(entity_display)
308
+
309
+ anonymized_lines = []
310
+ for line, merged in merged_all_lines:
311
+ if not line.strip():
312
+ st.markdown("<br>", unsafe_allow_html=True)
313
+ anonymized_lines.append("")
314
+ continue
315
+
316
+ html_line = highlight_entities(
317
+ line,
318
+ merged,
319
+ importance_levels,
320
+ threshold,
321
+ label_counters,
322
+ anonymized_map,
323
+ selected_keys,
324
+ entity_labels,
325
+ entity_importance,
326
+ ENTITY_COLORS,
327
+ )
328
+ st.markdown(
329
+ f'<div style="margin-bottom:0.8rem; line-height:1.8;">{html_line}</div>',
330
+ unsafe_allow_html=True,
331
+ )
332
+ cleaned = re.sub(r'<span class="tooltip">.*?</span>', "", html_line)
333
+ text_only = re.sub(r"<[^>]+>", "", cleaned)
334
+ anonymized_lines.append(text_only.strip())
335
+
336
+ st.markdown("---")
337
+ st.download_button(
338
+ label="Anonymisierten Text herunterladen",
339
+ data="\n".join(anonymized_lines),
340
+ file_name=f"anonymisiert_{uploaded_file.name}",
341
+ mime="text/plain",
342
+ )
343
+
344
+
345
+ if __name__ == "__main__":
346
+ main()
index.html DELETED
@@ -1,19 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ seaborn
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }