Spaces:

harshildarji
/

Legal-NER-Demo

Running

App Files Files Community

harshildarji commited on Feb 16

Commit

bdcb2c9

verified ·

1 Parent(s): 4635bc5

Add selective anonymization, DE option for UI

Browse files

Files changed (1) hide show

app.py +102 -40

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-import os
 import warnings
 import matplotlib.colors as mcolors
@@ -114,13 +113,53 @@ st.markdown(
             border-radius: 3px;
             padding: 2px 4px;
         }
     </style>
 """,
     unsafe_allow_html=True,
 )
 # Initialization for German Legal NER
-tkn = os.getenv("tkn")
 tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraBERT", use_auth_token=tkn)
 model = AutoModelForTokenClassification.from_pretrained(
     "harshildarji/JuraBERT", use_auth_token=tkn
@@ -152,14 +191,14 @@ classes = {
 ner_labels = list(classes.keys())
-# Function to generate a list of colors for visualization
 def generate_colors(num_colors):
     cm = plt.get_cmap("tab20")
     colors = [mcolors.rgb2hex(cm(1.0 * i / num_colors)) for i in range(num_colors)]
     return colors
-# Function to color substrings based on NER results
 def color_substrings(input_string, model_output):
     colors = generate_colors(len(ner_labels))
     label_to_color = {
@@ -173,29 +212,41 @@ def color_substrings(input_string, model_output):
         start, end, label = entity["start"], entity["end"], entity["label"]
         html_output += input_string[last_end:start]
         tooltip = classes.get(label, "")
-        html_output += f'<span class="tooltip" style="color: {label_to_color.get(label)}; font-weight: bold;">{input_string[start:end]}<span class="tooltiptext">{tooltip}</span></span>'
         last_end = end
     html_output += input_string[last_end:]
     return html_output
-# Function to anonymize entities
-def anonymize_text(input_string, model_output):
     anonymized_text = ""
     last_end = 0
     for entity in sorted(model_output, key=lambda x: x["start"]):
         start, end, label = entity["start"], entity["end"], entity["label"]
         anonymized_text += input_string[last_end:start]
-        anonymized_text += (
-            f'<span class="anonymized">[{classes.get(label, label)}]</span>'
-        )
         last_end = end
     anonymized_text += input_string[last_end:]
     return anonymized_text
@@ -209,7 +260,6 @@ def merge_entities(ner_results):
         token_start, token_end = token["start"], token["end"]
         token_word = token["word"].replace("##", "")  # Remove subword prefixes
-        # Start a new entity if necessary
         if (
             tag.startswith("B-")
             or current_entity is None
@@ -228,11 +278,9 @@ def merge_entities(ner_results):
             and current_entity
             and current_entity["label"] == entity_type
         ):
-            # Extend the current entity
             current_entity["end"] = token_end
             current_entity["word"] += token_word
         else:
-            # Handle misclassifications or gaps in tokens
             if (
                 current_entity
                 and token_start == current_entity["end"]
@@ -241,7 +289,6 @@ def merge_entities(ner_results):
                 current_entity["end"] = token_end
                 current_entity["word"] += token_word
             else:
-                # Treat it as a new entity if the above conditions aren't met
                 if current_entity:
                     merged_entities.append(current_entity)
                 current_entity = {
@@ -251,32 +298,52 @@ def merge_entities(ner_results):
                     "word": token_word,
                 }
-    # Append the last entity
     if current_entity:
         merged_entities.append(current_entity)
     return merged_entities
-st.title("Legal NER")
-st.markdown("<hr>", unsafe_allow_html=True)
-uploaded_file = st.file_uploader("Upload a .txt file", type="txt")
 if uploaded_file is not None:
     try:
         raw_content = uploaded_file.read()
         detected = detect(raw_content)
         encoding = detected["encoding"]
         if encoding is None:
             raise ValueError("Unable to detect file encoding.")
         lines = raw_content.decode(encoding).splitlines()
-        anonymize_mode = st.checkbox("Anonymize")
         st.markdown(
             "<hr style='margin-top: 10px; margin-bottom: 20px;'>",
             unsafe_allow_html=True,
@@ -285,13 +352,12 @@ if uploaded_file is not None:
         anonymized_lines = []
         displayed_lines = []
-        for line_number, line in enumerate(lines, start=1):
             if line.strip():
-                results = ner(line)
-                merged_results = merge_entities(results)
                 if anonymize_mode:
-                    anonymized_text = anonymize_text(line, merged_results)
                     displayed_lines.append(anonymized_text)
                     plain_text = re.sub(r"<.*?>", "", anonymized_text)
                     anonymized_lines.append(plain_text.strip())
@@ -299,31 +365,27 @@ if uploaded_file is not None:
                     colored_html = color_substrings(line, merged_results)
                     st.markdown(f"{colored_html}", unsafe_allow_html=True)
             else:
-                displayed_lines.append("<br>")
                 anonymized_lines.append("")
         if anonymize_mode:
             original_file_name = uploaded_file.name
             download_file_name = f"Anon_{original_file_name}"
             anonymized_content = "\n".join(anonymized_lines)
             for displayed_line in displayed_lines:
                 st.markdown(f"{displayed_line}", unsafe_allow_html=True)
             st.markdown("<hr>", unsafe_allow_html=True)
             st.download_button(
-                label="Download Anonymized Text",
                 data=anonymized_content,
                 file_name=download_file_name,
                 mime="text/plain",
             )
-        if not anonymize_mode:
             st.markdown(
-                '<div class="tip"><strong>Tip:</strong> Hover over the colored words to see its class.</div>',
                 unsafe_allow_html=True,
             )
     except Exception as e:
-        st.error(f"An error occurred while processing the file: {e}")

 import re
 import warnings
 import matplotlib.colors as mcolors
             border-radius: 3px;
             padding: 2px 4px;
         }
+        #language-container {
+            position: fixed;
+            top: 10px;
+            right: 10px;
+            z-index: 1000;
+        }
     </style>
 """,
     unsafe_allow_html=True,
 )
+# UI text for English and German.
+ui_text = {
+    "EN": {
+        "title": "Legal NER",
+        "upload": "Upload a .txt file",
+        "anonymize": "Anonymize",
+        "select_entities": "Entity types to anonymize:",
+        "download": "Download Anonymized Text",
+        "tip": "Tip: Hover over the colored words to see its class.",
+        "error": "An error occurred while processing the file: ",
+    },
+    "DE": {
+        "title": "Juristische NER",
+        "upload": "Lade eine .txt-Datei hoch",
+        "anonymize": "Anonymisieren",
+        "select_entities": "Entitätstypen zur Anonymisierung:",
+        "download": "Anonymisierten Text herunterladen",
+        "tip": "Tipp: Fahre mit der Maus über die farbigen Wörter, um deren Klasse zu sehen.",
+        "error": "Beim Verarbeiten der Datei ist ein Fehler aufgetreten: ",
+    },
+}
+col1, col2 = st.columns([4, 1])
+with col2:
+    lang = st.radio(
+        "",
+        options=["EN", "DE"],
+        horizontal=True,
+        label_visibility="collapsed",
+        key="language_selector",
+    )
+with col1:
+    st.title(ui_text[lang]["title"])
 # Initialization for German Legal NER
+tkn = open("./token").read()
 tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraBERT", use_auth_token=tkn)
 model = AutoModelForTokenClassification.from_pretrained(
     "harshildarji/JuraBERT", use_auth_token=tkn
 ner_labels = list(classes.keys())
+# Generate a list of colors for visualization
 def generate_colors(num_colors):
     cm = plt.get_cmap("tab20")
     colors = [mcolors.rgb2hex(cm(1.0 * i / num_colors)) for i in range(num_colors)]
     return colors
+# Color substrings based on NER results
 def color_substrings(input_string, model_output):
     colors = generate_colors(len(ner_labels))
     label_to_color = {
         start, end, label = entity["start"], entity["end"], entity["label"]
         html_output += input_string[last_end:start]
         tooltip = classes.get(label, "")
+        html_output += (
+            f'<span class="tooltip" style="color: {label_to_color.get(label)}; font-weight: bold;">'
+            f'{input_string[start:end]}<span class="tooltiptext">{tooltip}</span></span>'
+        )
         last_end = end
     html_output += input_string[last_end:]
     return html_output
+# Selectively anonymize entities
+def anonymize_text(input_string, model_output, selected_entities=None):
     anonymized_text = ""
     last_end = 0
+    colors = generate_colors(len(ner_labels))
+    label_to_color = {
+        label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
+    }
     for entity in sorted(model_output, key=lambda x: x["start"]):
         start, end, label = entity["start"], entity["end"], entity["label"]
         anonymized_text += input_string[last_end:start]
+        if selected_entities is None or label in selected_entities:
+            anonymized_text += (
+                f'<span class="anonymized">[{classes.get(label, label)}]</span>'
+            )
+        else:
+            tooltip = classes.get(label, "")
+            anonymized_text += (
+                f'<span class="tooltip" style="color: {label_to_color.get(label)}; font-weight: bold;">'
+                f'{input_string[start:end]}<span class="tooltiptext">{tooltip}</span></span>'
+            )
         last_end = end
     anonymized_text += input_string[last_end:]
     return anonymized_text
         token_start, token_end = token["start"], token["end"]
         token_word = token["word"].replace("##", "")  # Remove subword prefixes
         if (
             tag.startswith("B-")
             or current_entity is None
             and current_entity
             and current_entity["label"] == entity_type
         ):
             current_entity["end"] = token_end
             current_entity["word"] += token_word
         else:
             if (
                 current_entity
                 and token_start == current_entity["end"]
                 current_entity["end"] = token_end
                 current_entity["word"] += token_word
             else:
                 if current_entity:
                     merged_entities.append(current_entity)
                 current_entity = {
                     "word": token_word,
                 }
     if current_entity:
         merged_entities.append(current_entity)
     return merged_entities
+uploaded_file = st.file_uploader(ui_text[lang]["upload"], type="txt")
 if uploaded_file is not None:
     try:
         raw_content = uploaded_file.read()
         detected = detect(raw_content)
         encoding = detected["encoding"]
         if encoding is None:
             raise ValueError("Unable to detect file encoding.")
         lines = raw_content.decode(encoding).splitlines()
+        line_results = []
+        for line in lines:
+            if line.strip():
+                results = ner(line)
+                merged_results = merge_entities(results)
+                line_results.append(merged_results)
+            else:
+                line_results.append([])
+        anonymize_mode = st.checkbox(ui_text[lang]["anonymize"])
+        selected_entities = None
+        if anonymize_mode:
+            detected_entity_tags = set()
+            for merged_results in line_results:
+                for entity in merged_results:
+                    detected_entity_tags.add(entity["label"])
+            inverse_classes = {v: k for k, v in classes.items()}
+            detected_options = sorted([classes[tag] for tag in detected_entity_tags])
+            selected_options = st.multiselect(
+                ui_text[lang]["select_entities"],
+                options=detected_options,
+                default=detected_options,
+            )
+            selected_entities = [
+                inverse_classes[options] for options in selected_options
+            ]
         st.markdown(
             "<hr style='margin-top: 10px; margin-bottom: 20px;'>",
             unsafe_allow_html=True,
         anonymized_lines = []
         displayed_lines = []
+        for line, merged_results in zip(lines, line_results):
             if line.strip():
                 if anonymize_mode:
+                    anonymized_text = anonymize_text(
+                        line, merged_results, selected_entities=selected_entities
+                    )
                     displayed_lines.append(anonymized_text)
                     plain_text = re.sub(r"<.*?>", "", anonymized_text)
                     anonymized_lines.append(plain_text.strip())
                     colored_html = color_substrings(line, merged_results)
                     st.markdown(f"{colored_html}", unsafe_allow_html=True)
             else:
+                # displayed_lines.append("<br>")
                 anonymized_lines.append("")
         if anonymize_mode:
             original_file_name = uploaded_file.name
             download_file_name = f"Anon_{original_file_name}"
             anonymized_content = "\n".join(anonymized_lines)
             for displayed_line in displayed_lines:
                 st.markdown(f"{displayed_line}", unsafe_allow_html=True)
             st.markdown("<hr>", unsafe_allow_html=True)
             st.download_button(
+                label=ui_text[lang]["download"],
                 data=anonymized_content,
                 file_name=download_file_name,
                 mime="text/plain",
             )
+        else:
             st.markdown(
+                f'<div class="tip"><strong>{ui_text[lang]["tip"]}</strong></div>',
                 unsafe_allow_html=True,
             )
     except Exception as e:
+        st.error(f"{ui_text[lang]['error']}{e}")