Spaces:

harshildarji
/

Juristische-NER

Running on CPU Upgrade

App Files Files Community

Harshil Darji commited on Jan 13

Commit

56fe3c0

1 Parent(s): 347d235

Add app file

Browse files

Files changed (2) hide show

app.py +308 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import os
+import warnings
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import streamlit as st
+from charset_normalizer import detect
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    logging,
+    pipeline,
+)
+warnings.simplefilter(action="ignore", category=Warning)
+logging.set_verbosity(logging.ERROR)
+st.set_page_config(page_title="Legal NER", page_icon="⚖️", layout="wide")
+st.markdown(
+    """
+    <style>
+        body {
+            font-family: 'Poppins', sans-serif;
+            background-color: #f4f4f8;
+        }
+        .header {
+            background-color: rgba(220, 219, 219, 0.25);
+            color: #000;
+            padding: 5px 0;
+            text-align: center;
+            border-radius: 7px;
+            margin-bottom: 13px;
+            border-bottom: 2px solid #333;
+        }
+        .container {
+            background-color: #fff;
+            padding: 30px;
+            border-radius: 10px;
+            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+            width: 100%;
+            max-width: 1000px;
+            margin: 0 auto;
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+        }
+        .btn-primary {
+            background-color: #5477d1;
+            border: none;
+            transition: background-color 0.3s, transform 0.2s;
+            border-radius: 25px;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
+        }
+        .btn-primary:hover {
+            background-color: #4c6cbe;
+            transform: translateY(-1px);
+        }
+        h2 {
+            font-weight: 600;
+            font-size: 24px;
+            margin-bottom: 20px;
+        }
+        label {
+            font-weight: 500;
+        }
+        .tip {
+            background-color: rgba(180, 47, 109, 0.25);
+            padding: 7px;
+            border-radius: 7px;
+            display: inline-block;
+            margin-top: 15px;
+            margin-bottom: 15px;
+        }
+        .sec {
+            background-color: rgba(220, 219, 219, 0.10);
+            padding: 7px;
+            border-radius: 5px;
+            display: inline-block;
+            margin-top: 15px;
+            margin-bottom: 15px;
+        }
+        .tooltip {
+            position: relative;
+            display: inline-block;
+            cursor: pointer;
+        }
+        .tooltip .tooltiptext {
+            visibility: hidden;
+            width: 120px;
+            background-color: #6c757d;
+            color: #fff;
+            text-align: center;
+            border-radius: 3px;
+            padding: 3px;
+            position: absolute;
+            z-index: 1;
+            bottom: 125%;
+            left: 50%;
+            margin-left: -60px;
+            opacity: 0;
+            transition: opacity 0.3s;
+        }
+        .tooltip:hover .tooltiptext {
+            visibility: visible;
+            opacity: 1;
+        }
+        .anonymized {
+            background-color: #ffcccb;
+            color: #000;
+            font-weight: bold;
+            border-radius: 3px;
+            padding: 2px 4px;
+        }
+    </style>
+""",
+    unsafe_allow_html=True,
+)
+# Initialization for German Legal NER
+tkn = os.getenv("tkn")
+tokenizer = AutoTokenizer.from_pretrained("harshildarji/JuraBERT", use_auth_token=tkn)
+model = AutoModelForTokenClassification.from_pretrained(
+    "harshildarji/JuraBERT", use_auth_token=tkn
+)
+ner = pipeline("ner", model=model, tokenizer=tokenizer)
+# Define class labels for the model
+classes = {
+    "AN": "Lawyer",
+    "EUN": "European legal norm",
+    "GRT": "Court",
+    "GS": "Law",
+    "INN": "Institution",
+    "LD": "Country",
+    "LDS": "Landscape",
+    "LIT": "Legal literature",
+    "MRK": "Brand",
+    "ORG": "Organization",
+    "PER": "Person",
+    "RR": "Judge",
+    "RS": "Court decision",
+    "ST": "City",
+    "STR": "Street",
+    "UN": "Company",
+    "VO": "Ordinance",
+    "VS": "Regulation",
+    "VT": "Contract",
+}
+ner_labels = list(classes.keys())
+# Function to generate a list of colors for visualization
+def generate_colors(num_colors):
+    cm = plt.get_cmap("tab20")
+    colors = [mcolors.rgb2hex(cm(1.0 * i / num_colors)) for i in range(num_colors)]
+    return colors
+# Function to color substrings based on NER results
+def color_substrings(input_string, model_output):
+    colors = generate_colors(len(ner_labels))
+    label_to_color = {
+        label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
+    }
+    last_end = 0
+    html_output = ""
+    for entity in sorted(model_output, key=lambda x: x["start"]):
+        start, end, label = entity["start"], entity["end"], entity["label"]
+        html_output += input_string[last_end:start]
+        tooltip = classes.get(label, "")
+        html_output += f'<span class="tooltip" style="color: {label_to_color.get(label)}; font-weight: bold;">{input_string[start:end]}<span class="tooltiptext">{tooltip}</span></span>'
+        last_end = end
+    html_output += input_string[last_end:]
+    return html_output
+# Function to anonymize entities
+def anonymize_text(input_string, model_output):
+    anonymized_text = ""
+    last_end = 0
+    for entity in sorted(model_output, key=lambda x: x["start"]):
+        start, end, label = entity["start"], entity["end"], entity["label"]
+        anonymized_text += input_string[last_end:start]
+        anonymized_text += (
+            f'<span class="anonymized">[{classes.get(label, label)}]</span>'
+        )
+        last_end = end
+    anonymized_text += input_string[last_end:]
+    return anonymized_text
+def merge_entities(ner_results):
+    merged_entities = []
+    current_entity = None
+    for token in ner_results:
+        tag = token["entity"]
+        entity_type = tag.split("-")[-1] if "-" in tag else tag
+        token_start, token_end = token["start"], token["end"]
+        token_word = token["word"].replace("##", "")  # Remove subword prefixes
+        # Start a new entity if necessary
+        if (
+            tag.startswith("B-")
+            or current_entity is None
+            or current_entity["label"] != entity_type
+        ):
+            if current_entity:
+                merged_entities.append(current_entity)
+            current_entity = {
+                "start": token_start,
+                "end": token_end,
+                "label": entity_type,
+                "word": token_word,
+            }
+        elif (
+            tag.startswith("I-")
+            and current_entity
+            and current_entity["label"] == entity_type
+        ):
+            # Extend the current entity
+            current_entity["end"] = token_end
+            current_entity["word"] += token_word
+        else:
+            # Handle misclassifications or gaps in tokens
+            if (
+                current_entity
+                and token_start == current_entity["end"]
+                and current_entity["label"] == entity_type
+            ):
+                current_entity["end"] = token_end
+                current_entity["word"] += token_word
+            else:
+                # Treat it as a new entity if the above conditions aren't met
+                if current_entity:
+                    merged_entities.append(current_entity)
+                current_entity = {
+                    "start": token_start,
+                    "end": token_end,
+                    "label": entity_type,
+                    "word": token_word,
+                }
+    # Append the last entity
+    if current_entity:
+        merged_entities.append(current_entity)
+    return merged_entities
+st.title("Legal NER")
+st.markdown("<hr>", unsafe_allow_html=True)
+uploaded_file = st.file_uploader("Upload a .txt file", type="txt")
+if uploaded_file is not None:
+    try:
+        # Read raw content of the file
+        raw_content = uploaded_file.read()
+        # Dynamically detect encoding
+        detected = detect(raw_content)
+        encoding = detected["encoding"]
+        if encoding is None:
+            raise ValueError("Unable to detect file encoding.")
+        # Decode file content with the detected encoding
+        lines = raw_content.decode(encoding).splitlines()
+        anonymize_mode = st.checkbox("Anonymize")
+        st.markdown(
+            "<hr style='margin-top: 10px; margin-bottom: 20px;'>",
+            unsafe_allow_html=True,
+        )
+        for line_number, line in enumerate(lines, start=1):
+            if line.strip():
+                results = ner(line)
+                merged_results = merge_entities(results)
+                if anonymize_mode:
+                    anonymized_text = anonymize_text(line, merged_results)
+                    st.markdown(f"{anonymized_text}", unsafe_allow_html=True)
+                else:
+                    colored_html = color_substrings(line, merged_results)
+                    st.markdown(f"{colored_html}", unsafe_allow_html=True)
+            else:
+                st.markdown("<br>", unsafe_allow_html=True)
+        if not anonymize_mode:
+            st.markdown(
+                '<div class="tip"><strong>Tip:</strong> Hover over the colored words to see its class.</div>',
+                unsafe_allow_html=True,
+            )
+    except Exception as e:
+        st.error(f"An error occurred while processing the file: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+torch
+matplotlib