umarigan's picture
Update app.py
1c9f94a verified
raw
history blame
2.82 kB
import streamlit as st
import pandas as pd
import spacy
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import PyPDF2
import docx
import io
import re
# ... [Previous functions remain unchanged] ...
def create_mask_dict(entities):
mask_dict = {}
entity_counters = {}
for entity in entities:
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
if entity['word'] not in mask_dict:
if entity['entity_group'] not in entity_counters:
entity_counters[entity['entity_group']] = 1
else:
entity_counters[entity['entity_group']] += 1
mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
return mask_dict
def create_masked_text(input_text, mask_dict):
masked_text = input_text
for word, mask in sorted(mask_dict.items(), key=lambda x: len(x[0]), reverse=True):
masked_text = re.sub(r'\b' + re.escape(word) + r'\b', mask, masked_text)
return masked_text
Run_Button = st.button("Run")
if Run_Button and input_text:
ner_pipeline = setModel(model_checkpoint, aggregation)
# Chunk the input text
chunks = chunk_text(input_text)
# Process each chunk
all_outputs = []
for i, chunk in enumerate(chunks):
output = ner_pipeline(chunk)
# Adjust start and end positions for entities in chunks after the first
if i > 0:
offset = len(' '.join(chunks[:i])) + 1
for entity in output:
entity['start'] += offset
entity['end'] += offset
all_outputs.extend(output)
# Combine entities
output_comb = entity_comb(all_outputs)
# Create mask dictionary
mask_dict = create_mask_dict(output_comb)
# Create masked text
masked_text = create_masked_text(input_text, mask_dict)
st.subheader("Masked Text")
st.text(masked_text)
st.subheader("Masking Dictionary")
st.json(mask_dict)
# Create a DataFrame for display
df = pd.DataFrame([(word, mask) for word, mask in mask_dict.items()], columns=['Original', 'Masked'])
st.subheader("Masking Table")
st.dataframe(df)
# Optional: Display original text with highlights
st.subheader("Original Text with Highlights")
spacy_display = {"ents": [], "text": input_text, "title": None}
for entity in output_comb:
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
label = mask_dict[entity['word']]
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": label})
html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
st.write(html, unsafe_allow_html=True)