Spaces:
Runtime error
Runtime error
File size: 2,821 Bytes
e6bfe5c d25abcf ae36be9 1c9f94a e6bfe5c 1c9f94a ae36be9 8c045a9 1c9f94a 8bd5af2 1c9f94a 8bd5af2 81805e8 e6bfe5c 81805e8 1b711d9 c480c1f 8bb7ed4 c480c1f 8c045a9 3547909 1c9f94a e6bfe5c 1c9f94a 20bef1c 1c9f94a 81805e8 4874aa0 8c045a9 1c9f94a c480c1f 8ab4c34 1c9f94a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import streamlit as st
import pandas as pd
import spacy
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import PyPDF2
import docx
import io
import re
# ... [Previous functions remain unchanged] ...
def create_mask_dict(entities):
mask_dict = {}
entity_counters = {}
for entity in entities:
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
if entity['word'] not in mask_dict:
if entity['entity_group'] not in entity_counters:
entity_counters[entity['entity_group']] = 1
else:
entity_counters[entity['entity_group']] += 1
mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
return mask_dict
def create_masked_text(input_text, mask_dict):
masked_text = input_text
for word, mask in sorted(mask_dict.items(), key=lambda x: len(x[0]), reverse=True):
masked_text = re.sub(r'\b' + re.escape(word) + r'\b', mask, masked_text)
return masked_text
Run_Button = st.button("Run")
if Run_Button and input_text:
ner_pipeline = setModel(model_checkpoint, aggregation)
# Chunk the input text
chunks = chunk_text(input_text)
# Process each chunk
all_outputs = []
for i, chunk in enumerate(chunks):
output = ner_pipeline(chunk)
# Adjust start and end positions for entities in chunks after the first
if i > 0:
offset = len(' '.join(chunks[:i])) + 1
for entity in output:
entity['start'] += offset
entity['end'] += offset
all_outputs.extend(output)
# Combine entities
output_comb = entity_comb(all_outputs)
# Create mask dictionary
mask_dict = create_mask_dict(output_comb)
# Create masked text
masked_text = create_masked_text(input_text, mask_dict)
st.subheader("Masked Text")
st.text(masked_text)
st.subheader("Masking Dictionary")
st.json(mask_dict)
# Create a DataFrame for display
df = pd.DataFrame([(word, mask) for word, mask in mask_dict.items()], columns=['Original', 'Masked'])
st.subheader("Masking Table")
st.dataframe(df)
# Optional: Display original text with highlights
st.subheader("Original Text with Highlights")
spacy_display = {"ents": [], "text": input_text, "title": None}
for entity in output_comb:
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
label = mask_dict[entity['word']]
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": label})
html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
st.write(html, unsafe_allow_html=True) |