File size: 2,821 Bytes
e6bfe5c
 
d25abcf
ae36be9
 
 
 
1c9f94a
e6bfe5c
1c9f94a
ae36be9
8c045a9
 
 
 
 
 
 
 
 
 
 
 
1c9f94a
 
8bd5af2
1c9f94a
 
8bd5af2
 
81805e8
e6bfe5c
81805e8
1b711d9
 
c480c1f
 
 
 
 
 
8bb7ed4
c480c1f
 
 
 
 
 
 
 
 
 
 
 
 
8c045a9
 
3547909
1c9f94a
 
e6bfe5c
1c9f94a
 
20bef1c
1c9f94a
 
 
 
 
 
 
 
 
 
81805e8
4874aa0
8c045a9
1c9f94a
 
c480c1f
8ab4c34
1c9f94a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st
import pandas as pd
import spacy
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import PyPDF2
import docx
import io
import re

# ... [Previous functions remain unchanged] ...

def create_mask_dict(entities):
    mask_dict = {}
    entity_counters = {}
    for entity in entities:
        if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
            if entity['word'] not in mask_dict:
                if entity['entity_group'] not in entity_counters:
                    entity_counters[entity['entity_group']] = 1
                else:
                    entity_counters[entity['entity_group']] += 1
                mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
    return mask_dict

def create_masked_text(input_text, mask_dict):
    masked_text = input_text
    for word, mask in sorted(mask_dict.items(), key=lambda x: len(x[0]), reverse=True):
        masked_text = re.sub(r'\b' + re.escape(word) + r'\b', mask, masked_text)
    return masked_text

Run_Button = st.button("Run")

if Run_Button and input_text:
    ner_pipeline = setModel(model_checkpoint, aggregation)
    
    # Chunk the input text
    chunks = chunk_text(input_text)
    
    # Process each chunk
    all_outputs = []
    for i, chunk in enumerate(chunks):
        output = ner_pipeline(chunk)
        
        # Adjust start and end positions for entities in chunks after the first
        if i > 0:
            offset = len(' '.join(chunks[:i])) + 1
            for entity in output:
                entity['start'] += offset
                entity['end'] += offset
        
        all_outputs.extend(output)
    
    # Combine entities
    output_comb = entity_comb(all_outputs)
    
    # Create mask dictionary
    mask_dict = create_mask_dict(output_comb)

    # Create masked text
    masked_text = create_masked_text(input_text, mask_dict)
    
    st.subheader("Masked Text")
    st.text(masked_text)

    st.subheader("Masking Dictionary")
    st.json(mask_dict)

    # Create a DataFrame for display
    df = pd.DataFrame([(word, mask) for word, mask in mask_dict.items()], columns=['Original', 'Masked'])
    st.subheader("Masking Table")
    st.dataframe(df)

    # Optional: Display original text with highlights
    st.subheader("Original Text with Highlights")
    spacy_display = {"ents": [], "text": input_text, "title": None}
    for entity in output_comb:
        if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
            label = mask_dict[entity['word']]
            spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": label})
    
    html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
    st.write(html, unsafe_allow_html=True)