File size: 3,541 Bytes
aba3b27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bfa5bb
 
 
 
 
 
7965ec3
8bfa5bb
 
 
 
 
 
 
 
 
aba3b27
22857c0
aba3b27
8bfa5bb
aba3b27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cb45a7
dce6dc2
aba3b27
dce6dc2
 
 
 
 
 
aba3b27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
import streamlit as st
import fitz  # PyMuPDF
from docx import Document
import re
import nltk
nltk.download('punkt')

def sentence_tokenize(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

model_dir_large = 'edithram23/Redaction_Personal_info_v1'
tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)

model_dir_small = 'edithram23/Redaction'
tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
model_small = AutoModelForSeq2SeqLM.from_pretrained(model_dir_small)

def small(text,model=model_small,tokenizer=tokenizer_small):
    inputs = ["Mask Generation: " + text.lower()+'.']
    inputs = tokenizer(inputs, max_length=256, truncation=True, return_tensors="pt")
    output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    predicted_title = decoded_output.strip()
    pattern = r'\[.*?\]'
    # Replace all occurrences of the pattern with [redacted]
    redacted_text = re.sub(pattern, '[redacted]', predicted_title)
    return redacted_text


def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
    if(len(text)<200):
        text = text+'.'
        return small(text)
    inputs = ["Mask Generation: " + text.lower()+'.']
    inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
    output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    predicted_title = decoded_output.strip()
    pattern = r'\[.*?\]'
    # Replace all occurrences of the pattern with [redacted]
    redacted_text = re.sub(pattern, '[redacted]', predicted_title)
    return redacted_text



def read_pdf(file):
    pdf_document = fitz.open(stream=file.read(), filetype="pdf")
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

def read_docx(file):
    doc = Document(file)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def read_txt(file):
    text = file.read().decode("utf-8")
    return text

def process_file(file):
    if file.type == "application/pdf":
        return read_pdf(file)
    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        return read_docx(file)
    elif file.type == "text/plain":
        return read_txt(file)
    else:
        return "Unsupported file type."

st.title("Redaction")
# user = st.text_input("Input Text to Redact")
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
# if(user != ''):
#     token = sentence_tokenize(user)
#     final=''
#     for i in range(0, len(token)):
#         final+=mask_generation(token[i])+'\n'
#     st.text_area("OUTPUT",final,height=400)
if uploaded_file is not None:
    file_contents = process_file(uploaded_file)
    token = sentence_tokenize(file_contents)
    final=''
    for i in range(0, len(token)):
        final+=mask_generation(token[i])+'\n'
    processed_text = final
    st.text_area("OUTPUT", processed_text, height=400)
    
    st.download_button(
        label="Download Processed File",
        data=processed_text,
        file_name="processed_file.txt",
        mime="text/plain",
    )