|
from transformers import AutoTokenizer |
|
from transformers import AutoModelForSeq2SeqLM |
|
import streamlit as st |
|
import fitz |
|
from docx import Document |
|
import re |
|
import nltk |
|
nltk.download('punkt') |
|
|
|
def sentence_tokenize(text): |
|
sentences = nltk.sent_tokenize(text) |
|
return sentences |
|
|
|
model_dir_large = 'edithram23/Redaction_Personal_info_v1' |
|
tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large) |
|
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large) |
|
|
|
def mask_generation(text,model=model_large,tokenizer=tokenizer_large): |
|
if(len(text)<30): |
|
text = text+'.' |
|
inputs = ["Mask Generation: " + text.lower()+'.'] |
|
inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt") |
|
output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text)) |
|
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0] |
|
predicted_title = decoded_output.strip() |
|
pattern = r'\[.*?\]' |
|
|
|
redacted_text = re.sub(pattern, '[redacted]', predicted_title) |
|
return redacted_text |
|
|
|
|
|
|
|
def read_pdf(file): |
|
pdf_document = fitz.open(stream=file.read(), filetype="pdf") |
|
text = "" |
|
for page_num in range(len(pdf_document)): |
|
page = pdf_document.load_page(page_num) |
|
text += page.get_text() |
|
return text |
|
|
|
def read_docx(file): |
|
doc = Document(file) |
|
text = "\n".join([para.text for para in doc.paragraphs]) |
|
return text |
|
|
|
def read_txt(file): |
|
text = file.read().decode("utf-8") |
|
return text |
|
|
|
def process_file(file): |
|
if file.type == "application/pdf": |
|
return read_pdf(file) |
|
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
|
return read_docx(file) |
|
elif file.type == "text/plain": |
|
return read_txt(file) |
|
else: |
|
return "Unsupported file type." |
|
|
|
st.title("File Reader") |
|
user = st.text_input("Input Text to Redact") |
|
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"]) |
|
if(user != ''): |
|
token = sentence_tokenize(user) |
|
final='' |
|
for i in range(0, len(token)): |
|
final+=mask_generation(token[i])+'\n' |
|
st.text_area("OUTPUT",final,height=400) |
|
if uploaded_file is not None: |
|
file_contents = process_file(uploaded_file) |
|
token = sentence_tokenize(file_contents) |
|
final='' |
|
for i in range(0, len(token)): |
|
final+=mask_generation(token[i])+'\n' |
|
processed_text = final |
|
st.text_area("OUTPUT", processed_text, height=400) |
|
|
|
st.download_button( |
|
label="Download Processed File", |
|
data=processed_text, |
|
file_name="processed_file.txt", |
|
mime="text/plain", |
|
) |
|
|