|
from transformers import pipeline |
|
from transformers import AutoTokenizer |
|
from transformers import AutoModelForSeq2SeqLM |
|
import streamlit as st |
|
import fitz |
|
from docx import Document |
|
import re |
|
import nltk |
|
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern |
|
nltk.download('punkt') |
|
|
|
|
|
def sentence_tokenize(text): |
|
sentences = nltk.sent_tokenize(text) |
|
return sentences |
|
|
|
model_dir_large = 'edithram23/Redaction_Personal_info_v1' |
|
tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large) |
|
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large) |
|
pipe1 = pipeline("token-classification", model="edithram23/new-bert-v2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analyzer = AnalyzerEngine() |
|
|
|
|
|
address_pattern = Pattern(name="address", regex=r"\d+\s\w+\s(?:street|st|road|rd|avenue|ave|lane|ln|drive|dr|blvd|boulevard)\s*\w*", score=0.5) |
|
address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[address_pattern]) |
|
|
|
|
|
analyzer.registry.add_recognizer(address_recognizer) |
|
analyzer.get_recognizers |
|
|
|
|
|
|
|
def combine_words(entities): |
|
combined_entities = [] |
|
current_entity = None |
|
|
|
for entity in entities: |
|
if current_entity: |
|
if current_entity['end'] == entity['start']: |
|
|
|
current_entity['word'] += entity['word'].replace('##', '') |
|
current_entity['end'] = entity['end'] |
|
elif current_entity['end'] + 1 == entity['start']: |
|
|
|
current_entity['word'] += ' ' + entity['word'].replace('##', '') |
|
current_entity['end'] = entity['end'] |
|
else: |
|
|
|
combined_entities.append(current_entity) |
|
|
|
current_entity = entity.copy() |
|
current_entity['word'] = current_entity['word'].replace('##', '') |
|
else: |
|
|
|
current_entity = entity.copy() |
|
current_entity['word'] = current_entity['word'].replace('##', '') |
|
|
|
|
|
if current_entity: |
|
combined_entities.append(current_entity) |
|
|
|
return combined_entities |
|
|
|
def words_red_bert(text): |
|
final=[] |
|
sentences = sentence_tokenize(text) |
|
for sentence in sentences: |
|
x=[pipe1(sentence)] |
|
m = combine_words(x[0]) |
|
for j in m: |
|
if(j['entity']!='none' and len(j['word'])>1 and j['word']!=', '): |
|
final.append(j['word']) |
|
return final |
|
|
|
def extract_entities(text): |
|
entities = { |
|
"NAME": [], |
|
"PHONE_NUMBER": [], |
|
"EMAIL": [], |
|
"ADDRESS": [], |
|
"LOCATION": [], |
|
"IN_AADHAAR": [], |
|
} |
|
output = [] |
|
|
|
|
|
results = analyzer.analyze(text=text, language='en') |
|
|
|
for result in results: |
|
if result.entity_type == "PERSON": |
|
entities["NAME"].append(text[result.start:result.end]) |
|
output+=[text[result.start:result.end]] |
|
elif result.entity_type == "PHONE_NUMBER": |
|
entities["PHONE_NUMBER"].append(text[result.start:result.end]) |
|
output+=[text[result.start:result.end]] |
|
elif result.entity_type == "EMAIL_ADDRESS": |
|
entities["EMAIL"].append(text[result.start:result.end]) |
|
output+=[text[result.start:result.end]] |
|
elif result.entity_type == "ADDRESS": |
|
entities["ADDRESS"].append(text[result.start:result.end]) |
|
output+=[text[result.start:result.end]] |
|
elif result.entity_type == 'LOCATION': |
|
entities['LOCATION'].append(text[result.start:result.end]) |
|
output+=[text[result.start:result.end]] |
|
elif result.entity_type == 'IN_AADHAAR': |
|
entities['IN_PAN'].append(text[result.start:result.end]) |
|
output+=[text[result.start:result.end]] |
|
|
|
return entities,output |
|
|
|
def mask_generation(text, model=model_large, tokenizer=tokenizer_large): |
|
if len(text) < 90: |
|
text = text + '.' |
|
|
|
inputs = ["Mask Generation: " + text.lower() + '.'] |
|
inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt") |
|
output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text)) |
|
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0] |
|
predicted_title = decoded_output.strip() |
|
pattern = r'\[.*?\]' |
|
redacted_text = re.sub(pattern, '[redacted]', predicted_title) |
|
return redacted_text |
|
|
|
def redact_text(page, text): |
|
text_instances = page.search_for(text) |
|
for inst in text_instances: |
|
page.add_redact_annot(inst, fill=(0, 0, 0)) |
|
page.apply_redactions() |
|
|
|
def read_pdf(file): |
|
pdf_document = fitz.open(stream=file.read(), filetype="pdf") |
|
text = "" |
|
for page_num in range(len(pdf_document)): |
|
page = pdf_document.load_page(page_num) |
|
text += page.get_text() |
|
return text, pdf_document |
|
|
|
def read_docx(file): |
|
doc = Document(file) |
|
text = "\n".join([para.text for para in doc.paragraphs]) |
|
return text |
|
|
|
def read_txt(file): |
|
text = file.read().decode("utf-8") |
|
return text |
|
|
|
def process_file(file): |
|
if file.type == "application/pdf": |
|
return read_pdf(file) |
|
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
|
return read_docx(file), None |
|
elif file.type == "text/plain": |
|
return read_txt(file), None |
|
else: |
|
return "Unsupported file type.", None |
|
|
|
st.title("Redaction") |
|
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"]) |
|
|
|
if uploaded_file is not None: |
|
file_contents, pdf_document = process_file(uploaded_file) |
|
if pdf_document: |
|
redacted_text = [] |
|
for pg in pdf_document: |
|
text = pg.get_text('text') |
|
st.text_area(pg.get_text()) |
|
sentences = sentence_tokenize(text) |
|
for sent in sentences: |
|
entities,words_out = extract_entities(sent) |
|
bert_words = words_red_bert(sent) |
|
new=[] |
|
for w in words_out: |
|
new+=w.split('\n') |
|
words_out+=bert_words |
|
words_out = [i for i in new if len(i)>2] |
|
|
|
|
|
words_out=sorted(words_out, key=len,reverse=True) |
|
print(words_out) |
|
for i in words_out: |
|
redact_text(pg,i) |
|
|
|
output_pdf = "output_redacted.pdf" |
|
pdf_document.save(output_pdf) |
|
|
|
with open(output_pdf, "rb") as file: |
|
st.download_button( |
|
label="Download Processed PDF", |
|
data=file, |
|
file_name="processed_file.pdf", |
|
mime="application/pdf", |
|
) |
|
else: |
|
token = sentence_tokenize(file_contents) |
|
final = '' |
|
for i in range(0, len(token)): |
|
final += mask_generation(token[i]) + '\n' |
|
processed_text = final |
|
st.text_area("OUTPUT", processed_text, height=400) |
|
st.download_button( |
|
label="Download Processed File", |
|
data=processed_text, |
|
file_name="processed_file.txt", |
|
mime="text/plain", |
|
) |
|
|