|
from PIL import Image |
|
from typing import List |
|
import pandas as pd |
|
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine |
|
from pdfminer.high_level import extract_pages |
|
from tools.file_conversion import process_file |
|
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno |
|
from pikepdf import Pdf, Dictionary, Name |
|
from gradio import Progress |
|
import time |
|
|
|
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold |
|
from tools.helper_functions import get_file_path_end |
|
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image |
|
import gradio as gr |
|
|
|
def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)): |
|
|
|
tic = time.perf_counter() |
|
|
|
out_message = '' |
|
out_file_paths = [] |
|
|
|
if in_allow_list: |
|
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist] |
|
|
|
if file_path: |
|
file_path_without_ext = get_file_path_end(file_path) |
|
else: |
|
out_message = "No file selected" |
|
print(out_message) |
|
return out_message, out_file_paths |
|
|
|
if in_redact_method == "Image analysis": |
|
|
|
|
|
|
|
|
|
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat) |
|
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf" |
|
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:]) |
|
|
|
out_file_paths.append(out_image_file_path) |
|
out_message = "Image-based PDF successfully redacted and saved to file." |
|
|
|
elif in_redact_method == "Text analysis": |
|
if is_pdf(file_path) == False: |
|
return "Please upload a PDF file for text analysis.", None |
|
|
|
|
|
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat) |
|
out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf" |
|
pdf_text.save(out_text_file_path) |
|
|
|
out_file_paths.append(out_text_file_path) |
|
|
|
out_message = "Text-based PDF successfully redacted and saved to file." |
|
|
|
else: |
|
out_message = "No redaction method selected" |
|
print(out_message) |
|
return out_message, out_file_paths |
|
|
|
toc = time.perf_counter() |
|
out_time = f"Time taken: {toc - tic:0.1f} seconds." |
|
print(out_time) |
|
|
|
out_message = out_message + "\n\n" + out_time |
|
|
|
return out_message, out_file_paths, out_file_paths |
|
|
|
|
|
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)): |
|
''' |
|
take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back |
|
''' |
|
|
|
if not image_paths: |
|
|
|
out_message = "PDF does not exist as images. Converting pages to image" |
|
print(out_message) |
|
progress(0, desc=out_message) |
|
|
|
image_paths = process_file(file_path) |
|
|
|
|
|
|
|
|
|
images = [] |
|
number_of_pages = len(image_paths) |
|
|
|
out_message = "Redacting pages" |
|
print(out_message) |
|
progress(0.1, desc=out_message) |
|
|
|
for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"): |
|
|
|
print("Redacting page ", str(i + 1)) |
|
|
|
|
|
image = image_paths[i] |
|
|
|
|
|
image_analyser = ImageAnalyzerEngine(nlp_analyser) |
|
engine = ImageRedactorEngine(image_analyser) |
|
|
|
if language == 'en': |
|
ocr_lang = 'eng' |
|
else: ocr_lang = language |
|
|
|
|
|
|
|
redacted_image = engine.redact(image, |
|
fill=(0, 0, 0), |
|
ocr_kwargs={"lang": ocr_lang}, |
|
allow_list=allow_list, |
|
ad_hoc_recognizers= None, |
|
**{ |
|
"language": language, |
|
"entities": chosen_redact_entities, |
|
"score_threshold": score_threshold |
|
}, |
|
) |
|
|
|
images.append(redacted_image) |
|
|
|
return images |
|
|
|
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)): |
|
''' |
|
Redact chosen entities from a pdf that is made up of multiple pages that are not images. |
|
''' |
|
|
|
combined_analyzer_results = [] |
|
analyser_explanations = [] |
|
annotations_all_pages = [] |
|
analyzed_bounding_boxes_df = pd.DataFrame() |
|
|
|
pdf = Pdf.open(filename) |
|
|
|
page_num = 0 |
|
|
|
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"): |
|
|
|
|
|
print("Page number is: ", page_num) |
|
|
|
annotations_on_page = [] |
|
analyzed_bounding_boxes = [] |
|
|
|
for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1): |
|
analyzer_results = [] |
|
|
|
for text_container in page_layout: |
|
if isinstance(text_container, LTTextContainer): |
|
text_to_analyze = text_container.get_text() |
|
|
|
analyzer_results = [] |
|
characters = [] |
|
|
|
analyzer_results = nlp_analyser.analyze(text=text_to_analyze, |
|
language=language, |
|
entities=chosen_redact_entities, |
|
score_threshold=score_threshold, |
|
return_decision_process=False, |
|
allow_list=allow_list) |
|
|
|
|
|
|
|
|
|
|
|
characters = [char |
|
for line in text_container |
|
if isinstance(line, LTTextLine) |
|
for char in line] |
|
|
|
|
|
|
|
print(analyzer_results) |
|
|
|
if len(analyzer_results) > 0 and len(characters) > 0: |
|
analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar)) |
|
combined_analyzer_results.extend(analyzer_results) |
|
|
|
if len(analyzer_results) > 0: |
|
|
|
analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes) |
|
analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True) |
|
analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"] |
|
analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1) |
|
analyzed_bounding_boxes_df_new['page'] = page_num + 1 |
|
analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0) |
|
|
|
for analyzed_bounding_box in analyzed_bounding_boxes: |
|
bounding_box = analyzed_bounding_box["boundingBox"] |
|
annotation = Dictionary( |
|
Type=Name.Annot, |
|
Subtype=Name.Highlight, |
|
QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]], |
|
Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]], |
|
C=[0, 0, 0], |
|
CA=1, |
|
T=analyzed_bounding_box["result"].entity_type |
|
) |
|
annotations_on_page.append(annotation) |
|
|
|
annotations_all_pages.extend([annotations_on_page]) |
|
|
|
print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations") |
|
page.Annots = pdf.make_indirect(annotations_on_page) |
|
|
|
page_num += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv") |
|
|
|
return pdf |