document_rag_preparation / tools /file_redaction.py
seanpedrickcase's picture
Initial commit
0b2c988
raw
history blame
10.1 kB
from PIL import Image
from typing import List
import pandas as pd
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
from pdfminer.high_level import extract_pages
from tools.file_conversion import process_file
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
from pikepdf import Pdf, Dictionary, Name
from gradio import Progress
import time
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
from tools.helper_functions import get_file_path_end
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
import gradio as gr
def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
tic = time.perf_counter()
out_message = ''
out_file_paths = []
if in_allow_list:
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
if file_path:
file_path_without_ext = get_file_path_end(file_path)
else:
out_message = "No file selected"
print(out_message)
return out_message, out_file_paths
if in_redact_method == "Image analysis":
# Analyse and redact image-based pdf or image
# if is_pdf_or_image(file_path) == False:
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
out_file_paths.append(out_image_file_path)
out_message = "Image-based PDF successfully redacted and saved to file."
elif in_redact_method == "Text analysis":
if is_pdf(file_path) == False:
return "Please upload a PDF file for text analysis.", None
# Analyse text-based pdf
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
pdf_text.save(out_text_file_path)
out_file_paths.append(out_text_file_path)
out_message = "Text-based PDF successfully redacted and saved to file."
else:
out_message = "No redaction method selected"
print(out_message)
return out_message, out_file_paths
toc = time.perf_counter()
out_time = f"Time taken: {toc - tic:0.1f} seconds."
print(out_time)
out_message = out_message + "\n\n" + out_time
return out_message, out_file_paths, out_file_paths
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
'''
take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
'''
if not image_paths:
out_message = "PDF does not exist as images. Converting pages to image"
print(out_message)
progress(0, desc=out_message)
image_paths = process_file(file_path)
# Create a new PDF
#pdf = pikepdf.new()
images = []
number_of_pages = len(image_paths)
out_message = "Redacting pages"
print(out_message)
progress(0.1, desc=out_message)
for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
print("Redacting page ", str(i + 1))
# Get the image to redact using PIL lib (pillow)
image = image_paths[i] #Image.open(image_paths[i])
# %%
image_analyser = ImageAnalyzerEngine(nlp_analyser)
engine = ImageRedactorEngine(image_analyser)
if language == 'en':
ocr_lang = 'eng'
else: ocr_lang = language
# %%
# Redact the image with pink color
redacted_image = engine.redact(image,
fill=(0, 0, 0),
ocr_kwargs={"lang": ocr_lang},
allow_list=allow_list,
ad_hoc_recognizers= None,
**{
"language": language,
"entities": chosen_redact_entities,
"score_threshold": score_threshold
},
)
images.append(redacted_image)
return images
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
'''
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
'''
combined_analyzer_results = []
analyser_explanations = []
annotations_all_pages = []
analyzed_bounding_boxes_df = pd.DataFrame()
pdf = Pdf.open(filename)
page_num = 0
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
print("Page number is: ", page_num)
annotations_on_page = []
analyzed_bounding_boxes = []
for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
analyzer_results = []
for text_container in page_layout:
if isinstance(text_container, LTTextContainer):
text_to_analyze = text_container.get_text()
analyzer_results = []
characters = []
analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
language=language,
entities=chosen_redact_entities,
score_threshold=score_threshold,
return_decision_process=False,
allow_list=allow_list)
#if analyzer_results:
# pass
#explanation = analyzer_results[0].analysis_explanation.to_dict()
#analyser_explanations.append(explanation)
characters = [char # This is what we want to include in the list
for line in text_container # Loop through each line in text_container
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
for char in line] # Loop through each character in the line
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
# If any results found
print(analyzer_results)
if len(analyzer_results) > 0 and len(characters) > 0:
analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
combined_analyzer_results.extend(analyzer_results)
if len(analyzer_results) > 0:
# Create summary df of annotations to be made
analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
analyzed_bounding_boxes_df_new['page'] = page_num + 1
analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
for analyzed_bounding_box in analyzed_bounding_boxes:
bounding_box = analyzed_bounding_box["boundingBox"]
annotation = Dictionary(
Type=Name.Annot,
Subtype=Name.Highlight,
QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
C=[0, 0, 0],
CA=1, # Transparency
T=analyzed_bounding_box["result"].entity_type
)
annotations_on_page.append(annotation)
annotations_all_pages.extend([annotations_on_page])
print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
page.Annots = pdf.make_indirect(annotations_on_page)
page_num += 1
# Extracting data from dictionaries
# extracted_data = []
# for item in annotations_all_pages:
# temp_dict = {}
# #print(item)
# for key, value in item.items():
# if isinstance(value, Decimal):
# temp_dict[key] = float(value)
# elif isinstance(value, list):
# temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
# else:
# temp_dict[key] = value
# extracted_data.append(temp_dict)
# Creating DataFrame
# annotations_out = pd.DataFrame(extracted_data)
#print(df)
#annotations_out.to_csv("examples/annotations.csv")
analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
return pdf