|
from pdf2image import convert_from_path, pdfinfo_from_path |
|
from tools.helper_functions import get_file_path_end |
|
from PIL import Image |
|
import os |
|
from gradio import Progress |
|
from typing import List |
|
|
|
def is_pdf_or_image(filename): |
|
""" |
|
Check if a file name is a PDF or an image file. |
|
|
|
Args: |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise. |
|
""" |
|
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"): |
|
output = True |
|
else: |
|
output = False |
|
return output |
|
|
|
def is_pdf(filename): |
|
""" |
|
Check if a file name is a PDF. |
|
|
|
Args: |
|
filename (str): The name of the file. |
|
|
|
Returns: |
|
bool: True if the file name ends with ".pdf", False otherwise. |
|
""" |
|
return filename.lower().endswith(".pdf") |
|
|
|
|
|
|
|
|
|
def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)): |
|
|
|
|
|
page_count = pdfinfo_from_path(pdf_path)['Pages'] |
|
print("Number of pages in PDF: ", str(page_count)) |
|
|
|
images = [] |
|
|
|
|
|
for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): |
|
|
|
print("Current page: ", str(page_num)) |
|
|
|
|
|
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1) |
|
|
|
|
|
if not image: |
|
break |
|
|
|
images.extend(image) |
|
|
|
print("PDF has been converted to images.") |
|
|
|
return images |
|
|
|
|
|
|
|
def process_file(file_path): |
|
|
|
file_extension = os.path.splitext(file_path)[1].lower() |
|
|
|
|
|
if file_extension in ['.jpg', '.jpeg', '.png']: |
|
print(f"{file_path} is an image file.") |
|
|
|
out_path = [Image.open(file_path)] |
|
|
|
|
|
elif file_extension == '.pdf': |
|
print(f"{file_path} is a PDF file. Converting to image set") |
|
|
|
out_path = convert_pdf_to_images(file_path) |
|
|
|
else: |
|
print(f"{file_path} is not an image or PDF file.") |
|
out_path = [''] |
|
|
|
return out_path |
|
|
|
def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None): |
|
|
|
out_message = '' |
|
out_file_paths = [] |
|
|
|
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist] |
|
|
|
if file_path: |
|
file_path_without_ext = get_file_path_end(file_path) |
|
else: |
|
out_message = "No file selected" |
|
print(out_message) |
|
return out_message, out_file_paths |
|
|
|
if in_redact_method == "Image analysis": |
|
|
|
if is_pdf_or_image(file_path) == False: |
|
return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None |
|
|
|
out_file_path = process_file(file_path) |
|
|
|
elif in_redact_method == "Text analysis": |
|
if is_pdf(file_path) == False: |
|
return "Please upload a PDF file for text analysis.", None |
|
|
|
out_file_path = file_path |
|
|
|
return out_message, out_file_path |
|
|
|
|
|
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]): |
|
file_path_without_ext = get_file_path_end(in_file_path) |
|
|
|
out_file_paths = out_text_file_path |
|
|
|
|
|
print("Creating image version of results") |
|
pdf_text_image_paths = process_file(out_text_file_path[0]) |
|
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf" |
|
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:]) |
|
|
|
out_file_paths.append(out_text_image_file_path) |
|
|
|
out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file." |
|
|
|
return out_message, out_file_paths |
|
|
|
|
|
|
|
|
|
|
|
|
|
|