File size: 4,387 Bytes
0b2c988 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
from pdf2image import convert_from_path, pdfinfo_from_path
from tools.helper_functions import get_file_path_end
from PIL import Image
import os
from gradio import Progress
from typing import List
def is_pdf_or_image(filename):
"""
Check if a file name is a PDF or an image file.
Args:
filename (str): The name of the file.
Returns:
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
"""
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
output = True
else:
output = False
return output
def is_pdf(filename):
"""
Check if a file name is a PDF.
Args:
filename (str): The name of the file.
Returns:
bool: True if the file name ends with ".pdf", False otherwise.
"""
return filename.lower().endswith(".pdf")
# %%
## Convert pdf to image if necessary
def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
# Get the number of pages in the PDF
page_count = pdfinfo_from_path(pdf_path)['Pages']
print("Number of pages in PDF: ", str(page_count))
images = []
# Open the PDF file
for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
print("Current page: ", str(page_num))
# Convert one page to image
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
# If no images are returned, break the loop
if not image:
break
images.extend(image)
print("PDF has been converted to images.")
return images
# %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
def process_file(file_path):
# Get the file extension
file_extension = os.path.splitext(file_path)[1].lower()
# Check if the file is an image type
if file_extension in ['.jpg', '.jpeg', '.png']:
print(f"{file_path} is an image file.")
# Perform image processing here
out_path = [Image.open(file_path)]
# Check if the file is a PDF
elif file_extension == '.pdf':
print(f"{file_path} is a PDF file. Converting to image set")
# Run your function for processing PDF files here
out_path = convert_pdf_to_images(file_path)
else:
print(f"{file_path} is not an image or PDF file.")
out_path = ['']
return out_path
def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
out_message = ''
out_file_paths = []
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
if file_path:
file_path_without_ext = get_file_path_end(file_path)
else:
out_message = "No file selected"
print(out_message)
return out_message, out_file_paths
if in_redact_method == "Image analysis":
# Analyse and redact image-based pdf or image
if is_pdf_or_image(file_path) == False:
return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
out_file_path = process_file(file_path)
elif in_redact_method == "Text analysis":
if is_pdf(file_path) == False:
return "Please upload a PDF file for text analysis.", None
out_file_path = file_path
return out_message, out_file_path
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
file_path_without_ext = get_file_path_end(in_file_path)
out_file_paths = out_text_file_path
# Convert annotated text pdf back to image to give genuine redactions
print("Creating image version of results")
pdf_text_image_paths = process_file(out_text_file_path[0])
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
out_file_paths.append(out_text_image_file_path)
out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
return out_message, out_file_paths
|