Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on May 6

Commit

10f46e9

1 Parent(s): 69c2af9

Corrected a couple of bugs. Now Textract whole document API call outputs will load also the input PDF into the app

Browse files

Files changed (8) hide show

README.md +2 -5
app.py +30 -24
pyproject.toml +1 -1
tools/config.py +3 -0
tools/file_conversion.py +25 -11
tools/file_redaction.py +19 -6
tools/find_duplicate_pages.py +1 -1
tools/textract_batch_call.py +84 -13

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ license: agpl-3.0
 ---
 # Document redaction
-version: 0.6.2
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
@@ -22,10 +22,7 @@ NOTE: The app is not 100% accurate, and it will miss some personal information.
 # USER GUIDE
-## Experiment with the test (public) version of the app
-You can test out many of the features described in this user guide at the [public test version of the app](https://huggingface.co/spaces/seanpedrickcase/document_redaction), which is free. AWS functions (e.g. Textract, Comprehend) are not enabled (unless you have valid API keys).
-## Chat over this user guide
 You can now [speak with a chat bot about this user guide](https://huggingface.co/spaces/seanpedrickcase/Light-PDF-Web-QA-Chatbot) (beta!)
 ## Table of contents

 ---
 # Document redaction
+version: 0.6.3
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
 # USER GUIDE
+## Chat with this user guide
 You can now [speak with a chat bot about this user guide](https://huggingface.co/spaces/seanpedrickcase/Light-PDF-Web-QA-Chatbot) (beta!)
 ## Table of contents

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
-from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
@@ -55,8 +55,6 @@ if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCE
 if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
 if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
-print
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
@@ -170,16 +168,16 @@ with app:
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
-    s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
-    s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
-    s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
     no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
     textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
-    load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
-    s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
-    local_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
     s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
     default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
@@ -214,10 +212,10 @@ with app:
     # Textract API call placeholders in case option not selected in config
-    job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=False)
     send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
-    job_id_textbox = gr.Textbox(label = "Latest job ID for bulk document analysis", value='', visible=False)
     check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
     job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
     job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
@@ -225,6 +223,7 @@ with app:
     selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
     is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
     textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
     convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
@@ -251,7 +250,7 @@ with app:
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
-            text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Go to Redaction settings - AWS Textract options to remove signature detection.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
                 handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
@@ -260,7 +259,7 @@ with app:
                 pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
             if SHOW_COSTS == "True":
-                with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
                     with gr.Row(equal_height=True):
                         with gr.Column(scale=1):
                             textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
@@ -269,10 +268,11 @@ with app:
                             with gr.Row(equal_height=True):
                                 total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
                                 estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
-                                estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
             if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
-                with gr.Accordion("Apply cost code", open = True, visible=True):
                     with gr.Row():
                         cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
                         with gr.Column():
@@ -555,18 +555,24 @@ with app:
     all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Send whole document to Textract for text extraction
-    send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number])
     check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
-        success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
-    convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
-        success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
         success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
@@ -693,10 +699,10 @@ with app:
     # Get connection details on app load
     if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
-        app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
-        success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
     else:
-        app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder])
     # If relevant environment variable is set, load in the Textract job details

 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, AWS_REGION
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
+from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
 if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
 if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
+    s3_whole_document_textract_default_bucket = gr.Textbox(label = "Default Textract whole_document S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
+    s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
+    s3_whole_document_textract_output_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
     no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
     textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
+    load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
+    s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
+    local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
     s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
     default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
     # Textract API call placeholders in case option not selected in config
+    job_name_textbox = gr.Textbox(value="", label="whole_document Textract call", visible=False)
     send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
+    job_id_textbox = gr.Textbox(label = "Latest job ID for whole_document document analysis", value='', visible=False)
     check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
     job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
     job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
     selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
     is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
+    job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
     textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
     convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
+            text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
                 handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
                 pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
             if SHOW_COSTS == "True":
+                with gr.Accordion("Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs)", open = True, visible=True):
                     with gr.Row(equal_height=True):
                         with gr.Column(scale=1):
                             textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
                             with gr.Row(equal_height=True):
                                 total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
                                 estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
+                                estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
             if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
+                with gr.Accordion("Assign task to cost code", open = True, visible=True):
+                    gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
                     with gr.Row():
                         cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
                         with gr.Column():
     all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Send whole document to Textract for text extraction
+    send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number]).\
+        success(check_for_provided_job_id, inputs=[job_id_textbox]).\
+        success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
+        success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
+        success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
+    convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
+        success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
+        success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
+        success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
         success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
     # Get connection details on app load
     if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
+        app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder]).\
+        success(load_in_textract_job_details, inputs=[load_s3_whole_document_textract_logs_bool, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[textract_job_detail_df])
     else:
+        app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
     # If relevant environment variable is set, load in the Textract job details

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "0.6.2"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"

 [project]
 name = "doc_redaction"
+version = "0.6.3"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"

tools/config.py CHANGED Viewed

@@ -279,4 +279,7 @@ LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_J
 TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
 TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored

 TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
+TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
 TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored

tools/file_conversion.py CHANGED Viewed

@@ -620,12 +620,12 @@ def prepare_image_or_pdf(
         elif file_extension in ['.csv']:
             if '_review_file' in file_path_without_ext:
-                review_file_csv = read_file(file_path)
                 all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
                 json_from_csv = True
                 #print("Converted CSV review file to image annotation object")
             elif '_ocr_output' in file_path_without_ext:
-                all_line_level_ocr_results_df = read_file(file_path)
                 json_from_csv = False
         # NEW IF STATEMENT
@@ -987,7 +987,7 @@ def divide_coordinates_by_page_sizes(
         if "image_width" in df_abs.columns and "mediabox_width" in df_abs.columns:
              # Check if image_width mostly missing - use .isna().all() or check percentage
              if df_abs["image_width"].isna().all():
-                 print("Falling back to mediabox dimensions as image_width is entirely missing.")
                  df_abs["image_width"] = df_abs["image_width"].fillna(df_abs["mediabox_width"])
                  df_abs["image_height"] = df_abs["image_height"].fillna(df_abs["mediabox_height"])
              else:
@@ -1469,7 +1469,6 @@ def convert_annotation_json_to_review_df(
         id_col_exists_in_review = 'id' in review_file_df.columns and not review_file_df['id'].isnull().all() and not (review_file_df['id'] == '').all()
         id_col_exists_in_redaction = 'id' in redaction_decision_output.columns and not redaction_decision_output['id'].isnull().all() and not (redaction_decision_output['id'] == '').all()
         if id_col_exists_in_review and id_col_exists_in_redaction:
             #print("Attempting to join data based on 'id' column.")
             try:
@@ -1530,7 +1529,7 @@ def convert_annotation_json_to_review_df(
         # Only attempt proximity match if text wasn't added by ID join and proximity is requested
         if not text_added_successfully and do_proximity_match:
-             print("Attempting proximity match to add text data.")
              # Ensure 'page' columns are numeric before coordinate division and proximity match
              # (Assuming divide_coordinates_by_page_sizes and do_proximity_match_all_pages_for_text need this)
@@ -1559,7 +1558,7 @@ def convert_annotation_json_to_review_df(
                     # Assuming do_proximity_match_all_pages_for_text adds/updates the 'text' column
                     if 'text' in review_file_df.columns:
                          text_added_successfully = True
-                    print("Proximity match completed.")
                 except Exception as e:
                     print(f"Error during proximity match: {e}. Text data may not be added.")
@@ -1611,7 +1610,13 @@ def convert_annotation_json_to_review_df(
               print(f"Warning: Could not sort DataFrame due to type error in sort columns: {e}")
               # Proceed without sorting
-    review_file_df = review_file_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"])
     return review_file_df
@@ -1721,7 +1726,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
     # --- Ensure Column Exists ---
     original_dtype = None
     if column_name not in df.columns:
-        print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
         # Initialize with None (which Pandas often treats as NaN but allows object dtype)
         df[column_name] = None
         # Set original_dtype to object so it likely becomes string later
@@ -1757,7 +1762,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
         # print(f"No missing or empty values found requiring IDs in column '{column_name}'.")
         return df
-    print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
     # --- Get Existing IDs to Ensure Uniqueness ---
     # Consider only rows that are *not* missing/empty
@@ -1809,7 +1814,8 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
     # Use the previously identified index to assign the new IDs correctly
     # Assigning string IDs might change the column's dtype to 'object'
     if not pd.api.types.is_object_dtype(original_dtype) and not pd.api.types.is_string_dtype(original_dtype):
-         warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
     df.loc[rows_to_fill_index, column_name] = new_ids_list
     print(f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'.")
@@ -1842,7 +1848,13 @@ def convert_review_df_to_annotation_json(
     Returns:
         List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
     """
-    review_file_df = review_file_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"])
     if not page_sizes:
         raise ValueError("page_sizes argument is required and cannot be empty.")
@@ -1865,6 +1877,8 @@ def convert_review_df_to_annotation_json(
         raise ValueError(f"Error processing page_sizes: {e}") from e
     # Handle empty input DataFrame gracefully
     if review_file_df.empty:
         print("Input review_file_df is empty. Proceeding to generate JSON structure with empty boxes.")

         elif file_extension in ['.csv']:
             if '_review_file' in file_path_without_ext:
+                review_file_csv = read_file(file_path)
                 all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
                 json_from_csv = True
                 #print("Converted CSV review file to image annotation object")
             elif '_ocr_output' in file_path_without_ext:
+                all_line_level_ocr_results_df = read_file(file_path)
                 json_from_csv = False
         # NEW IF STATEMENT
         if "image_width" in df_abs.columns and "mediabox_width" in df_abs.columns:
              # Check if image_width mostly missing - use .isna().all() or check percentage
              if df_abs["image_width"].isna().all():
+                 #print("Falling back to mediabox dimensions as image_width is entirely missing.")
                  df_abs["image_width"] = df_abs["image_width"].fillna(df_abs["mediabox_width"])
                  df_abs["image_height"] = df_abs["image_height"].fillna(df_abs["mediabox_height"])
              else:
         id_col_exists_in_review = 'id' in review_file_df.columns and not review_file_df['id'].isnull().all() and not (review_file_df['id'] == '').all()
         id_col_exists_in_redaction = 'id' in redaction_decision_output.columns and not redaction_decision_output['id'].isnull().all() and not (redaction_decision_output['id'] == '').all()
         if id_col_exists_in_review and id_col_exists_in_redaction:
             #print("Attempting to join data based on 'id' column.")
             try:
         # Only attempt proximity match if text wasn't added by ID join and proximity is requested
         if not text_added_successfully and do_proximity_match:
+             #print("Attempting proximity match to add text data.")
              # Ensure 'page' columns are numeric before coordinate division and proximity match
              # (Assuming divide_coordinates_by_page_sizes and do_proximity_match_all_pages_for_text need this)
                     # Assuming do_proximity_match_all_pages_for_text adds/updates the 'text' column
                     if 'text' in review_file_df.columns:
                          text_added_successfully = True
+                    #print("Proximity match completed.")
                 except Exception as e:
                     print(f"Error during proximity match: {e}. Text data may not be added.")
               print(f"Warning: Could not sort DataFrame due to type error in sort columns: {e}")
               # Proceed without sorting
+    base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
+    for col in base_cols:
+        if col not in review_file_df.columns:
+            review_file_df[col] = pd.NA
+    review_file_df = review_file_df.dropna(subset=base_cols, how="all")
     return review_file_df
     # --- Ensure Column Exists ---
     original_dtype = None
     if column_name not in df.columns:
+        #print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
         # Initialize with None (which Pandas often treats as NaN but allows object dtype)
         df[column_name] = None
         # Set original_dtype to object so it likely becomes string later
         # print(f"No missing or empty values found requiring IDs in column '{column_name}'.")
         return df
+    #print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
     # --- Get Existing IDs to Ensure Uniqueness ---
     # Consider only rows that are *not* missing/empty
     # Use the previously identified index to assign the new IDs correctly
     # Assigning string IDs might change the column's dtype to 'object'
     if not pd.api.types.is_object_dtype(original_dtype) and not pd.api.types.is_string_dtype(original_dtype):
+         df['id'] = df['id'].astype(str, errors="ignore")
+         #warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
     df.loc[rows_to_fill_index, column_name] = new_ids_list
     print(f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'.")
     Returns:
         List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
     """
+    base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
+    for col in base_cols:
+        if col not in review_file_df.columns:
+            review_file_df[col] = pd.NA
+    review_file_df = review_file_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how='all')
     if not page_sizes:
         raise ValueError("page_sizes argument is required and cannot be empty.")
         raise ValueError(f"Error processing page_sizes: {e}") from e
     # Handle empty input DataFrame gracefully
     if review_file_df.empty:
         print("Input review_file_df is empty. Proceeding to generate JSON structure with empty boxes.")

tools/file_redaction.py CHANGED Viewed

@@ -923,6 +923,22 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
     return img_annotation_box, rect
 def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
@@ -979,9 +995,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
     for annot in page_annotations:
         # Check if an Image recogniser result, or a Gradio annotation object
         if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
@@ -1053,7 +1066,9 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
     }
     page.apply_redactions(images=0, graphics=0)
-    page.set_cropbox(original_cropbox)  # Set CropBox to original size
     page.clean_contents()
     return page, out_annotation_boxes
@@ -1547,8 +1562,6 @@ def redact_image_pdf(file_path:str,
                 page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
-                print("page_image_annotations at box drawing:", page_image_annotations)
                 redacted_image = image.copy()
                 #redacted_image.save("test_out_image.png")

     return img_annotation_box, rect
+def set_cropbox_safely(page, original_cropbox):
+    """
+    Sets the cropbox of a page, ensuring it's not larger than the mediabox.
+    If the original cropbox is larger, the mediabox is used instead.
+    Args:
+        page: The PyMuPdf page object.
+        original_cropbox: The fitz.Rect representing the desired cropbox.
+    """
+    mediabox = page.mediabox
+    if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
+        print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
+        page.set_cropbox(mediabox)
+    else:
+        page.set_cropbox(original_cropbox)
 def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
     for annot in page_annotations:
         # Check if an Image recogniser result, or a Gradio annotation object
         if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
     }
     page.apply_redactions(images=0, graphics=0)
+    set_cropbox_safely(page, original_cropbox)
+    #page.set_cropbox(original_cropbox)
+      # Set CropBox to original size
     page.clean_contents()
     return page, out_annotation_boxes
                 page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
                 redacted_image = image.copy()
                 #redacted_image.save("test_out_image.png")

tools/find_duplicate_pages.py CHANGED Viewed

@@ -136,7 +136,7 @@ def process_data(df:pd.DataFrame, column:str):
 def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
     output_paths = []
-    progress(0.1, desc="Cleaning input texts")
     # Load and clean data
     df, output_files = combine_ocr_output_text(input_files)

 def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
     output_paths = []
+    progress(0.1, desc="Cleaning input text")
     # Load and clean data
     df, output_files = combine_ocr_output_text(input_files)

tools/textract_batch_call.py CHANGED Viewed

@@ -5,13 +5,16 @@ import pandas as pd
 import json
 import logging
 import datetime
 from typing import List
 from io import StringIO
 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
-from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
-#from tools.aws_textract import json_to_ocrresult
 def analyse_document_with_textract_api(
     local_pdf_path: str,
@@ -209,7 +212,8 @@ def return_job_status(job_id:str,
     logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
     if job_status == 'IN_PROGRESS':
-        time.sleep(poll_interval_seconds)
     elif job_status == 'SUCCEEDED':
         logging.info("Textract job succeeded.")
     elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
@@ -237,6 +241,8 @@ def download_textract_job_files(s3_client:str,
     Download and combine selected job files from the AWS Textract service.
     '''
     list_response = s3_client.list_objects_v2(
         Bucket=s3_bucket_name,
         Prefix=s3_output_key_prefix
@@ -322,7 +328,67 @@ def check_for_provided_job_id(job_id:str):
         raise Exception("Please provide a job ID.")
     return
-def poll_bulk_textract_analysis_progress_and_download(
     job_id:str,
     job_type_dropdown:str,
     s3_output_prefix: str,
@@ -333,14 +399,17 @@ def poll_bulk_textract_analysis_progress_and_download(
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
     aws_region: str = AWS_REGION, # Optional: specify region if not default
-    load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
-    max_polling_attempts: int = 1 # ~10 minutes total wait time):
     ):
     '''
     Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
     '''
     if job_id:
         # Initialize boto3 clients
         session = boto3.Session(region_name=aws_region)
@@ -365,7 +434,7 @@ def poll_bulk_textract_analysis_progress_and_download(
             print(f"Failed to update job details dataframe: {e}")
             #raise
-        while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
             attempts += 1
             try:
                 if job_type_dropdown=="document_analysis":
@@ -394,7 +463,9 @@ def poll_bulk_textract_analysis_progress_and_download(
         downloaded_file_path = None
         if job_status == 'SUCCEEDED':
             #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
-            # 3b - Replace PDF file name if it exists in the job dataframe
             # If job_df is not empty
             if not job_df.empty:
@@ -410,13 +481,11 @@ def poll_bulk_textract_analysis_progress_and_download(
                     else:
                         pdf_filename = "unknown_file"
             # --- 4. Download Output JSON from S3 ---
             # Textract typically creates output under s3_output_prefix/job_id/
             # There might be multiple JSON files if pagination occurred during writing.
             # Usually, for smaller docs, there's one file, often named '1'.
-            # For robust handling, list objects and find the JSON(s).
             s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
             logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
@@ -436,8 +505,10 @@ def poll_bulk_textract_analysis_progress_and_download(
     else:
         raise Exception("No Job ID provided.")
-    return downloaded_file_path, job_status, job_df
 def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,

 import json
 import logging
 import datetime
+import gradio as gr
+from gradio import FileData
 from typing import List
 from io import StringIO
 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
+from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER
+from tools.aws_functions import download_file_from_s3
+from tools.file_conversion import get_input_file_names
+from tools.helper_functions import get_file_name_without_type
 def analyse_document_with_textract_api(
     local_pdf_path: str,
     logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
     if job_status == 'IN_PROGRESS':
+        pass
+        #time.sleep(poll_interval_seconds)
     elif job_status == 'SUCCEEDED':
         logging.info("Textract job succeeded.")
     elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
     Download and combine selected job files from the AWS Textract service.
     '''
+    #print("s3_output_key_prefix at download:", s3_output_key_prefix)
     list_response = s3_client.list_objects_v2(
         Bucket=s3_bucket_name,
         Prefix=s3_output_key_prefix
         raise Exception("Please provide a job ID.")
     return
+def load_pdf_job_file_from_s3(
+    load_s3_jobs_input_loc,
+    pdf_filename,
+    local_output_dir,
+    s3_bucket_name,
+    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
+    try:
+        print("load_s3_jobs_input_loc:", load_s3_jobs_input_loc)
+        pdf_file_location = ''
+        doc_file_name_no_extension_textbox = ''
+        s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
+        s3_input_key_prefix = s3_input_key_prefix + ".pdf"
+        print("s3_input_key_prefix:", s3_input_key_prefix)
+        local_input_file_path = os.path.join(local_output_dir, pdf_filename)
+        local_input_file_path = local_input_file_path + ".pdf"
+        print("input to s3 download:", s3_bucket_name, s3_input_key_prefix, local_input_file_path)
+        download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
+        pdf_file_location = [local_input_file_path]
+        doc_file_name_no_extension_textbox = get_file_name_without_type(pdf_filename)
+    except Exception as e:
+        print("Could not download PDF job file from S3 due to:", e)
+    return pdf_file_location, doc_file_name_no_extension_textbox
+def replace_existing_pdf_input_for_whole_document_outputs(
+    load_s3_jobs_input_loc:str,
+    pdf_filename:str,
+    local_output_dir:str,
+    s3_bucket_name:str,
+    in_doc_files:FileData=[],
+    input_folder:str=INPUT_FOLDER,
+    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+    progress = gr.Progress(track_tqdm=True)):
+    progress(0.1, "Loading PDF from s3")
+    if in_doc_files:
+        doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(in_doc_files)
+        if pdf_filename == doc_file_name_no_extension_textbox:
+            print("Existing loaded PDF file has same name as file from S3")
+            doc_file_name_no_extension_textbox = pdf_filename
+            downloaded_pdf_file_location = in_doc_files
+        else:
+            downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
+            doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
+    else:
+        downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
+        doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
+    return downloaded_pdf_file_location, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count
+def poll_whole_document_textract_analysis_progress_and_download(
     job_id:str,
     job_type_dropdown:str,
     s3_output_prefix: str,
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
     aws_region: str = AWS_REGION, # Optional: specify region if not default
+    load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
+    max_polling_attempts: int = 1, # ~10 minutes total wait time):
+    progress = gr.Progress(track_tqdm=True)
     ):
     '''
     Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
     '''
+    progress(0.1, "Querying AWS Textract for status of document analysis job")
     if job_id:
         # Initialize boto3 clients
         session = boto3.Session(region_name=aws_region)
             print(f"Failed to update job details dataframe: {e}")
             #raise
+        while job_status == 'IN_PROGRESS' and attempts <= max_polling_attempts:
             attempts += 1
             try:
                 if job_type_dropdown=="document_analysis":
         downloaded_file_path = None
         if job_status == 'SUCCEEDED':
             #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
+            # 3b - Replace PDF file name if it exists in the job dataframe
+            progress(0.5, "Document analysis task outputs found. Downloading from S3")
             # If job_df is not empty
             if not job_df.empty:
                     else:
                         pdf_filename = "unknown_file"
             # --- 4. Download Output JSON from S3 ---
             # Textract typically creates output under s3_output_prefix/job_id/
             # There might be multiple JSON files if pagination occurred during writing.
             # Usually, for smaller docs, there's one file, often named '1'.
+            # For robust handling, list objects and find the JSON(s).
             s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
             logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
     else:
         raise Exception("No Job ID provided.")
+    output_pdf_filename = get_file_name_without_type(pdf_filename)
+    return downloaded_file_path, job_status, job_df, output_pdf_filename
 def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,