seanpedrickcase commited on
Commit
10f46e9
·
1 Parent(s): 69c2af9

Corrected a couple of bugs. Now Textract whole document API call outputs will load also the input PDF into the app

Browse files
README.md CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 0.6.2
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
@@ -22,10 +22,7 @@ NOTE: The app is not 100% accurate, and it will miss some personal information.
22
 
23
  # USER GUIDE
24
 
25
- ## Experiment with the test (public) version of the app
26
- You can test out many of the features described in this user guide at the [public test version of the app](https://huggingface.co/spaces/seanpedrickcase/document_redaction), which is free. AWS functions (e.g. Textract, Comprehend) are not enabled (unless you have valid API keys).
27
-
28
- ## Chat over this user guide
29
  You can now [speak with a chat bot about this user guide](https://huggingface.co/spaces/seanpedrickcase/Light-PDF-Web-QA-Chatbot) (beta!)
30
 
31
  ## Table of contents
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 0.6.3
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
 
22
 
23
  # USER GUIDE
24
 
25
+ ## Chat with this user guide
 
 
 
26
  You can now [speak with a chat bot about this user guide](https://huggingface.co/spaces/seanpedrickcase/Light-PDF-Web-QA-Chatbot) (beta!)
27
 
28
  ## Table of contents
app.py CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
4
  import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
 
7
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS
8
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
9
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
10
  from tools.file_redaction import choose_and_run_redactor
@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
- from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
@@ -55,8 +55,6 @@ if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCE
55
  if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
56
  if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
57
 
58
- print
59
-
60
  # Create the gradio interface
61
  app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
62
 
@@ -170,16 +168,16 @@ with app:
170
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
171
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
172
 
173
- s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
174
- s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
175
- s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
176
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
177
  no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
178
  textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
179
 
180
- load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
181
- s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
182
- local_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
183
 
184
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
185
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
@@ -214,10 +212,10 @@ with app:
214
 
215
  # Textract API call placeholders in case option not selected in config
216
 
217
- job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=False)
218
  send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
219
 
220
- job_id_textbox = gr.Textbox(label = "Latest job ID for bulk document analysis", value='', visible=False)
221
  check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
222
  job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
223
  job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
@@ -225,6 +223,7 @@ with app:
225
  selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
226
  is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
227
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
 
228
 
229
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
230
  convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
@@ -251,7 +250,7 @@ with app:
251
  with gr.Accordion("Redact document", open = True):
252
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
253
 
254
- text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Go to Redaction settings - AWS Textract options to remove signature detection.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
255
 
256
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
257
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
@@ -260,7 +259,7 @@ with app:
260
  pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
261
 
262
  if SHOW_COSTS == "True":
263
- with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
264
  with gr.Row(equal_height=True):
265
  with gr.Column(scale=1):
266
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
@@ -269,10 +268,11 @@ with app:
269
  with gr.Row(equal_height=True):
270
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
271
  estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
272
- estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
273
 
274
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
275
- with gr.Accordion("Apply cost code", open = True, visible=True):
 
276
  with gr.Row():
277
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
278
  with gr.Column():
@@ -555,18 +555,24 @@ with app:
555
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
556
 
557
  # Send whole document to Textract for text extraction
558
- send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number])
559
-
 
 
 
560
  check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
561
- success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
562
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
563
 
564
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
565
 
566
 
567
- convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
 
 
 
568
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
569
- success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
570
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
571
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
572
 
@@ -693,10 +699,10 @@ with app:
693
  # Get connection details on app load
694
 
695
  if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
696
- app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
697
- success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
698
  else:
699
- app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder])
700
 
701
 
702
  # If relevant environment variable is set, load in the Textract job details
 
4
  import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
 
7
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, AWS_REGION
8
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
9
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
10
  from tools.file_redaction import choose_and_run_redactor
 
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
+ from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
 
55
  if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
56
  if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
57
 
 
 
58
  # Create the gradio interface
59
  app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
60
 
 
168
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
169
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
170
 
171
+ s3_whole_document_textract_default_bucket = gr.Textbox(label = "Default Textract whole_document S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
172
+ s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
173
+ s3_whole_document_textract_output_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
174
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
175
  no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
176
  textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
177
 
178
+ load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
179
+ s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
180
+ local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
181
 
182
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
183
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
 
212
 
213
  # Textract API call placeholders in case option not selected in config
214
 
215
+ job_name_textbox = gr.Textbox(value="", label="whole_document Textract call", visible=False)
216
  send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
217
 
218
+ job_id_textbox = gr.Textbox(label = "Latest job ID for whole_document document analysis", value='', visible=False)
219
  check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
220
  job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
221
  job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
 
223
  selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
224
  is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
225
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
226
+ job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
227
 
228
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
229
  convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
 
250
  with gr.Accordion("Redact document", open = True):
251
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
252
 
253
+ text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
254
 
255
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
256
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
 
259
  pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
260
 
261
  if SHOW_COSTS == "True":
262
+ with gr.Accordion("Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs)", open = True, visible=True):
263
  with gr.Row(equal_height=True):
264
  with gr.Column(scale=1):
265
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
 
268
  with gr.Row(equal_height=True):
269
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
270
  estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
271
+ estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
272
 
273
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
274
+ with gr.Accordion("Assign task to cost code", open = True, visible=True):
275
+ gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
276
  with gr.Row():
277
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
278
  with gr.Column():
 
555
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
556
 
557
  # Send whole document to Textract for text extraction
558
+ send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number]).\
559
+ success(check_for_provided_job_id, inputs=[job_id_textbox]).\
560
+ success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
561
+ success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
562
+
563
  check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
564
+ success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
565
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
566
 
567
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
568
 
569
 
570
+ convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
571
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
572
+ success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
573
+ success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
574
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
575
+ success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
576
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
577
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
578
 
 
699
  # Get connection details on app load
700
 
701
  if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
702
+ app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder]).\
703
+ success(load_in_textract_job_details, inputs=[load_s3_whole_document_textract_logs_bool, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[textract_job_detail_df])
704
  else:
705
+ app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
706
 
707
 
708
  # If relevant environment variable is set, load in the Textract job details
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.6.2"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "0.6.3"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
tools/config.py CHANGED
@@ -279,4 +279,7 @@ LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_J
279
 
280
  TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
281
 
 
 
 
282
  TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
 
279
 
280
  TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
281
 
282
+ TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
283
+
284
+
285
  TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
tools/file_conversion.py CHANGED
@@ -620,12 +620,12 @@ def prepare_image_or_pdf(
620
 
621
  elif file_extension in ['.csv']:
622
  if '_review_file' in file_path_without_ext:
623
- review_file_csv = read_file(file_path)
624
  all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
625
  json_from_csv = True
626
  #print("Converted CSV review file to image annotation object")
627
  elif '_ocr_output' in file_path_without_ext:
628
- all_line_level_ocr_results_df = read_file(file_path)
629
  json_from_csv = False
630
 
631
  # NEW IF STATEMENT
@@ -987,7 +987,7 @@ def divide_coordinates_by_page_sizes(
987
  if "image_width" in df_abs.columns and "mediabox_width" in df_abs.columns:
988
  # Check if image_width mostly missing - use .isna().all() or check percentage
989
  if df_abs["image_width"].isna().all():
990
- print("Falling back to mediabox dimensions as image_width is entirely missing.")
991
  df_abs["image_width"] = df_abs["image_width"].fillna(df_abs["mediabox_width"])
992
  df_abs["image_height"] = df_abs["image_height"].fillna(df_abs["mediabox_height"])
993
  else:
@@ -1469,7 +1469,6 @@ def convert_annotation_json_to_review_df(
1469
  id_col_exists_in_review = 'id' in review_file_df.columns and not review_file_df['id'].isnull().all() and not (review_file_df['id'] == '').all()
1470
  id_col_exists_in_redaction = 'id' in redaction_decision_output.columns and not redaction_decision_output['id'].isnull().all() and not (redaction_decision_output['id'] == '').all()
1471
 
1472
-
1473
  if id_col_exists_in_review and id_col_exists_in_redaction:
1474
  #print("Attempting to join data based on 'id' column.")
1475
  try:
@@ -1530,7 +1529,7 @@ def convert_annotation_json_to_review_df(
1530
 
1531
  # Only attempt proximity match if text wasn't added by ID join and proximity is requested
1532
  if not text_added_successfully and do_proximity_match:
1533
- print("Attempting proximity match to add text data.")
1534
 
1535
  # Ensure 'page' columns are numeric before coordinate division and proximity match
1536
  # (Assuming divide_coordinates_by_page_sizes and do_proximity_match_all_pages_for_text need this)
@@ -1559,7 +1558,7 @@ def convert_annotation_json_to_review_df(
1559
  # Assuming do_proximity_match_all_pages_for_text adds/updates the 'text' column
1560
  if 'text' in review_file_df.columns:
1561
  text_added_successfully = True
1562
- print("Proximity match completed.")
1563
  except Exception as e:
1564
  print(f"Error during proximity match: {e}. Text data may not be added.")
1565
 
@@ -1611,7 +1610,13 @@ def convert_annotation_json_to_review_df(
1611
  print(f"Warning: Could not sort DataFrame due to type error in sort columns: {e}")
1612
  # Proceed without sorting
1613
 
1614
- review_file_df = review_file_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"])
 
 
 
 
 
 
1615
 
1616
  return review_file_df
1617
 
@@ -1721,7 +1726,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
1721
  # --- Ensure Column Exists ---
1722
  original_dtype = None
1723
  if column_name not in df.columns:
1724
- print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
1725
  # Initialize with None (which Pandas often treats as NaN but allows object dtype)
1726
  df[column_name] = None
1727
  # Set original_dtype to object so it likely becomes string later
@@ -1757,7 +1762,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
1757
  # print(f"No missing or empty values found requiring IDs in column '{column_name}'.")
1758
  return df
1759
 
1760
- print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
1761
 
1762
  # --- Get Existing IDs to Ensure Uniqueness ---
1763
  # Consider only rows that are *not* missing/empty
@@ -1809,7 +1814,8 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
1809
  # Use the previously identified index to assign the new IDs correctly
1810
  # Assigning string IDs might change the column's dtype to 'object'
1811
  if not pd.api.types.is_object_dtype(original_dtype) and not pd.api.types.is_string_dtype(original_dtype):
1812
- warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
 
1813
 
1814
  df.loc[rows_to_fill_index, column_name] = new_ids_list
1815
  print(f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'.")
@@ -1842,7 +1848,13 @@ def convert_review_df_to_annotation_json(
1842
  Returns:
1843
  List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
1844
  """
1845
- review_file_df = review_file_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"])
 
 
 
 
 
 
1846
 
1847
  if not page_sizes:
1848
  raise ValueError("page_sizes argument is required and cannot be empty.")
@@ -1865,6 +1877,8 @@ def convert_review_df_to_annotation_json(
1865
  raise ValueError(f"Error processing page_sizes: {e}") from e
1866
 
1867
 
 
 
1868
  # Handle empty input DataFrame gracefully
1869
  if review_file_df.empty:
1870
  print("Input review_file_df is empty. Proceeding to generate JSON structure with empty boxes.")
 
620
 
621
  elif file_extension in ['.csv']:
622
  if '_review_file' in file_path_without_ext:
623
+ review_file_csv = read_file(file_path)
624
  all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
625
  json_from_csv = True
626
  #print("Converted CSV review file to image annotation object")
627
  elif '_ocr_output' in file_path_without_ext:
628
+ all_line_level_ocr_results_df = read_file(file_path)
629
  json_from_csv = False
630
 
631
  # NEW IF STATEMENT
 
987
  if "image_width" in df_abs.columns and "mediabox_width" in df_abs.columns:
988
  # Check if image_width mostly missing - use .isna().all() or check percentage
989
  if df_abs["image_width"].isna().all():
990
+ #print("Falling back to mediabox dimensions as image_width is entirely missing.")
991
  df_abs["image_width"] = df_abs["image_width"].fillna(df_abs["mediabox_width"])
992
  df_abs["image_height"] = df_abs["image_height"].fillna(df_abs["mediabox_height"])
993
  else:
 
1469
  id_col_exists_in_review = 'id' in review_file_df.columns and not review_file_df['id'].isnull().all() and not (review_file_df['id'] == '').all()
1470
  id_col_exists_in_redaction = 'id' in redaction_decision_output.columns and not redaction_decision_output['id'].isnull().all() and not (redaction_decision_output['id'] == '').all()
1471
 
 
1472
  if id_col_exists_in_review and id_col_exists_in_redaction:
1473
  #print("Attempting to join data based on 'id' column.")
1474
  try:
 
1529
 
1530
  # Only attempt proximity match if text wasn't added by ID join and proximity is requested
1531
  if not text_added_successfully and do_proximity_match:
1532
+ #print("Attempting proximity match to add text data.")
1533
 
1534
  # Ensure 'page' columns are numeric before coordinate division and proximity match
1535
  # (Assuming divide_coordinates_by_page_sizes and do_proximity_match_all_pages_for_text need this)
 
1558
  # Assuming do_proximity_match_all_pages_for_text adds/updates the 'text' column
1559
  if 'text' in review_file_df.columns:
1560
  text_added_successfully = True
1561
+ #print("Proximity match completed.")
1562
  except Exception as e:
1563
  print(f"Error during proximity match: {e}. Text data may not be added.")
1564
 
 
1610
  print(f"Warning: Could not sort DataFrame due to type error in sort columns: {e}")
1611
  # Proceed without sorting
1612
 
1613
+ base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
1614
+
1615
+ for col in base_cols:
1616
+ if col not in review_file_df.columns:
1617
+ review_file_df[col] = pd.NA
1618
+
1619
+ review_file_df = review_file_df.dropna(subset=base_cols, how="all")
1620
 
1621
  return review_file_df
1622
 
 
1726
  # --- Ensure Column Exists ---
1727
  original_dtype = None
1728
  if column_name not in df.columns:
1729
+ #print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
1730
  # Initialize with None (which Pandas often treats as NaN but allows object dtype)
1731
  df[column_name] = None
1732
  # Set original_dtype to object so it likely becomes string later
 
1762
  # print(f"No missing or empty values found requiring IDs in column '{column_name}'.")
1763
  return df
1764
 
1765
+ #print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
1766
 
1767
  # --- Get Existing IDs to Ensure Uniqueness ---
1768
  # Consider only rows that are *not* missing/empty
 
1814
  # Use the previously identified index to assign the new IDs correctly
1815
  # Assigning string IDs might change the column's dtype to 'object'
1816
  if not pd.api.types.is_object_dtype(original_dtype) and not pd.api.types.is_string_dtype(original_dtype):
1817
+ df['id'] = df['id'].astype(str, errors="ignore")
1818
+ #warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
1819
 
1820
  df.loc[rows_to_fill_index, column_name] = new_ids_list
1821
  print(f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'.")
 
1848
  Returns:
1849
  List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
1850
  """
1851
+ base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
1852
+
1853
+ for col in base_cols:
1854
+ if col not in review_file_df.columns:
1855
+ review_file_df[col] = pd.NA
1856
+
1857
+ review_file_df = review_file_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how='all')
1858
 
1859
  if not page_sizes:
1860
  raise ValueError("page_sizes argument is required and cannot be empty.")
 
1877
  raise ValueError(f"Error processing page_sizes: {e}") from e
1878
 
1879
 
1880
+
1881
+
1882
  # Handle empty input DataFrame gracefully
1883
  if review_file_df.empty:
1884
  print("Input review_file_df is empty. Proceeding to generate JSON structure with empty boxes.")
tools/file_redaction.py CHANGED
@@ -923,6 +923,22 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
923
 
924
  return img_annotation_box, rect
925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
  def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
927
 
928
  rect_height = page.rect.height
@@ -979,9 +995,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
979
 
980
  for annot in page_annotations:
981
 
982
-
983
-
984
-
985
  # Check if an Image recogniser result, or a Gradio annotation object
986
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
987
 
@@ -1053,7 +1066,9 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
1053
  }
1054
 
1055
  page.apply_redactions(images=0, graphics=0)
1056
- page.set_cropbox(original_cropbox) # Set CropBox to original size
 
 
1057
  page.clean_contents()
1058
 
1059
  return page, out_annotation_boxes
@@ -1547,8 +1562,6 @@ def redact_image_pdf(file_path:str,
1547
 
1548
  page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
1549
 
1550
- print("page_image_annotations at box drawing:", page_image_annotations)
1551
-
1552
  redacted_image = image.copy()
1553
  #redacted_image.save("test_out_image.png")
1554
 
 
923
 
924
  return img_annotation_box, rect
925
 
926
+ def set_cropbox_safely(page, original_cropbox):
927
+ """
928
+ Sets the cropbox of a page, ensuring it's not larger than the mediabox.
929
+ If the original cropbox is larger, the mediabox is used instead.
930
+
931
+ Args:
932
+ page: The PyMuPdf page object.
933
+ original_cropbox: The fitz.Rect representing the desired cropbox.
934
+ """
935
+ mediabox = page.mediabox
936
+ if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
937
+ print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
938
+ page.set_cropbox(mediabox)
939
+ else:
940
+ page.set_cropbox(original_cropbox)
941
+
942
  def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
943
 
944
  rect_height = page.rect.height
 
995
 
996
  for annot in page_annotations:
997
 
 
 
 
998
  # Check if an Image recogniser result, or a Gradio annotation object
999
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
1000
 
 
1066
  }
1067
 
1068
  page.apply_redactions(images=0, graphics=0)
1069
+ set_cropbox_safely(page, original_cropbox)
1070
+ #page.set_cropbox(original_cropbox)
1071
+ # Set CropBox to original size
1072
  page.clean_contents()
1073
 
1074
  return page, out_annotation_boxes
 
1562
 
1563
  page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
1564
 
 
 
1565
  redacted_image = image.copy()
1566
  #redacted_image.save("test_out_image.png")
1567
 
tools/find_duplicate_pages.py CHANGED
@@ -136,7 +136,7 @@ def process_data(df:pd.DataFrame, column:str):
136
  def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
137
  output_paths = []
138
 
139
- progress(0.1, desc="Cleaning input texts")
140
 
141
  # Load and clean data
142
  df, output_files = combine_ocr_output_text(input_files)
 
136
  def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
137
  output_paths = []
138
 
139
+ progress(0.1, desc="Cleaning input text")
140
 
141
  # Load and clean data
142
  df, output_files = combine_ocr_output_text(input_files)
tools/textract_batch_call.py CHANGED
@@ -5,13 +5,16 @@ import pandas as pd
5
  import json
6
  import logging
7
  import datetime
 
 
8
  from typing import List
9
  from io import StringIO
10
  from urllib.parse import urlparse
11
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
-
13
- from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
14
- #from tools.aws_textract import json_to_ocrresult
 
15
 
16
  def analyse_document_with_textract_api(
17
  local_pdf_path: str,
@@ -209,7 +212,8 @@ def return_job_status(job_id:str,
209
  logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
210
 
211
  if job_status == 'IN_PROGRESS':
212
- time.sleep(poll_interval_seconds)
 
213
  elif job_status == 'SUCCEEDED':
214
  logging.info("Textract job succeeded.")
215
  elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
@@ -237,6 +241,8 @@ def download_textract_job_files(s3_client:str,
237
  Download and combine selected job files from the AWS Textract service.
238
  '''
239
 
 
 
240
  list_response = s3_client.list_objects_v2(
241
  Bucket=s3_bucket_name,
242
  Prefix=s3_output_key_prefix
@@ -322,7 +328,67 @@ def check_for_provided_job_id(job_id:str):
322
  raise Exception("Please provide a job ID.")
323
  return
324
 
325
- def poll_bulk_textract_analysis_progress_and_download(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  job_id:str,
327
  job_type_dropdown:str,
328
  s3_output_prefix: str,
@@ -333,14 +399,17 @@ def poll_bulk_textract_analysis_progress_and_download(
333
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
334
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
335
  aws_region: str = AWS_REGION, # Optional: specify region if not default
336
- load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
337
  poll_interval_seconds: int = 1,
338
- max_polling_attempts: int = 1 # ~10 minutes total wait time):
 
339
  ):
340
  '''
341
  Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
342
  '''
343
 
 
 
344
  if job_id:
345
  # Initialize boto3 clients
346
  session = boto3.Session(region_name=aws_region)
@@ -365,7 +434,7 @@ def poll_bulk_textract_analysis_progress_and_download(
365
  print(f"Failed to update job details dataframe: {e}")
366
  #raise
367
 
368
- while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
369
  attempts += 1
370
  try:
371
  if job_type_dropdown=="document_analysis":
@@ -394,7 +463,9 @@ def poll_bulk_textract_analysis_progress_and_download(
394
  downloaded_file_path = None
395
  if job_status == 'SUCCEEDED':
396
  #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
397
- # 3b - Replace PDF file name if it exists in the job dataframe
 
 
398
 
399
  # If job_df is not empty
400
  if not job_df.empty:
@@ -410,13 +481,11 @@ def poll_bulk_textract_analysis_progress_and_download(
410
  else:
411
  pdf_filename = "unknown_file"
412
 
413
-
414
  # --- 4. Download Output JSON from S3 ---
415
  # Textract typically creates output under s3_output_prefix/job_id/
416
  # There might be multiple JSON files if pagination occurred during writing.
417
  # Usually, for smaller docs, there's one file, often named '1'.
418
- # For robust handling, list objects and find the JSON(s).
419
-
420
 
421
  s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
422
  logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
@@ -436,8 +505,10 @@ def poll_bulk_textract_analysis_progress_and_download(
436
 
437
  else:
438
  raise Exception("No Job ID provided.")
 
 
439
 
440
- return downloaded_file_path, job_status, job_df
441
 
442
  def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
443
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
 
5
  import json
6
  import logging
7
  import datetime
8
+ import gradio as gr
9
+ from gradio import FileData
10
  from typing import List
11
  from io import StringIO
12
  from urllib.parse import urlparse
13
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
14
+ from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER
15
+ from tools.aws_functions import download_file_from_s3
16
+ from tools.file_conversion import get_input_file_names
17
+ from tools.helper_functions import get_file_name_without_type
18
 
19
  def analyse_document_with_textract_api(
20
  local_pdf_path: str,
 
212
  logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
213
 
214
  if job_status == 'IN_PROGRESS':
215
+ pass
216
+ #time.sleep(poll_interval_seconds)
217
  elif job_status == 'SUCCEEDED':
218
  logging.info("Textract job succeeded.")
219
  elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
 
241
  Download and combine selected job files from the AWS Textract service.
242
  '''
243
 
244
+ #print("s3_output_key_prefix at download:", s3_output_key_prefix)
245
+
246
  list_response = s3_client.list_objects_v2(
247
  Bucket=s3_bucket_name,
248
  Prefix=s3_output_key_prefix
 
328
  raise Exception("Please provide a job ID.")
329
  return
330
 
331
+ def load_pdf_job_file_from_s3(
332
+ load_s3_jobs_input_loc,
333
+ pdf_filename,
334
+ local_output_dir,
335
+ s3_bucket_name,
336
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
337
+
338
+ try:
339
+ print("load_s3_jobs_input_loc:", load_s3_jobs_input_loc)
340
+ pdf_file_location = ''
341
+ doc_file_name_no_extension_textbox = ''
342
+
343
+ s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
344
+ s3_input_key_prefix = s3_input_key_prefix + ".pdf"
345
+ print("s3_input_key_prefix:", s3_input_key_prefix)
346
+
347
+ local_input_file_path = os.path.join(local_output_dir, pdf_filename)
348
+ local_input_file_path = local_input_file_path + ".pdf"
349
+
350
+ print("input to s3 download:", s3_bucket_name, s3_input_key_prefix, local_input_file_path)
351
+
352
+ download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
353
+
354
+ pdf_file_location = [local_input_file_path]
355
+ doc_file_name_no_extension_textbox = get_file_name_without_type(pdf_filename)
356
+ except Exception as e:
357
+ print("Could not download PDF job file from S3 due to:", e)
358
+
359
+ return pdf_file_location, doc_file_name_no_extension_textbox
360
+
361
+ def replace_existing_pdf_input_for_whole_document_outputs(
362
+ load_s3_jobs_input_loc:str,
363
+ pdf_filename:str,
364
+ local_output_dir:str,
365
+ s3_bucket_name:str,
366
+ in_doc_files:FileData=[],
367
+ input_folder:str=INPUT_FOLDER,
368
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
369
+ progress = gr.Progress(track_tqdm=True)):
370
+
371
+ progress(0.1, "Loading PDF from s3")
372
+
373
+ if in_doc_files:
374
+ doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(in_doc_files)
375
+
376
+ if pdf_filename == doc_file_name_no_extension_textbox:
377
+ print("Existing loaded PDF file has same name as file from S3")
378
+ doc_file_name_no_extension_textbox = pdf_filename
379
+ downloaded_pdf_file_location = in_doc_files
380
+ else:
381
+ downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
382
+
383
+ doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
384
+ else:
385
+ downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
386
+
387
+ doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
388
+
389
+ return downloaded_pdf_file_location, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count
390
+
391
+ def poll_whole_document_textract_analysis_progress_and_download(
392
  job_id:str,
393
  job_type_dropdown:str,
394
  s3_output_prefix: str,
 
399
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
400
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
401
  aws_region: str = AWS_REGION, # Optional: specify region if not default
402
+ load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
403
  poll_interval_seconds: int = 1,
404
+ max_polling_attempts: int = 1, # ~10 minutes total wait time):
405
+ progress = gr.Progress(track_tqdm=True)
406
  ):
407
  '''
408
  Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
409
  '''
410
 
411
+ progress(0.1, "Querying AWS Textract for status of document analysis job")
412
+
413
  if job_id:
414
  # Initialize boto3 clients
415
  session = boto3.Session(region_name=aws_region)
 
434
  print(f"Failed to update job details dataframe: {e}")
435
  #raise
436
 
437
+ while job_status == 'IN_PROGRESS' and attempts <= max_polling_attempts:
438
  attempts += 1
439
  try:
440
  if job_type_dropdown=="document_analysis":
 
463
  downloaded_file_path = None
464
  if job_status == 'SUCCEEDED':
465
  #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
466
+ # 3b - Replace PDF file name if it exists in the job dataframe
467
+
468
+ progress(0.5, "Document analysis task outputs found. Downloading from S3")
469
 
470
  # If job_df is not empty
471
  if not job_df.empty:
 
481
  else:
482
  pdf_filename = "unknown_file"
483
 
 
484
  # --- 4. Download Output JSON from S3 ---
485
  # Textract typically creates output under s3_output_prefix/job_id/
486
  # There might be multiple JSON files if pagination occurred during writing.
487
  # Usually, for smaller docs, there's one file, often named '1'.
488
+ # For robust handling, list objects and find the JSON(s).
 
489
 
490
  s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
491
  logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
 
505
 
506
  else:
507
  raise Exception("No Job ID provided.")
508
+
509
+ output_pdf_filename = get_file_name_without_type(pdf_filename)
510
 
511
+ return downloaded_file_path, job_status, job_df, output_pdf_filename
512
 
513
  def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
514
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,