Commit
·
ef4000e
1
Parent(s):
c8ffcd4
Local text redaction now produces ocr results with words json and can make dataframe format
Browse files- app.py +23 -22
- tools/file_conversion.py +17 -14
- tools/file_redaction.py +245 -41
- tools/helper_functions.py +19 -5
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import pandas as pd
|
|
| 3 |
import gradio as gr
|
| 4 |
from gradio_image_annotation import image_annotator
|
| 5 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
|
| 6 |
-
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select,
|
| 7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
| 8 |
from tools.file_redaction import choose_and_run_redactor
|
| 9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
|
@@ -63,7 +63,7 @@ with app:
|
|
| 63 |
###
|
| 64 |
|
| 65 |
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
| 66 |
-
pdf_doc_state = gr.State([])
|
| 67 |
all_image_annotations_state = gr.State([])
|
| 68 |
|
| 69 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
|
@@ -211,7 +211,7 @@ with app:
|
|
| 211 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
|
| 212 |
|
| 213 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
|
| 214 |
-
|
| 215 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
|
| 216 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
|
| 217 |
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
|
|
@@ -274,7 +274,7 @@ with app:
|
|
| 274 |
with gr.Row(equal_height=True):
|
| 275 |
with gr.Column(scale=1):
|
| 276 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
| 277 |
-
|
| 278 |
with gr.Column(scale=4):
|
| 279 |
with gr.Row(equal_height=True):
|
| 280 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
|
|
@@ -576,7 +576,8 @@ with app:
|
|
| 576 |
if SHOW_COSTS == 'True':
|
| 577 |
# Calculate costs
|
| 578 |
total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 579 |
-
text_extract_method_radio.change(
|
|
|
|
| 580 |
pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 581 |
handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 582 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
|
@@ -584,14 +585,14 @@ with app:
|
|
| 584 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 585 |
|
| 586 |
# Calculate time taken
|
| 587 |
-
total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
| 588 |
-
text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
| 589 |
-
pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
| 590 |
-
handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
| 591 |
-
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
| 592 |
-
only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
| 593 |
-
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
| 594 |
-
|
| 595 |
|
| 596 |
# Allow user to select items from cost code dataframe for cost code
|
| 597 |
if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
|
|
@@ -601,9 +602,9 @@ with app:
|
|
| 601 |
cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
| 602 |
|
| 603 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 604 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
| 605 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
| 606 |
-
success(fn=
|
| 607 |
|
| 608 |
# Run redaction function
|
| 609 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
|
@@ -620,7 +621,7 @@ with app:
|
|
| 620 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
|
| 621 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
| 622 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
| 623 |
-
success(fn=
|
| 624 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
|
| 625 |
success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
|
| 626 |
|
|
@@ -640,9 +641,9 @@ with app:
|
|
| 640 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
| 641 |
|
| 642 |
convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 643 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
| 644 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
| 645 |
-
success(fn=
|
| 646 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
| 647 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
| 648 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
|
@@ -657,7 +658,7 @@ with app:
|
|
| 657 |
# Upload previous files for modifying redactions
|
| 658 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 659 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 660 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
| 661 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
| 662 |
|
| 663 |
# Manual updates to review di
|
|
@@ -753,12 +754,12 @@ with app:
|
|
| 753 |
|
| 754 |
# Convert review file to xfdf Adobe format
|
| 755 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 756 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder,
|
| 757 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
| 758 |
|
| 759 |
# Convert xfdf Adobe file back to review_file.csv
|
| 760 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 761 |
-
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder,
|
| 762 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
| 763 |
|
| 764 |
###
|
|
@@ -779,7 +780,7 @@ with app:
|
|
| 779 |
###
|
| 780 |
# IDENTIFY DUPLICATE PAGES
|
| 781 |
###
|
| 782 |
-
#in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
| 783 |
|
| 784 |
find_duplicate_pages_btn.click(
|
| 785 |
fn=run_duplicate_analysis,
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
from gradio_image_annotation import image_annotator
|
| 5 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
|
| 6 |
+
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
|
| 7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
| 8 |
from tools.file_redaction import choose_and_run_redactor
|
| 9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
|
|
|
| 63 |
###
|
| 64 |
|
| 65 |
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
| 66 |
+
pdf_doc_state = gr.State([])
|
| 67 |
all_image_annotations_state = gr.State([])
|
| 68 |
|
| 69 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
|
|
|
| 211 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
|
| 212 |
|
| 213 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
|
| 214 |
+
relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=False)
|
| 215 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
|
| 216 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
|
| 217 |
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
|
|
|
|
| 274 |
with gr.Row(equal_height=True):
|
| 275 |
with gr.Column(scale=1):
|
| 276 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
| 277 |
+
relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
|
| 278 |
with gr.Column(scale=4):
|
| 279 |
with gr.Row(equal_height=True):
|
| 280 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
|
|
|
|
| 576 |
if SHOW_COSTS == 'True':
|
| 577 |
# Calculate costs
|
| 578 |
total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 579 |
+
text_extract_method_radio.change(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
| 580 |
+
success(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 581 |
pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 582 |
handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 583 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
|
|
|
| 585 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
| 586 |
|
| 587 |
# Calculate time taken
|
| 588 |
+
total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 589 |
+
text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 590 |
+
pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 591 |
+
handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 592 |
+
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 593 |
+
only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 594 |
+
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 595 |
+
relevant_ocr_output_with_words_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
| 596 |
|
| 597 |
# Allow user to select items from cost code dataframe for cost code
|
| 598 |
if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
|
|
|
|
| 602 |
cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
| 603 |
|
| 604 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 605 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
|
| 606 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
| 607 |
+
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
|
| 608 |
|
| 609 |
# Run redaction function
|
| 610 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
|
|
|
| 621 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
|
| 622 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
| 623 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
| 624 |
+
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
| 625 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
|
| 626 |
success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
|
| 627 |
|
|
|
|
| 641 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
| 642 |
|
| 643 |
convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 644 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
|
| 645 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
| 646 |
+
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
| 647 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
| 648 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
| 649 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
|
|
|
| 658 |
# Upload previous files for modifying redactions
|
| 659 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 660 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 661 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox], api_name="prepare_doc").\
|
| 662 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
| 663 |
|
| 664 |
# Manual updates to review di
|
|
|
|
| 754 |
|
| 755 |
# Convert review file to xfdf Adobe format
|
| 756 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 757 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
|
| 758 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
| 759 |
|
| 760 |
# Convert xfdf Adobe file back to review_file.csv
|
| 761 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
| 762 |
+
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
|
| 763 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
| 764 |
|
| 765 |
###
|
|
|
|
| 780 |
###
|
| 781 |
# IDENTIFY DUPLICATE PAGES
|
| 782 |
###
|
| 783 |
+
#in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox])
|
| 784 |
|
| 785 |
find_duplicate_pages_btn.click(
|
| 786 |
fn=run_duplicate_analysis,
|
tools/file_conversion.py
CHANGED
|
@@ -454,7 +454,7 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
|
|
| 454 |
|
| 455 |
def prepare_image_or_pdf(
|
| 456 |
file_paths: List[str],
|
| 457 |
-
|
| 458 |
all_line_level_ocr_results_df:pd.DataFrame,
|
| 459 |
latest_file_completed: int = 0,
|
| 460 |
out_message: List[str] = [],
|
|
@@ -468,7 +468,7 @@ def prepare_image_or_pdf(
|
|
| 468 |
prepare_images:bool=True,
|
| 469 |
page_sizes:list[dict]=[],
|
| 470 |
textract_output_found:bool = False,
|
| 471 |
-
|
| 472 |
progress: Progress = Progress(track_tqdm=True)
|
| 473 |
) -> tuple[List[str], List[str]]:
|
| 474 |
"""
|
|
@@ -479,7 +479,7 @@ def prepare_image_or_pdf(
|
|
| 479 |
|
| 480 |
Args:
|
| 481 |
file_paths (List[str]): List of file paths to process.
|
| 482 |
-
|
| 483 |
latest_file_completed (optional, int): Index of the last completed file.
|
| 484 |
out_message (optional, List[str]): List to store output messages.
|
| 485 |
first_loop_state (optional, bool): Flag indicating if this is the first iteration.
|
|
@@ -491,7 +491,7 @@ def prepare_image_or_pdf(
|
|
| 491 |
prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
|
| 492 |
page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
|
| 493 |
textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
|
| 494 |
-
|
| 495 |
progress (optional, Progress): Progress tracker for the operation
|
| 496 |
|
| 497 |
|
|
@@ -542,7 +542,7 @@ def prepare_image_or_pdf(
|
|
| 542 |
final_out_message = '\n'.join(out_message)
|
| 543 |
else:
|
| 544 |
final_out_message = out_message
|
| 545 |
-
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df,
|
| 546 |
|
| 547 |
progress(0.1, desc='Preparing file')
|
| 548 |
|
|
@@ -599,8 +599,8 @@ def prepare_image_or_pdf(
|
|
| 599 |
|
| 600 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
| 601 |
# Check if the file is an image type and the user selected text ocr option
|
| 602 |
-
if file_extension in ['.jpg', '.jpeg', '.png'] and
|
| 603 |
-
|
| 604 |
|
| 605 |
# Convert image to a pymupdf document
|
| 606 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
|
@@ -663,15 +663,18 @@ def prepare_image_or_pdf(
|
|
| 663 |
elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
|
| 664 |
print("Saving local OCR output")
|
| 665 |
# Copy it to the output folder so it can be used later.
|
| 666 |
-
output_ocr_results_with_words_json_file_name = file_path_without_ext
|
| 667 |
-
if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
| 668 |
-
else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
| 669 |
|
| 670 |
out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
|
| 671 |
|
| 672 |
# Use shutil to copy the file directly
|
| 673 |
shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
|
| 674 |
-
|
|
|
|
|
|
|
|
|
|
| 675 |
continue
|
| 676 |
|
| 677 |
# NEW IF STATEMENT
|
|
@@ -768,13 +771,13 @@ def prepare_image_or_pdf(
|
|
| 768 |
|
| 769 |
# Must be something else, return with error message
|
| 770 |
else:
|
| 771 |
-
if
|
| 772 |
if is_pdf_or_image(file_path) == False:
|
| 773 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
| 774 |
print(out_message)
|
| 775 |
raise Exception(out_message)
|
| 776 |
|
| 777 |
-
elif
|
| 778 |
if is_pdf(file_path) == False:
|
| 779 |
out_message = "Please upload a PDF file for text analysis."
|
| 780 |
print(out_message)
|
|
@@ -793,7 +796,7 @@ def prepare_image_or_pdf(
|
|
| 793 |
|
| 794 |
number_of_pages = len(page_sizes)#len(image_file_paths)
|
| 795 |
|
| 796 |
-
return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df,
|
| 797 |
|
| 798 |
def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
| 799 |
"""
|
|
|
|
| 454 |
|
| 455 |
def prepare_image_or_pdf(
|
| 456 |
file_paths: List[str],
|
| 457 |
+
text_extract_method: str,
|
| 458 |
all_line_level_ocr_results_df:pd.DataFrame,
|
| 459 |
latest_file_completed: int = 0,
|
| 460 |
out_message: List[str] = [],
|
|
|
|
| 468 |
prepare_images:bool=True,
|
| 469 |
page_sizes:list[dict]=[],
|
| 470 |
textract_output_found:bool = False,
|
| 471 |
+
relevant_ocr_output_with_words_found:bool = False,
|
| 472 |
progress: Progress = Progress(track_tqdm=True)
|
| 473 |
) -> tuple[List[str], List[str]]:
|
| 474 |
"""
|
|
|
|
| 479 |
|
| 480 |
Args:
|
| 481 |
file_paths (List[str]): List of file paths to process.
|
| 482 |
+
text_extract_method (str): The redaction method to use.
|
| 483 |
latest_file_completed (optional, int): Index of the last completed file.
|
| 484 |
out_message (optional, List[str]): List to store output messages.
|
| 485 |
first_loop_state (optional, bool): Flag indicating if this is the first iteration.
|
|
|
|
| 491 |
prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
|
| 492 |
page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
|
| 493 |
textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
|
| 494 |
+
relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
|
| 495 |
progress (optional, Progress): Progress tracker for the operation
|
| 496 |
|
| 497 |
|
|
|
|
| 542 |
final_out_message = '\n'.join(out_message)
|
| 543 |
else:
|
| 544 |
final_out_message = out_message
|
| 545 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
|
| 546 |
|
| 547 |
progress(0.1, desc='Preparing file')
|
| 548 |
|
|
|
|
| 599 |
|
| 600 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
| 601 |
# Check if the file is an image type and the user selected text ocr option
|
| 602 |
+
if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
| 603 |
+
text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
|
| 604 |
|
| 605 |
# Convert image to a pymupdf document
|
| 606 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
|
|
|
| 663 |
elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
|
| 664 |
print("Saving local OCR output")
|
| 665 |
# Copy it to the output folder so it can be used later.
|
| 666 |
+
output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
| 667 |
+
# if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
| 668 |
+
# else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
| 669 |
|
| 670 |
out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
|
| 671 |
|
| 672 |
# Use shutil to copy the file directly
|
| 673 |
shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
|
| 674 |
+
|
| 675 |
+
if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
|
| 676 |
+
if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
|
| 677 |
+
if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
|
| 678 |
continue
|
| 679 |
|
| 680 |
# NEW IF STATEMENT
|
|
|
|
| 771 |
|
| 772 |
# Must be something else, return with error message
|
| 773 |
else:
|
| 774 |
+
if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 775 |
if is_pdf_or_image(file_path) == False:
|
| 776 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
| 777 |
print(out_message)
|
| 778 |
raise Exception(out_message)
|
| 779 |
|
| 780 |
+
elif text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
| 781 |
if is_pdf(file_path) == False:
|
| 782 |
out_message = "Please upload a PDF file for text analysis."
|
| 783 |
print(out_message)
|
|
|
|
| 796 |
|
| 797 |
number_of_pages = len(page_sizes)#len(image_file_paths)
|
| 798 |
|
| 799 |
+
return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
|
| 800 |
|
| 801 |
def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
| 802 |
"""
|
tools/file_redaction.py
CHANGED
|
@@ -8,7 +8,7 @@ import copy
|
|
| 8 |
|
| 9 |
from tqdm import tqdm
|
| 10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
| 11 |
-
from typing import List, Dict, Tuple, Optional
|
| 12 |
import pandas as pd
|
| 13 |
|
| 14 |
from pdfminer.high_level import extract_pages
|
|
@@ -59,6 +59,49 @@ def sum_numbers_before_seconds(string:str):
|
|
| 59 |
|
| 60 |
return sum_of_numbers
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
def choose_and_run_redactor(file_paths:List[str],
|
| 63 |
prepared_pdf_file_paths:List[str],
|
| 64 |
pdf_image_file_paths:List[str],
|
|
@@ -499,7 +542,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 499 |
# Analyse text-based pdf
|
| 500 |
print('Redacting file as text-based PDF')
|
| 501 |
|
| 502 |
-
pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(
|
| 503 |
file_path,
|
| 504 |
language,
|
| 505 |
chosen_redact_entities,
|
|
@@ -513,6 +556,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 513 |
all_line_level_ocr_results_df,
|
| 514 |
all_pages_decision_process_table,
|
| 515 |
pymupdf_doc,
|
|
|
|
| 516 |
pii_identification_method,
|
| 517 |
comprehend_query_number,
|
| 518 |
comprehend_client,
|
|
@@ -522,7 +566,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 522 |
match_fuzzy_whole_phrase_bool,
|
| 523 |
page_sizes_df,
|
| 524 |
document_cropboxes,
|
| 525 |
-
text_extraction_only
|
|
|
|
| 526 |
else:
|
| 527 |
out_message = "No redaction method selected"
|
| 528 |
print(out_message)
|
|
@@ -536,9 +581,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 536 |
current_loop_page = 999
|
| 537 |
|
| 538 |
if latest_file_completed != len(file_paths_list):
|
| 539 |
-
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
| 540 |
-
|
| 541 |
-
|
| 542 |
|
| 543 |
# Save redacted file
|
| 544 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
@@ -572,6 +615,30 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 572 |
|
| 573 |
duplication_file_path_outputs.append(ocr_file_path)
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
# Convert the gradio annotation boxes to relative coordinates
|
| 576 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
| 577 |
progress(0.93, "Creating review file output")
|
|
@@ -1343,7 +1410,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1343 |
|
| 1344 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
| 1345 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
| 1346 |
-
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "
|
| 1347 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
| 1348 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
| 1349 |
|
|
@@ -1662,32 +1729,37 @@ def redact_image_pdf(file_path:str,
|
|
| 1662 |
# Append new annotation if it doesn't exist
|
| 1663 |
annotations_all_pages.append(page_image_annotations)
|
| 1664 |
|
| 1665 |
-
|
| 1666 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 1667 |
if original_textract_data != textract_data:
|
| 1668 |
# Write the updated existing textract data back to the JSON file
|
| 1669 |
with open(textract_json_file_path, 'w') as json_file:
|
| 1670 |
json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
| 1671 |
|
| 1672 |
-
|
| 1673 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1674 |
|
| 1675 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
| 1676 |
if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
|
| 1677 |
# Write the updated existing textract data back to the JSON file
|
|
|
|
| 1678 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
| 1679 |
-
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
| 1680 |
|
| 1681 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
| 1682 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
| 1683 |
|
| 1684 |
-
#all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
| 1685 |
-
#all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
| 1686 |
-
|
| 1687 |
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
| 1688 |
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
| 1689 |
|
| 1690 |
-
|
| 1691 |
current_loop_page += 1
|
| 1692 |
|
| 1693 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
|
@@ -1784,22 +1856,21 @@ def get_text_container_characters(text_container:LTTextContainer):
|
|
| 1784 |
return characters
|
| 1785 |
return []
|
| 1786 |
|
| 1787 |
-
def
|
| 1788 |
'''
|
| 1789 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
| 1790 |
'''
|
| 1791 |
|
| 1792 |
line_level_results_out = []
|
| 1793 |
line_level_characters_out = []
|
| 1794 |
-
|
| 1795 |
-
character_objects_out = []
|
| 1796 |
-
# character_text_objects_out = []
|
| 1797 |
|
| 1798 |
# Initialize variables
|
| 1799 |
full_text = ""
|
| 1800 |
added_text = ""
|
| 1801 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
| 1802 |
-
|
| 1803 |
|
| 1804 |
# Iterate through the character objects
|
| 1805 |
current_word = ""
|
|
@@ -1813,7 +1884,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
| 1813 |
# character_text_objects_out.append(character_text)
|
| 1814 |
|
| 1815 |
if isinstance(char, LTAnno):
|
| 1816 |
-
|
| 1817 |
added_text = char.get_text()
|
| 1818 |
|
| 1819 |
# Handle double quotes
|
|
@@ -1822,17 +1892,17 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
| 1822 |
# Handle space separately by finalizing the word
|
| 1823 |
full_text += added_text # Adds space or newline
|
| 1824 |
|
| 1825 |
-
if current_word: # Only
|
| 1826 |
-
|
| 1827 |
current_word = ""
|
| 1828 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
| 1829 |
|
| 1830 |
# Check for line break (assuming a new line is indicated by a specific character)
|
| 1831 |
if '\n' in added_text:
|
| 1832 |
|
| 1833 |
-
#
|
| 1834 |
if current_word:
|
| 1835 |
-
|
| 1836 |
# Create an OCRResult for the current line
|
| 1837 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
| 1838 |
line_level_characters_out.append(character_objects_out)
|
|
@@ -1872,23 +1942,138 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
| 1872 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
| 1873 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
| 1874 |
|
| 1875 |
-
#
|
| 1876 |
if current_word:
|
| 1877 |
-
|
| 1878 |
|
| 1879 |
if full_text:
|
|
|
|
| 1880 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
| 1881 |
# Convert special characters to a human-readable format
|
| 1882 |
|
| 1883 |
full_text = clean_unicode_text(full_text)
|
| 1884 |
full_text = full_text.strip()
|
| 1885 |
|
|
|
|
| 1886 |
|
| 1887 |
-
|
| 1888 |
|
| 1889 |
-
|
| 1890 |
|
| 1891 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1892 |
|
| 1893 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
| 1894 |
decision_process_table = pd.DataFrame()
|
|
@@ -1938,7 +2123,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
| 1938 |
return pikepdf_redaction_annotations_on_page
|
| 1939 |
|
| 1940 |
def redact_text_pdf(
|
| 1941 |
-
|
| 1942 |
language: str, # Language of the PDF content
|
| 1943 |
chosen_redact_entities: List[str], # List of entities to be redacted
|
| 1944 |
chosen_redact_comprehend_entities: List[str],
|
|
@@ -1951,6 +2136,7 @@ def redact_text_pdf(
|
|
| 1951 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
|
| 1952 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
| 1953 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
|
|
|
| 1954 |
pii_identification_method: str = "Local",
|
| 1955 |
comprehend_query_number:int = 0,
|
| 1956 |
comprehend_client="",
|
|
@@ -1961,6 +2147,7 @@ def redact_text_pdf(
|
|
| 1961 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
| 1962 |
original_cropboxes:List[dict]=[],
|
| 1963 |
text_extraction_only:bool=False,
|
|
|
|
| 1964 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
| 1965 |
max_time: int = int(MAX_TIME_VALUE),
|
| 1966 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
@@ -1970,7 +2157,7 @@ def redact_text_pdf(
|
|
| 1970 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
| 1971 |
|
| 1972 |
Input Variables:
|
| 1973 |
-
-
|
| 1974 |
- language: Language of the PDF content
|
| 1975 |
- chosen_redact_entities: List of entities to be redacted
|
| 1976 |
- chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
|
|
@@ -1994,6 +2181,7 @@ def redact_text_pdf(
|
|
| 1994 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
|
| 1995 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
| 1996 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
|
|
|
| 1997 |
- page_break_val: Value for page break
|
| 1998 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 1999 |
- progress: Progress tracking object
|
|
@@ -2023,8 +2211,13 @@ def redact_text_pdf(
|
|
| 2023 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
| 2024 |
|
| 2025 |
# Open with Pikepdf to get text lines
|
| 2026 |
-
pikepdf_pdf = Pdf.open(
|
| 2027 |
-
number_of_pages = len(pikepdf_pdf.pages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2028 |
|
| 2029 |
# Check that page_min and page_max are within expected ranges
|
| 2030 |
if page_max > number_of_pages or page_max == 0:
|
|
@@ -2056,7 +2249,7 @@ def redact_text_pdf(
|
|
| 2056 |
|
| 2057 |
if page_min <= page_no < page_max:
|
| 2058 |
# Go page by page
|
| 2059 |
-
for page_layout in extract_pages(
|
| 2060 |
|
| 2061 |
all_page_line_text_extraction_characters = []
|
| 2062 |
all_page_line_level_text_extraction_results_list = []
|
|
@@ -2068,14 +2261,18 @@ def redact_text_pdf(
|
|
| 2068 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
| 2069 |
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
| 2070 |
|
|
|
|
| 2071 |
for n, text_container in enumerate(page_layout):
|
| 2072 |
characters = []
|
| 2073 |
|
| 2074 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
| 2075 |
characters = get_text_container_characters(text_container)
|
|
|
|
| 2076 |
|
| 2077 |
# Create dataframe for all the text on the page
|
| 2078 |
-
line_level_text_results_list, line_characters =
|
|
|
|
|
|
|
| 2079 |
|
| 2080 |
### Create page_text_ocr_outputs (OCR format outputs)
|
| 2081 |
if line_level_text_results_list:
|
|
@@ -2093,6 +2290,7 @@ def redact_text_pdf(
|
|
| 2093 |
|
| 2094 |
all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
|
| 2095 |
all_page_line_text_extraction_characters.extend(line_characters)
|
|
|
|
| 2096 |
|
| 2097 |
### REDACTION
|
| 2098 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
@@ -2143,9 +2341,9 @@ def redact_text_pdf(
|
|
| 2143 |
|
| 2144 |
# Join extracted text outputs for all lines together
|
| 2145 |
if not page_text_ocr_outputs.empty:
|
| 2146 |
-
page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
| 2147 |
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
|
| 2148 |
-
all_line_level_ocr_results_list.append(page_text_ocr_outputs)
|
| 2149 |
|
| 2150 |
toc = time.perf_counter()
|
| 2151 |
|
|
@@ -2174,7 +2372,7 @@ def redact_text_pdf(
|
|
| 2174 |
|
| 2175 |
current_loop_page += 1
|
| 2176 |
|
| 2177 |
-
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
| 2178 |
|
| 2179 |
# Check if the image already exists in annotations_all_pages
|
| 2180 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
|
|
@@ -2195,7 +2393,7 @@ def redact_text_pdf(
|
|
| 2195 |
# Write logs
|
| 2196 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
| 2197 |
|
| 2198 |
-
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
| 2199 |
|
| 2200 |
# Write all page outputs
|
| 2201 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
|
@@ -2222,5 +2420,11 @@ def redact_text_pdf(
|
|
| 2222 |
if not all_line_level_ocr_results_df.empty:
|
| 2223 |
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
|
| 2224 |
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2225 |
|
| 2226 |
-
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
|
|
|
| 8 |
|
| 9 |
from tqdm import tqdm
|
| 10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
| 11 |
+
from typing import List, Dict, Tuple, Optional, Any
|
| 12 |
import pandas as pd
|
| 13 |
|
| 14 |
from pdfminer.high_level import extract_pages
|
|
|
|
| 59 |
|
| 60 |
return sum_of_numbers
|
| 61 |
|
| 62 |
+
def merge_page_results(data):
|
| 63 |
+
merged = {}
|
| 64 |
+
|
| 65 |
+
for item in data:
|
| 66 |
+
page = item["page"]
|
| 67 |
+
|
| 68 |
+
if page not in merged:
|
| 69 |
+
merged[page] = {
|
| 70 |
+
"page": page,
|
| 71 |
+
"results": {}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Merge line-level results into the existing page
|
| 75 |
+
merged[page]["results"].update(item.get("results", {}))
|
| 76 |
+
|
| 77 |
+
return list(merged.values())
|
| 78 |
+
|
| 79 |
+
def word_level_ocr_output_to_dataframe(ocr_result: dict) -> pd.DataFrame:
|
| 80 |
+
rows = []
|
| 81 |
+
ocr_result = ocr_result[0]
|
| 82 |
+
|
| 83 |
+
page_number = int(ocr_result['page'])
|
| 84 |
+
|
| 85 |
+
for line_key, line_data in ocr_result['results'].items():
|
| 86 |
+
line_number = int(line_data['line'])
|
| 87 |
+
for word in line_data['words']:
|
| 88 |
+
rows.append({
|
| 89 |
+
'page': page_number,
|
| 90 |
+
'line': line_number,
|
| 91 |
+
'word_text': word['text'],
|
| 92 |
+
'word_x0': word['bounding_box'][0],
|
| 93 |
+
'word_y0': word['bounding_box'][1],
|
| 94 |
+
'word_x1': word['bounding_box'][2],
|
| 95 |
+
'word_y1': word['bounding_box'][3],
|
| 96 |
+
'line_text': line_data['text'],
|
| 97 |
+
'line_x0': line_data['bounding_box'][0],
|
| 98 |
+
'line_y0': line_data['bounding_box'][1],
|
| 99 |
+
'line_x1': line_data['bounding_box'][2],
|
| 100 |
+
'line_y1': line_data['bounding_box'][3],
|
| 101 |
+
})
|
| 102 |
+
|
| 103 |
+
return pd.DataFrame(rows)
|
| 104 |
+
|
| 105 |
def choose_and_run_redactor(file_paths:List[str],
|
| 106 |
prepared_pdf_file_paths:List[str],
|
| 107 |
pdf_image_file_paths:List[str],
|
|
|
|
| 542 |
# Analyse text-based pdf
|
| 543 |
print('Redacting file as text-based PDF')
|
| 544 |
|
| 545 |
+
pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
|
| 546 |
file_path,
|
| 547 |
language,
|
| 548 |
chosen_redact_entities,
|
|
|
|
| 556 |
all_line_level_ocr_results_df,
|
| 557 |
all_pages_decision_process_table,
|
| 558 |
pymupdf_doc,
|
| 559 |
+
[], # All line level ocr results with words
|
| 560 |
pii_identification_method,
|
| 561 |
comprehend_query_number,
|
| 562 |
comprehend_client,
|
|
|
|
| 566 |
match_fuzzy_whole_phrase_bool,
|
| 567 |
page_sizes_df,
|
| 568 |
document_cropboxes,
|
| 569 |
+
text_extraction_only,
|
| 570 |
+
output_folder=output_folder)
|
| 571 |
else:
|
| 572 |
out_message = "No redaction method selected"
|
| 573 |
print(out_message)
|
|
|
|
| 581 |
current_loop_page = 999
|
| 582 |
|
| 583 |
if latest_file_completed != len(file_paths_list):
|
| 584 |
+
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
|
|
|
|
|
|
| 585 |
|
| 586 |
# Save redacted file
|
| 587 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
|
|
| 615 |
|
| 616 |
duplication_file_path_outputs.append(ocr_file_path)
|
| 617 |
|
| 618 |
+
if all_page_line_level_ocr_results_with_words:
|
| 619 |
+
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
| 620 |
+
|
| 621 |
+
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
| 622 |
+
|
| 623 |
+
# print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
| 624 |
+
|
| 625 |
+
file_name = get_file_name_without_type(file_path)
|
| 626 |
+
|
| 627 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
|
| 628 |
+
|
| 629 |
+
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
| 630 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
| 631 |
+
|
| 632 |
+
all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
|
| 633 |
+
|
| 634 |
+
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
|
| 635 |
+
|
| 636 |
+
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
|
| 637 |
+
|
| 638 |
+
all_page_line_level_ocr_results_with_words_df_file_path = output_folder + file_name + "_ocr_results_with_words.csv"
|
| 639 |
+
|
| 640 |
+
all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path)
|
| 641 |
+
|
| 642 |
# Convert the gradio annotation boxes to relative coordinates
|
| 643 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
| 644 |
progress(0.93, "Creating review file output")
|
|
|
|
| 1410 |
|
| 1411 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
| 1412 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
| 1413 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
|
| 1414 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
| 1415 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
| 1416 |
|
|
|
|
| 1729 |
# Append new annotation if it doesn't exist
|
| 1730 |
annotations_all_pages.append(page_image_annotations)
|
| 1731 |
|
| 1732 |
+
# Save word level options
|
| 1733 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 1734 |
if original_textract_data != textract_data:
|
| 1735 |
# Write the updated existing textract data back to the JSON file
|
| 1736 |
with open(textract_json_file_path, 'w') as json_file:
|
| 1737 |
json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
| 1738 |
|
| 1739 |
+
if textract_json_file_path not in log_files_output_paths:
|
| 1740 |
+
log_files_output_paths.append(textract_json_file_path)
|
| 1741 |
+
|
| 1742 |
+
all_page_line_level_ocr_results_with_words_json_file_path_textract = output_folder + file_name + "_ocr_results_with_words_textract.json"
|
| 1743 |
+
|
| 1744 |
+
with open(all_page_line_level_ocr_results_with_words_json_file_path_textract, 'w') as json_file:
|
| 1745 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
| 1746 |
+
|
| 1747 |
+
if all_page_line_level_ocr_results_with_words_json_file_path_textract not in log_files_output_paths:
|
| 1748 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path_textract)
|
| 1749 |
|
| 1750 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
| 1751 |
if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
|
| 1752 |
# Write the updated existing textract data back to the JSON file
|
| 1753 |
+
|
| 1754 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
| 1755 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
| 1756 |
|
| 1757 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
| 1758 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
| 1759 |
|
|
|
|
|
|
|
|
|
|
| 1760 |
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
| 1761 |
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
| 1762 |
|
|
|
|
| 1763 |
current_loop_page += 1
|
| 1764 |
|
| 1765 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
|
|
|
| 1856 |
return characters
|
| 1857 |
return []
|
| 1858 |
|
| 1859 |
+
def create_line_level_ocr_results_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
| 1860 |
'''
|
| 1861 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
| 1862 |
'''
|
| 1863 |
|
| 1864 |
line_level_results_out = []
|
| 1865 |
line_level_characters_out = []
|
| 1866 |
+
line_level_words_out = {}
|
| 1867 |
+
character_objects_out = []
|
|
|
|
| 1868 |
|
| 1869 |
# Initialize variables
|
| 1870 |
full_text = ""
|
| 1871 |
added_text = ""
|
| 1872 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
| 1873 |
+
line_bboxes = []
|
| 1874 |
|
| 1875 |
# Iterate through the character objects
|
| 1876 |
current_word = ""
|
|
|
|
| 1884 |
# character_text_objects_out.append(character_text)
|
| 1885 |
|
| 1886 |
if isinstance(char, LTAnno):
|
|
|
|
| 1887 |
added_text = char.get_text()
|
| 1888 |
|
| 1889 |
# Handle double quotes
|
|
|
|
| 1892 |
# Handle space separately by finalizing the word
|
| 1893 |
full_text += added_text # Adds space or newline
|
| 1894 |
|
| 1895 |
+
if current_word: # Only finalise if there is a current word
|
| 1896 |
+
line_bboxes.append((current_word, current_word_bbox))
|
| 1897 |
current_word = ""
|
| 1898 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
| 1899 |
|
| 1900 |
# Check for line break (assuming a new line is indicated by a specific character)
|
| 1901 |
if '\n' in added_text:
|
| 1902 |
|
| 1903 |
+
# finalise the current line
|
| 1904 |
if current_word:
|
| 1905 |
+
line_bboxes.append((current_word, current_word_bbox))
|
| 1906 |
# Create an OCRResult for the current line
|
| 1907 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
| 1908 |
line_level_characters_out.append(character_objects_out)
|
|
|
|
| 1942 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
| 1943 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
| 1944 |
|
| 1945 |
+
# Finalise the last word if any
|
| 1946 |
if current_word:
|
| 1947 |
+
line_bboxes.append((current_word, current_word_bbox))
|
| 1948 |
|
| 1949 |
if full_text:
|
| 1950 |
+
print("full_text found")
|
| 1951 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
| 1952 |
# Convert special characters to a human-readable format
|
| 1953 |
|
| 1954 |
full_text = clean_unicode_text(full_text)
|
| 1955 |
full_text = full_text.strip()
|
| 1956 |
|
| 1957 |
+
line_ocr_result_bbox = round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)
|
| 1958 |
|
| 1959 |
+
line_ocr_result = OCRResult(full_text.strip(), line_ocr_result_bbox)
|
| 1960 |
|
| 1961 |
+
line_level_results_out.append(line_ocr_result)
|
| 1962 |
|
| 1963 |
+
else:
|
| 1964 |
+
line_ocr_result_bbox = []
|
| 1965 |
+
|
| 1966 |
+
# if line_ocr_result_bbox:
|
| 1967 |
+
# line_level_words_out["page"] = 1
|
| 1968 |
+
# line_level_words_out['results'] = {'text_line_1':{"line":1, "text":full_text, "bounding_box": line_ocr_result_bbox, "words": line_bboxes}}
|
| 1969 |
+
# else:
|
| 1970 |
+
# line_level_words_out = {}
|
| 1971 |
+
|
| 1972 |
+
|
| 1973 |
+
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
| 1974 |
+
|
| 1975 |
+
def generate_word_level_ocr(char_objects: List, page_number: int, text_line_number:int) -> Dict[str, Any]:
|
| 1976 |
+
"""
|
| 1977 |
+
Generates a dictionary with line and word-level OCR results from a list of pdfminer.six objects.
|
| 1978 |
+
|
| 1979 |
+
This robust version handles real-world pdfminer.six output by:
|
| 1980 |
+
1. Filtering out non-character (LTAnno) objects that lack coordinate data.
|
| 1981 |
+
2. Sorting all text characters (LTChar) into a proper reading order.
|
| 1982 |
+
3. Using an adaptive threshold for detecting spaces based on character font size.
|
| 1983 |
+
|
| 1984 |
+
Args:
|
| 1985 |
+
char_objects: A mixed list of pdfminer.six LTChar and LTAnno objects from a single page.
|
| 1986 |
+
page_number: The page number where the characters are from.
|
| 1987 |
+
|
| 1988 |
+
Returns:
|
| 1989 |
+
A dictionary formatted with page, line, and word-level results.
|
| 1990 |
+
"""
|
| 1991 |
+
# **CRITICAL FIX: Filter out LTAnno objects, as they lack '.bbox' and are not needed for layout analysis.**
|
| 1992 |
+
text_chars = [c for c in char_objects if isinstance(c, LTChar)]
|
| 1993 |
+
|
| 1994 |
+
if not text_chars:
|
| 1995 |
+
return {"page": str(page_number), "results": {}}
|
| 1996 |
+
|
| 1997 |
+
# Sort the remaining text characters into reading order.
|
| 1998 |
+
text_chars.sort(key=lambda c: (-c.bbox[3], c.bbox[0]))
|
| 1999 |
+
|
| 2000 |
+
page_data = {"page": str(page_number), "results": {}}
|
| 2001 |
+
line_number = text_line_number
|
| 2002 |
+
|
| 2003 |
+
# State variables
|
| 2004 |
+
line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
|
| 2005 |
+
current_word_text, current_word_bbox = "", [float('inf'), float('inf'), -1, -1]
|
| 2006 |
+
prev_char = None
|
| 2007 |
+
|
| 2008 |
+
def finalize_word():
|
| 2009 |
+
nonlocal current_word_text, current_word_bbox
|
| 2010 |
+
word_text = current_word_text.strip()
|
| 2011 |
+
if word_text:
|
| 2012 |
+
line_words.append({
|
| 2013 |
+
"text": word_text,
|
| 2014 |
+
"bounding_box": [round(b, 2) for b in current_word_bbox]
|
| 2015 |
+
})
|
| 2016 |
+
current_word_text = ""
|
| 2017 |
+
current_word_bbox = [float('inf'), float('inf'), -1, -1]
|
| 2018 |
+
|
| 2019 |
+
def finalize_line():
|
| 2020 |
+
nonlocal line_text, line_bbox, line_words, line_number, prev_char
|
| 2021 |
+
finalize_word()
|
| 2022 |
+
if line_text.strip():
|
| 2023 |
+
page_data["results"][f"text_line_{line_number}"] = {
|
| 2024 |
+
"line": line_number,
|
| 2025 |
+
"text": line_text.strip(),
|
| 2026 |
+
"bounding_box": [round(b, 2) for b in line_bbox],
|
| 2027 |
+
"words": line_words
|
| 2028 |
+
}
|
| 2029 |
+
line_number += 1
|
| 2030 |
+
line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
|
| 2031 |
+
prev_char = None
|
| 2032 |
+
|
| 2033 |
+
for char in text_chars:
|
| 2034 |
+
char_text = clean_unicode_text(char.get_text())
|
| 2035 |
+
|
| 2036 |
+
if prev_char:
|
| 2037 |
+
char_height = char.bbox[3] - char.bbox[1]
|
| 2038 |
+
vertical_gap = abs(char.bbox[1] - prev_char.bbox[1])
|
| 2039 |
+
|
| 2040 |
+
# Line break detection
|
| 2041 |
+
if vertical_gap > char_height * 0.7:
|
| 2042 |
+
finalize_line()
|
| 2043 |
+
else:
|
| 2044 |
+
# Check for spacing between characters
|
| 2045 |
+
space_threshold = char.size * 0.5
|
| 2046 |
+
gap = char.bbox[0] - prev_char.bbox[2]
|
| 2047 |
+
if gap > max(space_threshold, 1.0):
|
| 2048 |
+
finalize_word()
|
| 2049 |
+
line_text += " "
|
| 2050 |
+
|
| 2051 |
+
# ✅ Explicitly finalize if space character
|
| 2052 |
+
if char_text == " ":
|
| 2053 |
+
finalize_word()
|
| 2054 |
+
line_text += " "
|
| 2055 |
+
prev_char = char
|
| 2056 |
+
continue
|
| 2057 |
+
|
| 2058 |
+
current_word_text += char_text
|
| 2059 |
+
line_text += char_text
|
| 2060 |
+
|
| 2061 |
+
# Update bounding boxes
|
| 2062 |
+
current_word_bbox[0] = min(current_word_bbox[0], char.bbox[0])
|
| 2063 |
+
current_word_bbox[1] = min(current_word_bbox[1], char.bbox[1])
|
| 2064 |
+
current_word_bbox[2] = max(current_word_bbox[2], char.bbox[2])
|
| 2065 |
+
current_word_bbox[3] = max(current_word_bbox[3], char.bbox[3])
|
| 2066 |
+
|
| 2067 |
+
line_bbox[0] = min(line_bbox[0], char.bbox[0])
|
| 2068 |
+
line_bbox[1] = min(line_bbox[1], char.bbox[1])
|
| 2069 |
+
line_bbox[2] = max(line_bbox[2], char.bbox[2])
|
| 2070 |
+
line_bbox[3] = max(line_bbox[3], char.bbox[3])
|
| 2071 |
+
|
| 2072 |
+
prev_char = char
|
| 2073 |
+
|
| 2074 |
+
finalize_line()
|
| 2075 |
+
|
| 2076 |
+
return page_data
|
| 2077 |
|
| 2078 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
| 2079 |
decision_process_table = pd.DataFrame()
|
|
|
|
| 2123 |
return pikepdf_redaction_annotations_on_page
|
| 2124 |
|
| 2125 |
def redact_text_pdf(
|
| 2126 |
+
file_path: str, # Path to the PDF file to be redacted
|
| 2127 |
language: str, # Language of the PDF content
|
| 2128 |
chosen_redact_entities: List[str], # List of entities to be redacted
|
| 2129 |
chosen_redact_comprehend_entities: List[str],
|
|
|
|
| 2136 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
|
| 2137 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
| 2138 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
| 2139 |
+
all_page_line_level_ocr_results_with_words: List = [],
|
| 2140 |
pii_identification_method: str = "Local",
|
| 2141 |
comprehend_query_number:int = 0,
|
| 2142 |
comprehend_client="",
|
|
|
|
| 2147 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
| 2148 |
original_cropboxes:List[dict]=[],
|
| 2149 |
text_extraction_only:bool=False,
|
| 2150 |
+
output_folder:str=OUTPUT_FOLDER,
|
| 2151 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
| 2152 |
max_time: int = int(MAX_TIME_VALUE),
|
| 2153 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
|
|
| 2157 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
| 2158 |
|
| 2159 |
Input Variables:
|
| 2160 |
+
- file_path: Path to the PDF file to be redacted
|
| 2161 |
- language: Language of the PDF content
|
| 2162 |
- chosen_redact_entities: List of entities to be redacted
|
| 2163 |
- chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
|
|
|
|
| 2181 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
|
| 2182 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
| 2183 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
| 2184 |
+
- output_folder (str, optional): The output folder for the function
|
| 2185 |
- page_break_val: Value for page break
|
| 2186 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 2187 |
- progress: Progress tracking object
|
|
|
|
| 2211 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
| 2212 |
|
| 2213 |
# Open with Pikepdf to get text lines
|
| 2214 |
+
pikepdf_pdf = Pdf.open(file_path)
|
| 2215 |
+
number_of_pages = len(pikepdf_pdf.pages)
|
| 2216 |
+
|
| 2217 |
+
file_name = get_file_name_without_type(file_path)
|
| 2218 |
+
|
| 2219 |
+
if not all_page_line_level_ocr_results_with_words:
|
| 2220 |
+
all_page_line_level_ocr_results_with_words = []
|
| 2221 |
|
| 2222 |
# Check that page_min and page_max are within expected ranges
|
| 2223 |
if page_max > number_of_pages or page_max == 0:
|
|
|
|
| 2249 |
|
| 2250 |
if page_min <= page_no < page_max:
|
| 2251 |
# Go page by page
|
| 2252 |
+
for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
|
| 2253 |
|
| 2254 |
all_page_line_text_extraction_characters = []
|
| 2255 |
all_page_line_level_text_extraction_results_list = []
|
|
|
|
| 2261 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
| 2262 |
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
| 2263 |
|
| 2264 |
+
text_line_no = 0
|
| 2265 |
for n, text_container in enumerate(page_layout):
|
| 2266 |
characters = []
|
| 2267 |
|
| 2268 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
| 2269 |
characters = get_text_container_characters(text_container)
|
| 2270 |
+
text_line_no += 1
|
| 2271 |
|
| 2272 |
# Create dataframe for all the text on the page
|
| 2273 |
+
line_level_text_results_list, line_characters, = create_line_level_ocr_results_from_characters(characters)
|
| 2274 |
+
|
| 2275 |
+
line_level_ocr_results_with_words = generate_word_level_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
|
| 2276 |
|
| 2277 |
### Create page_text_ocr_outputs (OCR format outputs)
|
| 2278 |
if line_level_text_results_list:
|
|
|
|
| 2290 |
|
| 2291 |
all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
|
| 2292 |
all_page_line_text_extraction_characters.extend(line_characters)
|
| 2293 |
+
all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
|
| 2294 |
|
| 2295 |
### REDACTION
|
| 2296 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
|
|
| 2341 |
|
| 2342 |
# Join extracted text outputs for all lines together
|
| 2343 |
if not page_text_ocr_outputs.empty:
|
| 2344 |
+
#page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
| 2345 |
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
|
| 2346 |
+
all_line_level_ocr_results_list.append(page_text_ocr_outputs)
|
| 2347 |
|
| 2348 |
toc = time.perf_counter()
|
| 2349 |
|
|
|
|
| 2372 |
|
| 2373 |
current_loop_page += 1
|
| 2374 |
|
| 2375 |
+
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
| 2376 |
|
| 2377 |
# Check if the image already exists in annotations_all_pages
|
| 2378 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
|
|
|
|
| 2393 |
# Write logs
|
| 2394 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
| 2395 |
|
| 2396 |
+
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
| 2397 |
|
| 2398 |
# Write all page outputs
|
| 2399 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
|
|
|
| 2420 |
if not all_line_level_ocr_results_df.empty:
|
| 2421 |
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
|
| 2422 |
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
| 2423 |
+
|
| 2424 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
|
| 2425 |
+
|
| 2426 |
+
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
| 2427 |
+
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
| 2428 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
| 2429 |
|
| 2430 |
+
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
tools/helper_functions.py
CHANGED
|
@@ -244,13 +244,27 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
|
|
| 244 |
else:
|
| 245 |
return False
|
| 246 |
|
| 247 |
-
def
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
if os.path.exists(local_ocr_output_path):
|
| 251 |
-
print("Existing
|
| 252 |
-
return True
|
| 253 |
-
|
| 254 |
else:
|
| 255 |
return False
|
| 256 |
|
|
|
|
| 244 |
else:
|
| 245 |
return False
|
| 246 |
|
| 247 |
+
def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:str, text_extraction_method:str, output_folder:str=OUTPUT_FOLDER):
|
| 248 |
+
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
|
| 249 |
+
elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
|
| 250 |
+
elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
|
| 251 |
+
else:
|
| 252 |
+
print("No valid text extraction method found. Returning False")
|
| 253 |
+
return False
|
| 254 |
+
|
| 255 |
+
print("doc_file_name_no_extension_textbox:", doc_file_name_no_extension_textbox)
|
| 256 |
+
|
| 257 |
+
doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
|
| 258 |
+
|
| 259 |
+
print("doc_file_with_ending:", doc_file_with_ending)
|
| 260 |
+
|
| 261 |
+
local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
|
| 262 |
+
|
| 263 |
+
print("looking for file path:", local_ocr_output_path)
|
| 264 |
|
| 265 |
if os.path.exists(local_ocr_output_path):
|
| 266 |
+
print("Existing OCR with words analysis output file found.")
|
| 267 |
+
return True
|
|
|
|
| 268 |
else:
|
| 269 |
return False
|
| 270 |
|