Commit
·
10f46e9
1
Parent(s):
69c2af9
Corrected a couple of bugs. Now Textract whole document API call outputs will load also the input PDF into the app
Browse files- README.md +2 -5
- app.py +30 -24
- pyproject.toml +1 -1
- tools/config.py +3 -0
- tools/file_conversion.py +25 -11
- tools/file_redaction.py +19 -6
- tools/find_duplicate_pages.py +1 -1
- tools/textract_batch_call.py +84 -13
README.md
CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
-
version: 0.6.
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
@@ -22,10 +22,7 @@ NOTE: The app is not 100% accurate, and it will miss some personal information.
|
|
22 |
|
23 |
# USER GUIDE
|
24 |
|
25 |
-
##
|
26 |
-
You can test out many of the features described in this user guide at the [public test version of the app](https://huggingface.co/spaces/seanpedrickcase/document_redaction), which is free. AWS functions (e.g. Textract, Comprehend) are not enabled (unless you have valid API keys).
|
27 |
-
|
28 |
-
## Chat over this user guide
|
29 |
You can now [speak with a chat bot about this user guide](https://huggingface.co/spaces/seanpedrickcase/Light-PDF-Web-QA-Chatbot) (beta!)
|
30 |
|
31 |
## Table of contents
|
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
+
version: 0.6.3
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
|
|
22 |
|
23 |
# USER GUIDE
|
24 |
|
25 |
+
## Chat with this user guide
|
|
|
|
|
|
|
26 |
You can now [speak with a chat bot about this user guide](https://huggingface.co/spaces/seanpedrickcase/Light-PDF-Web-QA-Chatbot) (beta!)
|
27 |
|
28 |
## Table of contents
|
app.py
CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
|
|
4 |
import gradio as gr
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
|
7 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS
|
8 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
|
9 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
|
10 |
from tools.file_redaction import choose_and_run_redactor
|
@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
|
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
from tools.custom_csvlogger import CSVLogger_custom
|
17 |
from tools.find_duplicate_pages import identify_similar_pages
|
18 |
-
from tools.textract_batch_call import analyse_document_with_textract_api,
|
19 |
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
@@ -55,8 +55,6 @@ if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCE
|
|
55 |
if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
|
56 |
if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
|
57 |
|
58 |
-
print
|
59 |
-
|
60 |
# Create the gradio interface
|
61 |
app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
|
62 |
|
@@ -170,16 +168,16 @@ with app:
|
|
170 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
171 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
177 |
no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
|
178 |
textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
|
184 |
s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
|
185 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
|
@@ -214,10 +212,10 @@ with app:
|
|
214 |
|
215 |
# Textract API call placeholders in case option not selected in config
|
216 |
|
217 |
-
job_name_textbox = gr.Textbox(value="", label="
|
218 |
send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
|
219 |
|
220 |
-
job_id_textbox = gr.Textbox(label = "Latest job ID for
|
221 |
check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
|
222 |
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
|
223 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
@@ -225,6 +223,7 @@ with app:
|
|
225 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
226 |
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
227 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
|
|
228 |
|
229 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
230 |
convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
|
@@ -251,7 +250,7 @@ with app:
|
|
251 |
with gr.Accordion("Redact document", open = True):
|
252 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
|
253 |
|
254 |
-
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without.
|
255 |
|
256 |
with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
|
257 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
@@ -260,7 +259,7 @@ with app:
|
|
260 |
pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
|
261 |
|
262 |
if SHOW_COSTS == "True":
|
263 |
-
with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
|
264 |
with gr.Row(equal_height=True):
|
265 |
with gr.Column(scale=1):
|
266 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
@@ -269,10 +268,11 @@ with app:
|
|
269 |
with gr.Row(equal_height=True):
|
270 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
|
271 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
|
272 |
-
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
|
273 |
|
274 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
275 |
-
with gr.Accordion("
|
|
|
276 |
with gr.Row():
|
277 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
|
278 |
with gr.Column():
|
@@ -555,18 +555,24 @@ with app:
|
|
555 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
556 |
|
557 |
# Send whole document to Textract for text extraction
|
558 |
-
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state,
|
559 |
-
|
|
|
|
|
|
|
560 |
check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
561 |
-
success(
|
562 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
563 |
|
564 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
565 |
|
566 |
|
567 |
-
convert_textract_outputs_to_ocr_results.click(
|
|
|
|
|
|
|
568 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
569 |
-
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
|
570 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
571 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
|
572 |
|
@@ -693,10 +699,10 @@ with app:
|
|
693 |
# Get connection details on app load
|
694 |
|
695 |
if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
|
696 |
-
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox,
|
697 |
-
success(load_in_textract_job_details, inputs=[
|
698 |
else:
|
699 |
-
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox,
|
700 |
|
701 |
|
702 |
# If relevant environment variable is set, load in the Textract job details
|
|
|
4 |
import gradio as gr
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
|
7 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, AWS_REGION
|
8 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
|
9 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
|
10 |
from tools.file_redaction import choose_and_run_redactor
|
|
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
from tools.custom_csvlogger import CSVLogger_custom
|
17 |
from tools.find_duplicate_pages import identify_similar_pages
|
18 |
+
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
19 |
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
|
|
55 |
if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
|
56 |
if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
|
57 |
|
|
|
|
|
58 |
# Create the gradio interface
|
59 |
app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
|
60 |
|
|
|
168 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
169 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
|
170 |
|
171 |
+
s3_whole_document_textract_default_bucket = gr.Textbox(label = "Default Textract whole_document S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
|
172 |
+
s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
173 |
+
s3_whole_document_textract_output_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
|
174 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
175 |
no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
|
176 |
textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
|
177 |
|
178 |
+
load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
179 |
+
s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
180 |
+
local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
|
181 |
|
182 |
s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
|
183 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
|
|
|
212 |
|
213 |
# Textract API call placeholders in case option not selected in config
|
214 |
|
215 |
+
job_name_textbox = gr.Textbox(value="", label="whole_document Textract call", visible=False)
|
216 |
send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
|
217 |
|
218 |
+
job_id_textbox = gr.Textbox(label = "Latest job ID for whole_document document analysis", value='', visible=False)
|
219 |
check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
|
220 |
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
|
221 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
|
|
223 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
224 |
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
225 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
226 |
+
job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
|
227 |
|
228 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
229 |
convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
|
|
|
250 |
with gr.Accordion("Redact document", open = True):
|
251 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
|
252 |
|
253 |
+
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
254 |
|
255 |
with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
|
256 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
|
|
259 |
pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
|
260 |
|
261 |
if SHOW_COSTS == "True":
|
262 |
+
with gr.Accordion("Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs)", open = True, visible=True):
|
263 |
with gr.Row(equal_height=True):
|
264 |
with gr.Column(scale=1):
|
265 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
|
|
268 |
with gr.Row(equal_height=True):
|
269 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
|
270 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
|
271 |
+
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
|
272 |
|
273 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
274 |
+
with gr.Accordion("Assign task to cost code", open = True, visible=True):
|
275 |
+
gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
|
276 |
with gr.Row():
|
277 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
|
278 |
with gr.Column():
|
|
|
555 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
556 |
|
557 |
# Send whole document to Textract for text extraction
|
558 |
+
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number]).\
|
559 |
+
success(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
560 |
+
success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
|
561 |
+
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
562 |
+
|
563 |
check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
564 |
+
success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
|
565 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
566 |
|
567 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
568 |
|
569 |
|
570 |
+
convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
571 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
|
572 |
+
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
573 |
+
success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
|
574 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
575 |
+
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
576 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
577 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
|
578 |
|
|
|
699 |
# Get connection details on app load
|
700 |
|
701 |
if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
|
702 |
+
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder]).\
|
703 |
+
success(load_in_textract_job_details, inputs=[load_s3_whole_document_textract_logs_bool, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[textract_job_detail_df])
|
704 |
else:
|
705 |
+
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
|
706 |
|
707 |
|
708 |
# If relevant environment variable is set, load in the Textract job details
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
-
version = "0.6.
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
+
version = "0.6.3"
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
tools/config.py
CHANGED
@@ -279,4 +279,7 @@ LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_J
|
|
279 |
|
280 |
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
|
281 |
|
|
|
|
|
|
|
282 |
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
|
|
|
279 |
|
280 |
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
|
281 |
|
282 |
+
TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
|
283 |
+
|
284 |
+
|
285 |
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
|
tools/file_conversion.py
CHANGED
@@ -620,12 +620,12 @@ def prepare_image_or_pdf(
|
|
620 |
|
621 |
elif file_extension in ['.csv']:
|
622 |
if '_review_file' in file_path_without_ext:
|
623 |
-
review_file_csv = read_file(file_path)
|
624 |
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
625 |
json_from_csv = True
|
626 |
#print("Converted CSV review file to image annotation object")
|
627 |
elif '_ocr_output' in file_path_without_ext:
|
628 |
-
all_line_level_ocr_results_df = read_file(file_path)
|
629 |
json_from_csv = False
|
630 |
|
631 |
# NEW IF STATEMENT
|
@@ -987,7 +987,7 @@ def divide_coordinates_by_page_sizes(
|
|
987 |
if "image_width" in df_abs.columns and "mediabox_width" in df_abs.columns:
|
988 |
# Check if image_width mostly missing - use .isna().all() or check percentage
|
989 |
if df_abs["image_width"].isna().all():
|
990 |
-
print("Falling back to mediabox dimensions as image_width is entirely missing.")
|
991 |
df_abs["image_width"] = df_abs["image_width"].fillna(df_abs["mediabox_width"])
|
992 |
df_abs["image_height"] = df_abs["image_height"].fillna(df_abs["mediabox_height"])
|
993 |
else:
|
@@ -1469,7 +1469,6 @@ def convert_annotation_json_to_review_df(
|
|
1469 |
id_col_exists_in_review = 'id' in review_file_df.columns and not review_file_df['id'].isnull().all() and not (review_file_df['id'] == '').all()
|
1470 |
id_col_exists_in_redaction = 'id' in redaction_decision_output.columns and not redaction_decision_output['id'].isnull().all() and not (redaction_decision_output['id'] == '').all()
|
1471 |
|
1472 |
-
|
1473 |
if id_col_exists_in_review and id_col_exists_in_redaction:
|
1474 |
#print("Attempting to join data based on 'id' column.")
|
1475 |
try:
|
@@ -1530,7 +1529,7 @@ def convert_annotation_json_to_review_df(
|
|
1530 |
|
1531 |
# Only attempt proximity match if text wasn't added by ID join and proximity is requested
|
1532 |
if not text_added_successfully and do_proximity_match:
|
1533 |
-
print("Attempting proximity match to add text data.")
|
1534 |
|
1535 |
# Ensure 'page' columns are numeric before coordinate division and proximity match
|
1536 |
# (Assuming divide_coordinates_by_page_sizes and do_proximity_match_all_pages_for_text need this)
|
@@ -1559,7 +1558,7 @@ def convert_annotation_json_to_review_df(
|
|
1559 |
# Assuming do_proximity_match_all_pages_for_text adds/updates the 'text' column
|
1560 |
if 'text' in review_file_df.columns:
|
1561 |
text_added_successfully = True
|
1562 |
-
print("Proximity match completed.")
|
1563 |
except Exception as e:
|
1564 |
print(f"Error during proximity match: {e}. Text data may not be added.")
|
1565 |
|
@@ -1611,7 +1610,13 @@ def convert_annotation_json_to_review_df(
|
|
1611 |
print(f"Warning: Could not sort DataFrame due to type error in sort columns: {e}")
|
1612 |
# Proceed without sorting
|
1613 |
|
1614 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1615 |
|
1616 |
return review_file_df
|
1617 |
|
@@ -1721,7 +1726,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
|
|
1721 |
# --- Ensure Column Exists ---
|
1722 |
original_dtype = None
|
1723 |
if column_name not in df.columns:
|
1724 |
-
print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
|
1725 |
# Initialize with None (which Pandas often treats as NaN but allows object dtype)
|
1726 |
df[column_name] = None
|
1727 |
# Set original_dtype to object so it likely becomes string later
|
@@ -1757,7 +1762,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
|
|
1757 |
# print(f"No missing or empty values found requiring IDs in column '{column_name}'.")
|
1758 |
return df
|
1759 |
|
1760 |
-
print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
|
1761 |
|
1762 |
# --- Get Existing IDs to Ensure Uniqueness ---
|
1763 |
# Consider only rows that are *not* missing/empty
|
@@ -1809,7 +1814,8 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
|
|
1809 |
# Use the previously identified index to assign the new IDs correctly
|
1810 |
# Assigning string IDs might change the column's dtype to 'object'
|
1811 |
if not pd.api.types.is_object_dtype(original_dtype) and not pd.api.types.is_string_dtype(original_dtype):
|
1812 |
-
|
|
|
1813 |
|
1814 |
df.loc[rows_to_fill_index, column_name] = new_ids_list
|
1815 |
print(f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'.")
|
@@ -1842,7 +1848,13 @@ def convert_review_df_to_annotation_json(
|
|
1842 |
Returns:
|
1843 |
List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
|
1844 |
"""
|
1845 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1846 |
|
1847 |
if not page_sizes:
|
1848 |
raise ValueError("page_sizes argument is required and cannot be empty.")
|
@@ -1865,6 +1877,8 @@ def convert_review_df_to_annotation_json(
|
|
1865 |
raise ValueError(f"Error processing page_sizes: {e}") from e
|
1866 |
|
1867 |
|
|
|
|
|
1868 |
# Handle empty input DataFrame gracefully
|
1869 |
if review_file_df.empty:
|
1870 |
print("Input review_file_df is empty. Proceeding to generate JSON structure with empty boxes.")
|
|
|
620 |
|
621 |
elif file_extension in ['.csv']:
|
622 |
if '_review_file' in file_path_without_ext:
|
623 |
+
review_file_csv = read_file(file_path)
|
624 |
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
625 |
json_from_csv = True
|
626 |
#print("Converted CSV review file to image annotation object")
|
627 |
elif '_ocr_output' in file_path_without_ext:
|
628 |
+
all_line_level_ocr_results_df = read_file(file_path)
|
629 |
json_from_csv = False
|
630 |
|
631 |
# NEW IF STATEMENT
|
|
|
987 |
if "image_width" in df_abs.columns and "mediabox_width" in df_abs.columns:
|
988 |
# Check if image_width mostly missing - use .isna().all() or check percentage
|
989 |
if df_abs["image_width"].isna().all():
|
990 |
+
#print("Falling back to mediabox dimensions as image_width is entirely missing.")
|
991 |
df_abs["image_width"] = df_abs["image_width"].fillna(df_abs["mediabox_width"])
|
992 |
df_abs["image_height"] = df_abs["image_height"].fillna(df_abs["mediabox_height"])
|
993 |
else:
|
|
|
1469 |
id_col_exists_in_review = 'id' in review_file_df.columns and not review_file_df['id'].isnull().all() and not (review_file_df['id'] == '').all()
|
1470 |
id_col_exists_in_redaction = 'id' in redaction_decision_output.columns and not redaction_decision_output['id'].isnull().all() and not (redaction_decision_output['id'] == '').all()
|
1471 |
|
|
|
1472 |
if id_col_exists_in_review and id_col_exists_in_redaction:
|
1473 |
#print("Attempting to join data based on 'id' column.")
|
1474 |
try:
|
|
|
1529 |
|
1530 |
# Only attempt proximity match if text wasn't added by ID join and proximity is requested
|
1531 |
if not text_added_successfully and do_proximity_match:
|
1532 |
+
#print("Attempting proximity match to add text data.")
|
1533 |
|
1534 |
# Ensure 'page' columns are numeric before coordinate division and proximity match
|
1535 |
# (Assuming divide_coordinates_by_page_sizes and do_proximity_match_all_pages_for_text need this)
|
|
|
1558 |
# Assuming do_proximity_match_all_pages_for_text adds/updates the 'text' column
|
1559 |
if 'text' in review_file_df.columns:
|
1560 |
text_added_successfully = True
|
1561 |
+
#print("Proximity match completed.")
|
1562 |
except Exception as e:
|
1563 |
print(f"Error during proximity match: {e}. Text data may not be added.")
|
1564 |
|
|
|
1610 |
print(f"Warning: Could not sort DataFrame due to type error in sort columns: {e}")
|
1611 |
# Proceed without sorting
|
1612 |
|
1613 |
+
base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
|
1614 |
+
|
1615 |
+
for col in base_cols:
|
1616 |
+
if col not in review_file_df.columns:
|
1617 |
+
review_file_df[col] = pd.NA
|
1618 |
+
|
1619 |
+
review_file_df = review_file_df.dropna(subset=base_cols, how="all")
|
1620 |
|
1621 |
return review_file_df
|
1622 |
|
|
|
1726 |
# --- Ensure Column Exists ---
|
1727 |
original_dtype = None
|
1728 |
if column_name not in df.columns:
|
1729 |
+
#print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
|
1730 |
# Initialize with None (which Pandas often treats as NaN but allows object dtype)
|
1731 |
df[column_name] = None
|
1732 |
# Set original_dtype to object so it likely becomes string later
|
|
|
1762 |
# print(f"No missing or empty values found requiring IDs in column '{column_name}'.")
|
1763 |
return df
|
1764 |
|
1765 |
+
#print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
|
1766 |
|
1767 |
# --- Get Existing IDs to Ensure Uniqueness ---
|
1768 |
# Consider only rows that are *not* missing/empty
|
|
|
1814 |
# Use the previously identified index to assign the new IDs correctly
|
1815 |
# Assigning string IDs might change the column's dtype to 'object'
|
1816 |
if not pd.api.types.is_object_dtype(original_dtype) and not pd.api.types.is_string_dtype(original_dtype):
|
1817 |
+
df['id'] = df['id'].astype(str, errors="ignore")
|
1818 |
+
#warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
|
1819 |
|
1820 |
df.loc[rows_to_fill_index, column_name] = new_ids_list
|
1821 |
print(f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'.")
|
|
|
1848 |
Returns:
|
1849 |
List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
|
1850 |
"""
|
1851 |
+
base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
|
1852 |
+
|
1853 |
+
for col in base_cols:
|
1854 |
+
if col not in review_file_df.columns:
|
1855 |
+
review_file_df[col] = pd.NA
|
1856 |
+
|
1857 |
+
review_file_df = review_file_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how='all')
|
1858 |
|
1859 |
if not page_sizes:
|
1860 |
raise ValueError("page_sizes argument is required and cannot be empty.")
|
|
|
1877 |
raise ValueError(f"Error processing page_sizes: {e}") from e
|
1878 |
|
1879 |
|
1880 |
+
|
1881 |
+
|
1882 |
# Handle empty input DataFrame gracefully
|
1883 |
if review_file_df.empty:
|
1884 |
print("Input review_file_df is empty. Proceeding to generate JSON structure with empty boxes.")
|
tools/file_redaction.py
CHANGED
@@ -923,6 +923,22 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
|
|
923 |
|
924 |
return img_annotation_box, rect
|
925 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
926 |
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
927 |
|
928 |
rect_height = page.rect.height
|
@@ -979,9 +995,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
979 |
|
980 |
for annot in page_annotations:
|
981 |
|
982 |
-
|
983 |
-
|
984 |
-
|
985 |
# Check if an Image recogniser result, or a Gradio annotation object
|
986 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
987 |
|
@@ -1053,7 +1066,9 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1053 |
}
|
1054 |
|
1055 |
page.apply_redactions(images=0, graphics=0)
|
1056 |
-
page
|
|
|
|
|
1057 |
page.clean_contents()
|
1058 |
|
1059 |
return page, out_annotation_boxes
|
@@ -1547,8 +1562,6 @@ def redact_image_pdf(file_path:str,
|
|
1547 |
|
1548 |
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
1549 |
|
1550 |
-
print("page_image_annotations at box drawing:", page_image_annotations)
|
1551 |
-
|
1552 |
redacted_image = image.copy()
|
1553 |
#redacted_image.save("test_out_image.png")
|
1554 |
|
|
|
923 |
|
924 |
return img_annotation_box, rect
|
925 |
|
926 |
+
def set_cropbox_safely(page, original_cropbox):
|
927 |
+
"""
|
928 |
+
Sets the cropbox of a page, ensuring it's not larger than the mediabox.
|
929 |
+
If the original cropbox is larger, the mediabox is used instead.
|
930 |
+
|
931 |
+
Args:
|
932 |
+
page: The PyMuPdf page object.
|
933 |
+
original_cropbox: The fitz.Rect representing the desired cropbox.
|
934 |
+
"""
|
935 |
+
mediabox = page.mediabox
|
936 |
+
if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
|
937 |
+
print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
|
938 |
+
page.set_cropbox(mediabox)
|
939 |
+
else:
|
940 |
+
page.set_cropbox(original_cropbox)
|
941 |
+
|
942 |
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
943 |
|
944 |
rect_height = page.rect.height
|
|
|
995 |
|
996 |
for annot in page_annotations:
|
997 |
|
|
|
|
|
|
|
998 |
# Check if an Image recogniser result, or a Gradio annotation object
|
999 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
1000 |
|
|
|
1066 |
}
|
1067 |
|
1068 |
page.apply_redactions(images=0, graphics=0)
|
1069 |
+
set_cropbox_safely(page, original_cropbox)
|
1070 |
+
#page.set_cropbox(original_cropbox)
|
1071 |
+
# Set CropBox to original size
|
1072 |
page.clean_contents()
|
1073 |
|
1074 |
return page, out_annotation_boxes
|
|
|
1562 |
|
1563 |
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
1564 |
|
|
|
|
|
1565 |
redacted_image = image.copy()
|
1566 |
#redacted_image.save("test_out_image.png")
|
1567 |
|
tools/find_duplicate_pages.py
CHANGED
@@ -136,7 +136,7 @@ def process_data(df:pd.DataFrame, column:str):
|
|
136 |
def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
|
137 |
output_paths = []
|
138 |
|
139 |
-
progress(0.1, desc="Cleaning input
|
140 |
|
141 |
# Load and clean data
|
142 |
df, output_files = combine_ocr_output_text(input_files)
|
|
|
136 |
def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
|
137 |
output_paths = []
|
138 |
|
139 |
+
progress(0.1, desc="Cleaning input text")
|
140 |
|
141 |
# Load and clean data
|
142 |
df, output_files = combine_ocr_output_text(input_files)
|
tools/textract_batch_call.py
CHANGED
@@ -5,13 +5,16 @@ import pandas as pd
|
|
5 |
import json
|
6 |
import logging
|
7 |
import datetime
|
|
|
|
|
8 |
from typing import List
|
9 |
from io import StringIO
|
10 |
from urllib.parse import urlparse
|
11 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
-
|
13 |
-
from tools.
|
14 |
-
|
|
|
15 |
|
16 |
def analyse_document_with_textract_api(
|
17 |
local_pdf_path: str,
|
@@ -209,7 +212,8 @@ def return_job_status(job_id:str,
|
|
209 |
logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
|
210 |
|
211 |
if job_status == 'IN_PROGRESS':
|
212 |
-
|
|
|
213 |
elif job_status == 'SUCCEEDED':
|
214 |
logging.info("Textract job succeeded.")
|
215 |
elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
|
@@ -237,6 +241,8 @@ def download_textract_job_files(s3_client:str,
|
|
237 |
Download and combine selected job files from the AWS Textract service.
|
238 |
'''
|
239 |
|
|
|
|
|
240 |
list_response = s3_client.list_objects_v2(
|
241 |
Bucket=s3_bucket_name,
|
242 |
Prefix=s3_output_key_prefix
|
@@ -322,7 +328,67 @@ def check_for_provided_job_id(job_id:str):
|
|
322 |
raise Exception("Please provide a job ID.")
|
323 |
return
|
324 |
|
325 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
job_id:str,
|
327 |
job_type_dropdown:str,
|
328 |
s3_output_prefix: str,
|
@@ -333,14 +399,17 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
333 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
334 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
335 |
aws_region: str = AWS_REGION, # Optional: specify region if not default
|
336 |
-
load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
337 |
poll_interval_seconds: int = 1,
|
338 |
-
max_polling_attempts: int = 1 # ~10 minutes total wait time):
|
|
|
339 |
):
|
340 |
'''
|
341 |
Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
|
342 |
'''
|
343 |
|
|
|
|
|
344 |
if job_id:
|
345 |
# Initialize boto3 clients
|
346 |
session = boto3.Session(region_name=aws_region)
|
@@ -365,7 +434,7 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
365 |
print(f"Failed to update job details dataframe: {e}")
|
366 |
#raise
|
367 |
|
368 |
-
while job_status == 'IN_PROGRESS' and attempts
|
369 |
attempts += 1
|
370 |
try:
|
371 |
if job_type_dropdown=="document_analysis":
|
@@ -394,7 +463,9 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
394 |
downloaded_file_path = None
|
395 |
if job_status == 'SUCCEEDED':
|
396 |
#raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
|
397 |
-
# 3b - Replace PDF file name if it exists in the job dataframe
|
|
|
|
|
398 |
|
399 |
# If job_df is not empty
|
400 |
if not job_df.empty:
|
@@ -410,13 +481,11 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
410 |
else:
|
411 |
pdf_filename = "unknown_file"
|
412 |
|
413 |
-
|
414 |
# --- 4. Download Output JSON from S3 ---
|
415 |
# Textract typically creates output under s3_output_prefix/job_id/
|
416 |
# There might be multiple JSON files if pagination occurred during writing.
|
417 |
# Usually, for smaller docs, there's one file, often named '1'.
|
418 |
-
# For robust handling, list objects and find the JSON(s).
|
419 |
-
|
420 |
|
421 |
s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
|
422 |
logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
@@ -436,8 +505,10 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
436 |
|
437 |
else:
|
438 |
raise Exception("No Job ID provided.")
|
|
|
|
|
439 |
|
440 |
-
return downloaded_file_path, job_status, job_df
|
441 |
|
442 |
def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
443 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
|
|
5 |
import json
|
6 |
import logging
|
7 |
import datetime
|
8 |
+
import gradio as gr
|
9 |
+
from gradio import FileData
|
10 |
from typing import List
|
11 |
from io import StringIO
|
12 |
from urllib.parse import urlparse
|
13 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
14 |
+
from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER
|
15 |
+
from tools.aws_functions import download_file_from_s3
|
16 |
+
from tools.file_conversion import get_input_file_names
|
17 |
+
from tools.helper_functions import get_file_name_without_type
|
18 |
|
19 |
def analyse_document_with_textract_api(
|
20 |
local_pdf_path: str,
|
|
|
212 |
logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
|
213 |
|
214 |
if job_status == 'IN_PROGRESS':
|
215 |
+
pass
|
216 |
+
#time.sleep(poll_interval_seconds)
|
217 |
elif job_status == 'SUCCEEDED':
|
218 |
logging.info("Textract job succeeded.")
|
219 |
elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
|
|
|
241 |
Download and combine selected job files from the AWS Textract service.
|
242 |
'''
|
243 |
|
244 |
+
#print("s3_output_key_prefix at download:", s3_output_key_prefix)
|
245 |
+
|
246 |
list_response = s3_client.list_objects_v2(
|
247 |
Bucket=s3_bucket_name,
|
248 |
Prefix=s3_output_key_prefix
|
|
|
328 |
raise Exception("Please provide a job ID.")
|
329 |
return
|
330 |
|
331 |
+
def load_pdf_job_file_from_s3(
|
332 |
+
load_s3_jobs_input_loc,
|
333 |
+
pdf_filename,
|
334 |
+
local_output_dir,
|
335 |
+
s3_bucket_name,
|
336 |
+
RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
|
337 |
+
|
338 |
+
try:
|
339 |
+
print("load_s3_jobs_input_loc:", load_s3_jobs_input_loc)
|
340 |
+
pdf_file_location = ''
|
341 |
+
doc_file_name_no_extension_textbox = ''
|
342 |
+
|
343 |
+
s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
|
344 |
+
s3_input_key_prefix = s3_input_key_prefix + ".pdf"
|
345 |
+
print("s3_input_key_prefix:", s3_input_key_prefix)
|
346 |
+
|
347 |
+
local_input_file_path = os.path.join(local_output_dir, pdf_filename)
|
348 |
+
local_input_file_path = local_input_file_path + ".pdf"
|
349 |
+
|
350 |
+
print("input to s3 download:", s3_bucket_name, s3_input_key_prefix, local_input_file_path)
|
351 |
+
|
352 |
+
download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
|
353 |
+
|
354 |
+
pdf_file_location = [local_input_file_path]
|
355 |
+
doc_file_name_no_extension_textbox = get_file_name_without_type(pdf_filename)
|
356 |
+
except Exception as e:
|
357 |
+
print("Could not download PDF job file from S3 due to:", e)
|
358 |
+
|
359 |
+
return pdf_file_location, doc_file_name_no_extension_textbox
|
360 |
+
|
361 |
+
def replace_existing_pdf_input_for_whole_document_outputs(
|
362 |
+
load_s3_jobs_input_loc:str,
|
363 |
+
pdf_filename:str,
|
364 |
+
local_output_dir:str,
|
365 |
+
s3_bucket_name:str,
|
366 |
+
in_doc_files:FileData=[],
|
367 |
+
input_folder:str=INPUT_FOLDER,
|
368 |
+
RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
|
369 |
+
progress = gr.Progress(track_tqdm=True)):
|
370 |
+
|
371 |
+
progress(0.1, "Loading PDF from s3")
|
372 |
+
|
373 |
+
if in_doc_files:
|
374 |
+
doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(in_doc_files)
|
375 |
+
|
376 |
+
if pdf_filename == doc_file_name_no_extension_textbox:
|
377 |
+
print("Existing loaded PDF file has same name as file from S3")
|
378 |
+
doc_file_name_no_extension_textbox = pdf_filename
|
379 |
+
downloaded_pdf_file_location = in_doc_files
|
380 |
+
else:
|
381 |
+
downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
|
382 |
+
|
383 |
+
doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
|
384 |
+
else:
|
385 |
+
downloaded_pdf_file_location, doc_file_name_no_extension_textbox = load_pdf_job_file_from_s3(load_s3_jobs_input_loc, pdf_filename, local_output_dir, s3_bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS)
|
386 |
+
|
387 |
+
doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count = get_input_file_names(downloaded_pdf_file_location)
|
388 |
+
|
389 |
+
return downloaded_pdf_file_location, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count
|
390 |
+
|
391 |
+
def poll_whole_document_textract_analysis_progress_and_download(
|
392 |
job_id:str,
|
393 |
job_type_dropdown:str,
|
394 |
s3_output_prefix: str,
|
|
|
399 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
400 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
401 |
aws_region: str = AWS_REGION, # Optional: specify region if not default
|
402 |
+
load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
403 |
poll_interval_seconds: int = 1,
|
404 |
+
max_polling_attempts: int = 1, # ~10 minutes total wait time):
|
405 |
+
progress = gr.Progress(track_tqdm=True)
|
406 |
):
|
407 |
'''
|
408 |
Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
|
409 |
'''
|
410 |
|
411 |
+
progress(0.1, "Querying AWS Textract for status of document analysis job")
|
412 |
+
|
413 |
if job_id:
|
414 |
# Initialize boto3 clients
|
415 |
session = boto3.Session(region_name=aws_region)
|
|
|
434 |
print(f"Failed to update job details dataframe: {e}")
|
435 |
#raise
|
436 |
|
437 |
+
while job_status == 'IN_PROGRESS' and attempts <= max_polling_attempts:
|
438 |
attempts += 1
|
439 |
try:
|
440 |
if job_type_dropdown=="document_analysis":
|
|
|
463 |
downloaded_file_path = None
|
464 |
if job_status == 'SUCCEEDED':
|
465 |
#raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
|
466 |
+
# 3b - Replace PDF file name if it exists in the job dataframe
|
467 |
+
|
468 |
+
progress(0.5, "Document analysis task outputs found. Downloading from S3")
|
469 |
|
470 |
# If job_df is not empty
|
471 |
if not job_df.empty:
|
|
|
481 |
else:
|
482 |
pdf_filename = "unknown_file"
|
483 |
|
|
|
484 |
# --- 4. Download Output JSON from S3 ---
|
485 |
# Textract typically creates output under s3_output_prefix/job_id/
|
486 |
# There might be multiple JSON files if pagination occurred during writing.
|
487 |
# Usually, for smaller docs, there's one file, often named '1'.
|
488 |
+
# For robust handling, list objects and find the JSON(s).
|
|
|
489 |
|
490 |
s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
|
491 |
logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
|
|
505 |
|
506 |
else:
|
507 |
raise Exception("No Job ID provided.")
|
508 |
+
|
509 |
+
output_pdf_filename = get_file_name_without_type(pdf_filename)
|
510 |
|
511 |
+
return downloaded_file_path, job_status, job_df, output_pdf_filename
|
512 |
|
513 |
def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
514 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|