seanpedrickcase commited on
Commit
0b2c988
0 Parent(s):

Initial commit

Browse files
.dockerignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.csv
2
+ *.pdf
3
+ *.url
4
+ *.jpg
5
+ *.png
6
+ *.ipynb
7
+ examples/*
8
+ processing/*
9
+ output/*
10
+ tools/__pycache__/*
11
+ old_code/*
12
+ tesseract/*
13
+ poppler/*
14
+ build/*
15
+ dist/*
16
+ build_deps/*
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/[email protected]
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://seanpedrickcase:[email protected]/spaces/seanpedrickcase/document_rag_preparation main
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.csv
2
+ *.pdf
3
+ *.url
4
+ *.jpg
5
+ *.png
6
+ *.ipynb
7
+ examples/*
8
+ processing/*
9
+ output/*
10
+ tools/__pycache__/*
11
+ old_code/*
12
+ tesseract/*
13
+ poppler/*
14
+ build/*
15
+ dist/*
16
+ build_deps/*
Dockerfile ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
2
+
3
+ # Install system dependencies. Need to specify -y for poppler to get it to install
4
+ RUN apt-get update \
5
+ && apt-get install -y \
6
+ tesseract-ocr -y \
7
+ poppler-utils -y \
8
+ libgl1-mesa-glx -y \
9
+ libglib2.0-0 -y \
10
+ && apt-get clean \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ WORKDIR /src
14
+
15
+ COPY requirements.txt .
16
+
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ RUN pip install --no-cache-dir gradio==4.31.5
20
+
21
+ # Set up a new user named "user" with user ID 1000
22
+ RUN useradd -m -u 1000 user
23
+
24
+ # Change ownership of /home/user directory
25
+ #RUN chown -R user:user /home/user
26
+
27
+ # Make output folder
28
+ RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
29
+ RUN mkdir -p /home/user/app/tld && chown -R user:user /home/user/app/tld
30
+
31
+ # Switch to the "user" user
32
+ USER user
33
+
34
+ # Set environmental variables
35
+ ENV HOME=/home/user \
36
+ PATH=/home/user/.local/bin:$PATH \
37
+ PYTHONPATH=$HOME/app \
38
+ PYTHONUNBUFFERED=1 \
39
+ GRADIO_ALLOW_FLAGGING=never \
40
+ GRADIO_NUM_PORTS=1 \
41
+ GRADIO_SERVER_NAME=0.0.0.0 \
42
+ GRADIO_SERVER_PORT=7860 \
43
+ GRADIO_THEME=huggingface \
44
+ TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
45
+ #GRADIO_TEMP_DIR=$HOME/tmp \
46
+ #GRADIO_ROOT_PATH=/address-match \
47
+ # gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
48
+ KEEP_ALIVE=60 \
49
+ SYSTEM=spaces
50
+
51
+ # Set the working directory to the user's home directory
52
+ WORKDIR $HOME/app
53
+
54
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
55
+ COPY --chown=user . $HOME/app
56
+ #COPY . $HOME/app
57
+
58
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Document RAG preparation
3
+ emoji: 📖
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ ---
11
+
12
+ # Document RAG preparation
13
+
14
+ Extract text from documents and convert into tabular format using the Unstructured package. The outputs can then be used downstream for e.g. RAG/other processes that require tabular data. Currently supports the following file types: .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), code files (.py, .js etc.). Outputs csvs and files in a 'Document' format commonly used as input to vector databases e.g. ChromaDB, or Langchain embedding datastore integrations. See [here](https://docs.unstructured.io/open-source/core-functionality/overview) for more details about what is going on under the hood.
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
+ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
+
6
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, custom_regex_load
7
+ from tools.unstructured_funcs import partition_file, clean_elements, export_elements_as_table_to_file, filter_elements_and_metadata, chunk_all_elements, minimum_chunk_length, start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks, multipage_sections, overlap_all
8
+ #from tools.aws_functions import load_data_from_aws
9
+ from tools.clean_funcs import pre_clean, full_entity_list, chosen_redact_entities
10
+ import gradio as gr
11
+ import pandas as pd
12
+ import numpy as np
13
+ from typing import Type, List
14
+ from unstructured.documents.elements import Element
15
+
16
+ # Creating an alias for pandas DataFrame using Type
17
+ PandasDataFrame = Type[pd.DataFrame]
18
+
19
+ add_folder_to_path("_internal/tesseract/")
20
+ add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")
21
+
22
+ ensure_output_folder_exists()
23
+
24
+ language = 'en'
25
+ default_meta_keys_to_filter=["file_directory", "filetype"]
26
+ default_element_types_to_filter = ['UncategorizedText', 'Header']
27
+
28
+
29
+ def get_element_metadata(elements, prefix=""):
30
+ """Recursively retrieves element names and metadata in the desired format."""
31
+ result = []
32
+
33
+ for element in elements:
34
+ # print("Element metadata: ", element.metadata)
35
+ # print("Element metadata dict: ", element.metadata.__dict__)
36
+
37
+ if hasattr(element, 'metadata') and isinstance(element.metadata.__dict__, dict):
38
+ for key, value in element.metadata.__dict__.items(): # Iterate over key-value pairs in metadata dictionary
39
+ new_prefix = f"{prefix}." if prefix else ""
40
+ if isinstance(value, dict): # Nested metadata
41
+ result.extend(get_element_metadata([value], new_prefix)) # Recurse with the nested dictionary as a single-item list
42
+ else: # Leaf element
43
+ meta_element_to_add = f"{new_prefix}{key}"
44
+ if meta_element_to_add not in result:
45
+ result.append(meta_element_to_add)
46
+ else:
47
+ print(f"Warning: Element {element} does not have a metadata dictionary.") # Handle elements without metadata gracefully
48
+
49
+ return result
50
+
51
+ def update_filter_dropdowns(elements_table:PandasDataFrame, elements:List[Element]):
52
+ if 'text' in elements_table.columns:
53
+ elements_table_filt = elements_table.drop('text', axis=1)
54
+ else:
55
+ elements_table_filt = elements_table
56
+
57
+ # Error handling for missing 'type' column
58
+ if 'type' not in elements_table_filt.columns:
59
+ print("Warning: 'type' column not found in the DataFrame.")
60
+ return gr.Dropdown(label="Element types (not available)"), gr.Dropdown(label="Metadata properties (not available)")
61
+
62
+ element_types_to_filter = elements_table_filt['type'].unique().tolist()
63
+ meta_keys_to_filter = get_element_metadata(elements)
64
+
65
+ #print("Element types:", element_types_to_filter)
66
+ #print("Meta keys:", meta_keys_to_filter)
67
+
68
+ element_types_to_filter_shortlist = [x for x in default_element_types_to_filter if x in element_types_to_filter]
69
+ meta_keys_to_filter_shortlist = [x for x in default_meta_keys_to_filter if x in meta_keys_to_filter]
70
+
71
+ return gr.Dropdown(
72
+ value=element_types_to_filter_shortlist, choices=element_types_to_filter, multiselect=True, interactive=True, label="Choose element types to exclude from element list"
73
+ ), gr.Dropdown(
74
+ value=meta_keys_to_filter_shortlist, choices=meta_keys_to_filter, multiselect=True, interactive=True, label="Choose metadata keys to filter out"
75
+ )
76
+
77
+ # Create the gradio interface
78
+
79
+ block = gr.Blocks(theme = gr.themes.Base())
80
+
81
+ with block:
82
+
83
+ elements_state = gr.State([])
84
+ elements_table_state = gr.State(pd.DataFrame())
85
+ metadata_keys_state = gr.State([])
86
+ output_image_files_state = gr.State([])
87
+ output_file_list_state = gr.State([])
88
+ in_colnames_state = gr.State("text")
89
+
90
+ data_state = gr.State(pd.DataFrame())
91
+ embeddings_state = gr.State(np.array([]))
92
+ embeddings_type_state = gr.State("")
93
+ topic_model_state = gr.State()
94
+ assigned_topics_state = gr.State([])
95
+ custom_regex_state = gr.State(pd.DataFrame())
96
+ docs_state = gr.State()
97
+ data_file_name_no_ext_state = gr.State()
98
+ label_list_state = gr.State(pd.DataFrame())
99
+ output_name_state = gr.State("")
100
+
101
+ gr.Markdown(
102
+ """
103
+ # Document RAG preparation
104
+ Extract text from documents and convert into tabular format using the Unstructured package. The outputs can then be used downstream for e.g. RAG/other processes that require tabular data. Currently supports the following file types: .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.). Outputs csvs and files in a 'Document' format commonly used as input to vector databases e.g. ChromaDB, or Langchain embedding datastore integrations. See [here](https://docs.unstructured.io/open-source/core-functionality/overview) for more details about what is going on under the hood.
105
+ """)
106
+
107
+ with gr.Tab("Partition document"):
108
+
109
+ with gr.Accordion("Upload files - accepts .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.)", open = True):
110
+ in_file = gr.File(label="Choose file", file_count= "multiple", height=100)
111
+ in_pdf_partition_strategy = gr.Radio(label="PDF partition strategy", value = "fast", choices=["fast", "ocr_only", "hi_res"])
112
+
113
+ partition_btn = gr.Button("Partition documents (outputs appear below)", variant='primary')
114
+
115
+ with gr.Accordion("Clean, anonymise, or filter text elements", open = False):
116
+ with gr.Accordion("Filter element types from text and information from metadata", open = False):
117
+ element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
118
+ meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")
119
+
120
+ filter_meta_btn = gr.Button("Filter elements/metadata")
121
+
122
+ with gr.Accordion("Clean/anonymise text", open = False):
123
+ with gr.Row():
124
+ clean_options = gr.Dropdown(choices = ["Convert bytes to string","Replace quotes","Clean non ASCII","Clean ordered list", "Group paragraphs",
125
+ "Remove trailing punctuation", "Remove all punctuation","Clean text","Remove extra whitespace", "Remove dashes","Remove bullets",
126
+ "Make lowercase"],
127
+ value=["Clean ordered list", "Group paragraphs", "Clean non ASCII", "Remove extra whitespace", "Remove dashes", "Remove bullets"],
128
+ label="Clean options", multiselect=True, interactive=True)
129
+
130
+ with gr.Accordion("Clean with custom regex", open = False):
131
+ gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
132
+ clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove custom regex.")
133
+ with gr.Row():
134
+ custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
135
+ custom_regex_text = gr.Textbox(label="Custom regex load status")
136
+
137
+ with gr.Accordion("Anonymise text", open = False):
138
+ anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data. Personal details are redacted - not 100% effective. Please check results afterwards!")
139
+ with gr.Row():
140
+ anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
141
+ anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")
142
+
143
+ unstructured_clean_btn = gr.Button("Clean data")
144
+
145
+ with gr.Accordion("Chunk text", open = False):
146
+ with gr.Row():
147
+ chunking_method_rad = gr.Radio(value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
148
+ multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
149
+ overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
150
+ with gr.Row():
151
+ minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=10000, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
152
+ start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=10000, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
153
+ hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=10000, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)
154
+
155
+ chunk_btn = gr.Button("Chunk document")
156
+
157
+ # Save chunked data to file
158
+ with gr.Accordion("File outputs", open = True):
159
+ with gr.Row():
160
+ output_summary = gr.Textbox(label="Output summary")
161
+ output_file = gr.File(label="Output file")
162
+
163
+ # AWS functions not yet implemented in this app
164
+ # with gr.Tab(label="AWS data load"):
165
+ # with gr.Accordion(label = "AWS data access", open = True):
166
+ # aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
167
+ # with gr.Row():
168
+ # in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
169
+ # load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
170
+
171
+ # aws_log_box = gr.Textbox(label="AWS data load status")
172
+
173
+ # Partition data, then Update filter dropdowns from loaded data
174
+ partition_btn.click(fn = partition_file, inputs=[in_file, in_pdf_partition_strategy],
175
+ outputs=[output_summary, elements_state, output_file, output_name_state, elements_table_state], api_name="partition").\
176
+ then(fn = update_filter_dropdowns, inputs=[elements_table_state, elements_state], outputs=[element_types_to_filter, meta_keys_to_filter])
177
+
178
+ # Clean data
179
+ ## Filter metadata
180
+
181
+ filter_meta_btn.click(fn=filter_elements_and_metadata, inputs=[elements_state, element_types_to_filter, meta_keys_to_filter], outputs=[elements_state]).\
182
+ then(fn=export_elements_as_table_to_file, inputs=[elements_state, output_name_state], outputs=[output_summary, output_file])
183
+
184
+ ## General text clean and anonymisation
185
+
186
+ ### Custom regex load
187
+ custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
188
+
189
+ unstructured_clean_btn.click(fn=clean_elements, inputs=[elements_state, clean_options, output_name_state], outputs=[elements_state, output_summary, output_file, output_name_state]).\
190
+ then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])
191
+
192
+ ## Chunk data
193
+ chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop], outputs=[output_summary, output_file, output_name_state])
194
+
195
+ # Loading AWS data - not yet implemented in this app
196
+ # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
197
+
198
+ # Simple run
199
+ block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
how_to_create_exe_dist.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
2
+
3
+ 2. Activate the environment 'conda activate new_env'
4
+
5
+ 3. cd to this folder. Install packages from requirements.txt using 'pip install -r requirements.txt'
6
+
7
+ NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
8
+
9
+ 6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
10
+
11
+ 7. pip install pyinstaller
12
+
13
+ 8. In command line, cd to the folder that contains app.py.
14
+
15
+ 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
+
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRagPrepApp_0.1 app.py
18
+
19
+ # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
+
21
+
22
+ b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
23
+
24
+ a = Analysis(
25
+ ...
26
+ module_collection_mode={
27
+ 'gradio': 'py', # Collect gradio package as source .py files
28
+ }
29
+ )
30
+
31
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRagPrepApp_0.1.spec
32
+
33
+
34
+ 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\<app_name>').
35
+
36
+ 10. In 'dist\<app_name>' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
37
+
38
+ 11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ spacy # Not specified as latest versions create a conflict with latest versions of gradio
3
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
4
+ gradio # Not specified as latest versions create a conflict with latest versions of spacy
5
+ boto3==1.34.103
6
+ unstructured
7
+ unstructured[pdf]
8
+ unstructured[docx]
9
+ unstructured[pptx]
10
+ unstructured[html]
11
+ unstructured[text]
12
+ unstructured[xlsx]
13
+ unstructured[odt]
14
+ unstructured[jpg]
15
+ unstructured[msg]
16
+ Faker==22.2.0
17
+ presidio_analyzer==2.2.351
18
+ presidio_anonymizer==2.2.351
19
+ polars==0.20.6
20
+
tld/.tld_set_snapshot ADDED
The diff for this file is too large to render. See raw diff
 
tools/__init__.py ADDED
File without changes
tools/anonymiser.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.cli import download
2
+ import spacy
3
+ from tools.presidio_analyzer_custom import analyze_dict
4
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser
5
+ from typing import List
6
+ from unstructured.documents.elements import Element
7
+
8
+ spacy.prefer_gpu()
9
+
10
+ def spacy_model_installed(model_name):
11
+ try:
12
+ import en_core_web_sm
13
+ en_core_web_sm.load()
14
+ print("Successfully imported spaCy model")
15
+ #nlp = spacy.load("en_core_web_sm")
16
+ #print(nlp._path)
17
+ except:
18
+ download(model_name)
19
+ spacy.load(model_name)
20
+ print("Successfully imported spaCy model")
21
+ #print(nlp._path)
22
+
23
+
24
+ #if not is_model_installed(model_name):
25
+ # os.system(f"python -m spacy download {model_name}")
26
+ model_name = "en_core_web_sm"
27
+ spacy_model_installed(model_name)
28
+
29
+ #spacy.load(model_name)
30
+ # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
31
+ #os.system("pip uninstall -y gradio")
32
+ #os.system("pip install gradio==3.50.0")
33
+ #os.system("python -m spacy download en_core_web_lg")
34
+
35
+ import re
36
+ import secrets
37
+ import base64
38
+ import time
39
+
40
+ import pandas as pd
41
+
42
+ from faker import Faker
43
+
44
+ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, PatternRecognizer
45
+ from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
46
+ from presidio_anonymizer.entities import OperatorConfig
47
+
48
+
49
+
50
+ def anon_consistent_names(df):
51
+ # ## Pick out common names and replace them with the same person value
52
+ df_dict = df.to_dict(orient="list")
53
+
54
+ analyzer = AnalyzerEngine()
55
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
56
+
57
+ analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
58
+ analyzer_results = list(analyzer_results)
59
+
60
+ # + tags=[]
61
+ text = analyzer_results[3].value
62
+
63
+ # + tags=[]
64
+ recognizer_result = str(analyzer_results[3].recognizer_results)
65
+
66
+ # + tags=[]
67
+ recognizer_result
68
+
69
+ # + tags=[]
70
+ data_str = recognizer_result # abbreviated for brevity
71
+
72
+ # Adjusting the parse_dict function to handle trailing ']'
73
+ # Splitting the main data string into individual list strings
74
+ list_strs = data_str[1:-1].split('], [')
75
+
76
+ def parse_dict(s):
77
+ s = s.strip('[]') # Removing any surrounding brackets
78
+ items = s.split(', ')
79
+ d = {}
80
+ for item in items:
81
+ key, value = item.split(': ')
82
+ if key == 'score':
83
+ d[key] = float(value)
84
+ elif key in ['start', 'end']:
85
+ d[key] = int(value)
86
+ else:
87
+ d[key] = value
88
+ return d
89
+
90
+ # Re-running the improved processing code
91
+
92
+ result = []
93
+
94
+ for lst_str in list_strs:
95
+ # Splitting each list string into individual dictionary strings
96
+ dict_strs = lst_str.split(', type: ')
97
+ dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings
98
+
99
+ # Parsing each dictionary string
100
+ dicts = [parse_dict(d) for d in dict_strs]
101
+ result.append(dicts)
102
+
103
+ #result
104
+
105
+ # + tags=[]
106
+ names = []
107
+
108
+ for idx, paragraph in enumerate(text):
109
+ paragraph_texts = []
110
+ for dictionary in result[idx]:
111
+ if dictionary['type'] == 'PERSON':
112
+ paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
113
+ names.append(paragraph_texts)
114
+
115
+ # + tags=[]
116
+ # Flatten the list of lists and extract unique names
117
+ unique_names = list(set(name for sublist in names for name in sublist))
118
+
119
+ # + tags=[]
120
+ fake_names = pd.Series(unique_names).apply(fake_first_name)
121
+
122
+ # + tags=[]
123
+ mapping_df = pd.DataFrame(data={"Unique names":unique_names,
124
+ "Fake names": fake_names})
125
+
126
+ # + tags=[]
127
+ # Convert mapping dataframe to dictionary
128
+ # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
129
+ name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
130
+
131
+ # + tags=[]
132
+ name_map
133
+
134
+ # + tags=[]
135
+ scrubbed_df_consistent_names = df.replace(name_map, regex = True)
136
+
137
+ # + tags=[]
138
+ scrubbed_df_consistent_names
139
+
140
+ return scrubbed_df_consistent_names
141
+
142
+ def detect_file_type(filename):
143
+ """Detect the file type based on its extension."""
144
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
145
+ return 'csv'
146
+ elif filename.endswith('.xlsx'):
147
+ return 'xlsx'
148
+ elif filename.endswith('.parquet'):
149
+ return 'parquet'
150
+ else:
151
+ raise ValueError("Unsupported file type.")
152
+
153
+ def read_file(filename):
154
+ """Read the file based on its detected type."""
155
+ file_type = detect_file_type(filename)
156
+
157
+ if file_type == 'csv':
158
+ return pd.read_csv(filename, low_memory=False)
159
+ elif file_type == 'xlsx':
160
+ return pd.read_excel(filename)
161
+ elif file_type == 'parquet':
162
+ return pd.read_parquet(filename)
163
+
164
+ def anonymise_script(text_list:List[Element], anon_strat:str, nlp_analyser=None):
165
+
166
+ #print(df.shape)
167
+
168
+ #df_chosen_col_mask = (df[chosen_col].isnull()) | (df[chosen_col].str.strip() == "")
169
+ #print("Length of input series blank at start is: ", df_chosen_col_mask.value_counts())
170
+
171
+ # DataFrame to dict
172
+ df_dict = pd.DataFrame(data={"text":text_list}).to_dict(orient="list")
173
+
174
+ if nlp_analyser:
175
+ analyzer = nlp_analyser
176
+ else:
177
+ analyzer = AnalyzerEngine()
178
+
179
+ # Add titles to analyzer list
180
+ titles_recognizer = PatternRecognizer(supported_entity="TITLE",
181
+ deny_list=["Mr","Mrs","Miss", "Ms", "mr", "mrs", "miss", "ms"])
182
+
183
+ analyzer.registry.add_recognizer(titles_recognizer)
184
+
185
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
186
+
187
+ anonymizer = AnonymizerEngine()
188
+
189
+ batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
190
+
191
+ print("Identifying personal data")
192
+ analyse_tic = time.perf_counter()
193
+ #analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
194
+ analyzer_results = analyze_dict(batch_analyzer, df_dict, language="en")
195
+ #print(analyzer_results)
196
+ analyzer_results = list(analyzer_results)
197
+
198
+ analyse_toc = time.perf_counter()
199
+ analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
200
+ print(analyse_time_out)
201
+
202
+ # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
203
+ key = secrets.token_bytes(16) # 128 bits = 16 bytes
204
+ key_string = base64.b64encode(key).decode('utf-8')
205
+
206
+ # Create faker function (note that it has to receive a value)
207
+
208
+ fake = Faker("en_UK")
209
+
210
+ def fake_first_name(x):
211
+ return fake.first_name()
212
+
213
+ # Set up the anonymization configuration WITHOUT DATE_TIME
214
+ replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
215
+ redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
216
+ hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
217
+ mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
218
+ people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
219
+ fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
220
+
221
+
222
+ if anon_strat == "replace": chosen_mask_config = replace_config
223
+ if anon_strat == "redact": chosen_mask_config = redact_config
224
+ if anon_strat == "hash": chosen_mask_config = hash_config
225
+ if anon_strat == "mask": chosen_mask_config = mask_config
226
+ if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
227
+ elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
228
+
229
+ # I think in general people will want to keep date / times - NOT FOR TOPIC MODELLING
230
+ #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
231
+
232
+ #combined_config = {**chosen_mask_config, **keep_date_config}
233
+ combined_config = {**chosen_mask_config}#, **keep_date_config}
234
+ combined_config
235
+
236
+ print("Anonymising personal data")
237
+ anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
238
+
239
+ #print(anonymizer_results)
240
+
241
+ scrubbed_df = pd.DataFrame(data={"text":anonymizer_results["text"]})
242
+
243
+ scrubbed_series = scrubbed_df["text"]
244
+
245
+ #print(scrubbed_series[0:6])
246
+
247
+ #print("Length of output series is: ", len(scrubbed_series))
248
+ #print("Length of input series at end is: ", len(df[chosen_col]))
249
+
250
+
251
+ #scrubbed_values_mask = (scrubbed_series.isnull()) | (scrubbed_series.str.strip() == "")
252
+ #df_chosen_col_mask = (df[chosen_col].isnull()) | (df[chosen_col].str.strip() == "")
253
+
254
+ #print("Length of input series blank at end is: ", df_chosen_col_mask.value_counts())
255
+ #print("Length of output series blank is: ", scrubbed_values_mask.value_counts())
256
+
257
+
258
+ # Create reporting message
259
+ out_message = "Successfully anonymised"
260
+
261
+ if anon_strat == "encrypt":
262
+ out_message = out_message + ". Your decryption key is " + key_string + "."
263
+
264
+ return scrubbed_series, out_message
265
+
266
+ def do_anonymise(in_file:str, anon_strat:str, chosen_cols:List[str]):
267
+
268
+ # Load file
269
+
270
+ anon_df = pd.DataFrame()
271
+
272
+ if in_file:
273
+ for match_file in in_file:
274
+ match_temp_file = pd.read_csv(match_file.name, delimiter = ",", low_memory=False)#, encoding='cp1252')
275
+ anon_df = pd.concat([anon_df, match_temp_file])
276
+
277
+ # Split dataframe to keep only selected columns
278
+ all_cols_original_order = list(anon_df.columns)
279
+ anon_df_part = anon_df[chosen_cols]
280
+ anon_df_remain = anon_df.drop(chosen_cols, axis = 1)
281
+
282
+ # Anonymise the selected columns
283
+ anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat)
284
+
285
+ # Rejoin the dataframe together
286
+ anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
287
+ anon_df_out = anon_df_out[all_cols_original_order]
288
+
289
+ # Export file
290
+ out_file_part = re.sub(r'\.csv', '', match_file.name)
291
+
292
+ anon_export_file_name = out_file_part + "_anon_" + anon_strat + ".csv"
293
+
294
+ anon_df_out.to_csv(anon_export_file_name, index = None)
295
+
296
+ return out_message, anon_export_file_name
tools/aws_functions.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type
2
+ import pandas as pd
3
+ import boto3
4
+ import tempfile
5
+ import os
6
+
7
+ PandasDataFrame = Type[pd.DataFrame]
8
+ bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
9
+
10
+ try:
11
+ session = boto3.Session() # profile_name="default"
12
+ except Exception as e:
13
+ print(e)
14
+
15
+ # sts = session.client("sts")
16
+ # Create a Session with the IAM role ARN
17
+ # aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
18
+ # response = sts.assume_role(
19
+ # RoleArn=aws_role,
20
+ # RoleSessionName="ecs-test-session"
21
+ # )
22
+ # print(response)
23
+
24
+
25
+ def get_assumed_role_info():
26
+ sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
27
+ response = sts.get_caller_identity()
28
+
29
+ # Extract ARN of the assumed role
30
+ assumed_role_arn = response['Arn']
31
+
32
+ # Extract the name of the assumed role from the ARN
33
+ assumed_role_name = assumed_role_arn.split('/')[-1]
34
+
35
+ return assumed_role_arn, assumed_role_name
36
+
37
+ try:
38
+ assumed_role_arn, assumed_role_name = get_assumed_role_info()
39
+
40
+ print("Assumed Role ARN:", assumed_role_arn)
41
+ print("Assumed Role Name:", assumed_role_name)
42
+ except Exception as e:
43
+ print(e)
44
+
45
+ # Download direct from S3 - requires login credentials
46
+ def download_file_from_s3(bucket_name, key, local_file_path):
47
+
48
+ s3 = boto3.client('s3')
49
+ s3.download_file(bucket_name, key, local_file_path)
50
+ print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
51
+
52
+ #download_file_from_s3(bucket_name, object_key, local_file_loc)
53
+
54
+ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
55
+ """
56
+ Download all files from an S3 folder to a local folder.
57
+ """
58
+ s3 = boto3.client('s3')
59
+
60
+ # List objects in the specified S3 folder
61
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
62
+
63
+ # Download each object
64
+ for obj in response.get('Contents', []):
65
+ # Extract object key and construct local file path
66
+ object_key = obj['Key']
67
+ local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
68
+
69
+ # Create directories if necessary
70
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
71
+
72
+ # Download the object
73
+ try:
74
+ s3.download_file(bucket_name, object_key, local_file_path)
75
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
76
+ except Exception as e:
77
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
78
+
79
+
80
+ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
81
+ """
82
+ Download specific files from an S3 folder to a local folder.
83
+ """
84
+ s3 = boto3.client('s3')
85
+
86
+ print("Trying to download file: ", filenames)
87
+
88
+ if filenames == '*':
89
+ # List all objects in the S3 folder
90
+ print("Trying to download all files in AWS folder: ", s3_folder)
91
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
92
+
93
+ print("Found files in AWS folder: ", response.get('Contents', []))
94
+
95
+ filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
96
+
97
+ print("Found filenames in AWS folder: ", filenames)
98
+
99
+ for filename in filenames:
100
+ object_key = os.path.join(s3_folder, filename)
101
+ local_file_path = os.path.join(local_folder, filename)
102
+
103
+ # Create directories if necessary
104
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
105
+
106
+ # Download the object
107
+ try:
108
+ s3.download_file(bucket_name, object_key, local_file_path)
109
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
110
+ except Exception as e:
111
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
112
+
113
+
114
+
115
+ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
116
+
117
+ temp_dir = tempfile.mkdtemp()
118
+ local_address_stub = temp_dir + '/doc-redaction/'
119
+ files = []
120
+
121
+ if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
122
+ out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
123
+ return files, out_message
124
+
125
+ if aws_password:
126
+ if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
127
+
128
+ s3_folder_stub = 'example-data/lambeth-borough-plan/latest/'
129
+
130
+ local_folder_path = local_address_stub
131
+
132
+ # Check if folder exists
133
+ if not os.path.exists(local_folder_path):
134
+ print(f"Folder {local_folder_path} does not exist! Making folder.")
135
+
136
+ os.mkdir(local_folder_path)
137
+
138
+ # Check if folder is empty
139
+ if len(os.listdir(local_folder_path)) == 0:
140
+ print(f"Folder {local_folder_path} is empty")
141
+ # Download data
142
+ download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
143
+
144
+ print("AWS data downloaded")
145
+
146
+ else:
147
+ print(f"Folder {local_folder_path} is not empty")
148
+
149
+ #files = os.listdir(local_folder_stub)
150
+ #print(files)
151
+
152
+ files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
153
+
154
+ out_message = "Data successfully loaded from AWS"
155
+ print(out_message)
156
+
157
+ else:
158
+ out_message = "Data not loaded from AWS"
159
+ print(out_message)
160
+ else:
161
+ out_message = "No password provided. Please ask the data team for access if you need this."
162
+ print(out_message)
163
+
164
+ return files, out_message
tools/clean_funcs.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import polars as pl
4
+ import gradio as gr
5
+ import time
6
+ from datetime import datetime
7
+ import tools.anonymiser as anon
8
+ from unstructured.staging.base import convert_to_dataframe
9
+
10
+ from typing import List
11
+ from unstructured.documents.elements import Element
12
+
13
+ from tools.unstructured_funcs import export_elements_as_table_to_file
14
+
15
+ today_rev = datetime.now().strftime("%Y%m%d")
16
+
17
+ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
18
+ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
19
+
20
+ # Adding custom words to the stopwords
21
+ custom_words = []
22
+ my_stop_words = custom_words
23
+
24
+ # #### Some of my cleaning functions
25
+ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
26
+ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
27
+ email_pattern_regex = r'\S*@\S*\s?'
28
+ num_pattern_regex = r'[0-9]+'
29
+ nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
30
+ postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
31
+ multiple_spaces_regex = r'\s{2,}'
32
+
33
+ def pre_clean(data:List[Element], in_colnames:str, custom_regex:List[str], clean_text:str, data_file_name_no_ext:str="combined_elements", anonymise_drop:List[str]="No", anon_strat:str = "redact", anon_entities:List[str]=chosen_redact_entities, progress=gr.Progress(track_tqdm=True)):
34
+ '''
35
+ Clean open text in tabular format with custom regex or anonymisation.
36
+ '''
37
+
38
+ output_text = ""
39
+ output_list = []
40
+
41
+ progress(0, desc = "Cleaning data")
42
+
43
+ if not in_colnames:
44
+ error_message = "Please enter one column name to use for cleaning and finding topics."
45
+ print(error_message)
46
+ return error_message, None, data_file_name_no_ext, None, None
47
+
48
+ all_tic = time.perf_counter()
49
+
50
+ output_list = []
51
+ #file_list = [string.name for string in in_files]
52
+
53
+ in_colnames_list_first = in_colnames[0]
54
+
55
+ if clean_text == "Yes":
56
+ clean_tic = time.perf_counter()
57
+ print("Starting data clean.")
58
+
59
+ for element in data:
60
+ if not custom_regex.empty:
61
+ cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
62
+ else:
63
+ cleaned_data = initial_clean([element.text], [])
64
+
65
+ element.text = cleaned_data[0]
66
+ print(element.text)
67
+
68
+ clean_toc = time.perf_counter()
69
+ clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
70
+ print(clean_time_out)
71
+
72
+ if anonymise_drop == "Yes":
73
+ progress(0.6, desc= "Anonymising data")
74
+
75
+ data_file_name_no_ext = data_file_name_no_ext + "_anon"
76
+
77
+ anon_tic = time.perf_counter()
78
+
79
+ data_list = []
80
+
81
+ for element in data:
82
+ data_list.append(element.text)
83
+
84
+ data_anon_col, anonymisation_success = anon.anonymise_script(data_list, anon_strat=anon_strat)
85
+
86
+ for i, element in enumerate(data):
87
+ element.text = data_anon_col[i]
88
+
89
+ print(anonymisation_success)
90
+
91
+ anon_toc = time.perf_counter()
92
+ time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
93
+
94
+ alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(data, data_file_name_no_ext, file_name_suffix="_clean")
95
+
96
+ all_toc = time.perf_counter()
97
+ time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
98
+ print(time_out)
99
+
100
+ output_text = "Data clean completed."
101
+
102
+ return output_text, out_files, data, output_file_base
103
+
104
+
105
+ def initial_clean(texts, custom_regex, progress=gr.Progress()):
106
+ #texts = pl.Series(texts).str.strip_chars()
107
+ #text = texts.str.replace_all(html_pattern_regex, ' ')
108
+ #text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
109
+ #text = text.str.replace_all(email_pattern_regex, ' ')
110
+ #text = text.str.replace_all(nums_two_more_regex, ' ')
111
+ #text = text.str.replace_all(postcode_pattern_regex, ' ')
112
+
113
+ texts = pl.Series(texts)
114
+
115
+ # Allow for custom regex patterns to be removed
116
+ if len(custom_regex) > 0:
117
+ for pattern in custom_regex:
118
+ raw_string_pattern = rf"{pattern}" # Case-insensitive regex
119
+ #print(f"Removing regex pattern: {raw_string_pattern}")
120
+ text = text.str.replace_all(raw_string_pattern, " ")
121
+ #print("Text without pattern: ", text[0])
122
+
123
+
124
+ #text = text.str.replace_all(multiple_spaces_regex, ' ')
125
+
126
+ text = text.to_list()
127
+
128
+ return text
129
+
130
+ def remove_hyphens(text_text):
131
+ return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
132
+
133
+
134
+ def remove_characters_after_tokenization(tokens):
135
+ pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
136
+ filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
137
+ return filtered_tokens
138
+
139
+ def convert_to_lowercase(tokens):
140
+ return [token.lower() for token in tokens if token.isalpha()]
141
+
142
+ def remove_short_tokens(tokens):
143
+ return [token for token in tokens if len(token) > 3]
144
+
145
+
146
+ def remove_dups_text(data_samples_ready, data_samples_clean, data_samples):
147
+ # Identify duplicates in the data: https://stackoverflow.com/questions/44191465/efficiently-identify-duplicates-in-large-list-500-000
148
+ # Only identifies the second duplicate
149
+
150
+ seen = set()
151
+ dups = []
152
+
153
+ for i, doi in enumerate(data_samples_ready):
154
+ if doi not in seen:
155
+ seen.add(doi)
156
+ else:
157
+ dups.append(i)
158
+ #data_samples_ready[dupes[0:]]
159
+
160
+ # To see a specific duplicated value you know the position of
161
+ #matching = [s for s in data_samples_ready if data_samples_ready[83] in s]
162
+ #matching
163
+
164
+ # Remove duplicates only (keep first instance)
165
+ #data_samples_ready = list( dict.fromkeys(data_samples_ready) ) # This way would keep one version of the duplicates
166
+
167
+ ### Remove all duplicates including original instance
168
+
169
+ # Identify ALL duplicates including initial values
170
+ # https://stackoverflow.com/questions/11236006/identify-duplicate-values-in-a-list-in-python
171
+
172
+ from collections import defaultdict
173
+ D = defaultdict(list)
174
+ for i,item in enumerate(data_samples_ready):
175
+ D[item].append(i)
176
+ D = {k:v for k,v in D.items() if len(v)>1}
177
+
178
+ # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
179
+ L = list(D.values())
180
+ flat_list_dups = [item for sublist in L for item in sublist]
181
+
182
+ # https://stackoverflow.com/questions/11303225/how-to-remove-multiple-indexes-from-a-list-at-the-same-time
183
+ for index in sorted(flat_list_dups, reverse=True):
184
+ del data_samples_ready[index]
185
+ del data_samples_clean[index]
186
+ del data_samples[index]
187
+
188
+ # Remove blanks
189
+ data_samples_ready = [i for i in data_samples_ready if i]
190
+ data_samples_clean = [i for i in data_samples_clean if i]
191
+ data_samples = [i for i in data_samples if i]
192
+
193
+ return data_samples_ready, data_samples_clean, flat_list_dups, data_samples
194
+
tools/file_conversion.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_path_end
3
+ from PIL import Image
4
+ import os
5
+ from gradio import Progress
6
+ from typing import List
7
+
8
+ def is_pdf_or_image(filename):
9
+ """
10
+ Check if a file name is a PDF or an image file.
11
+
12
+ Args:
13
+ filename (str): The name of the file.
14
+
15
+ Returns:
16
+ bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
17
+ """
18
+ if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
19
+ output = True
20
+ else:
21
+ output = False
22
+ return output
23
+
24
+ def is_pdf(filename):
25
+ """
26
+ Check if a file name is a PDF.
27
+
28
+ Args:
29
+ filename (str): The name of the file.
30
+
31
+ Returns:
32
+ bool: True if the file name ends with ".pdf", False otherwise.
33
+ """
34
+ return filename.lower().endswith(".pdf")
35
+
36
+ # %%
37
+ ## Convert pdf to image if necessary
38
+
39
+ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
40
+
41
+ # Get the number of pages in the PDF
42
+ page_count = pdfinfo_from_path(pdf_path)['Pages']
43
+ print("Number of pages in PDF: ", str(page_count))
44
+
45
+ images = []
46
+
47
+ # Open the PDF file
48
+ for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
49
+
50
+ print("Current page: ", str(page_num))
51
+
52
+ # Convert one page to image
53
+ image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
54
+
55
+ # If no images are returned, break the loop
56
+ if not image:
57
+ break
58
+
59
+ images.extend(image)
60
+
61
+ print("PDF has been converted to images.")
62
+
63
+ return images
64
+
65
+
66
+ # %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
67
+ def process_file(file_path):
68
+ # Get the file extension
69
+ file_extension = os.path.splitext(file_path)[1].lower()
70
+
71
+ # Check if the file is an image type
72
+ if file_extension in ['.jpg', '.jpeg', '.png']:
73
+ print(f"{file_path} is an image file.")
74
+ # Perform image processing here
75
+ out_path = [Image.open(file_path)]
76
+
77
+ # Check if the file is a PDF
78
+ elif file_extension == '.pdf':
79
+ print(f"{file_path} is a PDF file. Converting to image set")
80
+ # Run your function for processing PDF files here
81
+ out_path = convert_pdf_to_images(file_path)
82
+
83
+ else:
84
+ print(f"{file_path} is not an image or PDF file.")
85
+ out_path = ['']
86
+
87
+ return out_path
88
+
89
+ def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
90
+
91
+ out_message = ''
92
+ out_file_paths = []
93
+
94
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
95
+
96
+ if file_path:
97
+ file_path_without_ext = get_file_path_end(file_path)
98
+ else:
99
+ out_message = "No file selected"
100
+ print(out_message)
101
+ return out_message, out_file_paths
102
+
103
+ if in_redact_method == "Image analysis":
104
+ # Analyse and redact image-based pdf or image
105
+ if is_pdf_or_image(file_path) == False:
106
+ return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
107
+
108
+ out_file_path = process_file(file_path)
109
+
110
+ elif in_redact_method == "Text analysis":
111
+ if is_pdf(file_path) == False:
112
+ return "Please upload a PDF file for text analysis.", None
113
+
114
+ out_file_path = file_path
115
+
116
+ return out_message, out_file_path
117
+
118
+
119
+ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
120
+ file_path_without_ext = get_file_path_end(in_file_path)
121
+
122
+ out_file_paths = out_text_file_path
123
+
124
+ # Convert annotated text pdf back to image to give genuine redactions
125
+ print("Creating image version of results")
126
+ pdf_text_image_paths = process_file(out_text_file_path[0])
127
+ out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
+ pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
+
130
+ out_file_paths.append(out_text_image_file_path)
131
+
132
+ out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
133
+
134
+ return out_message, out_file_paths
135
+
136
+
137
+
138
+
139
+
140
+
tools/file_redaction.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from typing import List
3
+ import pandas as pd
4
+ from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
5
+ from pdfminer.high_level import extract_pages
6
+ from tools.file_conversion import process_file
7
+ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
8
+ from pikepdf import Pdf, Dictionary, Name
9
+ from gradio import Progress
10
+ import time
11
+
12
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
13
+ from tools.helper_functions import get_file_path_end
14
+ from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
15
+ import gradio as gr
16
+
17
+ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
18
+
19
+ tic = time.perf_counter()
20
+
21
+ out_message = ''
22
+ out_file_paths = []
23
+
24
+ if in_allow_list:
25
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
26
+
27
+ if file_path:
28
+ file_path_without_ext = get_file_path_end(file_path)
29
+ else:
30
+ out_message = "No file selected"
31
+ print(out_message)
32
+ return out_message, out_file_paths
33
+
34
+ if in_redact_method == "Image analysis":
35
+ # Analyse and redact image-based pdf or image
36
+ # if is_pdf_or_image(file_path) == False:
37
+ # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
38
+
39
+ pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
40
+ out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
41
+ pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
42
+
43
+ out_file_paths.append(out_image_file_path)
44
+ out_message = "Image-based PDF successfully redacted and saved to file."
45
+
46
+ elif in_redact_method == "Text analysis":
47
+ if is_pdf(file_path) == False:
48
+ return "Please upload a PDF file for text analysis.", None
49
+
50
+ # Analyse text-based pdf
51
+ pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
52
+ out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
53
+ pdf_text.save(out_text_file_path)
54
+
55
+ out_file_paths.append(out_text_file_path)
56
+
57
+ out_message = "Text-based PDF successfully redacted and saved to file."
58
+
59
+ else:
60
+ out_message = "No redaction method selected"
61
+ print(out_message)
62
+ return out_message, out_file_paths
63
+
64
+ toc = time.perf_counter()
65
+ out_time = f"Time taken: {toc - tic:0.1f} seconds."
66
+ print(out_time)
67
+
68
+ out_message = out_message + "\n\n" + out_time
69
+
70
+ return out_message, out_file_paths, out_file_paths
71
+
72
+
73
+ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
74
+ '''
75
+ take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
76
+ '''
77
+
78
+ if not image_paths:
79
+
80
+ out_message = "PDF does not exist as images. Converting pages to image"
81
+ print(out_message)
82
+ progress(0, desc=out_message)
83
+
84
+ image_paths = process_file(file_path)
85
+
86
+ # Create a new PDF
87
+ #pdf = pikepdf.new()
88
+
89
+ images = []
90
+ number_of_pages = len(image_paths)
91
+
92
+ out_message = "Redacting pages"
93
+ print(out_message)
94
+ progress(0.1, desc=out_message)
95
+
96
+ for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
97
+
98
+ print("Redacting page ", str(i + 1))
99
+
100
+ # Get the image to redact using PIL lib (pillow)
101
+ image = image_paths[i] #Image.open(image_paths[i])
102
+
103
+ # %%
104
+ image_analyser = ImageAnalyzerEngine(nlp_analyser)
105
+ engine = ImageRedactorEngine(image_analyser)
106
+
107
+ if language == 'en':
108
+ ocr_lang = 'eng'
109
+ else: ocr_lang = language
110
+
111
+ # %%
112
+ # Redact the image with pink color
113
+ redacted_image = engine.redact(image,
114
+ fill=(0, 0, 0),
115
+ ocr_kwargs={"lang": ocr_lang},
116
+ allow_list=allow_list,
117
+ ad_hoc_recognizers= None,
118
+ **{
119
+ "language": language,
120
+ "entities": chosen_redact_entities,
121
+ "score_threshold": score_threshold
122
+ },
123
+ )
124
+
125
+ images.append(redacted_image)
126
+
127
+ return images
128
+
129
+ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
130
+ '''
131
+ Redact chosen entities from a pdf that is made up of multiple pages that are not images.
132
+ '''
133
+
134
+ combined_analyzer_results = []
135
+ analyser_explanations = []
136
+ annotations_all_pages = []
137
+ analyzed_bounding_boxes_df = pd.DataFrame()
138
+
139
+ pdf = Pdf.open(filename)
140
+
141
+ page_num = 0
142
+
143
+ for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
144
+
145
+
146
+ print("Page number is: ", page_num)
147
+
148
+ annotations_on_page = []
149
+ analyzed_bounding_boxes = []
150
+
151
+ for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
152
+ analyzer_results = []
153
+
154
+ for text_container in page_layout:
155
+ if isinstance(text_container, LTTextContainer):
156
+ text_to_analyze = text_container.get_text()
157
+
158
+ analyzer_results = []
159
+ characters = []
160
+
161
+ analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
162
+ language=language,
163
+ entities=chosen_redact_entities,
164
+ score_threshold=score_threshold,
165
+ return_decision_process=False,
166
+ allow_list=allow_list)
167
+
168
+ #if analyzer_results:
169
+ # pass
170
+ #explanation = analyzer_results[0].analysis_explanation.to_dict()
171
+ #analyser_explanations.append(explanation)
172
+ characters = [char # This is what we want to include in the list
173
+ for line in text_container # Loop through each line in text_container
174
+ if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
175
+ for char in line] # Loop through each character in the line
176
+ #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
177
+
178
+ # If any results found
179
+ print(analyzer_results)
180
+
181
+ if len(analyzer_results) > 0 and len(characters) > 0:
182
+ analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
183
+ combined_analyzer_results.extend(analyzer_results)
184
+
185
+ if len(analyzer_results) > 0:
186
+ # Create summary df of annotations to be made
187
+ analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
188
+ analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
189
+ analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
190
+ analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
191
+ analyzed_bounding_boxes_df_new['page'] = page_num + 1
192
+ analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
193
+
194
+ for analyzed_bounding_box in analyzed_bounding_boxes:
195
+ bounding_box = analyzed_bounding_box["boundingBox"]
196
+ annotation = Dictionary(
197
+ Type=Name.Annot,
198
+ Subtype=Name.Highlight,
199
+ QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
200
+ Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
201
+ C=[0, 0, 0],
202
+ CA=1, # Transparency
203
+ T=analyzed_bounding_box["result"].entity_type
204
+ )
205
+ annotations_on_page.append(annotation)
206
+
207
+ annotations_all_pages.extend([annotations_on_page])
208
+
209
+ print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
210
+ page.Annots = pdf.make_indirect(annotations_on_page)
211
+
212
+ page_num += 1
213
+
214
+ # Extracting data from dictionaries
215
+ # extracted_data = []
216
+ # for item in annotations_all_pages:
217
+ # temp_dict = {}
218
+ # #print(item)
219
+ # for key, value in item.items():
220
+ # if isinstance(value, Decimal):
221
+ # temp_dict[key] = float(value)
222
+ # elif isinstance(value, list):
223
+ # temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
224
+ # else:
225
+ # temp_dict[key] = value
226
+ # extracted_data.append(temp_dict)
227
+
228
+ # Creating DataFrame
229
+ # annotations_out = pd.DataFrame(extracted_data)
230
+ #print(df)
231
+
232
+ #annotations_out.to_csv("examples/annotations.csv")
233
+
234
+ analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
235
+
236
+ return pdf
tools/helper_functions.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import gzip
4
+ import pickle
5
+ import numpy as np
6
+
7
+ def get_file_path_end(file_path):
8
+ # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
9
+ basename = os.path.basename(file_path)
10
+
11
+ # Then, split the basename and its extension and return only the basename without the extension
12
+ filename_without_extension, _ = os.path.splitext(basename)
13
+
14
+ #print(filename_without_extension)
15
+
16
+ return filename_without_extension
17
+
18
+ def get_file_path_end_with_ext(file_path):
19
+ # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
20
+ basename = os.path.basename(file_path)
21
+
22
+ return basename
23
+
24
+ def ensure_output_folder_exists():
25
+ """Checks if the 'output/' folder exists, creates it if not."""
26
+
27
+ folder_name = "output/"
28
+
29
+ if not os.path.exists(folder_name):
30
+ # Create the folder if it doesn't exist
31
+ os.makedirs(folder_name)
32
+ print(f"Created the 'output/' folder.")
33
+ else:
34
+ print(f"The 'output/' folder already exists.")
35
+
36
+ def detect_file_type(filename):
37
+ """Detect the file type based on its extension."""
38
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
39
+ return 'csv'
40
+ elif filename.endswith('.xlsx'):
41
+ return 'xlsx'
42
+ elif filename.endswith('.parquet'):
43
+ return 'parquet'
44
+ elif filename.endswith('.pkl.gz'):
45
+ return 'pkl.gz'
46
+ elif filename.endswith('.pkl'):
47
+ return 'pkl'
48
+ elif filename.endswith('.npz'):
49
+ return 'npz'
50
+ else:
51
+ raise ValueError("Unsupported file type.")
52
+
53
+
54
+ def read_file(filename, headers=0):
55
+ """Read the file based on its detected type."""
56
+ file_type = detect_file_type(filename)
57
+
58
+ print("Loading in file")
59
+
60
+ if file_type == 'csv':
61
+ file = pd.read_csv(filename, low_memory=False, header=headers)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
62
+ elif file_type == 'xlsx':
63
+ file = pd.read_excel(filename, header=headers)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
64
+ elif file_type == 'parquet':
65
+ file = pd.read_parquet(filename, header = headers)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
66
+ elif file_type == 'pkl.gz':
67
+ with gzip.open(filename, 'rb') as file:
68
+ file = pickle.load(file)
69
+ #file = pd.read_pickle(filename)
70
+ elif file_type == 'npz':
71
+ file = np.load(filename)['arr_0']
72
+
73
+ # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
74
+ if "compress" in filename:
75
+ file /= 100
76
+
77
+ print("File load complete")
78
+
79
+ return file
80
+
81
+ # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
82
+ def add_folder_to_path(folder_path: str):
83
+ '''
84
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
85
+ '''
86
+
87
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
88
+ print(folder_path, "folder exists.")
89
+
90
+ # Resolve relative path to absolute path
91
+ absolute_path = os.path.abspath(folder_path)
92
+
93
+ current_path = os.environ['PATH']
94
+ if absolute_path not in current_path.split(os.pathsep):
95
+ full_path_extension = absolute_path + os.pathsep + current_path
96
+ os.environ['PATH'] = full_path_extension
97
+ print(f"Updated PATH with: ", full_path_extension)
98
+ else:
99
+ print(f"Directory {folder_path} already exists in PATH.")
100
+ else:
101
+ print(f"Folder not found at {folder_path} - not added to PATH")
102
+
103
+ def custom_regex_load(in_file, headers = None):
104
+ '''
105
+ When file is loaded, update the column dropdown choices and write to relevant data states.
106
+ '''
107
+
108
+ custom_regex = pd.DataFrame()
109
+
110
+ file_list = [string.name for string in in_file]
111
+
112
+ regex_file_names = [string for string in file_list if "csv" in string.lower()]
113
+ if regex_file_names:
114
+ regex_file_name = regex_file_names[0]
115
+ custom_regex = read_file(regex_file_name, headers)
116
+ #regex_file_name_no_ext = get_file_path_end(regex_file_name)
117
+
118
+ output_text = "Data file loaded."
119
+ print(output_text)
120
+ else:
121
+ error = "No regex file provided."
122
+ print(error)
123
+ output_text = error
124
+ return error, custom_regex
125
+
126
+ return output_text, custom_regex
tools/load_spacy_model_custom_recognisers.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from typing import List
3
+ from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
+ from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
+ import spacy
6
+ spacy.prefer_gpu()
7
+ from spacy.cli.download import download
8
+ import re
9
+
10
+ # %%
11
+ model_name = "en_core_web_sm" #"en_core_web_trf"
12
+ score_threshold = 0.001
13
+
14
+ # %% [markdown]
15
+ # #### Custom recognisers
16
+
17
+ # %%
18
+ # Custom title recogniser
19
+ import re
20
+ titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
21
+ titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
22
+ titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
23
+ titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
24
+
25
+ # %%
26
+ # Custom postcode recogniser
27
+
28
+ # Define the regex pattern in a Presidio `Pattern` object:
29
+ ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b", score = 1)
30
+
31
+ # Define the recognizer with one or more patterns
32
+ ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
33
+
34
+ # %%
35
+ # Examples for testing
36
+
37
+ #text = "I live in 510 Broad st SE5 9NG ."
38
+
39
+ #numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
40
+ #print("Result:")
41
+ #print(numbers_result)
42
+
43
+ # %%
44
+ def extract_street_name(text:str) -> str:
45
+ """
46
+ Extracts the street name and preceding word (that should contain at least one number) from the given text.
47
+
48
+ """
49
+
50
+ street_types = [
51
+ 'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
52
+ 'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
53
+ 'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
54
+ 'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
55
+ 'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
56
+ 'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
57
+ 'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
58
+ 'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
59
+ ]
60
+
61
+ # Dynamically construct the regex pattern with all possible street types
62
+ street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
63
+
64
+ # The overall regex pattern to capture the street name and preceding word(s)
65
+
66
+ pattern = rf'(?P<preceding_word>\w*\d\w*)\s*'
67
+ pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
68
+
69
+ # Find all matches in text
70
+ matches = re.finditer(pattern, text, re.IGNORECASE)
71
+
72
+ start_positions = []
73
+ end_positions = []
74
+
75
+ for match in matches:
76
+ preceding_word = match.group('preceding_word').strip()
77
+ street_name = match.group('street_name').strip()
78
+ start_pos = match.start()
79
+ end_pos = match.end()
80
+ print(f"Start: {start_pos}, End: {end_pos}")
81
+ print(f"Preceding words: {preceding_word}")
82
+ print(f"Street name: {street_name}")
83
+ print()
84
+
85
+ start_positions.append(start_pos)
86
+ end_positions.append(end_pos)
87
+
88
+ return start_positions, end_positions
89
+
90
+
91
+ # %%
92
+ # Some examples for testing
93
+
94
+ #text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
95
+ #text = "Roberto lives in Five 10 Broad st in Oregon"
96
+ #text = "Roberto lives in 55 Oregon Square"
97
+ #text = "There is 51a no way I will do that"
98
+ #text = "I am writing to apply for"
99
+
100
+ #extract_street_name(text)
101
+
102
+ # %%
103
+ class StreetNameRecognizer(EntityRecognizer):
104
+
105
+ def load(self) -> None:
106
+ """No loading is required."""
107
+ pass
108
+
109
+ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
110
+ """
111
+ Logic for detecting a specific PII
112
+ """
113
+
114
+ start_pos, end_pos = extract_street_name(text)
115
+
116
+ results = []
117
+
118
+ for i in range(0, len(start_pos)):
119
+
120
+ result = RecognizerResult(
121
+ entity_type="STREETNAME",
122
+ start = start_pos[i],
123
+ end = end_pos[i],
124
+ score= 1
125
+ )
126
+
127
+ results.append(result)
128
+
129
+ return results
130
+
131
+ street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
132
+
133
+ # %%
134
+ # Create a class inheriting from SpacyNlpEngine
135
+ class LoadedSpacyNlpEngine(SpacyNlpEngine):
136
+ def __init__(self, loaded_spacy_model):
137
+ super().__init__()
138
+ self.nlp = {"en": loaded_spacy_model}
139
+
140
+ # %%
141
+ # Load spacy model
142
+ try:
143
+ import en_core_web_lg
144
+ nlp = en_core_web_lg.load()
145
+ print("Successfully imported spaCy model")
146
+
147
+ except:
148
+ download("en_core_web_lg")
149
+ nlp = spacy.load("en_core_web_lg")
150
+ print("Successfully downloaded and imported spaCy model")
151
+
152
+ # Pass the loaded model to the new LoadedSpacyNlpEngine
153
+ loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
154
+
155
+
156
+
157
+ # %%
158
+ nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
159
+ default_score_threshold=score_threshold,
160
+ supported_languages=["en"],
161
+ log_decision_process=True,
162
+ )
163
+
164
+ # %%
165
+ nlp_analyser.registry.add_recognizer(street_recogniser)
166
+ nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
167
+ nlp_analyser.registry.add_recognizer(titles_recogniser)
168
+
tools/presidio_analyzer_custom.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
3
+ from tqdm import tqdm
4
+
5
+ from presidio_analyzer import DictAnalyzerResult, RecognizerResult, AnalyzerEngine
6
+ from presidio_analyzer.nlp_engine import NlpArtifacts
7
+
8
+ def analyze_iterator_custom(
9
+ self,
10
+ texts: Iterable[Union[str, bool, float, int]],
11
+ language: str,
12
+ list_length:int,
13
+ progress=gr.Progress(),
14
+ **kwargs,
15
+ ) -> List[List[RecognizerResult]]:
16
+ """
17
+ Analyze an iterable of strings.
18
+
19
+ :param texts: An list containing strings to be analyzed.
20
+ :param language: Input language
21
+ :param list_length: Length of the input list.
22
+ :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
23
+ """
24
+
25
+ # validate types
26
+ texts = self._validate_types(texts)
27
+
28
+ # Process the texts as batch for improved performance
29
+ nlp_artifacts_batch: Iterator[
30
+ Tuple[str, NlpArtifacts]
31
+ ] = self.analyzer_engine.nlp_engine.process_batch(
32
+ texts=texts, language=language
33
+ )
34
+
35
+
36
+
37
+ list_results = []
38
+ for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
39
+ results = self.analyzer_engine.analyze(
40
+ text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
41
+ )
42
+
43
+ list_results.append(results)
44
+
45
+ return list_results
46
+
47
+ def analyze_dict(
48
+ self,
49
+ input_dict: Dict[str, Union[Any, Iterable[Any]]],
50
+ language: str,
51
+ keys_to_skip: Optional[List[str]] = None,
52
+ **kwargs,
53
+ ) -> Iterator[DictAnalyzerResult]:
54
+ """
55
+ Analyze a dictionary of keys (strings) and values/iterable of values.
56
+
57
+ Non-string values are returned as is.
58
+
59
+ :param input_dict: The input dictionary for analysis
60
+ :param language: Input language
61
+ :param keys_to_skip: Keys to ignore during analysis
62
+ :param kwargs: Additional keyword arguments
63
+ for the `AnalyzerEngine.analyze` method.
64
+ Use this to pass arguments to the analyze method,
65
+ such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
66
+ See `AnalyzerEngine.analyze` for the full list.
67
+ """
68
+
69
+ context = []
70
+ if "context" in kwargs:
71
+ context = kwargs["context"]
72
+ del kwargs["context"]
73
+
74
+ if not keys_to_skip:
75
+ keys_to_skip = []
76
+
77
+
78
+ for key, value in input_dict.items():
79
+ if not value or key in keys_to_skip:
80
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
81
+ continue # skip this key as requested
82
+
83
+ # Add the key as an additional context
84
+ specific_context = context[:]
85
+ specific_context.append(key)
86
+
87
+ if type(value) in (str, int, bool, float):
88
+ results: List[RecognizerResult] = self.analyzer_engine.analyze(
89
+ text=str(value), language=language, context=[key], **kwargs
90
+ )
91
+ elif isinstance(value, dict):
92
+ new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
93
+ results = self.analyze_dict(
94
+ input_dict=value,
95
+ language=language,
96
+ context=specific_context,
97
+ keys_to_skip=new_keys_to_skip,
98
+ **kwargs,
99
+ )
100
+ elif isinstance(value, Iterable):
101
+ # Recursively iterate nested dicts
102
+ list_length = len(value)
103
+
104
+ results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
105
+ texts=value,
106
+ language=language,
107
+ context=specific_context,
108
+ list_length=list_length,
109
+ **kwargs,
110
+ )
111
+ else:
112
+ raise ValueError(f"type {type(value)} is unsupported.")
113
+
114
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)
tools/unstructured_funcs.py ADDED
@@ -0,0 +1,884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unstructured.partition.auto import partition
2
+ from unstructured.chunking.title import chunk_by_title
3
+ from unstructured.chunking.basic import chunk_elements
4
+ from unstructured.documents.elements import Element, Title, CompositeElement
5
+ from unstructured.staging.base import convert_to_dataframe
6
+ from typing import Type, List, Literal, Tuple
7
+
8
+ from unstructured.cleaners.core import replace_unicode_quotes, clean_non_ascii_chars, clean_ordered_bullets, group_broken_paragraphs, replace_unicode_quotes, clean, clean_trailing_punctuation, remove_punctuation, bytes_string_to_string
9
+ import gradio as gr
10
+ import time
11
+ import pandas as pd
12
+ import re
13
+ import gzip
14
+ import pickle
15
+ from pydantic import BaseModel, Field
16
+
17
+ from tools.helper_functions import get_file_path_end, get_file_path_end_with_ext
18
+
19
+ # Creating an alias for pandas DataFrame using Type
20
+ PandasDataFrame = Type[pd.DataFrame]
21
+
22
+ # %%
23
+ # pdf partitioning strategy vars
24
+ pdf_partition_strat = "ocr_only" # ["fast", "ocr_only", "hi_res"]
25
+
26
+ # %%
27
+ # Element metadata modification vars
28
+ meta_keys_to_filter = ["file_directory", "filetype"]
29
+ element_types_to_filter = ['UncategorizedText', 'Header']
30
+
31
+ # %%
32
+ # Clean function vars
33
+
34
+ bytes_to_string=False
35
+ replace_quotes=True
36
+ clean_non_ascii=False
37
+ clean_ordered_list=True
38
+ group_paragraphs=True
39
+ trailing_punctuation=False
40
+ all_punctuation=False
41
+ clean_text=True
42
+ extra_whitespace=True
43
+ dashes=True
44
+ bullets=True
45
+ lowercase=False
46
+
47
+ # %%
48
+ # Chunking vars
49
+
50
+ minimum_chunk_length = 2000
51
+ start_new_chunk_after_end_of_this_element_length = 2000
52
+ hard_max_character_length_chunks = 3000
53
+ multipage_sections=True
54
+ overlap_all=True
55
+ include_orig_elements=True
56
+
57
+ # %%
58
+ class Document(BaseModel):
59
+ """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
60
+
61
+ page_content: str
62
+ """String text."""
63
+ metadata: dict = Field(default_factory=dict)
64
+ """Arbitrary metadata about the page content (e.g., source, relationships to other
65
+ documents, etc.).
66
+ """
67
+ type: Literal["Document"] = "Document"
68
+
69
+ # %%
70
+ def create_title_id_dict(elements:List[Element]):
71
+
72
+ # Assuming the object is stored in a variable named 'elements_list'
73
+ titles = [item.text for item in elements if isinstance(item, Title)]
74
+
75
+ #### Get all elements under these titles
76
+ chapter_ids = {}
77
+ for element in elements:
78
+ for chapter in titles:
79
+ if element.text == chapter and element.category == "Title":
80
+ chapter_ids[element._element_id] = chapter
81
+ break
82
+
83
+ chapter_to_id = {v: k for k, v in chapter_ids.items()}
84
+
85
+ return chapter_ids, chapter_to_id
86
+
87
+ # %%
88
+ def filter_elements(elements:List[Element], excluded_elements: List[str] = ['']):
89
+ """
90
+ Filter out elements from a list based on their categories.
91
+
92
+ Args:
93
+ elements: The list of elements to filter.
94
+ excluded_elements: A list of element categories to exclude.
95
+
96
+ Returns:
97
+ A new list containing the filtered elements.
98
+ """
99
+ filtered_elements = []
100
+ for element in elements:
101
+ if element.category not in excluded_elements:
102
+ filtered_elements.append(element)
103
+ return filtered_elements
104
+
105
+ # %%
106
+ def remove_keys_from_meta(
107
+ elements: List[Element],
108
+ meta_remove_keys: List[str],
109
+ excluded_element_types: List[str] = []
110
+ ) -> List[Element]:
111
+ '''
112
+ Remove specified metadata keys from an Unstructured Element object
113
+ '''
114
+
115
+ for element in elements:
116
+ if element.category not in excluded_element_types:
117
+ for key in meta_remove_keys:
118
+ try:
119
+ del element.metadata.__dict__[key] # Directly modify metadata
120
+ except KeyError:
121
+ print(f"Key '{key}' not found in element metadata.")
122
+
123
+ return elements
124
+
125
+ def filter_elements_and_metadata(
126
+ elements: List[Element],
127
+ excluded_categories: List[str] = [],
128
+ meta_remove_keys: List[str] = [],
129
+ ) -> List[Element]:
130
+ """
131
+ Filters elements based on categories and removes specified metadata keys.
132
+
133
+ Args:
134
+ elements: The list of elements to process.
135
+ excluded_categories: A list of element categories to exclude.
136
+ meta_remove_keys: A list of metadata keys to remove.
137
+
138
+ Returns:
139
+ A new list containing the processed elements.
140
+ """
141
+
142
+ filtered_elements = []
143
+ for element in elements:
144
+ if element.category not in excluded_categories:
145
+ for key in meta_remove_keys:
146
+ try:
147
+ del element.metadata.__dict__[key]
148
+ except KeyError:
149
+ # Better logging/error handling instead of just printing
150
+ # Use a proper logger or raise a warning/exception
151
+ pass
152
+ filtered_elements.append(element)
153
+
154
+ return filtered_elements
155
+
156
+ # %%
157
+ def add_parent_title_to_meta(elements:List[Element], chapter_ids:List[str], excluded_element_types:List[str]=['']) -> List[Element]:
158
+ '''
159
+ Add parent title to Unstructured metadata elements
160
+
161
+ '''
162
+ for element in elements:
163
+ if element.category in excluded_element_types:
164
+ pass
165
+
166
+ else:
167
+ meta = element.metadata.to_dict()
168
+
169
+ if "parent_id" in meta and meta["parent_id"] in chapter_ids and "title_name" not in meta:
170
+ title_name = chapter_ids[meta["parent_id"]]
171
+ # Directly modify the existing element metadata object
172
+ element.metadata.title_name = title_name
173
+
174
+ return elements
175
+
176
+
177
+ def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:str = "Basic_chunking", minimum_chunk_length:int=minimum_chunk_length, start_new_chunk_after_end_of_this_element_length:int=start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks:int=hard_max_character_length_chunks, multipage_sections:bool=multipage_sections, overlap_all:bool=overlap_all, include_orig_elements:bool=include_orig_elements):
178
+
179
+ '''
180
+ Use Unstructured.io functions to chunk an Element object by Title or across all elements.
181
+ '''
182
+ output_files = []
183
+ output_summary = ""
184
+
185
+ chapter_ids, chapter_to_id = create_title_id_dict(elements)
186
+
187
+ ### Break text down into chunks
188
+
189
+ try:
190
+
191
+ if chunk_type == "Chunk within title":
192
+ chunks = chunk_by_title(
193
+ elements,
194
+ include_orig_elements=include_orig_elements,
195
+ combine_text_under_n_chars=minimum_chunk_length,
196
+ new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
197
+ max_characters=hard_max_character_length_chunks,
198
+ multipage_sections=multipage_sections,
199
+ overlap_all=overlap_all
200
+ )
201
+
202
+ else:
203
+ chunks = chunk_elements(
204
+ elements,
205
+ include_orig_elements=include_orig_elements,
206
+ new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
207
+ max_characters=hard_max_character_length_chunks,
208
+ overlap_all=overlap_all
209
+ )
210
+
211
+ except Exception as output_summary:
212
+ print(output_summary)
213
+ return output_summary, output_files, file_name_base
214
+
215
+ chunk_sections, chunk_df, chunks_out = element_chunks_to_document(chunks, chapter_ids)
216
+
217
+ file_name_suffix = "_chunk"
218
+
219
+ # The new file name does not overwrite the old file name as the 'chunked' elements are only used as an output, and not an input to other functions
220
+ output_summary, output_files, file_name_base_new = export_elements_as_table_to_file(chunks_out, file_name_base, file_name_suffix, chunk_sections)
221
+
222
+ return output_summary, output_files, file_name_base
223
+
224
+ # %%
225
+ def element_chunks_to_document(chunks:CompositeElement, chapter_ids:List[str]) -> Tuple[List[Document], PandasDataFrame, List[str]]:
226
+ '''
227
+ Take an Unstructured.io chunk_by_title output with the original parsed document elements and turn it into a Document format commonly used by vector databases, and a Pandas dataframe.
228
+ '''
229
+ chunk_sections = []
230
+ current_title_id = ''
231
+ current_title = ''
232
+ last_page = ''
233
+ chunk_df_list = []
234
+
235
+ for chunk in chunks:
236
+ chunk_meta = chunk.metadata.to_dict()
237
+ true_element_ids = []
238
+ element_categories = []
239
+ titles = []
240
+ titles_id = []
241
+
242
+ if "page_number" in chunk_meta:
243
+ last_page = chunk_meta["page_number"]
244
+
245
+ chunk_text = chunk.text
246
+ #chunk_page_number = chunk.metadata.to_dict()["page_number"]
247
+
248
+ # If the same element text is found, add the element_id to the chunk (NOT PERFECT. THIS WILL FAIL IF THE SAME TEXT IS SEEN MULTIPL TIMES)
249
+ for element in chunk.metadata.orig_elements:
250
+
251
+ #element_text = element.text
252
+ element_id = element._element_id
253
+ element_category = element.category
254
+ element_meta = element.metadata.to_dict()
255
+
256
+ if "page_number" in element_meta:
257
+ element_page_number = element_meta["page_number"]
258
+ last_page = element_page_number
259
+
260
+ true_element_ids.append(element_id)
261
+ element_categories.append(element_category)
262
+
263
+
264
+ # Set new metadata for chunk
265
+ if "page_number" in element_meta:
266
+ chunk_meta["last_page_number"] = last_page
267
+
268
+ chunk_meta["true_element_ids"] = true_element_ids
269
+
270
+ for loop_id in chunk_meta['true_element_ids']:
271
+ if loop_id in chapter_ids:
272
+ current_title = chapter_ids[loop_id]
273
+ current_title_id = loop_id
274
+
275
+ titles.append(current_title)
276
+ titles_id.append(current_title_id)
277
+
278
+ chunk_meta['titles'] = titles
279
+ chunk_meta['titles_id'] = titles_id
280
+
281
+ # Remove original elements data for documents
282
+ chunk_meta.pop('orig_elements')
283
+
284
+ chunk_dict_for_df = chunk_meta.copy()
285
+ chunk_dict_for_df['text'] = chunk.text
286
+
287
+ chunk_df_list.append(chunk_dict_for_df)
288
+
289
+
290
+ chunk_doc = [Document(page_content=chunk_text, metadata=chunk_meta)]
291
+ chunk_sections.extend(chunk_doc)
292
+
293
+ ## Write metadata back to elements
294
+ chunk.metadata.__dict__ = chunk_meta
295
+
296
+ chunk_df = pd.DataFrame(chunk_df_list)
297
+
298
+ # print("Doc format: ", chunk_sections)
299
+
300
+ return chunk_sections, chunk_df, chunks
301
+
302
+ # %%
303
+ def write_elements_to_documents(elements:List[Element]):
304
+ '''
305
+ Take Unstructured.io parsed elements and write it into a 'Document' format commonly used by vector databases
306
+ '''
307
+
308
+ doc_sections = []
309
+
310
+ for element in elements:
311
+ meta = element.metadata.to_dict()
312
+
313
+ meta["type"] = element.category
314
+ meta["element_id"] = element._element_id
315
+
316
+ element_doc = [Document(page_content=element.text, metadata= meta)]
317
+ doc_sections.extend(element_doc)
318
+
319
+ #print("Doc format: ", doc_sections)
320
+
321
+
322
+ return doc_sections
323
+
324
+ # %%
325
+ def clean_elements(elements:List[Element], dropdown_options: List[str] = [''],
326
+ output_name:str = "combined_elements",
327
+ bytes_to_string:bool=False,
328
+ replace_quotes:bool=True,
329
+ clean_non_ascii:bool=False,
330
+ clean_ordered_list:bool=True,
331
+ group_paragraphs:bool=True,
332
+ trailing_punctuation:bool=False,
333
+ all_punctuation:bool=False,
334
+ clean_text:bool=True,
335
+ extra_whitespace:bool=True,
336
+ dashes:bool=True,
337
+ bullets:bool=True,
338
+ lowercase:bool=False) -> List[Element]:
339
+
340
+ '''
341
+ Apply Unstructured cleaning processes to a list of parse elements.
342
+ '''
343
+
344
+ out_files = []
345
+ output_summary = ""
346
+
347
+ # Set variables to True based on dropdown selections
348
+ for option in dropdown_options:
349
+ if option == "Convert bytes to string":
350
+ bytes_to_string = True
351
+ elif option == "Replace quotes":
352
+ replace_quotes = True
353
+ elif option == "Clean non ASCII":
354
+ clean_non_ascii = True
355
+ elif option == "Clean ordered list":
356
+ clean_ordered_list = True
357
+ elif option == "Group paragraphs":
358
+ group_paragraphs = True
359
+ elif option == "Remove trailing punctuation":
360
+ trailing_punctuation = True
361
+ elif option == "Remove all punctuation":
362
+ all_punctuation = True
363
+ elif option == "Clean text":
364
+ clean_text = True
365
+ elif option == "Remove extra whitespace":
366
+ extra_whitespace = True
367
+ elif option == "Remove dashes":
368
+ dashes = True
369
+ elif option == "Remove bullets":
370
+ bullets = True
371
+ elif option == "Make lowercase":
372
+ lowercase = True
373
+
374
+
375
+ cleaned_elements = elements.copy()
376
+
377
+ for element in cleaned_elements:
378
+
379
+ try:
380
+ if element: # Check if element is not None or empty
381
+ if bytes_to_string:
382
+ element.apply(bytes_string_to_string)
383
+ if replace_quotes:
384
+ element.apply(replace_unicode_quotes)
385
+ if clean_non_ascii:
386
+ element.apply(clean_non_ascii_chars)
387
+ if clean_ordered_list:
388
+ element.apply(clean_ordered_bullets)
389
+ if group_paragraphs:
390
+ element.apply(group_broken_paragraphs)
391
+ if trailing_punctuation:
392
+ element.apply(clean_trailing_punctuation)
393
+ if all_punctuation:
394
+ element.apply(remove_punctuation)
395
+ if group_paragraphs:
396
+ element.apply(group_broken_paragraphs)
397
+ if clean_text:
398
+ element.apply(lambda x: clean(x, extra_whitespace=extra_whitespace, dashes=dashes, bullets=bullets, lowercase=lowercase))
399
+ except Exception as e:
400
+ print(e)
401
+ element = element
402
+
403
+ alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(cleaned_elements, output_name, file_name_suffix="_clean")
404
+
405
+ output_summary = "Text elements successfully cleaned."
406
+ print(output_summary)
407
+
408
+ return cleaned_elements, output_summary, out_files, output_file_base
409
+
410
+ # %% [markdown]
411
+ def export_elements_as_table_to_file(elements:List[Element], file_name_base:str, file_name_suffix:str="", chunk_documents:List[Document]=[]):
412
+ '''
413
+ Export elements as as a table.
414
+ '''
415
+ output_summary = ""
416
+ out_files = []
417
+
418
+ # Convert to dataframe format
419
+ out_table = convert_to_dataframe(elements)
420
+
421
+ # If the file suffix already exists in the output file name, don't add it again.
422
+ if file_name_suffix not in file_name_base:
423
+ out_file_name_base = file_name_base + file_name_suffix
424
+
425
+ else:
426
+ out_file_name_base = file_name_base
427
+
428
+ out_file_name = "output/" + out_file_name_base + ".csv"
429
+
430
+ out_table.to_csv(out_file_name)
431
+ out_files.append(out_file_name)
432
+
433
+ # Convert to document format
434
+ if chunk_documents:
435
+ out_documents = chunk_documents
436
+ else:
437
+ out_documents = write_elements_to_documents(elements)
438
+
439
+
440
+
441
+ out_file_name_docs = "output/" + out_file_name_base + "_docs.pkl.gz"
442
+ with gzip.open(out_file_name_docs, 'wb') as file:
443
+ pickle.dump(out_documents, file)
444
+
445
+ out_files.append(out_file_name_docs)
446
+
447
+ output_summary = "File successfully exported."
448
+
449
+ return output_summary, out_files, out_file_name_base
450
+
451
+ # # Partition PDF
452
+
453
+ def get_file_type(filename):
454
+ pattern = r"\.(\w+)$" # Match a dot followed by one or more word characters at the end of the string
455
+
456
+ match = re.search(pattern, filename)
457
+ if match:
458
+ file_type = match.group(1) # Extract the captured file type (without the dot)
459
+ print(file_type) # Output: "png"
460
+ else:
461
+ print("No file type found.")
462
+
463
+ return file_type
464
+
465
+ # %%
466
+ def partition_file(filenames:List[str], pdf_partition_strat:str = pdf_partition_strat, progress = gr.Progress()):
467
+ '''
468
+ Partition document files into text elements using the Unstructured package. Currently supports PDF, docx, pptx, html, several image file types, text document types, email messages, code files.
469
+ '''
470
+
471
+ out_message = ""
472
+ combined_elements = []
473
+ out_files = []
474
+
475
+ for file in progress.tqdm(filenames, desc="Partitioning files", unit="files"):
476
+
477
+ try:
478
+
479
+ tic = time.perf_counter()
480
+ print(file)
481
+
482
+ file_name = get_file_path_end_with_ext(file)
483
+ file_name_base = get_file_path_end(file)
484
+ file_type = get_file_type(file_name)
485
+
486
+ image_file_type_list = ["jpg", "jpeg", "png", "heic"]
487
+
488
+ if file_type in image_file_type_list:
489
+ print("File is an image. Using OCR method to partition.")
490
+ file_elements = partition(file, strategy="ocr_only")
491
+ else:
492
+ file_elements = partition(file, strategy=pdf_partition_strat)
493
+
494
+ toc = time.perf_counter()
495
+
496
+
497
+ new_out_message = f"Successfully partitioned file: {file_name} in {toc - tic:0.1f} seconds\n"
498
+ print(new_out_message)
499
+
500
+ out_message = out_message + new_out_message
501
+ combined_elements.extend(file_elements)
502
+
503
+ except Exception as e:
504
+ new_out_message = f"Failed to partition file: {file_name} due to {e}. Partitioning halted."
505
+ print(new_out_message)
506
+ out_message = out_message + new_out_message
507
+ break
508
+
509
+ out_table = convert_to_dataframe(combined_elements)
510
+
511
+ # If multiple files, overwrite default file name for outputs
512
+ if len(filenames) > 1:
513
+ file_name_base = "combined_files"
514
+
515
+ alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(combined_elements, file_name_base, file_name_suffix="_elements")
516
+
517
+ return out_message, combined_elements, out_files, output_file_base, out_table
518
+
519
+ # %%
520
+ def modify_metadata_elements(elements_out_cleaned:List[Element], meta_keys_to_filter:List[str]=meta_keys_to_filter, element_types_to_filter:List[str]=element_types_to_filter) -> List[Element]:
521
+
522
+ '''
523
+ Take an element object, add parent title names to metadata. Remove specified metadata keys or element types from element list.
524
+ '''
525
+
526
+ chapter_ids, chapter_to_id = create_title_id_dict(elements_out_cleaned.copy())
527
+ elements_out_meta_mod = add_parent_title_to_meta(elements_out_cleaned.copy(), chapter_ids)
528
+ elements_out_meta_mod_meta_filt = remove_keys_from_meta(elements_out_meta_mod.copy(), meta_keys_to_filter)
529
+ elements_out_filtered_meta_mod = filter_elements(elements_out_meta_mod_meta_filt, element_types_to_filter)
530
+
531
+ return elements_out_filtered_meta_mod
532
+ # %%
533
+ # file_stub = "C:/Users/SPedrickCase/OneDrive - Lambeth Council/Apps/doc_rag_prep/examples/"
534
+ # filenames = []
535
+ # pdf_filename = [file_stub + "Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf"]
536
+ # filenames.extend(pdf_filename)
537
+
538
+ # html_filename = [file_stub + "transport-strategy.html"]
539
+ # filenames.extend(html_filename)
540
+
541
+ # docx_filename = [file_stub + "FINAL Policy and Procedure for Writing Housing Policies.docx"]
542
+ # filenames.extend(docx_filename)
543
+
544
+ # out_message, elements_parse = partition_file(filenames=filenames, pdf_partition_strat="ocr_only")
545
+
546
+ # for element in elements_parse[:10]:
547
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
548
+ # elements_out = elements_parse.copy()
549
+
550
+ # %% [markdown]
551
+ # ### Process with document layout detection - fast strategy
552
+ #
553
+ # The "fast" strategy will extract the text using pdfminer and process the raw text with partition_text. If the PDF text is not extractable, partition_pdf will fall back to "ocr_only". We recommend using the "fast" strategy in most cases where the PDF has extractable text.
554
+ # elements_out_parse = partition_pdf(filename=filename, strategy="fast")
555
+ # for element in elements_out_parse[:10]:
556
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
557
+ # elements_out = elements_out_parse.copy()
558
+ # ### OCR only
559
+ #
560
+ # The "ocr_only" strategy runs the document through Tesseract for OCR and then runs the raw text through partition_text. Currently, "hi_res" has difficulty ordering elements for documents with multiple columns. If you have a document with multiple columns that does not have extractable text, we recommend using the "ocr_only" strategy. "ocr_only" falls back to "fast" if Tesseract is not available and the document has extractable text.
561
+ # elements_out_parse = partition_pdf(filename=filename, strategy="ocr_only")
562
+ # for element in elements_out_parse[:10]:
563
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
564
+ # elements_out = elements_out_parse.copy()
565
+ # ### Hi-res partitioning
566
+ #
567
+ # The "hi_res" strategy will identify the layout of the document using detectron2. The advantage of “hi_res” is that it uses the document layout to gain additional information about document elements. We recommend using this strategy if your use case is highly sensitive to correct classifications for document elements. If detectron2 is not available, the "hi_res" strategy will fall back to the "ocr_only" strategy.
568
+ # elements_out = partition_pdf(filename=filename, strategy="hi_res")
569
+ # for element in elements_out[:10]:
570
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
571
+
572
+ # %% [markdown]
573
+ # ## Clean data
574
+
575
+ # %%
576
+ # elements_out_cleaned = clean_elements(elements_out.copy(), bytes_to_string=False,
577
+ # replace_quotes=True ,
578
+ # clean_non_ascii=False,
579
+ # clean_ordered_list=True ,
580
+ # group_paragraphs=True,
581
+ # trailing_punctuation=False,
582
+ # all_punctuation=False,
583
+ # clean_text=True ,
584
+ # extra_whitespace=True,
585
+ # dashes=True ,
586
+ # bullets=True ,
587
+ # lowercase=False)
588
+
589
+ # %% [markdown]
590
+ # ## Add/remove elements to/from metadata
591
+
592
+
593
+
594
+ # %% [markdown]
595
+ # ### Write to table, dictionary, document format
596
+
597
+ # %%
598
+ ### Dataframe format
599
+
600
+ # elements_out_filtered_df = convert_to_dataframe(elements_out_filtered_meta_mod)
601
+
602
+ # elements_out_filtered_df.to_csv("table.csv")
603
+ # elements_out_filtered_df.head(6)
604
+
605
+ # # %%
606
+ # ### Dictionary format
607
+
608
+ # elements_out_filtered_dict = convert_to_dict(elements_out_filtered_meta_mod)
609
+ # elements_out_filtered_dict[20]
610
+
611
+ # # %% [markdown]
612
+ # # ### Document format for embeddings
613
+
614
+ # # %%
615
+ # doc_sections = write_elements_to_documents(elements_out_filtered_meta_mod, element_types_to_filter)
616
+
617
+ # doc_sections[0:10]
618
+
619
+ # # %% [markdown]
620
+ # # ### Break text down into chunks
621
+
622
+ # # %%
623
+ # chunks_by_title = chunk_by_title(
624
+ # elements_out_filtered_meta_mod,
625
+ # include_orig_elements=True,
626
+ # combine_text_under_n_chars=minimum_chunk_length,
627
+ # new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
628
+ # max_characters=hard_max_character_length_chunks,
629
+ # multipage_sections=True,
630
+ # overlap_all=True
631
+ # )
632
+
633
+ # chunk_sections, chunk_df = element_chunks_to_document(chunks_by_title, chapter_ids)
634
+ # chunk_df.to_csv("chunked_df.csv")
635
+ # print(chunk_sections[2])
636
+
637
+ # # %%
638
+ # chunks_basic = chunk_elements(
639
+ # elements_out_filtered_meta_mod,
640
+ # include_orig_elements=True,
641
+ # new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
642
+ # max_characters=hard_max_character_length_chunks,
643
+ # overlap_all=True
644
+ # )
645
+
646
+ # chunk_basic_sections, chunk_basic_df = element_chunks_to_document(chunks_basic, chapter_ids)
647
+ # chunk_basic_df.to_csv("chunked_basic_df.csv")
648
+
649
+ # %% [markdown]
650
+ # # Partition Word document
651
+ #
652
+ # You cannot get location metadata for bounding boxes from word documents
653
+
654
+ # %%
655
+ # word_filename = "../examples/FINAL Policy and Procedure for Writing Housing Policies.docx"
656
+
657
+ # # %%
658
+ # docx_elements = partition(filename=word_filename)
659
+ # for element in docx_elements:
660
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
661
+
662
+ # # %%
663
+ # docx_elements[5].text
664
+
665
+ # # %%
666
+ # docx_elements[5].category
667
+
668
+ # # %%
669
+ # docx_elements[5].metadata.to_dict()
670
+
671
+ # # %% [markdown]
672
+ # # ## Find elements associated with chapters
673
+
674
+ # # %%
675
+ # chapter_ids, chapter_to_id = create_title_id_dict(docx_elements)
676
+
677
+ # chapter_ids
678
+
679
+ # # %%
680
+ # doc_sections = write_elements_to_documents(docx_elements.copy(), chapter_ids)
681
+
682
+ # # %%
683
+ # doc_sections
684
+
685
+ # # %% [markdown]
686
+ # # ### Chunk documents
687
+
688
+ # # %%
689
+ # chunks = chunk_by_title(
690
+ # docx_elements,
691
+ # include_orig_elements=False,
692
+ # combine_text_under_n_chars=0,
693
+ # new_after_n_chars=500,
694
+ # max_characters=1000,
695
+ # multipage_sections=True,
696
+ # overlap_all=True
697
+ # )
698
+
699
+ # # %%
700
+ # print(chunks)
701
+
702
+ # # %%
703
+ # chunk_sections = element_chunks_to_document(chunks.copy(), docx_elements.copy(), chapter_ids)
704
+
705
+ # # %%
706
+ # chunk_sections[5].page_content
707
+
708
+ # # %%
709
+ # chunk_sections[5].metadata["true_element_ids"]
710
+
711
+ # # %%
712
+ # for element in docx_elements:
713
+ # if element._element_id in chunk_sections[5].metadata["true_element_ids"]:
714
+ # print(element.text)
715
+
716
+ # # %% [markdown]
717
+ # # # Partition PPTX document
718
+
719
+ # # %%
720
+ # pptx_filename = "../examples/LOTI presentation Jan 2024.pptx"
721
+
722
+ # # %%
723
+ # pptx_elements = partition(filename=pptx_filename)
724
+ # for element in pptx_elements[:10]:
725
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
726
+
727
+ # # %%
728
+ # chapter_ids, chapter_to_id = create_title_id_dict(pptx_elements)
729
+ # chapter_ids
730
+
731
+ # # %%
732
+ # pptx_sections = write_elements_to_documents(pptx_elements.copy(), chapter_ids)
733
+
734
+ # # %%
735
+ # pptx_sections
736
+
737
+ # # %%
738
+ # pptx_chunks = chunk_by_title(
739
+ # pptx_elements,
740
+ # include_orig_elements=False,
741
+ # combine_text_under_n_chars=0,
742
+ # new_after_n_chars=500,
743
+ # max_characters=1000,
744
+ # multipage_sections=True,
745
+ # overlap_all=True
746
+ # )
747
+
748
+ # # %%
749
+ # pptx_chunk_sections = element_chunks_to_document(pptx_chunks.copy(), pptx_elements.copy(), chapter_ids)
750
+
751
+ # # %% [markdown]
752
+ # # ### Load documents into a vectorDB (Not necessary)
753
+
754
+ # # %%
755
+ # import chromadb
756
+
757
+ # # %%
758
+ # client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
759
+ # client.reset()
760
+
761
+ # # %%
762
+ # collection = client.create_collection(
763
+ # name="policy_statements",
764
+ # metadata={"hnsw:space": "cosine"}
765
+ # )
766
+
767
+ # # %%
768
+ # chapter_ids
769
+
770
+ # # %%
771
+ # for element in docx_elements:
772
+ # parent_id = element.metadata.parent_id
773
+ # #print(element.text)
774
+ # #print(parent_id)
775
+ # #print(element.metadata.to_dict())
776
+ # if parent_id:
777
+ # try:
778
+ # print(parent_id)
779
+ # chapter = chapter_ids[parent_id]
780
+ # print(chapter)
781
+ # except KeyError:
782
+ # chapter = "None"
783
+ # else:
784
+ # chapter = "None"
785
+ # collection.add(
786
+ # documents=[element.text],
787
+ # ids=[element._element_id],
788
+ # metadatas=[{"chapter": chapter}]
789
+ # )
790
+
791
+ # # %% [markdown]
792
+ # # #### See the elements in the VectorDB and perform hybrid search
793
+
794
+ # # %%
795
+ # results = collection.peek()
796
+ # print(results["documents"])
797
+
798
+ # # %%
799
+ # print(collection.metadata)
800
+
801
+ # # %%
802
+ # import json
803
+
804
+ # result = collection.query(
805
+ # query_texts=["What should policies do?"],
806
+ # n_results=2,
807
+ # where={"chapter": '3.0 Policy Statements'},
808
+ # )
809
+ # print(json.dumps(result, indent=2))
810
+
811
+ # # %%
812
+ # collection = client.create_collection(
813
+ # name="policy_statements_chunk",
814
+ # metadata={"hnsw:space": "cosine"}
815
+ # )
816
+
817
+ # # %%
818
+ # for element in chunks:
819
+ # parent_id = element.metadata.parent_id
820
+ # #print(element.text)
821
+ # #print(parent_id)
822
+ # #print(element.metadata.to_dict())
823
+ # if parent_id:
824
+ # try:
825
+ # print(parent_id)
826
+ # chapter = chapter_ids[parent_id]
827
+ # print(chapter)
828
+ # except KeyError:
829
+ # chapter = "None"
830
+ # else:
831
+ # chapter = "None"
832
+
833
+ # print(element._element_id)
834
+ # collection.add(
835
+ # documents=[element.text],
836
+ # ids=[element.orig_elements],
837
+ # metadatas=[{"chapter": chapter}]
838
+ # )
839
+
840
+ # # %% [markdown]
841
+ # # # Partition HTML
842
+
843
+ # # %%
844
+ # html_filename = "../examples/transport-strategy.html"
845
+
846
+ # # %%
847
+ # html_elements = partition(filename=html_filename)
848
+ # for element in html_elements[:10]:
849
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
850
+
851
+ # # %% [markdown]
852
+ # # # Partition image
853
+
854
+ # # %%
855
+ # img_filename = "../examples/example_complaint_letter.jpg"
856
+
857
+ # # %%
858
+ # img_elements = partition(filename=img_filename)
859
+ # for element in img_elements[:10]:
860
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
861
+
862
+ # # %% [markdown]
863
+ # # # Partition XLSX
864
+
865
+ # # %%
866
+ # xlsx_filename = "../examples/fuel-poverty-sub-regional-tables-2020-2018-data.xlsx"
867
+
868
+ # # %%
869
+ # xlsx_elements = partition(filename=xlsx_filename)
870
+ # for element in xlsx_elements[:10]:
871
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
872
+
873
+ # # %% [markdown]
874
+ # # # Partition .py
875
+
876
+ # # %%
877
+ # py_filename = "../examples/app.py"
878
+
879
+ # # %%
880
+ # py_elements = partition(filename=py_filename)
881
+ # for element in py_elements[:10]:
882
+ # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
883
+
884
+