seanpedrickcase commited on
Commit
1cb0304
·
1 Parent(s): 28347d9

Can now chunk within files (without overlap). Removed unnecessary code files

Browse files
app.py CHANGED
@@ -25,6 +25,8 @@ language = 'en'
25
  default_meta_keys_to_filter=["file_directory", "filetype"]
26
  default_element_types_to_filter = ['UncategorizedText', 'Header']
27
 
 
 
28
 
29
  def get_element_metadata(elements, prefix=""):
30
  """Recursively retrieves element names and metadata in the desired format."""
@@ -117,7 +119,7 @@ with block:
117
  element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
118
  meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")
119
 
120
- filter_meta_btn = gr.Button("Filter elements/metadata")
121
 
122
  with gr.Accordion("Clean/anonymise text", open = False):
123
  with gr.Row():
@@ -140,19 +142,20 @@ with block:
140
  anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
141
  anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")
142
 
143
- unstructured_clean_btn = gr.Button("Clean data")
144
 
145
  with gr.Accordion("Chunk text", open = False):
146
  with gr.Row():
147
- chunking_method_rad = gr.Radio(value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
 
148
  multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
149
  overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
150
  with gr.Row():
151
- minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=10000, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
152
- start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=10000, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
153
- hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=10000, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)
154
 
155
- chunk_btn = gr.Button("Chunk document")
156
 
157
  # Save chunked data to file
158
  with gr.Accordion("File outputs", open = True):
@@ -190,10 +193,11 @@ with block:
190
  then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])
191
 
192
  ## Chunk data
193
- chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop], outputs=[output_summary, output_file, output_name_state])
194
 
195
  # Loading AWS data - not yet implemented in this app
196
  # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
197
 
198
  # Simple run
199
- block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
 
 
25
  default_meta_keys_to_filter=["file_directory", "filetype"]
26
  default_element_types_to_filter = ['UncategorizedText', 'Header']
27
 
28
+ max_chunk_length = 25000 # characters
29
+
30
 
31
  def get_element_metadata(elements, prefix=""):
32
  """Recursively retrieves element names and metadata in the desired format."""
 
119
  element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
120
  meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")
121
 
122
+ filter_meta_btn = gr.Button("Filter elements/metadata", variant='primary')
123
 
124
  with gr.Accordion("Clean/anonymise text", open = False):
125
  with gr.Row():
 
142
  anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
143
  anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")
144
 
145
+ unstructured_clean_btn = gr.Button("Clean data", variant='primary')
146
 
147
  with gr.Accordion("Chunk text", open = False):
148
  with gr.Row():
149
+ chunk_within_docs_rad = gr.Radio(label="Chunk within documents", value = "No", choices = ["Yes", "No"], interactive=True)
150
+ chunking_method_rad = gr.Radio(label="Basic chunking or by title", value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
151
  multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
152
  overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
153
  with gr.Row():
154
+ minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=max_chunk_length, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
155
+ start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=max_chunk_length, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
156
+ hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=max_chunk_length, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)
157
 
158
+ chunk_btn = gr.Button("Chunk document(s)", variant='primary')
159
 
160
  # Save chunked data to file
161
  with gr.Accordion("File outputs", open = True):
 
193
  then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])
194
 
195
  ## Chunk data
196
+ chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop, chunk_within_docs_rad], outputs=[output_summary, output_file, output_name_state])
197
 
198
  # Loading AWS data - not yet implemented in this app
199
  # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
200
 
201
  # Simple run
202
+ if __name__ == "__main__":
203
+ block.queue().launch(show_error=True, inbrowser=True)
tools/anonymiser.py CHANGED
@@ -1,7 +1,7 @@
1
  from spacy.cli import download
2
  import spacy
3
  from tools.presidio_analyzer_custom import analyze_dict
4
- from tools.load_spacy_model_custom_recognisers import nlp_analyser
5
  from typing import List
6
  from unstructured.documents.elements import Element
7
 
 
1
  from spacy.cli import download
2
  import spacy
3
  from tools.presidio_analyzer_custom import analyze_dict
4
+ #from tools.load_spacy_model_custom_recognisers import nlp_analyser
5
  from typing import List
6
  from unstructured.documents.elements import Element
7
 
tools/file_conversion.py DELETED
@@ -1,140 +0,0 @@
1
- from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end
3
- from PIL import Image
4
- import os
5
- from gradio import Progress
6
- from typing import List
7
-
8
- def is_pdf_or_image(filename):
9
- """
10
- Check if a file name is a PDF or an image file.
11
-
12
- Args:
13
- filename (str): The name of the file.
14
-
15
- Returns:
16
- bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
17
- """
18
- if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
19
- output = True
20
- else:
21
- output = False
22
- return output
23
-
24
- def is_pdf(filename):
25
- """
26
- Check if a file name is a PDF.
27
-
28
- Args:
29
- filename (str): The name of the file.
30
-
31
- Returns:
32
- bool: True if the file name ends with ".pdf", False otherwise.
33
- """
34
- return filename.lower().endswith(".pdf")
35
-
36
- # %%
37
- ## Convert pdf to image if necessary
38
-
39
- def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
40
-
41
- # Get the number of pages in the PDF
42
- page_count = pdfinfo_from_path(pdf_path)['Pages']
43
- print("Number of pages in PDF: ", str(page_count))
44
-
45
- images = []
46
-
47
- # Open the PDF file
48
- for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
49
-
50
- print("Current page: ", str(page_num))
51
-
52
- # Convert one page to image
53
- image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
54
-
55
- # If no images are returned, break the loop
56
- if not image:
57
- break
58
-
59
- images.extend(image)
60
-
61
- print("PDF has been converted to images.")
62
-
63
- return images
64
-
65
-
66
- # %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
67
- def process_file(file_path):
68
- # Get the file extension
69
- file_extension = os.path.splitext(file_path)[1].lower()
70
-
71
- # Check if the file is an image type
72
- if file_extension in ['.jpg', '.jpeg', '.png']:
73
- print(f"{file_path} is an image file.")
74
- # Perform image processing here
75
- out_path = [Image.open(file_path)]
76
-
77
- # Check if the file is a PDF
78
- elif file_extension == '.pdf':
79
- print(f"{file_path} is a PDF file. Converting to image set")
80
- # Run your function for processing PDF files here
81
- out_path = convert_pdf_to_images(file_path)
82
-
83
- else:
84
- print(f"{file_path} is not an image or PDF file.")
85
- out_path = ['']
86
-
87
- return out_path
88
-
89
- def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
90
-
91
- out_message = ''
92
- out_file_paths = []
93
-
94
- in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
95
-
96
- if file_path:
97
- file_path_without_ext = get_file_path_end(file_path)
98
- else:
99
- out_message = "No file selected"
100
- print(out_message)
101
- return out_message, out_file_paths
102
-
103
- if in_redact_method == "Image analysis":
104
- # Analyse and redact image-based pdf or image
105
- if is_pdf_or_image(file_path) == False:
106
- return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
107
-
108
- out_file_path = process_file(file_path)
109
-
110
- elif in_redact_method == "Text analysis":
111
- if is_pdf(file_path) == False:
112
- return "Please upload a PDF file for text analysis.", None
113
-
114
- out_file_path = file_path
115
-
116
- return out_message, out_file_path
117
-
118
-
119
- def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
120
- file_path_without_ext = get_file_path_end(in_file_path)
121
-
122
- out_file_paths = out_text_file_path
123
-
124
- # Convert annotated text pdf back to image to give genuine redactions
125
- print("Creating image version of results")
126
- pdf_text_image_paths = process_file(out_text_file_path[0])
127
- out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
- pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
-
130
- out_file_paths.append(out_text_image_file_path)
131
-
132
- out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
133
-
134
- return out_message, out_file_paths
135
-
136
-
137
-
138
-
139
-
140
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/file_redaction.py DELETED
@@ -1,236 +0,0 @@
1
- from PIL import Image
2
- from typing import List
3
- import pandas as pd
4
- from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
5
- from pdfminer.high_level import extract_pages
6
- from tools.file_conversion import process_file
7
- from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
8
- from pikepdf import Pdf, Dictionary, Name
9
- from gradio import Progress
10
- import time
11
-
12
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
13
- from tools.helper_functions import get_file_path_end
14
- from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
15
- import gradio as gr
16
-
17
- def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
18
-
19
- tic = time.perf_counter()
20
-
21
- out_message = ''
22
- out_file_paths = []
23
-
24
- if in_allow_list:
25
- in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
26
-
27
- if file_path:
28
- file_path_without_ext = get_file_path_end(file_path)
29
- else:
30
- out_message = "No file selected"
31
- print(out_message)
32
- return out_message, out_file_paths
33
-
34
- if in_redact_method == "Image analysis":
35
- # Analyse and redact image-based pdf or image
36
- # if is_pdf_or_image(file_path) == False:
37
- # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
38
-
39
- pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
40
- out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
41
- pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
42
-
43
- out_file_paths.append(out_image_file_path)
44
- out_message = "Image-based PDF successfully redacted and saved to file."
45
-
46
- elif in_redact_method == "Text analysis":
47
- if is_pdf(file_path) == False:
48
- return "Please upload a PDF file for text analysis.", None
49
-
50
- # Analyse text-based pdf
51
- pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
52
- out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
53
- pdf_text.save(out_text_file_path)
54
-
55
- out_file_paths.append(out_text_file_path)
56
-
57
- out_message = "Text-based PDF successfully redacted and saved to file."
58
-
59
- else:
60
- out_message = "No redaction method selected"
61
- print(out_message)
62
- return out_message, out_file_paths
63
-
64
- toc = time.perf_counter()
65
- out_time = f"Time taken: {toc - tic:0.1f} seconds."
66
- print(out_time)
67
-
68
- out_message = out_message + "\n\n" + out_time
69
-
70
- return out_message, out_file_paths, out_file_paths
71
-
72
-
73
- def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
74
- '''
75
- take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
76
- '''
77
-
78
- if not image_paths:
79
-
80
- out_message = "PDF does not exist as images. Converting pages to image"
81
- print(out_message)
82
- progress(0, desc=out_message)
83
-
84
- image_paths = process_file(file_path)
85
-
86
- # Create a new PDF
87
- #pdf = pikepdf.new()
88
-
89
- images = []
90
- number_of_pages = len(image_paths)
91
-
92
- out_message = "Redacting pages"
93
- print(out_message)
94
- progress(0.1, desc=out_message)
95
-
96
- for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
97
-
98
- print("Redacting page ", str(i + 1))
99
-
100
- # Get the image to redact using PIL lib (pillow)
101
- image = image_paths[i] #Image.open(image_paths[i])
102
-
103
- # %%
104
- image_analyser = ImageAnalyzerEngine(nlp_analyser)
105
- engine = ImageRedactorEngine(image_analyser)
106
-
107
- if language == 'en':
108
- ocr_lang = 'eng'
109
- else: ocr_lang = language
110
-
111
- # %%
112
- # Redact the image with pink color
113
- redacted_image = engine.redact(image,
114
- fill=(0, 0, 0),
115
- ocr_kwargs={"lang": ocr_lang},
116
- allow_list=allow_list,
117
- ad_hoc_recognizers= None,
118
- **{
119
- "language": language,
120
- "entities": chosen_redact_entities,
121
- "score_threshold": score_threshold
122
- },
123
- )
124
-
125
- images.append(redacted_image)
126
-
127
- return images
128
-
129
- def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
130
- '''
131
- Redact chosen entities from a pdf that is made up of multiple pages that are not images.
132
- '''
133
-
134
- combined_analyzer_results = []
135
- analyser_explanations = []
136
- annotations_all_pages = []
137
- analyzed_bounding_boxes_df = pd.DataFrame()
138
-
139
- pdf = Pdf.open(filename)
140
-
141
- page_num = 0
142
-
143
- for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
144
-
145
-
146
- print("Page number is: ", page_num)
147
-
148
- annotations_on_page = []
149
- analyzed_bounding_boxes = []
150
-
151
- for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
152
- analyzer_results = []
153
-
154
- for text_container in page_layout:
155
- if isinstance(text_container, LTTextContainer):
156
- text_to_analyze = text_container.get_text()
157
-
158
- analyzer_results = []
159
- characters = []
160
-
161
- analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
162
- language=language,
163
- entities=chosen_redact_entities,
164
- score_threshold=score_threshold,
165
- return_decision_process=False,
166
- allow_list=allow_list)
167
-
168
- #if analyzer_results:
169
- # pass
170
- #explanation = analyzer_results[0].analysis_explanation.to_dict()
171
- #analyser_explanations.append(explanation)
172
- characters = [char # This is what we want to include in the list
173
- for line in text_container # Loop through each line in text_container
174
- if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
175
- for char in line] # Loop through each character in the line
176
- #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
177
-
178
- # If any results found
179
- print(analyzer_results)
180
-
181
- if len(analyzer_results) > 0 and len(characters) > 0:
182
- analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
183
- combined_analyzer_results.extend(analyzer_results)
184
-
185
- if len(analyzer_results) > 0:
186
- # Create summary df of annotations to be made
187
- analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
188
- analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
189
- analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
190
- analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
191
- analyzed_bounding_boxes_df_new['page'] = page_num + 1
192
- analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
193
-
194
- for analyzed_bounding_box in analyzed_bounding_boxes:
195
- bounding_box = analyzed_bounding_box["boundingBox"]
196
- annotation = Dictionary(
197
- Type=Name.Annot,
198
- Subtype=Name.Highlight,
199
- QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
200
- Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
201
- C=[0, 0, 0],
202
- CA=1, # Transparency
203
- T=analyzed_bounding_box["result"].entity_type
204
- )
205
- annotations_on_page.append(annotation)
206
-
207
- annotations_all_pages.extend([annotations_on_page])
208
-
209
- print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
210
- page.Annots = pdf.make_indirect(annotations_on_page)
211
-
212
- page_num += 1
213
-
214
- # Extracting data from dictionaries
215
- # extracted_data = []
216
- # for item in annotations_all_pages:
217
- # temp_dict = {}
218
- # #print(item)
219
- # for key, value in item.items():
220
- # if isinstance(value, Decimal):
221
- # temp_dict[key] = float(value)
222
- # elif isinstance(value, list):
223
- # temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
224
- # else:
225
- # temp_dict[key] = value
226
- # extracted_data.append(temp_dict)
227
-
228
- # Creating DataFrame
229
- # annotations_out = pd.DataFrame(extracted_data)
230
- #print(df)
231
-
232
- #annotations_out.to_csv("examples/annotations.csv")
233
-
234
- analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
235
-
236
- return pdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/load_spacy_model_custom_recognisers.py DELETED
@@ -1,168 +0,0 @@
1
- # %%
2
- from typing import List
3
- from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
- from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
- import spacy
6
- spacy.prefer_gpu()
7
- from spacy.cli.download import download
8
- import re
9
-
10
- # %%
11
- model_name = "en_core_web_lg" #"en_core_web_trf"
12
- score_threshold = 0.001
13
-
14
- # %% [markdown]
15
- # #### Custom recognisers
16
-
17
- # %%
18
- # Custom title recogniser
19
- import re
20
- titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
21
- titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
22
- titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
23
- titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
24
-
25
- # %%
26
- # Custom postcode recogniser
27
-
28
- # Define the regex pattern in a Presidio `Pattern` object:
29
- ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b", score = 1)
30
-
31
- # Define the recognizer with one or more patterns
32
- ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
33
-
34
- # %%
35
- # Examples for testing
36
-
37
- #text = "I live in 510 Broad st SE5 9NG ."
38
-
39
- #numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
40
- #print("Result:")
41
- #print(numbers_result)
42
-
43
- # %%
44
- def extract_street_name(text:str) -> str:
45
- """
46
- Extracts the street name and preceding word (that should contain at least one number) from the given text.
47
-
48
- """
49
-
50
- street_types = [
51
- 'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
52
- 'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
53
- 'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
54
- 'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
55
- 'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
56
- 'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
57
- 'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
58
- 'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
59
- ]
60
-
61
- # Dynamically construct the regex pattern with all possible street types
62
- street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
63
-
64
- # The overall regex pattern to capture the street name and preceding word(s)
65
-
66
- pattern = rf'(?P<preceding_word>\w*\d\w*)\s*'
67
- pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
68
-
69
- # Find all matches in text
70
- matches = re.finditer(pattern, text, re.IGNORECASE)
71
-
72
- start_positions = []
73
- end_positions = []
74
-
75
- for match in matches:
76
- preceding_word = match.group('preceding_word').strip()
77
- street_name = match.group('street_name').strip()
78
- start_pos = match.start()
79
- end_pos = match.end()
80
- print(f"Start: {start_pos}, End: {end_pos}")
81
- print(f"Preceding words: {preceding_word}")
82
- print(f"Street name: {street_name}")
83
- print()
84
-
85
- start_positions.append(start_pos)
86
- end_positions.append(end_pos)
87
-
88
- return start_positions, end_positions
89
-
90
-
91
- # %%
92
- # Some examples for testing
93
-
94
- #text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
95
- #text = "Roberto lives in Five 10 Broad st in Oregon"
96
- #text = "Roberto lives in 55 Oregon Square"
97
- #text = "There is 51a no way I will do that"
98
- #text = "I am writing to apply for"
99
-
100
- #extract_street_name(text)
101
-
102
- # %%
103
- class StreetNameRecognizer(EntityRecognizer):
104
-
105
- def load(self) -> None:
106
- """No loading is required."""
107
- pass
108
-
109
- def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
110
- """
111
- Logic for detecting a specific PII
112
- """
113
-
114
- start_pos, end_pos = extract_street_name(text)
115
-
116
- results = []
117
-
118
- for i in range(0, len(start_pos)):
119
-
120
- result = RecognizerResult(
121
- entity_type="STREETNAME",
122
- start = start_pos[i],
123
- end = end_pos[i],
124
- score= 1
125
- )
126
-
127
- results.append(result)
128
-
129
- return results
130
-
131
- street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
132
-
133
- # %%
134
- # Create a class inheriting from SpacyNlpEngine
135
- class LoadedSpacyNlpEngine(SpacyNlpEngine):
136
- def __init__(self, loaded_spacy_model):
137
- super().__init__()
138
- self.nlp = {"en": loaded_spacy_model}
139
-
140
- # %%
141
- # Load spacy model
142
- try:
143
- import en_core_web_lg
144
- nlp = en_core_web_lg.load()
145
- print("Successfully imported spaCy model")
146
-
147
- except:
148
- download("en_core_web_lg")
149
- nlp = spacy.load("en_core_web_lg")
150
- print("Successfully downloaded and imported spaCy model")
151
-
152
- # Pass the loaded model to the new LoadedSpacyNlpEngine
153
- loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
154
-
155
-
156
-
157
- # %%
158
- nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
159
- default_score_threshold=score_threshold,
160
- supported_languages=["en"],
161
- log_decision_process=True,
162
- )
163
-
164
- # %%
165
- nlp_analyser.registry.add_recognizer(street_recogniser)
166
- nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
167
- nlp_analyser.registry.add_recognizer(titles_recogniser)
168
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/unstructured_funcs.py CHANGED
@@ -173,8 +173,29 @@ def add_parent_title_to_meta(elements:List[Element], chapter_ids:List[str], excl
173
 
174
  return elements
175
 
 
 
 
 
 
 
 
 
 
176
 
177
- def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:str = "Basic_chunking", minimum_chunk_length:int=minimum_chunk_length, start_new_chunk_after_end_of_this_element_length:int=start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks:int=hard_max_character_length_chunks, multipage_sections:bool=multipage_sections, overlap_all:bool=overlap_all, include_orig_elements:bool=include_orig_elements):
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  '''
180
  Use Unstructured.io functions to chunk an Element object by Title or across all elements.
@@ -186,33 +207,44 @@ def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:st
186
 
187
  ### Break text down into chunks
188
 
189
- try:
190
 
191
- if chunk_type == "Chunk within title":
192
- chunks = chunk_by_title(
193
- elements,
194
- include_orig_elements=include_orig_elements,
195
- combine_text_under_n_chars=minimum_chunk_length,
196
- new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
197
- max_characters=hard_max_character_length_chunks,
198
- multipage_sections=multipage_sections,
199
- overlap_all=overlap_all
200
- )
201
 
202
- else:
203
- chunks = chunk_elements(
204
- elements,
205
- include_orig_elements=include_orig_elements,
206
- new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
207
- max_characters=hard_max_character_length_chunks,
208
- overlap_all=overlap_all
209
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  except Exception as output_summary:
212
  print(output_summary)
213
  return output_summary, output_files, file_name_base
 
 
214
 
215
- chunk_sections, chunk_df, chunks_out = element_chunks_to_document(chunks, chapter_ids)
216
 
217
  file_name_suffix = "_chunk"
218
 
@@ -316,9 +348,6 @@ def write_elements_to_documents(elements:List[Element]):
316
  element_doc = [Document(page_content=element.text, metadata= meta)]
317
  doc_sections.extend(element_doc)
318
 
319
- #print("Doc format: ", doc_sections)
320
-
321
-
322
  return doc_sections
323
 
324
  # %%
@@ -434,9 +463,7 @@ def export_elements_as_table_to_file(elements:List[Element], file_name_base:str,
434
  if chunk_documents:
435
  out_documents = chunk_documents
436
  else:
437
- out_documents = write_elements_to_documents(elements)
438
-
439
-
440
 
441
  out_file_name_docs = "output/" + out_file_name_base + "_docs.pkl.gz"
442
  with gzip.open(out_file_name_docs, 'wb') as file:
@@ -528,357 +555,4 @@ def modify_metadata_elements(elements_out_cleaned:List[Element], meta_keys_to_fi
528
  elements_out_meta_mod_meta_filt = remove_keys_from_meta(elements_out_meta_mod.copy(), meta_keys_to_filter)
529
  elements_out_filtered_meta_mod = filter_elements(elements_out_meta_mod_meta_filt, element_types_to_filter)
530
 
531
- return elements_out_filtered_meta_mod
532
- # %%
533
- # file_stub = "C:/Users/SPedrickCase/OneDrive - Lambeth Council/Apps/doc_rag_prep/examples/"
534
- # filenames = []
535
- # pdf_filename = [file_stub + "Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf"]
536
- # filenames.extend(pdf_filename)
537
-
538
- # html_filename = [file_stub + "transport-strategy.html"]
539
- # filenames.extend(html_filename)
540
-
541
- # docx_filename = [file_stub + "FINAL Policy and Procedure for Writing Housing Policies.docx"]
542
- # filenames.extend(docx_filename)
543
-
544
- # out_message, elements_parse = partition_file(filenames=filenames, pdf_partition_strat="ocr_only")
545
-
546
- # for element in elements_parse[:10]:
547
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
548
- # elements_out = elements_parse.copy()
549
-
550
- # %% [markdown]
551
- # ### Process with document layout detection - fast strategy
552
- #
553
- # The "fast" strategy will extract the text using pdfminer and process the raw text with partition_text. If the PDF text is not extractable, partition_pdf will fall back to "ocr_only". We recommend using the "fast" strategy in most cases where the PDF has extractable text.
554
- # elements_out_parse = partition_pdf(filename=filename, strategy="fast")
555
- # for element in elements_out_parse[:10]:
556
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
557
- # elements_out = elements_out_parse.copy()
558
- # ### OCR only
559
- #
560
- # The "ocr_only" strategy runs the document through Tesseract for OCR and then runs the raw text through partition_text. Currently, "hi_res" has difficulty ordering elements for documents with multiple columns. If you have a document with multiple columns that does not have extractable text, we recommend using the "ocr_only" strategy. "ocr_only" falls back to "fast" if Tesseract is not available and the document has extractable text.
561
- # elements_out_parse = partition_pdf(filename=filename, strategy="ocr_only")
562
- # for element in elements_out_parse[:10]:
563
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
564
- # elements_out = elements_out_parse.copy()
565
- # ### Hi-res partitioning
566
- #
567
- # The "hi_res" strategy will identify the layout of the document using detectron2. The advantage of “hi_res” is that it uses the document layout to gain additional information about document elements. We recommend using this strategy if your use case is highly sensitive to correct classifications for document elements. If detectron2 is not available, the "hi_res" strategy will fall back to the "ocr_only" strategy.
568
- # elements_out = partition_pdf(filename=filename, strategy="hi_res")
569
- # for element in elements_out[:10]:
570
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
571
-
572
- # %% [markdown]
573
- # ## Clean data
574
-
575
- # %%
576
- # elements_out_cleaned = clean_elements(elements_out.copy(), bytes_to_string=False,
577
- # replace_quotes=True ,
578
- # clean_non_ascii=False,
579
- # clean_ordered_list=True ,
580
- # group_paragraphs=True,
581
- # trailing_punctuation=False,
582
- # all_punctuation=False,
583
- # clean_text=True ,
584
- # extra_whitespace=True,
585
- # dashes=True ,
586
- # bullets=True ,
587
- # lowercase=False)
588
-
589
- # %% [markdown]
590
- # ## Add/remove elements to/from metadata
591
-
592
-
593
-
594
- # %% [markdown]
595
- # ### Write to table, dictionary, document format
596
-
597
- # %%
598
- ### Dataframe format
599
-
600
- # elements_out_filtered_df = convert_to_dataframe(elements_out_filtered_meta_mod)
601
-
602
- # elements_out_filtered_df.to_csv("table.csv")
603
- # elements_out_filtered_df.head(6)
604
-
605
- # # %%
606
- # ### Dictionary format
607
-
608
- # elements_out_filtered_dict = convert_to_dict(elements_out_filtered_meta_mod)
609
- # elements_out_filtered_dict[20]
610
-
611
- # # %% [markdown]
612
- # # ### Document format for embeddings
613
-
614
- # # %%
615
- # doc_sections = write_elements_to_documents(elements_out_filtered_meta_mod, element_types_to_filter)
616
-
617
- # doc_sections[0:10]
618
-
619
- # # %% [markdown]
620
- # # ### Break text down into chunks
621
-
622
- # # %%
623
- # chunks_by_title = chunk_by_title(
624
- # elements_out_filtered_meta_mod,
625
- # include_orig_elements=True,
626
- # combine_text_under_n_chars=minimum_chunk_length,
627
- # new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
628
- # max_characters=hard_max_character_length_chunks,
629
- # multipage_sections=True,
630
- # overlap_all=True
631
- # )
632
-
633
- # chunk_sections, chunk_df = element_chunks_to_document(chunks_by_title, chapter_ids)
634
- # chunk_df.to_csv("chunked_df.csv")
635
- # print(chunk_sections[2])
636
-
637
- # # %%
638
- # chunks_basic = chunk_elements(
639
- # elements_out_filtered_meta_mod,
640
- # include_orig_elements=True,
641
- # new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
642
- # max_characters=hard_max_character_length_chunks,
643
- # overlap_all=True
644
- # )
645
-
646
- # chunk_basic_sections, chunk_basic_df = element_chunks_to_document(chunks_basic, chapter_ids)
647
- # chunk_basic_df.to_csv("chunked_basic_df.csv")
648
-
649
- # %% [markdown]
650
- # # Partition Word document
651
- #
652
- # You cannot get location metadata for bounding boxes from word documents
653
-
654
- # %%
655
- # word_filename = "../examples/FINAL Policy and Procedure for Writing Housing Policies.docx"
656
-
657
- # # %%
658
- # docx_elements = partition(filename=word_filename)
659
- # for element in docx_elements:
660
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
661
-
662
- # # %%
663
- # docx_elements[5].text
664
-
665
- # # %%
666
- # docx_elements[5].category
667
-
668
- # # %%
669
- # docx_elements[5].metadata.to_dict()
670
-
671
- # # %% [markdown]
672
- # # ## Find elements associated with chapters
673
-
674
- # # %%
675
- # chapter_ids, chapter_to_id = create_title_id_dict(docx_elements)
676
-
677
- # chapter_ids
678
-
679
- # # %%
680
- # doc_sections = write_elements_to_documents(docx_elements.copy(), chapter_ids)
681
-
682
- # # %%
683
- # doc_sections
684
-
685
- # # %% [markdown]
686
- # # ### Chunk documents
687
-
688
- # # %%
689
- # chunks = chunk_by_title(
690
- # docx_elements,
691
- # include_orig_elements=False,
692
- # combine_text_under_n_chars=0,
693
- # new_after_n_chars=500,
694
- # max_characters=1000,
695
- # multipage_sections=True,
696
- # overlap_all=True
697
- # )
698
-
699
- # # %%
700
- # print(chunks)
701
-
702
- # # %%
703
- # chunk_sections = element_chunks_to_document(chunks.copy(), docx_elements.copy(), chapter_ids)
704
-
705
- # # %%
706
- # chunk_sections[5].page_content
707
-
708
- # # %%
709
- # chunk_sections[5].metadata["true_element_ids"]
710
-
711
- # # %%
712
- # for element in docx_elements:
713
- # if element._element_id in chunk_sections[5].metadata["true_element_ids"]:
714
- # print(element.text)
715
-
716
- # # %% [markdown]
717
- # # # Partition PPTX document
718
-
719
- # # %%
720
- # pptx_filename = "../examples/LOTI presentation Jan 2024.pptx"
721
-
722
- # # %%
723
- # pptx_elements = partition(filename=pptx_filename)
724
- # for element in pptx_elements[:10]:
725
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
726
-
727
- # # %%
728
- # chapter_ids, chapter_to_id = create_title_id_dict(pptx_elements)
729
- # chapter_ids
730
-
731
- # # %%
732
- # pptx_sections = write_elements_to_documents(pptx_elements.copy(), chapter_ids)
733
-
734
- # # %%
735
- # pptx_sections
736
-
737
- # # %%
738
- # pptx_chunks = chunk_by_title(
739
- # pptx_elements,
740
- # include_orig_elements=False,
741
- # combine_text_under_n_chars=0,
742
- # new_after_n_chars=500,
743
- # max_characters=1000,
744
- # multipage_sections=True,
745
- # overlap_all=True
746
- # )
747
-
748
- # # %%
749
- # pptx_chunk_sections = element_chunks_to_document(pptx_chunks.copy(), pptx_elements.copy(), chapter_ids)
750
-
751
- # # %% [markdown]
752
- # # ### Load documents into a vectorDB (Not necessary)
753
-
754
- # # %%
755
- # import chromadb
756
-
757
- # # %%
758
- # client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
759
- # client.reset()
760
-
761
- # # %%
762
- # collection = client.create_collection(
763
- # name="policy_statements",
764
- # metadata={"hnsw:space": "cosine"}
765
- # )
766
-
767
- # # %%
768
- # chapter_ids
769
-
770
- # # %%
771
- # for element in docx_elements:
772
- # parent_id = element.metadata.parent_id
773
- # #print(element.text)
774
- # #print(parent_id)
775
- # #print(element.metadata.to_dict())
776
- # if parent_id:
777
- # try:
778
- # print(parent_id)
779
- # chapter = chapter_ids[parent_id]
780
- # print(chapter)
781
- # except KeyError:
782
- # chapter = "None"
783
- # else:
784
- # chapter = "None"
785
- # collection.add(
786
- # documents=[element.text],
787
- # ids=[element._element_id],
788
- # metadatas=[{"chapter": chapter}]
789
- # )
790
-
791
- # # %% [markdown]
792
- # # #### See the elements in the VectorDB and perform hybrid search
793
-
794
- # # %%
795
- # results = collection.peek()
796
- # print(results["documents"])
797
-
798
- # # %%
799
- # print(collection.metadata)
800
-
801
- # # %%
802
- # import json
803
-
804
- # result = collection.query(
805
- # query_texts=["What should policies do?"],
806
- # n_results=2,
807
- # where={"chapter": '3.0 Policy Statements'},
808
- # )
809
- # print(json.dumps(result, indent=2))
810
-
811
- # # %%
812
- # collection = client.create_collection(
813
- # name="policy_statements_chunk",
814
- # metadata={"hnsw:space": "cosine"}
815
- # )
816
-
817
- # # %%
818
- # for element in chunks:
819
- # parent_id = element.metadata.parent_id
820
- # #print(element.text)
821
- # #print(parent_id)
822
- # #print(element.metadata.to_dict())
823
- # if parent_id:
824
- # try:
825
- # print(parent_id)
826
- # chapter = chapter_ids[parent_id]
827
- # print(chapter)
828
- # except KeyError:
829
- # chapter = "None"
830
- # else:
831
- # chapter = "None"
832
-
833
- # print(element._element_id)
834
- # collection.add(
835
- # documents=[element.text],
836
- # ids=[element.orig_elements],
837
- # metadatas=[{"chapter": chapter}]
838
- # )
839
-
840
- # # %% [markdown]
841
- # # # Partition HTML
842
-
843
- # # %%
844
- # html_filename = "../examples/transport-strategy.html"
845
-
846
- # # %%
847
- # html_elements = partition(filename=html_filename)
848
- # for element in html_elements[:10]:
849
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
850
-
851
- # # %% [markdown]
852
- # # # Partition image
853
-
854
- # # %%
855
- # img_filename = "../examples/example_complaint_letter.jpg"
856
-
857
- # # %%
858
- # img_elements = partition(filename=img_filename)
859
- # for element in img_elements[:10]:
860
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
861
-
862
- # # %% [markdown]
863
- # # # Partition XLSX
864
-
865
- # # %%
866
- # xlsx_filename = "../examples/fuel-poverty-sub-regional-tables-2020-2018-data.xlsx"
867
-
868
- # # %%
869
- # xlsx_elements = partition(filename=xlsx_filename)
870
- # for element in xlsx_elements[:10]:
871
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
872
-
873
- # # %% [markdown]
874
- # # # Partition .py
875
-
876
- # # %%
877
- # py_filename = "../examples/app.py"
878
-
879
- # # %%
880
- # py_elements = partition(filename=py_filename)
881
- # for element in py_elements[:10]:
882
- # print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
883
-
884
-
 
173
 
174
  return elements
175
 
176
+ # %%
177
+ def group_by_filename(
178
+ elements: List[Element],
179
+ meta_keys: List[str] = ['filename']
180
+ ) -> List[List[Element]]:
181
+ '''
182
+ Identify elements with the same filename and return them
183
+ '''
184
+ grouped_elements = {} # Dictionary to hold lists of elements by filename
185
 
186
+ for element in elements:
187
+ for key in meta_keys:
188
+ try:
189
+ current_file = element.metadata.__dict__[key] # Get the filename
190
+ if current_file not in grouped_elements:
191
+ grouped_elements[current_file] = [] # Initialize list for this filename
192
+ grouped_elements[current_file].append(element) # Add element to the list
193
+ except KeyError:
194
+ print(f"Key '{key}' not found in element metadata.")
195
+
196
+ return list(grouped_elements.values()) # Return the grouped elements as a list of lists
197
+
198
+ def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:str = "Basic_chunking", minimum_chunk_length:int=minimum_chunk_length, start_new_chunk_after_end_of_this_element_length:int=start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks:int=hard_max_character_length_chunks, multipage_sections:bool=multipage_sections, overlap_all:bool=overlap_all, chunk_within_docs:str="Yes", include_orig_elements:bool=include_orig_elements):
199
 
200
  '''
201
  Use Unstructured.io functions to chunk an Element object by Title or across all elements.
 
207
 
208
  ### Break text down into chunks
209
 
210
+ all_chunks = []
211
 
212
+ #### If chunking within docs, then provide a list of list of elements, with each sublist being a separate document. Else, provide a list of lists of length 1
 
 
 
 
 
 
 
 
 
213
 
214
+ if chunk_within_docs == "No": elements = [elements]
215
+ else: elements = group_by_filename(elements)
216
+
217
+ try:
218
+ for element_group in elements:
219
+ if chunk_type == "Chunk within title":
220
+ chunks = chunk_by_title(
221
+ element_group,
222
+ include_orig_elements=include_orig_elements,
223
+ combine_text_under_n_chars=minimum_chunk_length,
224
+ new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
225
+ max_characters=hard_max_character_length_chunks,
226
+ multipage_sections=multipage_sections,
227
+ overlap_all=overlap_all
228
+ )
229
+
230
+ elif chunk_type == "Basic chunking":
231
+ chunks = chunk_elements(
232
+ element_group,
233
+ include_orig_elements=include_orig_elements,
234
+ new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
235
+ max_characters=hard_max_character_length_chunks,
236
+ overlap_all=overlap_all
237
+ )
238
+
239
+ all_chunks.extend(chunks)
240
 
241
  except Exception as output_summary:
242
  print(output_summary)
243
  return output_summary, output_files, file_name_base
244
+
245
+ # print("all_chunks:", all_chunks)
246
 
247
+ chunk_sections, chunk_df, chunks_out = element_chunks_to_document(all_chunks, chapter_ids)
248
 
249
  file_name_suffix = "_chunk"
250
 
 
348
  element_doc = [Document(page_content=element.text, metadata= meta)]
349
  doc_sections.extend(element_doc)
350
 
 
 
 
351
  return doc_sections
352
 
353
  # %%
 
463
  if chunk_documents:
464
  out_documents = chunk_documents
465
  else:
466
+ out_documents = write_elements_to_documents(elements)
 
 
467
 
468
  out_file_name_docs = "output/" + out_file_name_base + "_docs.pkl.gz"
469
  with gzip.open(out_file_name_docs, 'wb') as file:
 
555
  elements_out_meta_mod_meta_filt = remove_keys_from_meta(elements_out_meta_mod.copy(), meta_keys_to_filter)
556
  elements_out_filtered_meta_mod = filter_elements(elements_out_meta_mod_meta_filt, element_types_to_filter)
557
 
558
+ return elements_out_filtered_meta_mod