Spaces:

seanpedrickcase
/

document_rag_preparation

Running

App Files Files Community

seanpedrickcase commited on Jan 6

Commit

1cb0304

1 Parent(s): 28347d9

Can now chunk within files (without overlap). Removed unnecessary code files

Browse files

Files changed (6) hide show

app.py +13 -9
tools/anonymiser.py +1 -1
tools/file_conversion.py +0 -140
tools/file_redaction.py +0 -236
tools/load_spacy_model_custom_recognisers.py +0 -168
tools/unstructured_funcs.py +55 -381

app.py CHANGED Viewed

@@ -25,6 +25,8 @@ language = 'en'
 default_meta_keys_to_filter=["file_directory", "filetype"]
 default_element_types_to_filter = ['UncategorizedText', 'Header']
 def get_element_metadata(elements, prefix=""):
     """Recursively retrieves element names and metadata in the desired format."""
@@ -117,7 +119,7 @@ with block:
                 element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
                 meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")
-                filter_meta_btn = gr.Button("Filter elements/metadata")
             with gr.Accordion("Clean/anonymise text", open = False):
                 with gr.Row():
@@ -140,19 +142,20 @@ with block:
                         anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
                         anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")
-                unstructured_clean_btn = gr.Button("Clean data")
         with gr.Accordion("Chunk text", open = False):
             with gr.Row():
-                chunking_method_rad = gr.Radio(value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
                 multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
                 overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
             with gr.Row():
-                minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=10000, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
-                start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=10000, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
-                hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=10000, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)
-            chunk_btn = gr.Button("Chunk document")
         # Save chunked data to file
         with gr.Accordion("File outputs", open = True):
@@ -190,10 +193,11 @@ with block:
     then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])
     ## Chunk data
-    chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop], outputs=[output_summary, output_file, output_name_state])
     # Loading AWS data - not yet implemented in this app
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
 # Simple run
-block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861

 default_meta_keys_to_filter=["file_directory", "filetype"]
 default_element_types_to_filter = ['UncategorizedText', 'Header']
+max_chunk_length = 25000 # characters
 def get_element_metadata(elements, prefix=""):
     """Recursively retrieves element names and metadata in the desired format."""
                 element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
                 meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")
+                filter_meta_btn = gr.Button("Filter elements/metadata", variant='primary')
             with gr.Accordion("Clean/anonymise text", open = False):
                 with gr.Row():
                         anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
                         anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")
+                unstructured_clean_btn = gr.Button("Clean data", variant='primary')
         with gr.Accordion("Chunk text", open = False):
             with gr.Row():
+                chunk_within_docs_rad = gr.Radio(label="Chunk within documents", value = "No", choices = ["Yes", "No"], interactive=True)
+                chunking_method_rad = gr.Radio(label="Basic chunking or by title", value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
                 multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
                 overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
             with gr.Row():
+                minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=max_chunk_length, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
+                start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=max_chunk_length, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
+                hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=max_chunk_length, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)
+            chunk_btn = gr.Button("Chunk document(s)", variant='primary')
         # Save chunked data to file
         with gr.Accordion("File outputs", open = True):
     then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])
     ## Chunk data
+    chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop, chunk_within_docs_rad], outputs=[output_summary, output_file, output_name_state])
     # Loading AWS data - not yet implemented in this app
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
 # Simple run
+if __name__ == "__main__":
+    block.queue().launch(show_error=True, inbrowser=True)

tools/anonymiser.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from spacy.cli import download
 import spacy
 from tools.presidio_analyzer_custom import analyze_dict
-from tools.load_spacy_model_custom_recognisers import nlp_analyser
 from typing import List
 from unstructured.documents.elements import Element

 from spacy.cli import download
 import spacy
 from tools.presidio_analyzer_custom import analyze_dict
+#from tools.load_spacy_model_custom_recognisers import nlp_analyser
 from typing import List
 from unstructured.documents.elements import Element

tools/file_conversion.py DELETED Viewed

@@ -1,140 +0,0 @@
-from pdf2image import convert_from_path, pdfinfo_from_path
-from tools.helper_functions import get_file_path_end
-from PIL import Image
-import os
-from gradio import Progress
-from typing import List
-def is_pdf_or_image(filename):
-    """
-    Check if a file name is a PDF or an image file.
-    Args:
-        filename (str): The name of the file.
-    Returns:
-        bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
-    """
-    if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
-        output = True
-    else:
-        output = False
-    return output
-def is_pdf(filename):
-    """
-    Check if a file name is a PDF.
-    Args:
-        filename (str): The name of the file.
-    Returns:
-        bool: True if the file name ends with ".pdf", False otherwise.
-    """
-    return filename.lower().endswith(".pdf")
-# %%
-## Convert pdf to image if necessary
-def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
-    # Get the number of pages in the PDF
-    page_count = pdfinfo_from_path(pdf_path)['Pages']
-    print("Number of pages in PDF: ", str(page_count))
-    images = []
-    # Open the PDF file
-    for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
-        print("Current page: ", str(page_num))
-        # Convert one page to image
-        image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
-        # If no images are returned, break the loop
-        if not image:
-            break
-        images.extend(image)
-    print("PDF has been converted to images.")
-    return images
-# %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
-def process_file(file_path):
-    # Get the file extension
-    file_extension = os.path.splitext(file_path)[1].lower()
-    # Check if the file is an image type
-    if file_extension in ['.jpg', '.jpeg', '.png']:
-        print(f"{file_path} is an image file.")
-        # Perform image processing here
-        out_path = [Image.open(file_path)]
-    # Check if the file is a PDF
-    elif file_extension == '.pdf':
-        print(f"{file_path} is a PDF file. Converting to image set")
-        # Run your function for processing PDF files here
-        out_path = convert_pdf_to_images(file_path)
-    else:
-        print(f"{file_path} is not an image or PDF file.")
-        out_path = ['']
-    return out_path
-def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
-    out_message = ''
-    out_file_paths = []
-    in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    if file_path:
-        file_path_without_ext = get_file_path_end(file_path)
-    else:
-        out_message = "No file selected"
-        print(out_message)
-        return out_message, out_file_paths
-    if in_redact_method == "Image analysis":
-        # Analyse and redact image-based pdf or image
-        if is_pdf_or_image(file_path) == False:
-            return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-        out_file_path = process_file(file_path)
-    elif in_redact_method == "Text analysis":
-        if is_pdf(file_path) == False:
-            return "Please upload a PDF file for text analysis.", None
-        out_file_path = file_path
-    return out_message, out_file_path
-def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
-    file_path_without_ext = get_file_path_end(in_file_path)
-    out_file_paths = out_text_file_path
-    # Convert annotated text pdf back to image to give genuine redactions
-    print("Creating image version of results")
-    pdf_text_image_paths = process_file(out_text_file_path[0])
-    out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
-    pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
-    out_file_paths.append(out_text_image_file_path)
-    out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
-    return out_message, out_file_paths

tools/file_redaction.py DELETED Viewed

@@ -1,236 +0,0 @@
-from PIL import Image
-from typing import List
-import pandas as pd
-from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
-from pdfminer.high_level import extract_pages
-from tools.file_conversion import process_file
-from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
-from pikepdf import Pdf, Dictionary, Name
-from gradio import Progress
-import time
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
-from tools.helper_functions import get_file_path_end
-from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
-import gradio as gr
-def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
-    tic = time.perf_counter()
-    out_message = ''
-    out_file_paths = []
-    if in_allow_list:
-        in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    if file_path:
-         file_path_without_ext = get_file_path_end(file_path)
-    else:
-         out_message = "No file selected"
-         print(out_message)
-         return out_message, out_file_paths
-    if in_redact_method == "Image analysis":
-        # Analyse and redact image-based pdf or image
-        # if is_pdf_or_image(file_path) == False:
-        #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-        pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
-        out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
-        pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
-        out_file_paths.append(out_image_file_path)
-        out_message = "Image-based PDF successfully redacted and saved to file."
-    elif in_redact_method == "Text analysis":
-        if is_pdf(file_path) == False:
-            return "Please upload a PDF file for text analysis.", None
-        # Analyse text-based pdf
-        pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
-        out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
-        pdf_text.save(out_text_file_path)
-        out_file_paths.append(out_text_file_path)
-        out_message = "Text-based PDF successfully redacted and saved to file."
-    else:
-        out_message = "No redaction method selected"
-        print(out_message)
-        return out_message, out_file_paths
-    toc = time.perf_counter()
-    out_time = f"Time taken: {toc - tic:0.1f} seconds."
-    print(out_time)
-    out_message = out_message + "\n\n" + out_time
-    return out_message, out_file_paths, out_file_paths
-def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
-    '''
-    take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
-    '''
-    if not image_paths:
-        out_message = "PDF does not exist as images. Converting pages to image"
-        print(out_message)
-        progress(0, desc=out_message)
-        image_paths = process_file(file_path)
-    # Create a new PDF
-    #pdf = pikepdf.new()
-    images = []
-    number_of_pages = len(image_paths)
-    out_message = "Redacting pages"
-    print(out_message)
-    progress(0.1, desc=out_message)
-    for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
-        print("Redacting page ", str(i + 1))
-        # Get the image to redact using PIL lib (pillow)
-        image = image_paths[i] #Image.open(image_paths[i])
-        # %%
-        image_analyser = ImageAnalyzerEngine(nlp_analyser)
-        engine = ImageRedactorEngine(image_analyser)
-        if language == 'en':
-            ocr_lang = 'eng'
-        else: ocr_lang = language
-        # %%
-        # Redact the image with pink color
-        redacted_image = engine.redact(image,
-            fill=(0, 0, 0),
-            ocr_kwargs={"lang": ocr_lang},
-            allow_list=allow_list,
-            ad_hoc_recognizers= None,
-            **{
-                "language": language,
-                "entities": chosen_redact_entities,
-                "score_threshold": score_threshold
-            },
-            )
-        images.append(redacted_image)
-    return images
-def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
-    '''
-    Redact chosen entities from a pdf that is made up of multiple pages that are not images.
-    '''
-    combined_analyzer_results = []
-    analyser_explanations = []
-    annotations_all_pages = []
-    analyzed_bounding_boxes_df = pd.DataFrame()
-    pdf = Pdf.open(filename)
-    page_num = 0
-    for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
-        print("Page number is: ", page_num)
-        annotations_on_page = []
-        analyzed_bounding_boxes = []
-        for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
-            analyzer_results = []
-            for text_container in page_layout:
-                if isinstance(text_container, LTTextContainer):
-                    text_to_analyze = text_container.get_text()
-                    analyzer_results = []
-                    characters = []
-                    analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
-                                                            language=language,
-                                                            entities=chosen_redact_entities,
-                                                            score_threshold=score_threshold,
-                                                            return_decision_process=False,
-                                                            allow_list=allow_list)
-                        #if analyzer_results:
-                        #    pass
-                        #explanation = analyzer_results[0].analysis_explanation.to_dict()
-                        #analyser_explanations.append(explanation)
-                    characters = [char                    # This is what we want to include in the list
-                            for line in text_container          # Loop through each line in text_container
-                            if isinstance(line, LTTextLine)    # Check if the line is an instance of LTTextLine
-                            for char in line]                   # Loop through each character in the line
-                            #if isinstance(char, LTChar)]  # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
-                    # If any results found
-                    print(analyzer_results)
-                    if len(analyzer_results) > 0 and len(characters) > 0:
-                        analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
-                        combined_analyzer_results.extend(analyzer_results)
-            if len(analyzer_results) > 0:
-                # Create summary df of annotations to be made
-                analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
-                analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
-                analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
-                analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
-                analyzed_bounding_boxes_df_new['page'] = page_num + 1
-                analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
-            for analyzed_bounding_box in analyzed_bounding_boxes:
-                bounding_box = analyzed_bounding_box["boundingBox"]
-                annotation = Dictionary(
-                    Type=Name.Annot,
-                    Subtype=Name.Highlight,
-                    QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
-                    Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
-                    C=[0, 0, 0],
-                    CA=1, # Transparency
-                    T=analyzed_bounding_box["result"].entity_type
-                )
-                annotations_on_page.append(annotation)
-            annotations_all_pages.extend([annotations_on_page])
-            print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
-            page.Annots = pdf.make_indirect(annotations_on_page)
-            page_num += 1
-        # Extracting data from dictionaries
-        # extracted_data = []
-        # for item in annotations_all_pages:
-        #     temp_dict = {}
-        #     #print(item)
-        #     for key, value in item.items():
-        #         if isinstance(value, Decimal):
-        #             temp_dict[key] = float(value)
-        #         elif isinstance(value, list):
-        #             temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
-        #         else:
-        #             temp_dict[key] = value
-        #     extracted_data.append(temp_dict)
-        # Creating DataFrame
-        # annotations_out = pd.DataFrame(extracted_data)
-        #print(df)
-        #annotations_out.to_csv("examples/annotations.csv")
-    analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
-    return pdf

tools/load_spacy_model_custom_recognisers.py DELETED Viewed

@@ -1,168 +0,0 @@
-# %%
-from typing import List
-from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
-from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
-import spacy
-spacy.prefer_gpu()
-from spacy.cli.download import download
-import re
-# %%
-model_name = "en_core_web_lg" #"en_core_web_trf"
-score_threshold = 0.001
-# %% [markdown]
-# #### Custom recognisers
-# %%
-# Custom title recogniser
-import re
-titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
-titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
-titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
-titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
-# %%
-# Custom postcode recogniser
-# Define the regex pattern in a Presidio `Pattern` object:
-ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b", score = 1)
-# Define the recognizer with one or more patterns
-ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
-# %%
-# Examples for testing
-#text = "I live in 510 Broad st SE5 9NG ."
-#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
-#print("Result:")
-#print(numbers_result)
-# %%
-def extract_street_name(text:str) -> str:
-    """
-    Extracts the street name and preceding word (that should contain at least one number) from the given text.
-    """
-    street_types = [
-    'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
-    'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
-    'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
-    'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
-    'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
-    'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
-    'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
-    'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
-    ]
-    # Dynamically construct the regex pattern with all possible street types
-    street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
-    # The overall regex pattern to capture the street name and preceding word(s)
-    pattern = rf'(?P<preceding_word>\w*\d\w*)\s*'
-    pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
-    # Find all matches in text
-    matches = re.finditer(pattern, text, re.IGNORECASE)
-    start_positions = []
-    end_positions = []
-    for match in matches:
-        preceding_word = match.group('preceding_word').strip()
-        street_name = match.group('street_name').strip()
-        start_pos = match.start()
-        end_pos = match.end()
-        print(f"Start: {start_pos}, End: {end_pos}")
-        print(f"Preceding words: {preceding_word}")
-        print(f"Street name: {street_name}")
-        print()
-        start_positions.append(start_pos)
-        end_positions.append(end_pos)
-    return start_positions, end_positions
-# %%
-# Some examples for testing
-#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
-#text = "Roberto lives in Five 10 Broad st in Oregon"
-#text = "Roberto lives in 55 Oregon Square"
-#text = "There is 51a no way I will do that"
-#text = "I am writing to apply for"
-#extract_street_name(text)
-# %%
-class StreetNameRecognizer(EntityRecognizer):
-    def load(self) -> None:
-        """No loading is required."""
-        pass
-    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
-        """
-        Logic for detecting a specific PII
-        """
-        start_pos, end_pos = extract_street_name(text)
-        results = []
-        for i in range(0, len(start_pos)):
-            result = RecognizerResult(
-                        entity_type="STREETNAME",
-                        start = start_pos[i],
-                        end = end_pos[i],
-                        score= 1
-                    )
-            results.append(result)
-        return results
-street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
-# %%
-# Create a class inheriting from SpacyNlpEngine
-class LoadedSpacyNlpEngine(SpacyNlpEngine):
-    def __init__(self, loaded_spacy_model):
-        super().__init__()
-        self.nlp = {"en": loaded_spacy_model}
-# %%
-# Load spacy model
-try:
-	import en_core_web_lg
-	nlp = en_core_web_lg.load()
-	print("Successfully imported spaCy model")
-except:
-	download("en_core_web_lg")
-	nlp = spacy.load("en_core_web_lg")
-	print("Successfully downloaded and imported spaCy model")
-# Pass the loaded model to the new LoadedSpacyNlpEngine
-loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
-# %%
-nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
-                default_score_threshold=score_threshold,
-                supported_languages=["en"],
-                log_decision_process=True,
-                )
-# %%
-nlp_analyser.registry.add_recognizer(street_recogniser)
-nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
-nlp_analyser.registry.add_recognizer(titles_recogniser)

tools/unstructured_funcs.py CHANGED Viewed

@@ -173,8 +173,29 @@ def add_parent_title_to_meta(elements:List[Element], chapter_ids:List[str], excl
     return elements
-def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:str = "Basic_chunking",  minimum_chunk_length:int=minimum_chunk_length, start_new_chunk_after_end_of_this_element_length:int=start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks:int=hard_max_character_length_chunks, multipage_sections:bool=multipage_sections, overlap_all:bool=overlap_all, include_orig_elements:bool=include_orig_elements):
     '''
     Use Unstructured.io functions to chunk an Element object by Title or across all elements.
@@ -186,33 +207,44 @@ def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:st
     ### Break text down into chunks
-    try:
-        if chunk_type == "Chunk within title":
-            chunks = chunk_by_title(
-                elements,
-                include_orig_elements=include_orig_elements,
-                combine_text_under_n_chars=minimum_chunk_length,
-                new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
-                max_characters=hard_max_character_length_chunks,
-                multipage_sections=multipage_sections,
-                overlap_all=overlap_all
-            )
-        else:
-            chunks = chunk_elements(
-                elements,
-                include_orig_elements=include_orig_elements,
-                new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
-                max_characters=hard_max_character_length_chunks,
-                overlap_all=overlap_all
-            )
     except Exception as output_summary:
         print(output_summary)
         return output_summary, output_files, file_name_base
-    chunk_sections, chunk_df, chunks_out = element_chunks_to_document(chunks, chapter_ids)
     file_name_suffix = "_chunk"
@@ -316,9 +348,6 @@ def write_elements_to_documents(elements:List[Element]):
         element_doc = [Document(page_content=element.text, metadata= meta)]
         doc_sections.extend(element_doc)
-        #print("Doc format: ", doc_sections)
     return doc_sections
 # %%
@@ -434,9 +463,7 @@ def export_elements_as_table_to_file(elements:List[Element], file_name_base:str,
     if chunk_documents:
         out_documents = chunk_documents
     else:
-        out_documents = write_elements_to_documents(elements)
     out_file_name_docs = "output/" + out_file_name_base + "_docs.pkl.gz"
     with gzip.open(out_file_name_docs, 'wb') as file:
@@ -528,357 +555,4 @@ def modify_metadata_elements(elements_out_cleaned:List[Element], meta_keys_to_fi
     elements_out_meta_mod_meta_filt = remove_keys_from_meta(elements_out_meta_mod.copy(), meta_keys_to_filter)
     elements_out_filtered_meta_mod = filter_elements(elements_out_meta_mod_meta_filt, element_types_to_filter)
-    return elements_out_filtered_meta_mod
-# %%
-# file_stub = "C:/Users/SPedrickCase/OneDrive - Lambeth Council/Apps/doc_rag_prep/examples/"
-# filenames = []
-# pdf_filename = [file_stub + "Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf"]
-# filenames.extend(pdf_filename)
-# html_filename = [file_stub + "transport-strategy.html"]
-# filenames.extend(html_filename)
-# docx_filename = [file_stub + "FINAL Policy and Procedure for Writing Housing Policies.docx"]
-# filenames.extend(docx_filename)
-# out_message, elements_parse = partition_file(filenames=filenames, pdf_partition_strat="ocr_only")
-# for element in elements_parse[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-#     elements_out = elements_parse.copy()
-# %% [markdown]
-# ###  Process with document layout detection - fast strategy
-#
-# The "fast" strategy will extract the text using pdfminer and process the raw text with partition_text. If the PDF text is not extractable, partition_pdf will fall back to "ocr_only". We recommend using the "fast" strategy in most cases where the PDF has extractable text.
-# elements_out_parse = partition_pdf(filename=filename, strategy="fast")
-# for element in elements_out_parse[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-#  elements_out = elements_out_parse.copy()
-# ### OCR only
-#
-# The "ocr_only" strategy runs the document through Tesseract for OCR and then runs the raw text through partition_text. Currently, "hi_res" has difficulty ordering elements for documents with multiple columns. If you have a document with multiple columns that does not have extractable text, we recommend using the "ocr_only" strategy. "ocr_only" falls back to "fast" if Tesseract is not available and the document has extractable text.
-#  elements_out_parse = partition_pdf(filename=filename, strategy="ocr_only")
-#  for element in elements_out_parse[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-#     elements_out = elements_out_parse.copy()
-# ### Hi-res partitioning
-#
-# The "hi_res" strategy will identify the layout of the document using detectron2. The advantage of “hi_res” is that it uses the document layout to gain additional information about document elements. We recommend using this strategy if your use case is highly sensitive to correct classifications for document elements. If detectron2 is not available, the "hi_res" strategy will fall back to the "ocr_only" strategy.
-# elements_out = partition_pdf(filename=filename, strategy="hi_res")
-# for element in elements_out[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-# %% [markdown]
-# ## Clean data
-# %%
-# elements_out_cleaned = clean_elements(elements_out.copy(), bytes_to_string=False,
-# replace_quotes=True ,
-# clean_non_ascii=False,
-# clean_ordered_list=True ,
-# group_paragraphs=True,
-# trailing_punctuation=False,
-# all_punctuation=False,
-# clean_text=True ,
-# extra_whitespace=True,
-# dashes=True ,
-# bullets=True ,
-# lowercase=False)
-# %% [markdown]
-# ## Add/remove elements to/from metadata
-# %% [markdown]
-# ### Write to table, dictionary, document format
-# %%
-### Dataframe format
-# elements_out_filtered_df = convert_to_dataframe(elements_out_filtered_meta_mod)
-# elements_out_filtered_df.to_csv("table.csv")
-# elements_out_filtered_df.head(6)
-# # %%
-# ### Dictionary format
-# elements_out_filtered_dict = convert_to_dict(elements_out_filtered_meta_mod)
-# elements_out_filtered_dict[20]
-# # %% [markdown]
-# # ### Document format for embeddings
-# # %%
-# doc_sections = write_elements_to_documents(elements_out_filtered_meta_mod, element_types_to_filter)
-# doc_sections[0:10]
-# # %% [markdown]
-# # ### Break text down into chunks
-# # %%
-# chunks_by_title = chunk_by_title(
-#     elements_out_filtered_meta_mod,
-#     include_orig_elements=True,
-#     combine_text_under_n_chars=minimum_chunk_length,
-#     new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
-#     max_characters=hard_max_character_length_chunks,
-#     multipage_sections=True,
-#     overlap_all=True
-# )
-# chunk_sections, chunk_df = element_chunks_to_document(chunks_by_title, chapter_ids)
-# chunk_df.to_csv("chunked_df.csv")
-# print(chunk_sections[2])
-# # %%
-# chunks_basic = chunk_elements(
-#     elements_out_filtered_meta_mod,
-#     include_orig_elements=True,
-#     new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
-#     max_characters=hard_max_character_length_chunks,
-#     overlap_all=True
-# )
-# chunk_basic_sections, chunk_basic_df = element_chunks_to_document(chunks_basic, chapter_ids)
-# chunk_basic_df.to_csv("chunked_basic_df.csv")
-# %% [markdown]
-# # Partition Word document
-#
-# You cannot get location metadata for bounding boxes from word documents
-# %%
-# word_filename = "../examples/FINAL Policy and Procedure for Writing Housing Policies.docx"
-# # %%
-# docx_elements = partition(filename=word_filename)
-# for element in docx_elements:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-# # %%
-# docx_elements[5].text
-# # %%
-# docx_elements[5].category
-# # %%
-# docx_elements[5].metadata.to_dict()
-# # %% [markdown]
-# # ## Find elements associated with chapters
-# # %%
-# chapter_ids, chapter_to_id = create_title_id_dict(docx_elements)
-# chapter_ids
-# # %%
-# doc_sections = write_elements_to_documents(docx_elements.copy(), chapter_ids)
-# # %%
-# doc_sections
-# # %% [markdown]
-# # ### Chunk documents
-# # %%
-# chunks = chunk_by_title(
-#     docx_elements,
-#     include_orig_elements=False,
-#     combine_text_under_n_chars=0,
-#     new_after_n_chars=500,
-#     max_characters=1000,
-#     multipage_sections=True,
-#     overlap_all=True
-# )
-# # %%
-# print(chunks)
-# # %%
-# chunk_sections = element_chunks_to_document(chunks.copy(), docx_elements.copy(), chapter_ids)
-# # %%
-# chunk_sections[5].page_content
-# # %%
-# chunk_sections[5].metadata["true_element_ids"]
-# # %%
-# for element in docx_elements:
-#     if element._element_id in chunk_sections[5].metadata["true_element_ids"]:
-#         print(element.text)
-# # %% [markdown]
-# # # Partition PPTX document
-# # %%
-# pptx_filename = "../examples/LOTI presentation Jan 2024.pptx"
-# # %%
-# pptx_elements = partition(filename=pptx_filename)
-# for element in pptx_elements[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-# # %%
-# chapter_ids, chapter_to_id = create_title_id_dict(pptx_elements)
-# chapter_ids
-# # %%
-# pptx_sections = write_elements_to_documents(pptx_elements.copy(), chapter_ids)
-# # %%
-# pptx_sections
-# # %%
-# pptx_chunks = chunk_by_title(
-#     pptx_elements,
-#     include_orig_elements=False,
-#     combine_text_under_n_chars=0,
-#     new_after_n_chars=500,
-#     max_characters=1000,
-#     multipage_sections=True,
-#     overlap_all=True
-# )
-# # %%
-# pptx_chunk_sections = element_chunks_to_document(pptx_chunks.copy(), pptx_elements.copy(), chapter_ids)
-# # %% [markdown]
-# # ### Load documents into a vectorDB (Not necessary)
-# # %%
-# import chromadb
-# # %%
-# client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
-# client.reset()
-# # %%
-# collection = client.create_collection(
-#     name="policy_statements",
-#     metadata={"hnsw:space": "cosine"}
-# )
-# # %%
-# chapter_ids
-# # %%
-# for element in docx_elements:
-#     parent_id = element.metadata.parent_id
-#     #print(element.text)
-#     #print(parent_id)
-#     #print(element.metadata.to_dict())
-#     if parent_id:
-#         try:
-#             print(parent_id)
-#             chapter = chapter_ids[parent_id]
-#             print(chapter)
-#         except KeyError:
-#             chapter = "None"
-#     else:
-#         chapter = "None"
-#     collection.add(
-#         documents=[element.text],
-#         ids=[element._element_id],
-#         metadatas=[{"chapter": chapter}]
-#     )
-# # %% [markdown]
-# # #### See the elements in the VectorDB and perform hybrid search
-# # %%
-# results = collection.peek()
-# print(results["documents"])
-# # %%
-# print(collection.metadata)
-# # %%
-# import json
-# result = collection.query(
-#     query_texts=["What should policies do?"],
-#     n_results=2,
-#     where={"chapter": '3.0  Policy Statements'},
-# )
-# print(json.dumps(result, indent=2))
-# # %%
-# collection = client.create_collection(
-#     name="policy_statements_chunk",
-#     metadata={"hnsw:space": "cosine"}
-# )
-# # %%
-# for element in chunks:
-#     parent_id = element.metadata.parent_id
-#     #print(element.text)
-#     #print(parent_id)
-#     #print(element.metadata.to_dict())
-#     if parent_id:
-#         try:
-#             print(parent_id)
-#             chapter = chapter_ids[parent_id]
-#             print(chapter)
-#         except KeyError:
-#             chapter = "None"
-#     else:
-#         chapter = "None"
-#     print(element._element_id)
-#     collection.add(
-#         documents=[element.text],
-#         ids=[element.orig_elements],
-#         metadatas=[{"chapter": chapter}]
-#     )
-# # %% [markdown]
-# # # Partition HTML
-# # %%
-# html_filename = "../examples/transport-strategy.html"
-# # %%
-# html_elements = partition(filename=html_filename)
-# for element in html_elements[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-# # %% [markdown]
-# # # Partition image
-# # %%
-# img_filename = "../examples/example_complaint_letter.jpg"
-# # %%
-# img_elements = partition(filename=img_filename)
-# for element in img_elements[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-# # %% [markdown]
-# # # Partition XLSX
-# # %%
-# xlsx_filename = "../examples/fuel-poverty-sub-regional-tables-2020-2018-data.xlsx"
-# # %%
-# xlsx_elements = partition(filename=xlsx_filename)
-# for element in xlsx_elements[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
-# # %% [markdown]
-# # # Partition .py
-# # %%
-# py_filename = "../examples/app.py"
-# # %%
-# py_elements = partition(filename=py_filename)
-# for element in py_elements[:10]:
-#     print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")

     return elements
+# %%
+def group_by_filename(
+    elements: List[Element],
+    meta_keys: List[str] = ['filename']
+) -> List[List[Element]]:
+    '''
+    Identify elements with the same filename and return them
+    '''
+    grouped_elements = {}  # Dictionary to hold lists of elements by filename
+    for element in elements:
+        for key in meta_keys:
+            try:
+                current_file = element.metadata.__dict__[key]  # Get the filename
+                if current_file not in grouped_elements:
+                    grouped_elements[current_file] = []  # Initialize list for this filename
+                grouped_elements[current_file].append(element)  # Add element to the list
+            except KeyError:
+                print(f"Key '{key}' not found in element metadata.")
+    return list(grouped_elements.values())  # Return the grouped elements as a list of lists
+def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:str = "Basic_chunking",  minimum_chunk_length:int=minimum_chunk_length, start_new_chunk_after_end_of_this_element_length:int=start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks:int=hard_max_character_length_chunks, multipage_sections:bool=multipage_sections, overlap_all:bool=overlap_all, chunk_within_docs:str="Yes", include_orig_elements:bool=include_orig_elements):
     '''
     Use Unstructured.io functions to chunk an Element object by Title or across all elements.
     ### Break text down into chunks
+    all_chunks = []
+    #### If chunking within docs, then provide a list of list of elements, with each sublist being a separate document. Else, provide a list of lists of length 1
+    if chunk_within_docs == "No": elements = [elements]
+    else: elements = group_by_filename(elements)
+    try:
+        for element_group in elements:
+            if chunk_type == "Chunk within title":
+                chunks = chunk_by_title(
+                    element_group,
+                    include_orig_elements=include_orig_elements,
+                    combine_text_under_n_chars=minimum_chunk_length,
+                    new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
+                    max_characters=hard_max_character_length_chunks,
+                    multipage_sections=multipage_sections,
+                    overlap_all=overlap_all
+                )
+            elif chunk_type == "Basic chunking":
+                chunks = chunk_elements(
+                    element_group,
+                    include_orig_elements=include_orig_elements,
+                    new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
+                    max_characters=hard_max_character_length_chunks,
+                    overlap_all=overlap_all
+                )
+            all_chunks.extend(chunks)
     except Exception as output_summary:
         print(output_summary)
         return output_summary, output_files, file_name_base
+    # print("all_chunks:", all_chunks)
+    chunk_sections, chunk_df, chunks_out = element_chunks_to_document(all_chunks, chapter_ids)
     file_name_suffix = "_chunk"
         element_doc = [Document(page_content=element.text, metadata= meta)]
         doc_sections.extend(element_doc)
     return doc_sections
 # %%
     if chunk_documents:
         out_documents = chunk_documents
     else:
+        out_documents = write_elements_to_documents(elements)
     out_file_name_docs = "output/" + out_file_name_base + "_docs.pkl.gz"
     with gzip.open(out_file_name_docs, 'wb') as file:
     elements_out_meta_mod_meta_filt = remove_keys_from_meta(elements_out_meta_mod.copy(), meta_keys_to_filter)
     elements_out_filtered_meta_mod = filter_elements(elements_out_meta_mod_meta_filt, element_types_to_filter)
+    return elements_out_filtered_meta_mod