from unstructured.partition.auto import partition from unstructured.chunking.title import chunk_by_title from unstructured.chunking.basic import chunk_elements from unstructured.documents.elements import Element, Title, CompositeElement from unstructured.staging.base import convert_to_dataframe from typing import Type, List, Literal, Tuple from unstructured.cleaners.core import replace_unicode_quotes, clean_non_ascii_chars, clean_ordered_bullets, group_broken_paragraphs, replace_unicode_quotes, clean, clean_trailing_punctuation, remove_punctuation, bytes_string_to_string import gradio as gr import time import pandas as pd import re import gzip import pickle from pydantic import BaseModel, Field from tools.helper_functions import get_file_path_end, get_file_path_end_with_ext # Creating an alias for pandas DataFrame using Type PandasDataFrame = Type[pd.DataFrame] # %% # pdf partitioning strategy vars pdf_partition_strat = "ocr_only" # ["fast", "ocr_only", "hi_res"] # %% # Element metadata modification vars meta_keys_to_filter = ["file_directory", "filetype"] element_types_to_filter = ['UncategorizedText', 'Header'] # %% # Clean function vars bytes_to_string=False replace_quotes=True clean_non_ascii=False clean_ordered_list=True group_paragraphs=True trailing_punctuation=False all_punctuation=False clean_text=True extra_whitespace=True dashes=True bullets=True lowercase=False # %% # Chunking vars minimum_chunk_length = 2000 start_new_chunk_after_end_of_this_element_length = 2000 hard_max_character_length_chunks = 3000 multipage_sections=True overlap_all=True include_orig_elements=True # %% class Document(BaseModel): """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py""" page_content: str """String text.""" metadata: dict = Field(default_factory=dict) """Arbitrary metadata about the page content (e.g., source, relationships to other documents, etc.). """ type: Literal["Document"] = "Document" # %% def create_title_id_dict(elements:List[Element]): # Assuming the object is stored in a variable named 'elements_list' titles = [item.text for item in elements if isinstance(item, Title)] #### Get all elements under these titles chapter_ids = {} for element in elements: for chapter in titles: if element.text == chapter and element.category == "Title": chapter_ids[element._element_id] = chapter break chapter_to_id = {v: k for k, v in chapter_ids.items()} return chapter_ids, chapter_to_id # %% def filter_elements(elements:List[Element], excluded_elements: List[str] = ['']): """ Filter out elements from a list based on their categories. Args: elements: The list of elements to filter. excluded_elements: A list of element categories to exclude. Returns: A new list containing the filtered elements. """ filtered_elements = [] for element in elements: if element.category not in excluded_elements: filtered_elements.append(element) return filtered_elements # %% def remove_keys_from_meta( elements: List[Element], meta_remove_keys: List[str], excluded_element_types: List[str] = [] ) -> List[Element]: ''' Remove specified metadata keys from an Unstructured Element object ''' for element in elements: if element.category not in excluded_element_types: for key in meta_remove_keys: try: del element.metadata.__dict__[key] # Directly modify metadata except KeyError: print(f"Key '{key}' not found in element metadata.") return elements def filter_elements_and_metadata( elements: List[Element], excluded_categories: List[str] = [], meta_remove_keys: List[str] = [], ) -> List[Element]: """ Filters elements based on categories and removes specified metadata keys. Args: elements: The list of elements to process. excluded_categories: A list of element categories to exclude. meta_remove_keys: A list of metadata keys to remove. Returns: A new list containing the processed elements. """ filtered_elements = [] for element in elements: if element.category not in excluded_categories: for key in meta_remove_keys: try: del element.metadata.__dict__[key] except KeyError: # Better logging/error handling instead of just printing # Use a proper logger or raise a warning/exception pass filtered_elements.append(element) return filtered_elements # %% def add_parent_title_to_meta(elements:List[Element], chapter_ids:List[str], excluded_element_types:List[str]=['']) -> List[Element]: ''' Add parent title to Unstructured metadata elements ''' for element in elements: if element.category in excluded_element_types: pass else: meta = element.metadata.to_dict() if "parent_id" in meta and meta["parent_id"] in chapter_ids and "title_name" not in meta: title_name = chapter_ids[meta["parent_id"]] # Directly modify the existing element metadata object element.metadata.title_name = title_name return elements # %% def group_by_filename( elements: List[Element], meta_keys: List[str] = ['filename'] ) -> List[List[Element]]: ''' Identify elements with the same filename and return them ''' grouped_elements = {} # Dictionary to hold lists of elements by filename for element in elements: for key in meta_keys: try: current_file = element.metadata.__dict__[key] # Get the filename if current_file not in grouped_elements: grouped_elements[current_file] = [] # Initialize list for this filename grouped_elements[current_file].append(element) # Add element to the list except KeyError: print(f"Key '{key}' not found in element metadata.") return list(grouped_elements.values()) # Return the grouped elements as a list of lists def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:str = "Basic_chunking", minimum_chunk_length:int=minimum_chunk_length, start_new_chunk_after_end_of_this_element_length:int=start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks:int=hard_max_character_length_chunks, multipage_sections:bool=multipage_sections, overlap_all:bool=overlap_all, chunk_within_docs:str="Yes", include_orig_elements:bool=include_orig_elements): ''' Use Unstructured.io functions to chunk an Element object by Title or across all elements. ''' output_files = [] output_summary = "" chapter_ids, chapter_to_id = create_title_id_dict(elements) ### Break text down into chunks all_chunks = [] #### If chunking within docs, then provide a list of list of elements, with each sublist being a separate document. Else, provide a list of lists of length 1 if chunk_within_docs == "No": elements = [elements] else: elements = group_by_filename(elements) try: for element_group in elements: if chunk_type == "Chunk within title": chunks = chunk_by_title( element_group, include_orig_elements=include_orig_elements, combine_text_under_n_chars=minimum_chunk_length, new_after_n_chars=start_new_chunk_after_end_of_this_element_length, max_characters=hard_max_character_length_chunks, multipage_sections=multipage_sections, overlap_all=overlap_all ) elif chunk_type == "Basic chunking": chunks = chunk_elements( element_group, include_orig_elements=include_orig_elements, new_after_n_chars=start_new_chunk_after_end_of_this_element_length, max_characters=hard_max_character_length_chunks, overlap_all=overlap_all ) all_chunks.extend(chunks) except Exception as output_summary: print(output_summary) return output_summary, output_files, file_name_base # print("all_chunks:", all_chunks) chunk_sections, chunk_df, chunks_out = element_chunks_to_document(all_chunks, chapter_ids) file_name_suffix = "_chunk" # The new file name does not overwrite the old file name as the 'chunked' elements are only used as an output, and not an input to other functions output_summary, output_files, file_name_base_new = export_elements_as_table_to_file(chunks_out, file_name_base, file_name_suffix, chunk_sections) return output_summary, output_files, file_name_base # %% def element_chunks_to_document(chunks:CompositeElement, chapter_ids:List[str]) -> Tuple[List[Document], PandasDataFrame, List[str]]: ''' Take an Unstructured.io chunk_by_title output with the original parsed document elements and turn it into a Document format commonly used by vector databases, and a Pandas dataframe. ''' chunk_sections = [] current_title_id = '' current_title = '' last_page = '' chunk_df_list = [] for chunk in chunks: chunk_meta = chunk.metadata.to_dict() true_element_ids = [] element_categories = [] titles = [] titles_id = [] if "page_number" in chunk_meta: last_page = chunk_meta["page_number"] chunk_text = chunk.text #chunk_page_number = chunk.metadata.to_dict()["page_number"] # If the same element text is found, add the element_id to the chunk (NOT PERFECT. THIS WILL FAIL IF THE SAME TEXT IS SEEN MULTIPL TIMES) for element in chunk.metadata.orig_elements: #element_text = element.text element_id = element._element_id element_category = element.category element_meta = element.metadata.to_dict() if "page_number" in element_meta: element_page_number = element_meta["page_number"] last_page = element_page_number true_element_ids.append(element_id) element_categories.append(element_category) # Set new metadata for chunk if "page_number" in element_meta: chunk_meta["last_page_number"] = last_page chunk_meta["true_element_ids"] = true_element_ids for loop_id in chunk_meta['true_element_ids']: if loop_id in chapter_ids: current_title = chapter_ids[loop_id] current_title_id = loop_id titles.append(current_title) titles_id.append(current_title_id) chunk_meta['titles'] = titles chunk_meta['titles_id'] = titles_id # Remove original elements data for documents chunk_meta.pop('orig_elements') chunk_dict_for_df = chunk_meta.copy() chunk_dict_for_df['text'] = chunk.text chunk_df_list.append(chunk_dict_for_df) chunk_doc = [Document(page_content=chunk_text, metadata=chunk_meta)] chunk_sections.extend(chunk_doc) ## Write metadata back to elements chunk.metadata.__dict__ = chunk_meta chunk_df = pd.DataFrame(chunk_df_list) # print("Doc format: ", chunk_sections) return chunk_sections, chunk_df, chunks # %% def write_elements_to_documents(elements:List[Element]): ''' Take Unstructured.io parsed elements and write it into a 'Document' format commonly used by vector databases ''' doc_sections = [] for element in elements: meta = element.metadata.to_dict() meta["type"] = element.category meta["element_id"] = element._element_id element_doc = [Document(page_content=element.text, metadata= meta)] doc_sections.extend(element_doc) return doc_sections # %% def clean_elements(elements:List[Element], dropdown_options: List[str] = [''], output_name:str = "combined_elements", bytes_to_string:bool=False, replace_quotes:bool=True, clean_non_ascii:bool=False, clean_ordered_list:bool=True, group_paragraphs:bool=True, trailing_punctuation:bool=False, all_punctuation:bool=False, clean_text:bool=True, extra_whitespace:bool=True, dashes:bool=True, bullets:bool=True, lowercase:bool=False) -> List[Element]: ''' Apply Unstructured cleaning processes to a list of parse elements. ''' out_files = [] output_summary = "" # Set variables to True based on dropdown selections for option in dropdown_options: if option == "Convert bytes to string": bytes_to_string = True elif option == "Replace quotes": replace_quotes = True elif option == "Clean non ASCII": clean_non_ascii = True elif option == "Clean ordered list": clean_ordered_list = True elif option == "Group paragraphs": group_paragraphs = True elif option == "Remove trailing punctuation": trailing_punctuation = True elif option == "Remove all punctuation": all_punctuation = True elif option == "Clean text": clean_text = True elif option == "Remove extra whitespace": extra_whitespace = True elif option == "Remove dashes": dashes = True elif option == "Remove bullets": bullets = True elif option == "Make lowercase": lowercase = True cleaned_elements = elements.copy() for element in cleaned_elements: try: if element: # Check if element is not None or empty if bytes_to_string: element.apply(bytes_string_to_string) if replace_quotes: element.apply(replace_unicode_quotes) if clean_non_ascii: element.apply(clean_non_ascii_chars) if clean_ordered_list: element.apply(clean_ordered_bullets) if group_paragraphs: element.apply(group_broken_paragraphs) if trailing_punctuation: element.apply(clean_trailing_punctuation) if all_punctuation: element.apply(remove_punctuation) if group_paragraphs: element.apply(group_broken_paragraphs) if clean_text: element.apply(lambda x: clean(x, extra_whitespace=extra_whitespace, dashes=dashes, bullets=bullets, lowercase=lowercase)) except Exception as e: print(e) element = element alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(cleaned_elements, output_name, file_name_suffix="_clean") output_summary = "Text elements successfully cleaned." print(output_summary) return cleaned_elements, output_summary, out_files, output_file_base # %% [markdown] def export_elements_as_table_to_file(elements:List[Element], file_name_base:str, file_name_suffix:str="", chunk_documents:List[Document]=[]): ''' Export elements as as a table. ''' output_summary = "" out_files = [] # Convert to dataframe format out_table = convert_to_dataframe(elements) # If the file suffix already exists in the output file name, don't add it again. if file_name_suffix not in file_name_base: out_file_name_base = file_name_base + file_name_suffix else: out_file_name_base = file_name_base out_file_name = "output/" + out_file_name_base + ".csv" out_table.to_csv(out_file_name) out_files.append(out_file_name) # Convert to document format if chunk_documents: out_documents = chunk_documents else: out_documents = write_elements_to_documents(elements) out_file_name_docs = "output/" + out_file_name_base + "_docs.pkl.gz" with gzip.open(out_file_name_docs, 'wb') as file: pickle.dump(out_documents, file) out_files.append(out_file_name_docs) output_summary = "File successfully exported." return output_summary, out_files, out_file_name_base # # Partition PDF def get_file_type(filename): pattern = r"\.(\w+)$" # Match a dot followed by one or more word characters at the end of the string match = re.search(pattern, filename) if match: file_type = match.group(1) # Extract the captured file type (without the dot) print(file_type) # Output: "png" else: print("No file type found.") return file_type # %% def partition_file(filenames:List[str], pdf_partition_strat:str = pdf_partition_strat, progress = gr.Progress()): ''' Partition document files into text elements using the Unstructured package. Currently supports PDF, docx, pptx, html, several image file types, text document types, email messages, code files. ''' out_message = "" combined_elements = [] out_files = [] for file in progress.tqdm(filenames, desc="Partitioning files", unit="files"): try: tic = time.perf_counter() print(file) file_name = get_file_path_end_with_ext(file) file_name_base = get_file_path_end(file) file_type = get_file_type(file_name) image_file_type_list = ["jpg", "jpeg", "png", "heic"] if file_type in image_file_type_list: print("File is an image. Using OCR method to partition.") file_elements = partition(file, strategy="ocr_only") else: file_elements = partition(file, strategy=pdf_partition_strat) toc = time.perf_counter() new_out_message = f"Successfully partitioned file: {file_name} in {toc - tic:0.1f} seconds\n" print(new_out_message) out_message = out_message + new_out_message combined_elements.extend(file_elements) except Exception as e: new_out_message = f"Failed to partition file: {file_name} due to {e}. Partitioning halted." print(new_out_message) out_message = out_message + new_out_message break out_table = convert_to_dataframe(combined_elements) # If multiple files, overwrite default file name for outputs if len(filenames) > 1: file_name_base = "combined_files" alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(combined_elements, file_name_base, file_name_suffix="_elements") return out_message, combined_elements, out_files, output_file_base, out_table # %% def modify_metadata_elements(elements_out_cleaned:List[Element], meta_keys_to_filter:List[str]=meta_keys_to_filter, element_types_to_filter:List[str]=element_types_to_filter) -> List[Element]: ''' Take an element object, add parent title names to metadata. Remove specified metadata keys or element types from element list. ''' chapter_ids, chapter_to_id = create_title_id_dict(elements_out_cleaned.copy()) elements_out_meta_mod = add_parent_title_to_meta(elements_out_cleaned.copy(), chapter_ids) elements_out_meta_mod_meta_filt = remove_keys_from_meta(elements_out_meta_mod.copy(), meta_keys_to_filter) elements_out_filtered_meta_mod = filter_elements(elements_out_meta_mod_meta_filt, element_types_to_filter) return elements_out_filtered_meta_mod