Spaces:

seanpedrickcase
/

document_rag_preparation

Running

App Files Files Community

document_rag_preparation / app.py

seanpedrickcase

Can now chunk within files (without overlap). Removed unnecessary code files

1cb0304 6 months ago

raw

history blame

13.6 kB

	import os

	# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
	os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'

	from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, custom_regex_load
	from tools.unstructured_funcs import partition_file, clean_elements, export_elements_as_table_to_file, filter_elements_and_metadata, chunk_all_elements, minimum_chunk_length, start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks, multipage_sections, overlap_all
	#from tools.aws_functions import load_data_from_aws
	from tools.clean_funcs import pre_clean, full_entity_list, chosen_redact_entities
	import gradio as gr
	import pandas as pd
	import numpy as np
	from typing import Type, List
	from unstructured.documents.elements import Element

	# Creating an alias for pandas DataFrame using Type
	PandasDataFrame = Type[pd.DataFrame]

	add_folder_to_path("_internal/tesseract/")
	add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")

	ensure_output_folder_exists()

	language = 'en'
	default_meta_keys_to_filter=["file_directory", "filetype"]
	default_element_types_to_filter = ['UncategorizedText', 'Header']

	max_chunk_length = 25000 # characters


	def get_element_metadata(elements, prefix=""):
	"""Recursively retrieves element names and metadata in the desired format."""
	result = []

	for element in elements:
	# print("Element metadata: ", element.metadata)
	# print("Element metadata dict: ", element.metadata.__dict__)

	if hasattr(element, 'metadata') and isinstance(element.metadata.__dict__, dict):
	for key, value in element.metadata.__dict__.items(): # Iterate over key-value pairs in metadata dictionary
	new_prefix = f"{prefix}." if prefix else ""
	if isinstance(value, dict): # Nested metadata
	result.extend(get_element_metadata([value], new_prefix)) # Recurse with the nested dictionary as a single-item list
	else: # Leaf element
	meta_element_to_add = f"{new_prefix}{key}"
	if meta_element_to_add not in result:
	result.append(meta_element_to_add)
	else:
	print(f"Warning: Element {element} does not have a metadata dictionary.") # Handle elements without metadata gracefully

	return result

	def update_filter_dropdowns(elements_table:PandasDataFrame, elements:List[Element]):
	if 'text' in elements_table.columns:
	elements_table_filt = elements_table.drop('text', axis=1)
	else:
	elements_table_filt = elements_table

	# Error handling for missing 'type' column
	if 'type' not in elements_table_filt.columns:
	print("Warning: 'type' column not found in the DataFrame.")
	return gr.Dropdown(label="Element types (not available)"), gr.Dropdown(label="Metadata properties (not available)")

	element_types_to_filter = elements_table_filt['type'].unique().tolist()
	meta_keys_to_filter = get_element_metadata(elements)

	#print("Element types:", element_types_to_filter)
	#print("Meta keys:", meta_keys_to_filter)

	element_types_to_filter_shortlist = [x for x in default_element_types_to_filter if x in element_types_to_filter]
	meta_keys_to_filter_shortlist = [x for x in default_meta_keys_to_filter if x in meta_keys_to_filter]

	return gr.Dropdown(
	value=element_types_to_filter_shortlist, choices=element_types_to_filter, multiselect=True, interactive=True, label="Choose element types to exclude from element list"
	), gr.Dropdown(
	value=meta_keys_to_filter_shortlist, choices=meta_keys_to_filter, multiselect=True, interactive=True, label="Choose metadata keys to filter out"
	)

	# Create the gradio interface

	block = gr.Blocks(theme = gr.themes.Base())

	with block:

	elements_state = gr.State([])
	elements_table_state = gr.State(pd.DataFrame())
	metadata_keys_state = gr.State([])
	output_image_files_state = gr.State([])
	output_file_list_state = gr.State([])
	in_colnames_state = gr.State("text")

	data_state = gr.State(pd.DataFrame())
	embeddings_state = gr.State(np.array([]))
	embeddings_type_state = gr.State("")
	topic_model_state = gr.State()
	assigned_topics_state = gr.State([])
	custom_regex_state = gr.State(pd.DataFrame())
	docs_state = gr.State()
	data_file_name_no_ext_state = gr.State()
	label_list_state = gr.State(pd.DataFrame())
	output_name_state = gr.State("")

	gr.Markdown(
	"""
	# Document RAG preparation
	Extract text from documents and convert into tabular format using the Unstructured package. The outputs can then be used downstream for e.g. RAG/other processes that require tabular data. Currently supports the following file types: .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.). Outputs csvs and files in a 'Document' format commonly used as input to vector databases e.g. ChromaDB, or Langchain embedding datastore integrations. See [here](https://docs.unstructured.io/open-source/core-functionality/overview) for more details about what is going on under the hood.
	""")

	with gr.Tab("Partition document"):

	with gr.Accordion("Upload files - accepts .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.)", open = True):
	in_file = gr.File(label="Choose file", file_count= "multiple", height=100)
	in_pdf_partition_strategy = gr.Radio(label="PDF partition strategy", value = "fast", choices=["fast", "ocr_only", "hi_res"])

	partition_btn = gr.Button("Partition documents (outputs appear below)", variant='primary')

	with gr.Accordion("Clean, anonymise, or filter text elements", open = False):
	with gr.Accordion("Filter element types from text and information from metadata", open = False):
	element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
	meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")

	filter_meta_btn = gr.Button("Filter elements/metadata", variant='primary')

	with gr.Accordion("Clean/anonymise text", open = False):
	with gr.Row():
	clean_options = gr.Dropdown(choices = ["Convert bytes to string","Replace quotes","Clean non ASCII","Clean ordered list", "Group paragraphs",
	"Remove trailing punctuation", "Remove all punctuation","Clean text","Remove extra whitespace", "Remove dashes","Remove bullets",
	"Make lowercase"],
	value=["Clean ordered list", "Group paragraphs", "Clean non ASCII", "Remove extra whitespace", "Remove dashes", "Remove bullets"],
	label="Clean options", multiselect=True, interactive=True)

	with gr.Accordion("Clean with custom regex", open = False):
	gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
	clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove custom regex.")
	with gr.Row():
	custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
	custom_regex_text = gr.Textbox(label="Custom regex load status")

	with gr.Accordion("Anonymise text", open = False):
	anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data. Personal details are redacted - not 100% effective. Please check results afterwards!")
	with gr.Row():
	anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
	anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")

	unstructured_clean_btn = gr.Button("Clean data", variant='primary')

	with gr.Accordion("Chunk text", open = False):
	with gr.Row():
	chunk_within_docs_rad = gr.Radio(label="Chunk within documents", value = "No", choices = ["Yes", "No"], interactive=True)
	chunking_method_rad = gr.Radio(label="Basic chunking or by title", value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
	multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
	overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
	with gr.Row():
	minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=max_chunk_length, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
	start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=max_chunk_length, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
	hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=max_chunk_length, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)

	chunk_btn = gr.Button("Chunk document(s)", variant='primary')

	# Save chunked data to file
	with gr.Accordion("File outputs", open = True):
	with gr.Row():
	output_summary = gr.Textbox(label="Output summary")
	output_file = gr.File(label="Output file")

	# AWS functions not yet implemented in this app
	# with gr.Tab(label="AWS data load"):
	# with gr.Accordion(label = "AWS data access", open = True):
	# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
	# with gr.Row():
	# in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
	# load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")

	# aws_log_box = gr.Textbox(label="AWS data load status")

	# Partition data, then Update filter dropdowns from loaded data
	partition_btn.click(fn = partition_file, inputs=[in_file, in_pdf_partition_strategy],
	outputs=[output_summary, elements_state, output_file, output_name_state, elements_table_state], api_name="partition").\
	then(fn = update_filter_dropdowns, inputs=[elements_table_state, elements_state], outputs=[element_types_to_filter, meta_keys_to_filter])

	# Clean data
	## Filter metadata

	filter_meta_btn.click(fn=filter_elements_and_metadata, inputs=[elements_state, element_types_to_filter, meta_keys_to_filter], outputs=[elements_state]).\
	then(fn=export_elements_as_table_to_file, inputs=[elements_state, output_name_state], outputs=[output_summary, output_file])

	## General text clean and anonymisation

	### Custom regex load
	custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])

	unstructured_clean_btn.click(fn=clean_elements, inputs=[elements_state, clean_options, output_name_state], outputs=[elements_state, output_summary, output_file, output_name_state]).\
	then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])

	## Chunk data
	chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop, chunk_within_docs_rad], outputs=[output_summary, output_file, output_name_state])

	# Loading AWS data - not yet implemented in this app
	# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])

	# Simple run
	if __name__ == "__main__":
	block.queue().launch(show_error=True, inbrowser=True)