|
import os |
|
|
|
|
|
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot' |
|
|
|
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, custom_regex_load |
|
from tools.unstructured_funcs import partition_file, clean_elements, export_elements_as_table_to_file, filter_elements_and_metadata, chunk_all_elements, minimum_chunk_length, start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks, multipage_sections, overlap_all |
|
|
|
from tools.clean_funcs import pre_clean, full_entity_list, chosen_redact_entities |
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
from typing import Type, List |
|
from unstructured.documents.elements import Element |
|
|
|
|
|
PandasDataFrame = Type[pd.DataFrame] |
|
|
|
add_folder_to_path("_internal/tesseract/") |
|
add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/") |
|
|
|
ensure_output_folder_exists() |
|
|
|
language = 'en' |
|
default_meta_keys_to_filter=["file_directory", "filetype"] |
|
default_element_types_to_filter = ['UncategorizedText', 'Header'] |
|
|
|
max_chunk_length = 25000 |
|
|
|
|
|
def get_element_metadata(elements, prefix=""): |
|
"""Recursively retrieves element names and metadata in the desired format.""" |
|
result = [] |
|
|
|
for element in elements: |
|
|
|
|
|
|
|
if hasattr(element, 'metadata') and isinstance(element.metadata.__dict__, dict): |
|
for key, value in element.metadata.__dict__.items(): |
|
new_prefix = f"{prefix}." if prefix else "" |
|
if isinstance(value, dict): |
|
result.extend(get_element_metadata([value], new_prefix)) |
|
else: |
|
meta_element_to_add = f"{new_prefix}{key}" |
|
if meta_element_to_add not in result: |
|
result.append(meta_element_to_add) |
|
else: |
|
print(f"Warning: Element {element} does not have a metadata dictionary.") |
|
|
|
return result |
|
|
|
def update_filter_dropdowns(elements_table:PandasDataFrame, elements:List[Element]): |
|
if 'text' in elements_table.columns: |
|
elements_table_filt = elements_table.drop('text', axis=1) |
|
else: |
|
elements_table_filt = elements_table |
|
|
|
|
|
if 'type' not in elements_table_filt.columns: |
|
print("Warning: 'type' column not found in the DataFrame.") |
|
return gr.Dropdown(label="Element types (not available)"), gr.Dropdown(label="Metadata properties (not available)") |
|
|
|
element_types_to_filter = elements_table_filt['type'].unique().tolist() |
|
meta_keys_to_filter = get_element_metadata(elements) |
|
|
|
|
|
|
|
|
|
element_types_to_filter_shortlist = [x for x in default_element_types_to_filter if x in element_types_to_filter] |
|
meta_keys_to_filter_shortlist = [x for x in default_meta_keys_to_filter if x in meta_keys_to_filter] |
|
|
|
return gr.Dropdown( |
|
value=element_types_to_filter_shortlist, choices=element_types_to_filter, multiselect=True, interactive=True, label="Choose element types to exclude from element list" |
|
), gr.Dropdown( |
|
value=meta_keys_to_filter_shortlist, choices=meta_keys_to_filter, multiselect=True, interactive=True, label="Choose metadata keys to filter out" |
|
) |
|
|
|
|
|
|
|
block = gr.Blocks(theme = gr.themes.Base()) |
|
|
|
with block: |
|
|
|
elements_state = gr.State([]) |
|
elements_table_state = gr.State(pd.DataFrame()) |
|
metadata_keys_state = gr.State([]) |
|
output_image_files_state = gr.State([]) |
|
output_file_list_state = gr.State([]) |
|
in_colnames_state = gr.State("text") |
|
|
|
data_state = gr.State(pd.DataFrame()) |
|
embeddings_state = gr.State(np.array([])) |
|
embeddings_type_state = gr.State("") |
|
topic_model_state = gr.State() |
|
assigned_topics_state = gr.State([]) |
|
custom_regex_state = gr.State(pd.DataFrame()) |
|
docs_state = gr.State() |
|
data_file_name_no_ext_state = gr.State() |
|
label_list_state = gr.State(pd.DataFrame()) |
|
output_name_state = gr.State("") |
|
|
|
gr.Markdown( |
|
""" |
|
# Document RAG preparation |
|
Extract text from documents and convert into tabular format using the Unstructured package. The outputs can then be used downstream for e.g. RAG/other processes that require tabular data. Currently supports the following file types: .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.). Outputs csvs and files in a 'Document' format commonly used as input to vector databases e.g. ChromaDB, or Langchain embedding datastore integrations. See [here](https://docs.unstructured.io/open-source/core-functionality/overview) for more details about what is going on under the hood. |
|
""") |
|
|
|
with gr.Tab("Partition document"): |
|
|
|
with gr.Accordion("Upload files - accepts .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.)", open = True): |
|
in_file = gr.File(label="Choose file", file_count= "multiple", height=100) |
|
in_pdf_partition_strategy = gr.Radio(label="PDF partition strategy", value = "fast", choices=["fast", "ocr_only", "hi_res"]) |
|
|
|
partition_btn = gr.Button("Partition documents (outputs appear below)", variant='primary') |
|
|
|
with gr.Accordion("Clean, anonymise, or filter text elements", open = False): |
|
with gr.Accordion("Filter element types from text and information from metadata", open = False): |
|
element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list") |
|
meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out") |
|
|
|
filter_meta_btn = gr.Button("Filter elements/metadata", variant='primary') |
|
|
|
with gr.Accordion("Clean/anonymise text", open = False): |
|
with gr.Row(): |
|
clean_options = gr.Dropdown(choices = ["Convert bytes to string","Replace quotes","Clean non ASCII","Clean ordered list", "Group paragraphs", |
|
"Remove trailing punctuation", "Remove all punctuation","Clean text","Remove extra whitespace", "Remove dashes","Remove bullets", |
|
"Make lowercase"], |
|
value=["Clean ordered list", "Group paragraphs", "Clean non ASCII", "Remove extra whitespace", "Remove dashes", "Remove bullets"], |
|
label="Clean options", multiselect=True, interactive=True) |
|
|
|
with gr.Accordion("Clean with custom regex", open = False): |
|
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""") |
|
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove custom regex.") |
|
with gr.Row(): |
|
custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple") |
|
custom_regex_text = gr.Textbox(label="Custom regex load status") |
|
|
|
with gr.Accordion("Anonymise text", open = False): |
|
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data. Personal details are redacted - not 100% effective. Please check results afterwards!") |
|
with gr.Row(): |
|
anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)") |
|
anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text") |
|
|
|
unstructured_clean_btn = gr.Button("Clean data", variant='primary') |
|
|
|
with gr.Accordion("Chunk text", open = False): |
|
with gr.Row(): |
|
chunk_within_docs_rad = gr.Radio(label="Chunk within documents", value = "No", choices = ["Yes", "No"], interactive=True) |
|
chunking_method_rad = gr.Radio(label="Basic chunking or by title", value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True) |
|
multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True) |
|
overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True) |
|
with gr.Row(): |
|
minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=max_chunk_length, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True) |
|
start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=max_chunk_length, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached") |
|
hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=max_chunk_length, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True) |
|
|
|
chunk_btn = gr.Button("Chunk document(s)", variant='primary') |
|
|
|
|
|
with gr.Accordion("File outputs", open = True): |
|
with gr.Row(): |
|
output_summary = gr.Textbox(label="Output summary") |
|
output_file = gr.File(label="Output file") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
partition_btn.click(fn = partition_file, inputs=[in_file, in_pdf_partition_strategy], |
|
outputs=[output_summary, elements_state, output_file, output_name_state, elements_table_state], api_name="partition").\ |
|
then(fn = update_filter_dropdowns, inputs=[elements_table_state, elements_state], outputs=[element_types_to_filter, meta_keys_to_filter]) |
|
|
|
|
|
|
|
|
|
filter_meta_btn.click(fn=filter_elements_and_metadata, inputs=[elements_state, element_types_to_filter, meta_keys_to_filter], outputs=[elements_state]).\ |
|
then(fn=export_elements_as_table_to_file, inputs=[elements_state, output_name_state], outputs=[output_summary, output_file]) |
|
|
|
|
|
|
|
|
|
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state]) |
|
|
|
unstructured_clean_btn.click(fn=clean_elements, inputs=[elements_state, clean_options, output_name_state], outputs=[elements_state, output_summary, output_file, output_name_state]).\ |
|
then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state]) |
|
|
|
|
|
chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop, chunk_within_docs_rad], outputs=[output_summary, output_file, output_name_state]) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
block.queue().launch(show_error=True, inbrowser=True) |
|
|