File size: 13,637 Bytes
0b2c988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cb0304
 
0b2c988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cb0304
0b2c988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cb0304
0b2c988
 
 
1cb0304
 
0b2c988
 
 
1cb0304
 
 
0b2c988
1cb0304
0b2c988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cb0304
0b2c988
 
 
 
 
1cb0304
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import os

# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'

from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, custom_regex_load
from tools.unstructured_funcs import partition_file, clean_elements, export_elements_as_table_to_file, filter_elements_and_metadata, chunk_all_elements, minimum_chunk_length, start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks, multipage_sections, overlap_all
#from tools.aws_functions import load_data_from_aws
from tools.clean_funcs import pre_clean, full_entity_list, chosen_redact_entities
import gradio as gr
import pandas as pd
import numpy as np
from typing import Type, List
from unstructured.documents.elements import Element

# Creating an alias for pandas DataFrame using Type
PandasDataFrame = Type[pd.DataFrame]

add_folder_to_path("_internal/tesseract/")
add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")

ensure_output_folder_exists()

language = 'en'
default_meta_keys_to_filter=["file_directory", "filetype"]
default_element_types_to_filter = ['UncategorizedText', 'Header']

max_chunk_length = 25000 # characters


def get_element_metadata(elements, prefix=""):
    """Recursively retrieves element names and metadata in the desired format."""
    result = []

    for element in elements:
        # print("Element metadata: ", element.metadata)
        # print("Element metadata dict: ", element.metadata.__dict__)

        if hasattr(element, 'metadata') and isinstance(element.metadata.__dict__, dict):
            for key, value in element.metadata.__dict__.items():  # Iterate over key-value pairs in metadata dictionary
                    new_prefix = f"{prefix}." if prefix else ""
                    if isinstance(value, dict):  # Nested metadata
                        result.extend(get_element_metadata([value], new_prefix))  # Recurse with the nested dictionary as a single-item list
                    else:  # Leaf element
                        meta_element_to_add = f"{new_prefix}{key}"
                        if meta_element_to_add not in result:
                            result.append(meta_element_to_add)
        else:
            print(f"Warning: Element {element} does not have a metadata dictionary.")  # Handle elements without metadata gracefully

    return result

def update_filter_dropdowns(elements_table:PandasDataFrame, elements:List[Element]):
    if 'text' in elements_table.columns:
        elements_table_filt = elements_table.drop('text', axis=1)
    else:
        elements_table_filt = elements_table

    # Error handling for missing 'type' column
    if 'type' not in elements_table_filt.columns:
        print("Warning: 'type' column not found in the DataFrame.")
        return gr.Dropdown(label="Element types (not available)"), gr.Dropdown(label="Metadata properties (not available)")

    element_types_to_filter = elements_table_filt['type'].unique().tolist()
    meta_keys_to_filter = get_element_metadata(elements)

    #print("Element types:", element_types_to_filter)
    #print("Meta keys:", meta_keys_to_filter)

    element_types_to_filter_shortlist = [x for x in default_element_types_to_filter if x in element_types_to_filter]
    meta_keys_to_filter_shortlist = [x for x in default_meta_keys_to_filter if x in meta_keys_to_filter]

    return gr.Dropdown(
        value=element_types_to_filter_shortlist, choices=element_types_to_filter, multiselect=True, interactive=True, label="Choose element types to exclude from element list"
    ), gr.Dropdown(
        value=meta_keys_to_filter_shortlist, choices=meta_keys_to_filter, multiselect=True, interactive=True, label="Choose metadata keys to filter out"
    )

# Create the gradio interface

block = gr.Blocks(theme = gr.themes.Base())

with block:

    elements_state = gr.State([])
    elements_table_state = gr.State(pd.DataFrame())
    metadata_keys_state = gr.State([])
    output_image_files_state = gr.State([])
    output_file_list_state = gr.State([])
    in_colnames_state = gr.State("text")

    data_state = gr.State(pd.DataFrame())
    embeddings_state = gr.State(np.array([]))
    embeddings_type_state = gr.State("")
    topic_model_state = gr.State()
    assigned_topics_state = gr.State([])
    custom_regex_state = gr.State(pd.DataFrame())
    docs_state = gr.State()
    data_file_name_no_ext_state = gr.State()
    label_list_state = gr.State(pd.DataFrame())
    output_name_state = gr.State("")

    gr.Markdown(
    """
    # Document RAG preparation
    Extract text from documents and convert into tabular format using the Unstructured package. The outputs can then be used downstream for e.g. RAG/other processes that require tabular data. Currently supports the following file types: .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.). Outputs csvs and files in a 'Document' format commonly used as input to vector databases e.g. ChromaDB, or Langchain embedding datastore integrations. See [here](https://docs.unstructured.io/open-source/core-functionality/overview) for more details about what is going on under the hood.
    """)

    with gr.Tab("Partition document"):
    
        with gr.Accordion("Upload files - accepts .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx),  or code files (.py, .js, etc.)", open = True):
            in_file = gr.File(label="Choose file", file_count= "multiple", height=100)
            in_pdf_partition_strategy = gr.Radio(label="PDF partition strategy", value = "fast", choices=["fast", "ocr_only", "hi_res"])
        
        partition_btn = gr.Button("Partition documents (outputs appear below)", variant='primary')

        with gr.Accordion("Clean, anonymise, or filter text elements", open = False):
            with gr.Accordion("Filter element types from text and information from metadata", open = False):
                element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
                meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")                

                filter_meta_btn = gr.Button("Filter elements/metadata", variant='primary')

            with gr.Accordion("Clean/anonymise text", open = False):
                with gr.Row():
                    clean_options = gr.Dropdown(choices = ["Convert bytes to string","Replace quotes","Clean non ASCII","Clean ordered list", "Group paragraphs",
                    "Remove trailing punctuation", "Remove all punctuation","Clean text","Remove extra whitespace", "Remove dashes","Remove bullets",
                    "Make lowercase"],
                    value=["Clean ordered list", "Group paragraphs", "Clean non ASCII", "Remove extra whitespace", "Remove dashes",  "Remove bullets"],
                    label="Clean options", multiselect=True, interactive=True)                    

                with gr.Accordion("Clean with custom regex", open = False):
                    gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
                    clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove custom regex.")
                    with gr.Row():
                        custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
                        custom_regex_text = gr.Textbox(label="Custom regex load status")

                with gr.Accordion("Anonymise text", open = False):
                    anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data. Personal details are redacted - not 100% effective. Please check results afterwards!")
                    with gr.Row():
                        anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
                        anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")                        

                unstructured_clean_btn = gr.Button("Clean data", variant='primary')       
                
        with gr.Accordion("Chunk text", open = False):
            with gr.Row():
                chunk_within_docs_rad = gr.Radio(label="Chunk within documents", value = "No", choices = ["Yes", "No"], interactive=True)
                chunking_method_rad = gr.Radio(label="Basic chunking or by title", value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
                multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
                overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
            with gr.Row():
                minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=max_chunk_length, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
                start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=max_chunk_length, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
                hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=max_chunk_length, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)

            chunk_btn = gr.Button("Chunk document(s)", variant='primary')

        # Save chunked data to file
        with gr.Accordion("File outputs", open = True):
            with gr.Row():
                output_summary = gr.Textbox(label="Output summary")
                output_file = gr.File(label="Output file")

    # AWS functions not yet implemented in this app
    # with gr.Tab(label="AWS data load"):
    #     with gr.Accordion(label = "AWS data access", open = True):
    #         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
    #         with gr.Row():
    #             in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
    #             load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
                
    #         aws_log_box = gr.Textbox(label="AWS data load status")
    
    # Partition data, then Update filter dropdowns from loaded data
    partition_btn.click(fn = partition_file, inputs=[in_file, in_pdf_partition_strategy],
                    outputs=[output_summary, elements_state, output_file, output_name_state, elements_table_state], api_name="partition").\
                    then(fn = update_filter_dropdowns, inputs=[elements_table_state, elements_state], outputs=[element_types_to_filter, meta_keys_to_filter])

    # Clean data
    ## Filter metadata
    
    filter_meta_btn.click(fn=filter_elements_and_metadata, inputs=[elements_state, element_types_to_filter, meta_keys_to_filter], outputs=[elements_state]).\
    then(fn=export_elements_as_table_to_file, inputs=[elements_state, output_name_state], outputs=[output_summary, output_file])

    ## General text clean and anonymisation

    ### Custom regex load
    custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])

    unstructured_clean_btn.click(fn=clean_elements, inputs=[elements_state, clean_options, output_name_state], outputs=[elements_state, output_summary, output_file, output_name_state]).\
    then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])

    ## Chunk data
    chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop, chunk_within_docs_rad], outputs=[output_summary, output_file, output_name_state])
    
    # Loading AWS data - not yet implemented in this app
    # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
    
# Simple run
if __name__ == "__main__":
    block.queue().launch(show_error=True, inbrowser=True)