Spaces:

seanpedrickcase
/

llm_topic_modelling

Running

Sean Pedrick-Case commited on Mar 12

Commit

a21f6e5

unverified ·

Merge pull request #2 from seanpedrick-case/dev

Various improvements to zero shot topic modelling/categorisation. Varied sentiment and summary options. More resilient LLM calls.

Files changed (7) hide show

app.py +89 -55
requirements.txt +1 -1
requirements_aws.txt +1 -1
requirements_gpu.txt +1 -1
tools/helper_functions.py +16 -4
tools/llm_api_call.py +0 -0
tools/prompts.py +25 -11

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import socket
 import spaces
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
-from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 #from tools.aws_functions import load_data_from_aws
@@ -44,14 +44,20 @@ with app:
     ###
     text_output_file_list_state = gr.State([])
     log_files_output_list_state = gr.State([])
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
-    file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -67,14 +73,15 @@ with app:
     feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
     # Summary state objects
-    summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas") # gr.State(pd.DataFrame())
-    master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
     summarised_references_markdown = gr.Markdown("", visible=False)
     summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
-    unique_topics_table_file_textbox = gr.Textbox(label="unique_topics_table_file_textbox", visible=False)
     ###
     # UI LAYOUT
@@ -105,17 +112,20 @@ with app:
             in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
-        in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
-            candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
         context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
-        extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
-        text_output_summary = gr.Markdown(value="### Language model response will appear here")
-        text_output_file = gr.File(height=file_input_height, label="Output files")
         latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
         latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
@@ -129,14 +139,28 @@ with app:
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
-    with gr.Tab(label="Deduplicate and summarise topics"):
         gr.Markdown(
         """
-        ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
-            summarisation_in_previous_data_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
-            summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
             with gr.Row():
                 merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
@@ -144,17 +168,21 @@ with app:
                 deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
             deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
-            duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
             summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
-            summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
-    with gr.Tab(label="Continue previous topic extraction"):
         gr.Markdown(
         """
-        ### Load in data files from a previous attempt at extracting topics to continue it.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
@@ -170,7 +198,7 @@ with app:
         """)
         in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
-        view_table_markdown = gr.Markdown(value = "", label="View table")
     with gr.Tab(label="Topic extraction settings"):
         gr.Markdown(
@@ -183,7 +211,7 @@ with app:
             random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("Prompt settings", open = True):
-            number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3)
             system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
             initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
             prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
@@ -196,7 +224,7 @@ with app:
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
-        data_file_names_textbox = gr.Textbox(label = "Data file name", value="", visible=False)
         estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
         total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
@@ -220,40 +248,46 @@ with app:
     ###
      # Tabular data upload
-    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
-    extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
-    then(load_in_data_file,
-        inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
-        fn=extract_topics,
-        inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
-        outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
-    # return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
-    latest_batch_completed.change(fn=extract_topics,
-        inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
-        outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files]).\
-        then(fn = reveal_feedback_buttons,
-        outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
     # When button pressed, deduplicate data
-    deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
-    then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files, log_files_output])
     # When button pressed, summarise previous data
     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
-    then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
-    then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
-    then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
-    latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
     # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
     continue_previous_data_files_btn.click(
-            load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches]).\
-            then(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, data_file_names_textbox])
     ###
     # LOGGING AND ON APP LOAD FUNCTIONS
@@ -264,21 +298,21 @@ with app:
     access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
     access_callback.setup([session_hash_textbox], access_logs_data_folder)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
-    then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # Log usage usage when making a query
     usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
-    usage_callback.setup([session_hash_textbox, data_file_names_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], usage_data_folder)
-    conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_names_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], None, preprocess=False).\
-    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback
     feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
-    feedback_callback.setup([data_feedback_radio, data_further_details_text, data_file_names_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], feedback_data_folder)
-    data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_names_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], None, preprocess=False).\
-    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])

 import spaces
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
+from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 #from tools.aws_functions import load_data_from_aws
     ###
     text_output_file_list_state = gr.State([])
+    text_output_modify_file_list_state = gr.State([])
     log_files_output_list_state = gr.State([])
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
+    modified_unique_table_change_bool = gr.State(True) # This boolean is used to flag whether a file upload should change just the modified unique table object on the second tab
+    file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas")
+    master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas")
+    master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas")
+    master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas")
+    master_modify_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_unique_topics_df_state", visible=False, type="pandas")
+    master_modify_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_reference_df_state", visible=False, type="pandas")
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
     # Summary state objects
+    summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas")
+    master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas")
+    master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas")
     summarised_references_markdown = gr.Markdown("", visible=False)
     summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
+    reference_data_file_name_textbox = gr.Textbox(label = "Reference data file name", value="", visible=False)
+    unique_topics_table_file_name_textbox = gr.Textbox(label="Unique topics data file name textbox", visible=False)
     ###
     # UI LAYOUT
             in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
+        in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
+            candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
+            force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
         context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
+        sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative, Neutral, or Positive", choices=["Negative, Neutral, or Positive", "Negative or Positive", "Do not assess sentiment"])
+        extract_topics_btn = gr.Button("Extract topics", variant="primary")
+        topic_extraction_output_files = gr.File(height=file_input_height, label="Output files")
+        display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
         latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
         latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+    with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
         gr.Markdown(
         """
+        Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.
         """)
+        with gr.Accordion("Modify existing topics", open = False):
+            modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
+            modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
+            save_modified_files_button = gr.Button(value="Save modified topic names")
         with gr.Accordion("Upload reference data file and unique data files", open = True):
+            ### DEDUPLICATION
+            deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
+            deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
             with gr.Row():
                 merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
                 deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
             deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
+            ### SUMMARISATION
+            summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
+            summarise_format_radio = gr.Radio(label="Choose summary type", value="Return a summary up to two paragraphs long that includes as much detail as possible from the original text", choices=["Return a summary up to two paragraphs long that includes as much detail as possible from the original text", "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"])
             summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
             summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
+            summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
+    with gr.Tab(label="Continue unfinished topic extraction"):
         gr.Markdown(
         """
+        ### Load in output files from a previous topic extraction process and continue topic extraction with new data.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
         """)
         in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
+        view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
     with gr.Tab(label="Topic extraction settings"):
         gr.Markdown(
             random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("Prompt settings", open = True):
+            number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
             system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
             initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
             prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
         estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
         total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
     ###
      # Tabular data upload
+    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
+    extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
+    success(load_in_data_file,
+        inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
+    success(fn=extract_topics,
+        inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
+        outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
+    # latest_batch_completed.change(fn=extract_topics,
+    #     inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
+    #     outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
+    #     success(fn = reveal_feedback_buttons,
+    #         outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
+    # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
+    modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
+    # Modify output table with custom topic names
+    save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, deduplication_input_files, summarisation_input_files, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, summarised_output_markdown])
     # When button pressed, deduplicate data
+    deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
+        success(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, in_excel_sheets, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, summarisation_input_files, log_files_output, summarised_output_markdown], scroll_to_output=True)
     # When button pressed, summarise previous data
     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
+        success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
+            success(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
+                success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
+    latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output], scroll_to_output=True)
     # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
     continue_previous_data_files_btn.click(
+            load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
+            success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
     ###
     # LOGGING AND ON APP LOAD FUNCTIONS
     access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
     access_callback.setup([session_hash_textbox], access_logs_data_folder)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
+        success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # Log usage usage when making a query
     usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    usage_callback.setup([session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], usage_data_folder)
+    conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], None, preprocess=False).\
+        success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback
     feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], feedback_data_folder)
+    data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], None, preprocess=False).\
+        success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.2.3
-gradio==5.18.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

 pandas==2.2.3
+gradio==5.20.1
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

requirements_aws.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.2.3
-gradio==5.18.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

 pandas==2.2.3
+gradio==5.20.1
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

requirements_gpu.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.2.3
-gradio==5.18.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

 pandas==2.2.3
+gradio==5.20.1
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

tools/helper_functions.py CHANGED Viewed

@@ -15,8 +15,11 @@ def empty_output_vars_extract_topics():
     log_files_output_list_state = []
     conversation_metadata_textbox = ""
     estimated_time_taken_number = 0
-    return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number
 def empty_output_vars_summarise():
     # Empty output objects before summarising files
@@ -106,22 +109,31 @@ def detect_file_type(filename):
     else:
         raise ValueError("Unsupported file type.")
-def read_file(filename):
     """Read the file based on its detected type."""
     file_type = detect_file_type(filename)
     if file_type == 'csv':
         return pd.read_csv(filename, low_memory=False)
     elif file_type == 'xlsx':
-        return pd.read_excel(filename)
     elif file_type == 'parquet':
         return pd.read_parquet(filename)
 # Wrap text in each column to the specified max width, including whole words
-def wrap_text(text, max_width=60):
     if not isinstance(text, str):
         return text
     words = text.split()
     if not words:
         return text

     log_files_output_list_state = []
     conversation_metadata_textbox = ""
     estimated_time_taken_number = 0
+    file_data_state = pd.DataFrame()
+    reference_data_file_name_textbox = ""
+    display_topic_table_markdown = ""
+    return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown
 def empty_output_vars_summarise():
     # Empty output objects before summarising files
     else:
         raise ValueError("Unsupported file type.")
+def read_file(filename:str, sheet:str=""):
     """Read the file based on its detected type."""
     file_type = detect_file_type(filename)
     if file_type == 'csv':
         return pd.read_csv(filename, low_memory=False)
     elif file_type == 'xlsx':
+        if sheet:
+            return pd.read_excel(filename, sheet_name=sheet)
+        else:
+            return pd.read_excel(filename)
     elif file_type == 'parquet':
         return pd.read_parquet(filename)
 # Wrap text in each column to the specified max width, including whole words
+def wrap_text(text:str, max_width=60, max_text_length=None):
     if not isinstance(text, str):
         return text
+    # If max_text_length is set, truncate the text and add ellipsis
+    if max_text_length and len(text) > max_text_length:
+        text = text[:max_text_length] + '...'
+    text = text.replace('\r\n', '<br>').replace('\n', '<br>')
     words = text.split()
     if not words:
         return text

tools/llm_api_call.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

tools/prompts.py CHANGED Viewed

@@ -5,14 +5,16 @@ initial_table_prompt = """The open text data is shown in the following table tha
 Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
 In the first column identify general topics relevant to responses. Create as many general topics as you can.
-In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
-In the third column write the sentiment of the subtopic: Negative, Neutral, or Positive.
 In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
-In the fifth and final column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
-Do not add any other columns. Do not repeat Subtopics with the same Sentiment. Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
 New table:"""
 prompt2 = ""
 prompt3 = ""
@@ -21,6 +23,12 @@ prompt3 = ""
 add_existing_topics_system_prompt = system_prompt
 add_existing_topics_prompt = """Responses are shown in the following Response table:
 {response_table}
@@ -28,16 +36,16 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
 {topics}
 Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
-Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
-In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible.
-In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive.
 In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
-In the fifth and final column, write a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
-Do not add any other columns. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
-Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
 New table:"""
 summarise_topic_descriptions_system_prompt = system_prompt
@@ -45,11 +53,13 @@ summarise_topic_descriptions_prompt = """Below is a table with number of paragra
 '{summaries}'
-Your task is to make a consolidated summary of the above text. Return a summary up to two paragraphs long that includes as much detail as possible from the original text. Return only the summary and no other text.
 Summary:"""
 create_general_topics_system_prompt = system_prompt
 create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
@@ -60,6 +70,10 @@ Your task is to create a General Topic name for each Subtopic. The new Topics ta
 New Topics table:"""
 # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
 # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 # Summarise the following text in less than {length} words: "{text}"\n

 Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
 In the first column identify general topics relevant to responses. Create as many general topics as you can.
+In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
+{sentiment_choices}.
 In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
+In the fifth column, write a short summary of the subtopic based on relevant responses - highlight specific issues that appear.
+Do not add any other columns. Do not add any other text to your response.
 New table:"""
+# Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
 prompt2 = ""
 prompt3 = ""
 add_existing_topics_system_prompt = system_prompt
+force_existing_topics_prompt = """Create a new markdown table with the headings 'Placeholder', 'Subtopics', 'Sentiment', 'Response references', and 'Summary'.
+In the first column, write 'Not assessed'. In the second column, assign Subtopics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
+allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
+In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
 add_existing_topics_prompt = """Responses are shown in the following Response table:
 {response_table}
 {topics}
 Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
+{topic_assignment}
+{sentiment_choices}.
 In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
+In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
+Do not add any other columns. Do not add any other text to your response.
 New table:"""
+# Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
 summarise_topic_descriptions_system_prompt = system_prompt
 '{summaries}'
+Your task is to make a consolidated summary of the above text. {summary_format}. Return only the summary and no other text.
 Summary:"""
+## The following didn't work well in testing and so is not currently used
 create_general_topics_system_prompt = system_prompt
 create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
 New Topics table:"""
 # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
 # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 # Summarise the following text in less than {length} words: "{text}"\n