Spaces:

seanpedrickcase
/

llm_topic_modelling

Running

App Files Files Community

seanpedrickcase commited on Jan 22

Commit

854a758

1 Parent(s): a6d1841

Topic deduplication/merging now separated from summarisation. Gradio upgrade

Browse files

Files changed (7) hide show

Dockerfile +1 -1
app.py +40 -23
requirements.txt +1 -1
requirements_aws.txt +1 -1
requirements_cpu.txt +1 -1
tools/chatfuncs.py +1 -1
tools/llm_api_call.py +192 -76

Dockerfile CHANGED Viewed

@@ -26,7 +26,7 @@ RUN rm requirements_aws.txt
 # Stage 2: Final runtime image
 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
-# Install system dependencies. Need to specify -y for poppler to get it to install
 RUN apt-get update \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

 # Stage 2: Final runtime image
 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
+# Install system dependencies.
 RUN apt-get update \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import socket
 import spaces
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
-from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 #from tools.aws_functions import load_data_from_aws
@@ -21,6 +21,7 @@ print("host_name is:", host_name)
 access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
 feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
 print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
@@ -47,10 +48,10 @@ with app:
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
-    file_data_state = gr.State(pd.DataFrame())
-    master_topic_df_state = gr.State(pd.DataFrame())
-    master_reference_df_state = gr.State(pd.DataFrame())
-    master_unique_topics_df_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -66,13 +67,15 @@ with app:
     feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
     # Summary state objects
-    summary_reference_table_sample_state = gr.State(pd.DataFrame())
-    master_reference_df_revised_summaries_state = gr.State(pd.DataFrame())
-    master_unique_topics_df_revised_summaries_state = gr.State(pd.DataFrame())
     summarised_references_markdown = gr.Markdown("", visible=False)
     summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
     ###
     # UI LAYOUT
     ###
@@ -99,20 +102,20 @@ with app:
             in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         with gr.Accordion("Upload xlsx or csv file", open = True):
-            in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
         in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
-            candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
         context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
         extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
         text_output_summary = gr.Markdown(value="### Language model response will appear here")
-        text_output_file = gr.File(label="Output files")
         latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
         latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
@@ -126,16 +129,26 @@ with app:
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
-    with gr.Tab(label="Summarise topic outputs"):
         gr.Markdown(
         """
         ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
-            summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
             summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
-            summary_output_files = gr.File(label="Summarised output files", interactive=False)
             summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
     with gr.Tab(label="Continue previous topic extraction"):
@@ -145,28 +158,28 @@ with app:
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
-            in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
             continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
-    with gr.Tab(label="View output topics table"):
         gr.Markdown(
         """
         ### View a 'unique_topic_table' csv file in markdown format.
         """)
-        in_view_table = gr.File(label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
         view_table_markdown = gr.Markdown(value = "", label="View table")
-    with gr.Tab(label="LLM settings"):
         gr.Markdown(
         """
         Define settings that affect large language model output.
         """)
         with gr.Accordion("Settings for LLM generation", open = True):
             temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
-            batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0)
             random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("Prompt settings", open = True):
@@ -178,7 +191,7 @@ with app:
             add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
             add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
-        log_files_output = gr.File(label="Log file output", interactive=False)
         conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
         # Invisible text box to hold the session hash/username just for logging purposes
@@ -214,18 +227,22 @@ with app:
         inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
         fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
-        outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
     latest_batch_completed.change(fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
-        outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files]).\
         then(fn = reveal_feedback_buttons,
         outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
     # When button pressed, summarise previous data
     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
-    then(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox]).\
     then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
     then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])

 import spaces
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
+from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 #from tools.aws_functions import load_data_from_aws
 access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
 feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
+file_input_height = 150
 print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
+    file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
     # Summary state objects
+    summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas") # gr.State(pd.DataFrame())
+    master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
     summarised_references_markdown = gr.Markdown("", visible=False)
     summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
+    unique_topics_table_file_textbox = gr.Textbox(label="unique_topics_table_file_textbox", visible=False)
     ###
     # UI LAYOUT
     ###
             in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         with gr.Accordion("Upload xlsx or csv file", open = True):
+            in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
         in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
+            candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
         context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
         extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
         text_output_summary = gr.Markdown(value="### Language model response will appear here")
+        text_output_file = gr.File(height=file_input_height, label="Output files")
         latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
         latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+    with gr.Tab(label="Deduplicate and summarise topics"):
         gr.Markdown(
         """
         ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
+            summarisation_in_previous_data_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
+            with gr.Row():
+                merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
+                merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
+                deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
+            deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
+            duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
+            summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
             summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
     with gr.Tab(label="Continue previous topic extraction"):
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
+            in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
             continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
+    with gr.Tab(label="Topic table viewer"):
         gr.Markdown(
         """
         ### View a 'unique_topic_table' csv file in markdown format.
         """)
+        in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
         view_table_markdown = gr.Markdown(value = "", label="View table")
+    with gr.Tab(label="Topic extraction settings"):
         gr.Markdown(
         """
         Define settings that affect large language model output.
         """)
         with gr.Accordion("Settings for LLM generation", open = True):
             temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
+            batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
             random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("Prompt settings", open = True):
             add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
             add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
+        log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
         conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
         # Invisible text box to hold the session hash/username just for logging purposes
         inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
         fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
+        outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
     latest_batch_completed.change(fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
+        outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files]).\
         then(fn = reveal_feedback_buttons,
         outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
+    # When button pressed, deduplicate data
+    deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
+    then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files])
     # When button pressed, summarise previous data
     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
+    then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
     then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
     then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.2.3
-gradio==5.8.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

 pandas==2.2.3
+gradio==5.12.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

requirements_aws.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.2.3
-gradio==5.8.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

 pandas==2.2.3
+gradio==5.12.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

requirements_cpu.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.2.3
-gradio==5.6.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

 pandas==2.2.3
+gradio==5.12.0
 spaces==0.31.0
 boto3==1.35.71
 pyarrow==18.1.0

tools/chatfuncs.py CHANGED Viewed

@@ -50,7 +50,7 @@ reset: bool = True
 stream: bool = False
 threads: int = threads
 batch_size:int = 256
-context_length:int = 12288
 sample = True

 stream: bool = False
 threads: int = threads
 batch_size:int = 256
+context_length:int = 16384
 sample = True

tools/llm_api_call.py CHANGED Viewed

@@ -34,6 +34,12 @@ timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API c
 number_of_api_retry_attempts = 5
 max_time_for_loop = 99999
 batch_size_default = 5
 AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
 print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
@@ -104,7 +110,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
         if 'reference_table' in file.name:
             try:
                 reference_file_data, reference_file_name = load_in_file(file)
-                print("reference_file_data:", reference_file_data.head(2))
                 out_message = out_message + " Reference file load successful"
             except Exception as e:
                 out_message = "Could not load reference file data:" + str(e)
@@ -113,7 +119,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
         if 'unique_topics' in file.name:
             try:
                 unique_file_data, unique_file_name = load_in_file(file)
-                print("unique_topics_file:", unique_file_data.head(2))
                 out_message = out_message + " Unique table file load successful"
             except Exception as e:
                 out_message = "Could not load unique table file data:" + str(e)
@@ -132,7 +138,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
     print(out_message)
-    return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name
 def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
     """
@@ -188,7 +194,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
     simple_file["Response"] = simple_file["Response"].str.strip()  # Remove leading and trailing whitespace
     simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
     simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True)  # Replace multiple line breaks with a single line break
-    simple_file["Response"] = simple_file["Response"].str.slice(0, 2500) # Maximum 1,500 character responses
     # Remove blank and extremely short responses
     simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
@@ -988,7 +994,7 @@ def extract_topics(in_data_file,
             # Check if files and text exist
             out_message = "Please enter a data file to summarise."
             print(out_message)
-            return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
     #model_choice_clean = replace_punctuation_with_underscore(model_choice)
@@ -1087,7 +1093,7 @@ def extract_topics(in_data_file,
         print("summary_out_file_paths:", summary_out_file_paths)
         #final_out_message = '\n'.join(out_message)
-        return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, summary_out_file_paths
     if num_batches > 0:
@@ -1108,7 +1114,7 @@ def extract_topics(in_data_file,
     if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
         out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
         print(out_message)
-        return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
     topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
     topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
@@ -1440,74 +1446,125 @@ def extract_topics(in_data_file,
     print(final_message_out)
-    return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
 # SUMMARISATION FUNCTIONS
-def deduplicate_categories(category_series: pd.Series, join_series:pd.Series, threshold: float = 80) -> pd.DataFrame:
     """
-    Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold.
     Parameters:
         category_series (pd.Series): Series containing category names to deduplicate.
-        join_series (pd.Series): Additional series used for joining back to original results
         threshold (float): Similarity threshold for considering two strings as duplicates.
     Returns:
         pd.DataFrame: DataFrame with columns ['old_category', 'deduplicated_category'].
     """
     # Initialize the result dictionary
     deduplication_map = {}
-    # Iterate through each category in the series
     for category in category_series.unique():
         # Skip if the category is already processed
         if category in deduplication_map:
             continue
         # Find close matches to the current category, excluding the current category itself
-        matches = process.extract(category, [cat for cat in category_series.unique() if cat != category], scorer=fuzz.token_set_ratio, score_cutoff=threshold)
-        # Select the match with the highest score
         if matches:  # Check if there are any matches
             best_match = max(matches, key=lambda x: x[1])  # Get the match with the highest score
             match, score, _ = best_match  # Unpack the best match
-            #print("Best match:", match, "score:", score)
-            deduplication_map[match] = category  # Map the best match to the current category
     # Create the result DataFrame
-    result_df = pd.DataFrame({
-        'old_category': category_series + " | " + join_series,
-        'deduplicated_category': category_series.map(deduplication_map)
-    })
     return result_df
-def sample_reference_table_summaries(reference_df:pd.DataFrame,
-                                     unique_topics_df:pd.DataFrame,
-                                     random_seed:int,
-                                     deduplicate_topics:str="Yes",
-                                     no_of_sampled_summaries:int=150):
-    all_summaries = pd.DataFrame()
-    # Remove duplicate topics
     if deduplicate_topics == "Yes":
-        # Run through this three times to try to get all duplicate topics
-        for i in range(0, 3):
-            print("Run:", i)
-            # First, combine duplicate topics in reference_df
-            reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
-            reference_df_unique = reference_df.drop_duplicates("old_category")
-            #reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
-            # Deduplicate categories within each sentiment group
-            deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
-                lambda group: deduplicate_categories(group["Subtopic"], group["Sentiment"], threshold=80)
-            ).reset_index(drop=True)  # Reset index after groupby
             if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
             # Check if 'deduplicated_category' contains any values
@@ -1515,10 +1572,11 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
             else:
                 # Join deduplicated columns back to original df
                 # Remove rows where 'deduplicated_category' is blank or NaN
-                deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), :]
-                #deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
                 reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
@@ -1541,9 +1599,65 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
             reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
             reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
         # Remake unique_topics_df based on new reference_df
         unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
     reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
@@ -1629,6 +1743,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
                             out_metadata_str:str = "",
                             output_files:list = [],
                             summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
                             progress=gr.Progress(track_tqdm=True)):
     '''
     Create better summaries of the raw batch-level summaries created in the first run of the model.
@@ -1711,39 +1826,40 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
-    for summary_no in summary_loop:
-        print("Current summary number is:", summary_no)
-        summary_text = all_summaries[summary_no]
-        #print("summary_text:", summary_text)
-        formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
-        try:
-            response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
-            summarised_output = response
-            summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
-            summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
-            summarised_output = summarised_output.strip()
-        except Exception as e:
-            print(e)
-            summarised_output = ""
-        summarised_outputs.append(summarised_output)
-        out_metadata.extend(metadata)
-        out_metadata_str = '. '.join(out_metadata)
-        latest_summary_completed += 1
-        # Check if beyond max time allowed for processing and break if necessary
-        toc = time.perf_counter()
-        time_taken = tic - toc
-        if time_taken > max_time_for_loop:
-            print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
-            summary_loop.close()
-            tqdm._instances.clear()
-            break
     # If all summaries completeed
     if latest_summary_completed >= length_all_summaries:

 number_of_api_retry_attempts = 5
 max_time_for_loop = 99999
 batch_size_default = 5
+deduplication_threshold = 90
+MAX_COMMENT_CHARS = get_or_create_env_var('MAX_COMMENT_CHARS', '14000')
+print(f'The value of MAX_COMMENT_CHARS is {MAX_COMMENT_CHARS}')
+max_comment_character_length = int(MAX_COMMENT_CHARS)
 AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
 print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
         if 'reference_table' in file.name:
             try:
                 reference_file_data, reference_file_name = load_in_file(file)
+                #print("reference_file_data:", reference_file_data.head(2))
                 out_message = out_message + " Reference file load successful"
             except Exception as e:
                 out_message = "Could not load reference file data:" + str(e)
         if 'unique_topics' in file.name:
             try:
                 unique_file_data, unique_file_name = load_in_file(file)
+                #print("unique_topics_file:", unique_file_data.head(2))
                 out_message = out_message + " Unique table file load successful"
             except Exception as e:
                 out_message = "Could not load unique table file data:" + str(e)
     print(out_message)
+    return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
 def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
     """
     simple_file["Response"] = simple_file["Response"].str.strip()  # Remove leading and trailing whitespace
     simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
     simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True)  # Replace multiple line breaks with a single line break
+    simple_file["Response"] = simple_file["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
     # Remove blank and extremely short responses
     simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
             # Check if files and text exist
             out_message = "Please enter a data file to summarise."
             print(out_message)
+            return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
     #model_choice_clean = replace_punctuation_with_underscore(model_choice)
         print("summary_out_file_paths:", summary_out_file_paths)
         #final_out_message = '\n'.join(out_message)
+        return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
     if num_batches > 0:
     if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
         out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
         print(out_message)
+        return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
     topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
     topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
     print(final_message_out)
+    return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
 # SUMMARISATION FUNCTIONS
+def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
     """
+    Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold,
+    merging smaller topics into larger topics.
     Parameters:
         category_series (pd.Series): Series containing category names to deduplicate.
+        join_series (pd.Series): Additional series used for joining back to original results.
+        reference_df (pd.DataFrame): DataFrame containing the reference data to count occurrences.
         threshold (float): Similarity threshold for considering two strings as duplicates.
     Returns:
         pd.DataFrame: DataFrame with columns ['old_category', 'deduplicated_category'].
     """
+    # Count occurrences of each category in the reference_df
+    category_counts = reference_df['Subtopic'].value_counts().to_dict()
     # Initialize the result dictionary
     deduplication_map = {}
+    # First pass: Handle exact matches
+    for category in category_series.unique():
+        if category in deduplication_map:
+            continue
+        # Find all exact matches
+        exact_matches = category_series[category_series.str.lower() == category.lower()].index.tolist()
+        if len(exact_matches) > 1:
+            # Find the variant with the highest count
+            match_counts = {match: category_counts.get(category_series[match], 0) for match in exact_matches}
+            most_common = max(match_counts.items(), key=lambda x: x[1])[0]
+            most_common_category = category_series[most_common]
+            # Map all exact matches to the most common variant
+            for match in exact_matches:
+                deduplication_map[category_series[match]] = most_common_category
+    # Second pass: Handle fuzzy matches for remaining categories
     for category in category_series.unique():
         # Skip if the category is already processed
         if category in deduplication_map:
             continue
         # Find close matches to the current category, excluding the current category itself
+        matches = process.extract(category,
+                                [cat for cat in category_series.unique() if cat != category],
+                                scorer=fuzz.token_set_ratio,
+                                score_cutoff=threshold)
         if matches:  # Check if there are any matches
             best_match = max(matches, key=lambda x: x[1])  # Get the match with the highest score
             match, score, _ = best_match  # Unpack the best match
+            # Compare counts to ensure smaller topics merge into larger ones
+            if category_counts.get(category, 0) < category_counts.get(match, 0):
+                deduplication_map[category] = match  # Map the smaller category to the larger one
+            else:
+                deduplication_map[match] = category  # Map the larger category to the smaller one
+        else:
+            deduplication_map[category] = category  # No match found, keep the category as is
     # Create the result DataFrame
+    if merge_sentiment == "Yes":
+        result_df = pd.DataFrame({
+            'old_category': category_series + " | " + join_series,
+            'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
+        })
+    else:
+        result_df = pd.DataFrame({
+            'old_category': category_series + " | " + join_series,
+            'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
+        })
     return result_df
+def deduplicate_topics(reference_df,
+                       unique_topics_df,
+                       reference_table_file_name:str,
+                       unique_topics_table_file_name:str,
+                       merge_sentiment:str= "No",
+                       merge_general_topics:str="No",
+                       score_threshold:int=deduplication_threshold,
+                       deduplicate_topics:str="Yes"):
+    '''
+    Deduplicate topics based on a reference and unique topics table
+    '''
+    output_files = []
+    reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
+    unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
+    # Run through this x times to try to get all duplicate topics
     if deduplicate_topics == "Yes":
+        for i in range(0, 5):
+            #print("Deduplication run:", i)
+            #reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
+            if merge_sentiment == "No":
+                # First, combine duplicate topics in reference_df
+                reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
+                reference_df_unique = reference_df.drop_duplicates("old_category")
+                # Deduplicate categories within each sentiment group
+                deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
+                    lambda group: deduplicate_categories(group["Subtopic"], group["Sentiment"], reference_df, threshold=score_threshold)
+                ).reset_index(drop=True)  # Reset index after groupby
+            else:
+                # Deduplicate categories by subtopic name only
+                # First, combine duplicate topics in reference_df
+                reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
+                reference_df_unique = reference_df.drop_duplicates("old_category")
+                deduplicated_topic_map_df = deduplicate_categories(reference_df_unique["Subtopic"], reference_df_unique["Sentiment"], reference_df, merge_sentiment=merge_sentiment, threshold=score_threshold).reset_index(drop=True)
             if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
             # Check if 'deduplicated_category' contains any values
             else:
                 # Join deduplicated columns back to original df
+                deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
                 # Remove rows where 'deduplicated_category' is blank or NaN
+                deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
+                deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
                 reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
             reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
             reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
+            if merge_general_topics == "Yes":
+                # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
+                # Step 1: Count the number of occurrences for each General Topic and Subtopic combination
+                count_df = reference_df.groupby(['Subtopic', 'General Topic']).size().reset_index(name='Count')
+                # Step 2: Find the General Topic with the maximum count for each Subtopic
+                max_general_topic = count_df.loc[count_df.groupby('Subtopic')['Count'].idxmax()]
+                # Step 3: Map the General Topic back to the original DataFrame
+                reference_df = reference_df.merge(max_general_topic[['Subtopic', 'General Topic']], on='Subtopic', suffixes=('', '_max'), how='left')
+                reference_df['General Topic'] = reference_df["General Topic_max"].combine_first(reference_df["General Topic"])
+            if merge_sentiment == "Yes":
+                # Step 1: Count the number of occurrences for each General Topic and Subtopic combination
+                count_df = reference_df.groupby(['Subtopic', 'Sentiment']).size().reset_index(name='Count')
+                # Step 2: Determine the number of unique Sentiment values for each Subtopic
+                unique_sentiments = count_df.groupby('Subtopic')['Sentiment'].nunique().reset_index(name='UniqueCount')
+                # Step 3: Update Sentiment to 'Mixed' where there is more than one unique sentiment
+                reference_df = reference_df.merge(unique_sentiments, on='Subtopic', how='left')
+                reference_df['Sentiment'] = reference_df.apply(
+                    lambda row: 'Mixed' if row['UniqueCount'] > 1 else row['Sentiment'],
+                    axis=1
+                )
+                # Clean up the DataFrame by dropping the UniqueCount column
+                reference_df.drop(columns=['UniqueCount'], inplace=True)
+            reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
         # Remake unique_topics_df based on new reference_df
         unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
+        reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
+        unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
+        reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
+        unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
+        reference_df.to_csv(reference_file_path, index = None)
+        unique_topics_df.to_csv(unique_topics_file_path, index=None)
+        output_files.append(reference_file_path)
+        output_files.append(unique_topics_file_path)
+    return reference_df, unique_topics_df, output_files
+def sample_reference_table_summaries(reference_df:pd.DataFrame,
+                                     unique_topics_df:pd.DataFrame,
+                                     random_seed:int,
+                                     no_of_sampled_summaries:int=150):
+    '''
+    Sample x number of summaries from which to produce summaries, so that the input token length is not too long.
+    '''
+    all_summaries = pd.DataFrame()
+    output_files = []
     reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
                             out_metadata_str:str = "",
                             output_files:list = [],
                             summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
+                            do_summaries="Yes",
                             progress=gr.Progress(track_tqdm=True)):
     '''
     Create better summaries of the raw batch-level summaries created in the first run of the model.
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
+    if do_summaries == "Yes":
+        for summary_no in summary_loop:
+            print("Current summary number is:", summary_no)
+            summary_text = all_summaries[summary_no]
+            #print("summary_text:", summary_text)
+            formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
+            try:
+                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
+                summarised_output = response
+                summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
+                summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
+                summarised_output = summarised_output.strip()
+            except Exception as e:
+                print(e)
+                summarised_output = ""
+            summarised_outputs.append(summarised_output)
+            out_metadata.extend(metadata)
+            out_metadata_str = '. '.join(out_metadata)
+            latest_summary_completed += 1
+            # Check if beyond max time allowed for processing and break if necessary
+            toc = time.perf_counter()
+            time_taken = tic - toc
+            if time_taken > max_time_for_loop:
+                print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
+                summary_loop.close()
+                tqdm._instances.clear()
+                break
     # If all summaries completeed
     if latest_summary_completed >= length_all_summaries: