seanpedrickcase commited on
Commit
854a758
·
1 Parent(s): a6d1841

Topic deduplication/merging now separated from summarisation. Gradio upgrade

Browse files
Dockerfile CHANGED
@@ -26,7 +26,7 @@ RUN rm requirements_aws.txt
26
  # Stage 2: Final runtime image
27
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
28
 
29
- # Install system dependencies. Need to specify -y for poppler to get it to install
30
  RUN apt-get update \
31
  && apt-get clean \
32
  && rm -rf /var/lib/apt/lists/*
 
26
  # Stage 2: Final runtime image
27
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
28
 
29
+ # Install system dependencies.
30
  RUN apt-get update \
31
  && apt-get clean \
32
  && rm -rf /var/lib/apt/lists/*
app.py CHANGED
@@ -3,7 +3,7 @@ import socket
3
  import spaces
4
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
- from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
9
  #from tools.aws_functions import load_data_from_aws
@@ -21,6 +21,7 @@ print("host_name is:", host_name)
21
  access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
22
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
23
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
 
24
 
25
  print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
26
 
@@ -47,10 +48,10 @@ with app:
47
  first_loop_state = gr.State(True)
48
  second_loop_state = gr.State(False)
49
 
50
- file_data_state = gr.State(pd.DataFrame())
51
- master_topic_df_state = gr.State(pd.DataFrame())
52
- master_reference_df_state = gr.State(pd.DataFrame())
53
- master_unique_topics_df_state = gr.State(pd.DataFrame())
54
 
55
  session_hash_state = gr.State()
56
  s3_output_folder_state = gr.State()
@@ -66,13 +67,15 @@ with app:
66
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
67
 
68
  # Summary state objects
69
- summary_reference_table_sample_state = gr.State(pd.DataFrame())
70
- master_reference_df_revised_summaries_state = gr.State(pd.DataFrame())
71
- master_unique_topics_df_revised_summaries_state = gr.State(pd.DataFrame())
72
  summarised_references_markdown = gr.Markdown("", visible=False)
73
  summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
74
  latest_summary_completed_num = gr.Number(0, visible=False)
75
 
 
 
76
  ###
77
  # UI LAYOUT
78
  ###
@@ -99,20 +102,20 @@ with app:
99
  in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
100
 
101
  with gr.Accordion("Upload xlsx or csv file", open = True):
102
- in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
103
 
104
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
105
  in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
106
 
107
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
108
- candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
109
 
110
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
111
 
112
  extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
113
 
114
  text_output_summary = gr.Markdown(value="### Language model response will appear here")
115
- text_output_file = gr.File(label="Output files")
116
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
117
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
118
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
@@ -126,16 +129,26 @@ with app:
126
  with gr.Row():
127
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
128
 
129
- with gr.Tab(label="Summarise topic outputs"):
130
  gr.Markdown(
131
  """
132
  ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
133
  """)
134
  with gr.Accordion("Upload reference data file and unique data files", open = True):
135
- summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
136
  summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
 
 
 
 
 
 
 
 
 
 
137
  summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
138
- summary_output_files = gr.File(label="Summarised output files", interactive=False)
139
  summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
140
 
141
  with gr.Tab(label="Continue previous topic extraction"):
@@ -145,28 +158,28 @@ with app:
145
  """)
146
 
147
  with gr.Accordion("Upload reference data file and unique data files", open = True):
148
- in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
149
  in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
150
  continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
151
 
152
 
153
- with gr.Tab(label="View output topics table"):
154
  gr.Markdown(
155
  """
156
  ### View a 'unique_topic_table' csv file in markdown format.
157
  """)
158
 
159
- in_view_table = gr.File(label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
160
  view_table_markdown = gr.Markdown(value = "", label="View table")
161
 
162
- with gr.Tab(label="LLM settings"):
163
  gr.Markdown(
164
  """
165
  Define settings that affect large language model output.
166
  """)
167
  with gr.Accordion("Settings for LLM generation", open = True):
168
  temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
169
- batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0)
170
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
171
 
172
  with gr.Accordion("Prompt settings", open = True):
@@ -178,7 +191,7 @@ with app:
178
  add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
179
  add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
180
 
181
- log_files_output = gr.File(label="Log file output", interactive=False)
182
  conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
183
 
184
  # Invisible text box to hold the session hash/username just for logging purposes
@@ -214,18 +227,22 @@ with app:
214
  inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
215
  fn=extract_topics,
216
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
217
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
218
 
219
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
220
  latest_batch_completed.change(fn=extract_topics,
221
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
222
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files]).\
223
  then(fn = reveal_feedback_buttons,
224
  outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
225
 
 
 
 
 
226
  # When button pressed, summarise previous data
227
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
228
- then(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox]).\
229
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
230
  then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
231
 
 
3
  import spaces
4
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
+ from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
9
  #from tools.aws_functions import load_data_from_aws
 
21
  access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
22
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
23
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
24
+ file_input_height = 150
25
 
26
  print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
27
 
 
48
  first_loop_state = gr.State(True)
49
  second_loop_state = gr.State(False)
50
 
51
+ file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
52
+ master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
53
+ master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
54
+ master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
55
 
56
  session_hash_state = gr.State()
57
  s3_output_folder_state = gr.State()
 
67
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
68
 
69
  # Summary state objects
70
+ summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas") # gr.State(pd.DataFrame())
71
+ master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
72
+ master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
73
  summarised_references_markdown = gr.Markdown("", visible=False)
74
  summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
75
  latest_summary_completed_num = gr.Number(0, visible=False)
76
 
77
+ unique_topics_table_file_textbox = gr.Textbox(label="unique_topics_table_file_textbox", visible=False)
78
+
79
  ###
80
  # UI LAYOUT
81
  ###
 
102
  in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
103
 
104
  with gr.Accordion("Upload xlsx or csv file", open = True):
105
+ in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
106
 
107
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
108
  in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
109
 
110
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
111
+ candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
112
 
113
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
114
 
115
  extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
116
 
117
  text_output_summary = gr.Markdown(value="### Language model response will appear here")
118
+ text_output_file = gr.File(height=file_input_height, label="Output files")
119
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
120
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
121
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
 
129
  with gr.Row():
130
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
131
 
132
+ with gr.Tab(label="Deduplicate and summarise topics"):
133
  gr.Markdown(
134
  """
135
  ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
136
  """)
137
  with gr.Accordion("Upload reference data file and unique data files", open = True):
138
+ summarisation_in_previous_data_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
139
  summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
140
+
141
+ with gr.Row():
142
+ merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
143
+ merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
144
+ deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
145
+
146
+ deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
147
+
148
+ duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
149
+
150
  summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
151
+ summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
152
  summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
153
 
154
  with gr.Tab(label="Continue previous topic extraction"):
 
158
  """)
159
 
160
  with gr.Accordion("Upload reference data file and unique data files", open = True):
161
+ in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
162
  in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
163
  continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
164
 
165
 
166
+ with gr.Tab(label="Topic table viewer"):
167
  gr.Markdown(
168
  """
169
  ### View a 'unique_topic_table' csv file in markdown format.
170
  """)
171
 
172
+ in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
173
  view_table_markdown = gr.Markdown(value = "", label="View table")
174
 
175
+ with gr.Tab(label="Topic extraction settings"):
176
  gr.Markdown(
177
  """
178
  Define settings that affect large language model output.
179
  """)
180
  with gr.Accordion("Settings for LLM generation", open = True):
181
  temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
182
+ batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
183
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
184
 
185
  with gr.Accordion("Prompt settings", open = True):
 
191
  add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
192
  add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
193
 
194
+ log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
195
  conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
196
 
197
  # Invisible text box to hold the session hash/username just for logging purposes
 
227
  inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
228
  fn=extract_topics,
229
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
230
+ outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
231
 
232
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
233
  latest_batch_completed.change(fn=extract_topics,
234
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
235
+ outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files]).\
236
  then(fn = reveal_feedback_buttons,
237
  outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
238
 
239
+ # When button pressed, deduplicate data
240
+ deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
241
+ then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files])
242
+
243
  # When button pressed, summarise previous data
244
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
245
+ then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
246
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
247
  then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
248
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.8.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.12.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
requirements_aws.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.8.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.12.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
requirements_cpu.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.6.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.12.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
tools/chatfuncs.py CHANGED
@@ -50,7 +50,7 @@ reset: bool = True
50
  stream: bool = False
51
  threads: int = threads
52
  batch_size:int = 256
53
- context_length:int = 12288
54
  sample = True
55
 
56
 
 
50
  stream: bool = False
51
  threads: int = threads
52
  batch_size:int = 256
53
+ context_length:int = 16384
54
  sample = True
55
 
56
 
tools/llm_api_call.py CHANGED
@@ -34,6 +34,12 @@ timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API c
34
  number_of_api_retry_attempts = 5
35
  max_time_for_loop = 99999
36
  batch_size_default = 5
 
 
 
 
 
 
37
 
38
  AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
39
  print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
@@ -104,7 +110,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
104
  if 'reference_table' in file.name:
105
  try:
106
  reference_file_data, reference_file_name = load_in_file(file)
107
- print("reference_file_data:", reference_file_data.head(2))
108
  out_message = out_message + " Reference file load successful"
109
  except Exception as e:
110
  out_message = "Could not load reference file data:" + str(e)
@@ -113,7 +119,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
113
  if 'unique_topics' in file.name:
114
  try:
115
  unique_file_data, unique_file_name = load_in_file(file)
116
- print("unique_topics_file:", unique_file_data.head(2))
117
  out_message = out_message + " Unique table file load successful"
118
  except Exception as e:
119
  out_message = "Could not load unique table file data:" + str(e)
@@ -132,7 +138,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
132
 
133
  print(out_message)
134
 
135
- return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name
136
 
137
  def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
138
  """
@@ -188,7 +194,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
188
  simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
189
  simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
190
  simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
191
- simple_file["Response"] = simple_file["Response"].str.slice(0, 2500) # Maximum 1,500 character responses
192
 
193
  # Remove blank and extremely short responses
194
  simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
@@ -988,7 +994,7 @@ def extract_topics(in_data_file,
988
  # Check if files and text exist
989
  out_message = "Please enter a data file to summarise."
990
  print(out_message)
991
- return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
992
 
993
 
994
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
@@ -1087,7 +1093,7 @@ def extract_topics(in_data_file,
1087
  print("summary_out_file_paths:", summary_out_file_paths)
1088
 
1089
  #final_out_message = '\n'.join(out_message)
1090
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, summary_out_file_paths
1091
 
1092
 
1093
  if num_batches > 0:
@@ -1108,7 +1114,7 @@ def extract_topics(in_data_file,
1108
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1109
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1110
  print(out_message)
1111
- return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
1112
 
1113
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
1114
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
@@ -1440,74 +1446,125 @@ def extract_topics(in_data_file,
1440
 
1441
  print(final_message_out)
1442
 
1443
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
1444
 
1445
  # SUMMARISATION FUNCTIONS
1446
 
1447
- def deduplicate_categories(category_series: pd.Series, join_series:pd.Series, threshold: float = 80) -> pd.DataFrame:
1448
  """
1449
- Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold.
1450
-
 
1451
  Parameters:
1452
  category_series (pd.Series): Series containing category names to deduplicate.
1453
- join_series (pd.Series): Additional series used for joining back to original results
 
1454
  threshold (float): Similarity threshold for considering two strings as duplicates.
1455
-
1456
  Returns:
1457
  pd.DataFrame: DataFrame with columns ['old_category', 'deduplicated_category'].
1458
  """
 
 
 
1459
  # Initialize the result dictionary
1460
  deduplication_map = {}
1461
-
1462
- # Iterate through each category in the series
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1463
  for category in category_series.unique():
1464
  # Skip if the category is already processed
1465
  if category in deduplication_map:
1466
  continue
1467
 
1468
  # Find close matches to the current category, excluding the current category itself
1469
- matches = process.extract(category, [cat for cat in category_series.unique() if cat != category], scorer=fuzz.token_set_ratio, score_cutoff=threshold)
1470
-
1471
- # Select the match with the highest score
 
 
1472
  if matches: # Check if there are any matches
1473
  best_match = max(matches, key=lambda x: x[1]) # Get the match with the highest score
1474
  match, score, _ = best_match # Unpack the best match
1475
- #print("Best match:", match, "score:", score)
1476
- deduplication_map[match] = category # Map the best match to the current category
1477
-
 
 
 
 
 
 
1478
  # Create the result DataFrame
1479
- result_df = pd.DataFrame({
1480
- 'old_category': category_series + " | " + join_series,
1481
- 'deduplicated_category': category_series.map(deduplication_map)
1482
- })
1483
-
 
 
 
 
 
 
1484
  return result_df
1485
 
1486
- def sample_reference_table_summaries(reference_df:pd.DataFrame,
1487
- unique_topics_df:pd.DataFrame,
1488
- random_seed:int,
1489
- deduplicate_topics:str="Yes",
1490
- no_of_sampled_summaries:int=150):
1491
-
1492
- all_summaries = pd.DataFrame()
 
 
 
 
 
 
 
 
1493
 
1494
- # Remove duplicate topics
1495
  if deduplicate_topics == "Yes":
 
 
 
1496
 
1497
- # Run through this three times to try to get all duplicate topics
1498
- for i in range(0, 3):
1499
- print("Run:", i)
1500
- # First, combine duplicate topics in reference_df
1501
- reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1502
 
1503
- reference_df_unique = reference_df.drop_duplicates("old_category")
 
 
 
1504
 
1505
- #reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
 
 
 
 
 
 
 
 
1506
 
1507
- # Deduplicate categories within each sentiment group
1508
- deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
1509
- lambda group: deduplicate_categories(group["Subtopic"], group["Sentiment"], threshold=80)
1510
- ).reset_index(drop=True) # Reset index after groupby
1511
 
1512
  if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
1513
  # Check if 'deduplicated_category' contains any values
@@ -1515,10 +1572,11 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
1515
 
1516
  else:
1517
  # Join deduplicated columns back to original df
 
1518
  # Remove rows where 'deduplicated_category' is blank or NaN
1519
- deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), :]
1520
 
1521
- #deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1522
 
1523
  reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
1524
 
@@ -1541,9 +1599,65 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
1541
  reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
1542
  reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
1543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1544
  # Remake unique_topics_df based on new reference_df
1545
  unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
1546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1547
 
1548
  reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
1549
 
@@ -1629,6 +1743,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1629
  out_metadata_str:str = "",
1630
  output_files:list = [],
1631
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
 
1632
  progress=gr.Progress(track_tqdm=True)):
1633
  '''
1634
  Create better summaries of the raw batch-level summaries created in the first run of the model.
@@ -1711,39 +1826,40 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1711
  summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
1712
  summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
1713
 
1714
- for summary_no in summary_loop:
 
1715
 
1716
- print("Current summary number is:", summary_no)
1717
 
1718
- summary_text = all_summaries[summary_no]
1719
- #print("summary_text:", summary_text)
1720
- formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
1721
 
1722
- try:
1723
- response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
1724
- summarised_output = response
1725
- summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
1726
- summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
1727
- summarised_output = summarised_output.strip()
1728
- except Exception as e:
1729
- print(e)
1730
- summarised_output = ""
1731
-
1732
- summarised_outputs.append(summarised_output)
1733
- out_metadata.extend(metadata)
1734
- out_metadata_str = '. '.join(out_metadata)
1735
-
1736
- latest_summary_completed += 1
1737
-
1738
- # Check if beyond max time allowed for processing and break if necessary
1739
- toc = time.perf_counter()
1740
- time_taken = tic - toc
1741
 
1742
- if time_taken > max_time_for_loop:
1743
- print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
1744
- summary_loop.close()
1745
- tqdm._instances.clear()
1746
- break
 
 
 
 
 
 
 
 
 
 
1747
 
1748
  # If all summaries completeed
1749
  if latest_summary_completed >= length_all_summaries:
 
34
  number_of_api_retry_attempts = 5
35
  max_time_for_loop = 99999
36
  batch_size_default = 5
37
+ deduplication_threshold = 90
38
+
39
+ MAX_COMMENT_CHARS = get_or_create_env_var('MAX_COMMENT_CHARS', '14000')
40
+ print(f'The value of MAX_COMMENT_CHARS is {MAX_COMMENT_CHARS}')
41
+
42
+ max_comment_character_length = int(MAX_COMMENT_CHARS)
43
 
44
  AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
45
  print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
 
110
  if 'reference_table' in file.name:
111
  try:
112
  reference_file_data, reference_file_name = load_in_file(file)
113
+ #print("reference_file_data:", reference_file_data.head(2))
114
  out_message = out_message + " Reference file load successful"
115
  except Exception as e:
116
  out_message = "Could not load reference file data:" + str(e)
 
119
  if 'unique_topics' in file.name:
120
  try:
121
  unique_file_data, unique_file_name = load_in_file(file)
122
+ #print("unique_topics_file:", unique_file_data.head(2))
123
  out_message = out_message + " Unique table file load successful"
124
  except Exception as e:
125
  out_message = "Could not load unique table file data:" + str(e)
 
138
 
139
  print(out_message)
140
 
141
+ return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
142
 
143
  def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
144
  """
 
194
  simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
195
  simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
196
  simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
197
+ simple_file["Response"] = simple_file["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
198
 
199
  # Remove blank and extremely short responses
200
  simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
 
994
  # Check if files and text exist
995
  out_message = "Please enter a data file to summarise."
996
  print(out_message)
997
+ return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
998
 
999
 
1000
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
 
1093
  print("summary_out_file_paths:", summary_out_file_paths)
1094
 
1095
  #final_out_message = '\n'.join(out_message)
1096
+ return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
1097
 
1098
 
1099
  if num_batches > 0:
 
1114
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1115
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1116
  print(out_message)
1117
+ return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
1118
 
1119
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
1120
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
 
1446
 
1447
  print(final_message_out)
1448
 
1449
+ return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
1450
 
1451
  # SUMMARISATION FUNCTIONS
1452
 
1453
+ def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
1454
  """
1455
+ Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold,
1456
+ merging smaller topics into larger topics.
1457
+
1458
  Parameters:
1459
  category_series (pd.Series): Series containing category names to deduplicate.
1460
+ join_series (pd.Series): Additional series used for joining back to original results.
1461
+ reference_df (pd.DataFrame): DataFrame containing the reference data to count occurrences.
1462
  threshold (float): Similarity threshold for considering two strings as duplicates.
1463
+
1464
  Returns:
1465
  pd.DataFrame: DataFrame with columns ['old_category', 'deduplicated_category'].
1466
  """
1467
+ # Count occurrences of each category in the reference_df
1468
+ category_counts = reference_df['Subtopic'].value_counts().to_dict()
1469
+
1470
  # Initialize the result dictionary
1471
  deduplication_map = {}
1472
+
1473
+ # First pass: Handle exact matches
1474
+ for category in category_series.unique():
1475
+ if category in deduplication_map:
1476
+ continue
1477
+
1478
+ # Find all exact matches
1479
+ exact_matches = category_series[category_series.str.lower() == category.lower()].index.tolist()
1480
+ if len(exact_matches) > 1:
1481
+ # Find the variant with the highest count
1482
+ match_counts = {match: category_counts.get(category_series[match], 0) for match in exact_matches}
1483
+ most_common = max(match_counts.items(), key=lambda x: x[1])[0]
1484
+ most_common_category = category_series[most_common]
1485
+
1486
+ # Map all exact matches to the most common variant
1487
+ for match in exact_matches:
1488
+ deduplication_map[category_series[match]] = most_common_category
1489
+
1490
+ # Second pass: Handle fuzzy matches for remaining categories
1491
  for category in category_series.unique():
1492
  # Skip if the category is already processed
1493
  if category in deduplication_map:
1494
  continue
1495
 
1496
  # Find close matches to the current category, excluding the current category itself
1497
+ matches = process.extract(category,
1498
+ [cat for cat in category_series.unique() if cat != category],
1499
+ scorer=fuzz.token_set_ratio,
1500
+ score_cutoff=threshold)
1501
+
1502
  if matches: # Check if there are any matches
1503
  best_match = max(matches, key=lambda x: x[1]) # Get the match with the highest score
1504
  match, score, _ = best_match # Unpack the best match
1505
+
1506
+ # Compare counts to ensure smaller topics merge into larger ones
1507
+ if category_counts.get(category, 0) < category_counts.get(match, 0):
1508
+ deduplication_map[category] = match # Map the smaller category to the larger one
1509
+ else:
1510
+ deduplication_map[match] = category # Map the larger category to the smaller one
1511
+ else:
1512
+ deduplication_map[category] = category # No match found, keep the category as is
1513
+
1514
  # Create the result DataFrame
1515
+ if merge_sentiment == "Yes":
1516
+ result_df = pd.DataFrame({
1517
+ 'old_category': category_series + " | " + join_series,
1518
+ 'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
1519
+ })
1520
+ else:
1521
+ result_df = pd.DataFrame({
1522
+ 'old_category': category_series + " | " + join_series,
1523
+ 'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
1524
+ })
1525
+
1526
  return result_df
1527
 
1528
+ def deduplicate_topics(reference_df,
1529
+ unique_topics_df,
1530
+ reference_table_file_name:str,
1531
+ unique_topics_table_file_name:str,
1532
+ merge_sentiment:str= "No",
1533
+ merge_general_topics:str="No",
1534
+ score_threshold:int=deduplication_threshold,
1535
+ deduplicate_topics:str="Yes"):
1536
+ '''
1537
+ Deduplicate topics based on a reference and unique topics table
1538
+ '''
1539
+ output_files = []
1540
+
1541
+ reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
1542
+ unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
1543
 
1544
+ # Run through this x times to try to get all duplicate topics
1545
  if deduplicate_topics == "Yes":
1546
+ for i in range(0, 5):
1547
+ #print("Deduplication run:", i)
1548
+
1549
 
1550
+ #reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
 
 
 
 
1551
 
1552
+ if merge_sentiment == "No":
1553
+ # First, combine duplicate topics in reference_df
1554
+ reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1555
+ reference_df_unique = reference_df.drop_duplicates("old_category")
1556
 
1557
+ # Deduplicate categories within each sentiment group
1558
+ deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
1559
+ lambda group: deduplicate_categories(group["Subtopic"], group["Sentiment"], reference_df, threshold=score_threshold)
1560
+ ).reset_index(drop=True) # Reset index after groupby
1561
+ else:
1562
+ # Deduplicate categories by subtopic name only
1563
+ # First, combine duplicate topics in reference_df
1564
+ reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1565
+ reference_df_unique = reference_df.drop_duplicates("old_category")
1566
 
1567
+ deduplicated_topic_map_df = deduplicate_categories(reference_df_unique["Subtopic"], reference_df_unique["Sentiment"], reference_df, merge_sentiment=merge_sentiment, threshold=score_threshold).reset_index(drop=True)
 
 
 
1568
 
1569
  if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
1570
  # Check if 'deduplicated_category' contains any values
 
1572
 
1573
  else:
1574
  # Join deduplicated columns back to original df
1575
+ deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1576
  # Remove rows where 'deduplicated_category' is blank or NaN
1577
+ deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
1578
 
1579
+ deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1580
 
1581
  reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
1582
 
 
1599
  reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
1600
  reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
1601
 
1602
+ if merge_general_topics == "Yes":
1603
+ # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
1604
+ # Step 1: Count the number of occurrences for each General Topic and Subtopic combination
1605
+ count_df = reference_df.groupby(['Subtopic', 'General Topic']).size().reset_index(name='Count')
1606
+
1607
+ # Step 2: Find the General Topic with the maximum count for each Subtopic
1608
+ max_general_topic = count_df.loc[count_df.groupby('Subtopic')['Count'].idxmax()]
1609
+
1610
+ # Step 3: Map the General Topic back to the original DataFrame
1611
+ reference_df = reference_df.merge(max_general_topic[['Subtopic', 'General Topic']], on='Subtopic', suffixes=('', '_max'), how='left')
1612
+
1613
+ reference_df['General Topic'] = reference_df["General Topic_max"].combine_first(reference_df["General Topic"])
1614
+
1615
+ if merge_sentiment == "Yes":
1616
+ # Step 1: Count the number of occurrences for each General Topic and Subtopic combination
1617
+ count_df = reference_df.groupby(['Subtopic', 'Sentiment']).size().reset_index(name='Count')
1618
+
1619
+ # Step 2: Determine the number of unique Sentiment values for each Subtopic
1620
+ unique_sentiments = count_df.groupby('Subtopic')['Sentiment'].nunique().reset_index(name='UniqueCount')
1621
+
1622
+ # Step 3: Update Sentiment to 'Mixed' where there is more than one unique sentiment
1623
+ reference_df = reference_df.merge(unique_sentiments, on='Subtopic', how='left')
1624
+ reference_df['Sentiment'] = reference_df.apply(
1625
+ lambda row: 'Mixed' if row['UniqueCount'] > 1 else row['Sentiment'],
1626
+ axis=1
1627
+ )
1628
+
1629
+ # Clean up the DataFrame by dropping the UniqueCount column
1630
+ reference_df.drop(columns=['UniqueCount'], inplace=True)
1631
+
1632
+ reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
1633
+
1634
  # Remake unique_topics_df based on new reference_df
1635
  unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
1636
 
1637
+ reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
1638
+ unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
1639
+
1640
+ reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
1641
+ unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
1642
+ reference_df.to_csv(reference_file_path, index = None)
1643
+ unique_topics_df.to_csv(unique_topics_file_path, index=None)
1644
+
1645
+ output_files.append(reference_file_path)
1646
+ output_files.append(unique_topics_file_path)
1647
+
1648
+ return reference_df, unique_topics_df, output_files
1649
+
1650
+ def sample_reference_table_summaries(reference_df:pd.DataFrame,
1651
+ unique_topics_df:pd.DataFrame,
1652
+ random_seed:int,
1653
+ no_of_sampled_summaries:int=150):
1654
+
1655
+ '''
1656
+ Sample x number of summaries from which to produce summaries, so that the input token length is not too long.
1657
+ '''
1658
+
1659
+ all_summaries = pd.DataFrame()
1660
+ output_files = []
1661
 
1662
  reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
1663
 
 
1743
  out_metadata_str:str = "",
1744
  output_files:list = [],
1745
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
1746
+ do_summaries="Yes",
1747
  progress=gr.Progress(track_tqdm=True)):
1748
  '''
1749
  Create better summaries of the raw batch-level summaries created in the first run of the model.
 
1826
  summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
1827
  summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
1828
 
1829
+ if do_summaries == "Yes":
1830
+ for summary_no in summary_loop:
1831
 
1832
+ print("Current summary number is:", summary_no)
1833
 
1834
+ summary_text = all_summaries[summary_no]
1835
+ #print("summary_text:", summary_text)
1836
+ formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
1837
 
1838
+ try:
1839
+ response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
1840
+ summarised_output = response
1841
+ summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
1842
+ summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
1843
+ summarised_output = summarised_output.strip()
1844
+ except Exception as e:
1845
+ print(e)
1846
+ summarised_output = ""
 
 
 
 
 
 
 
 
 
 
1847
 
1848
+ summarised_outputs.append(summarised_output)
1849
+ out_metadata.extend(metadata)
1850
+ out_metadata_str = '. '.join(out_metadata)
1851
+
1852
+ latest_summary_completed += 1
1853
+
1854
+ # Check if beyond max time allowed for processing and break if necessary
1855
+ toc = time.perf_counter()
1856
+ time_taken = tic - toc
1857
+
1858
+ if time_taken > max_time_for_loop:
1859
+ print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
1860
+ summary_loop.close()
1861
+ tqdm._instances.clear()
1862
+ break
1863
 
1864
  # If all summaries completeed
1865
  if latest_summary_completed >= length_all_summaries: