seanpedrickcase commited on
Commit
75d1651
·
1 Parent(s): b50798a

Allowed for manual modification of output topic table. Did some fixes to deduplication and Excel file input. Allowed for General topic specification in zero shot topics

Browse files
Files changed (4) hide show
  1. app.py +79 -50
  2. tools/helper_functions.py +12 -3
  3. tools/llm_api_call.py +497 -183
  4. tools/prompts.py +8 -2
app.py CHANGED
@@ -3,7 +3,7 @@ import socket
3
  import spaces
4
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
- from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
9
  #from tools.aws_functions import load_data_from_aws
@@ -44,14 +44,20 @@ with app:
44
  ###
45
 
46
  text_output_file_list_state = gr.State([])
 
47
  log_files_output_list_state = gr.State([])
48
  first_loop_state = gr.State(True)
49
  second_loop_state = gr.State(False)
 
50
 
51
- file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
52
- master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
53
- master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
54
- master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
 
 
 
 
55
 
56
  session_hash_state = gr.State()
57
  s3_output_folder_state = gr.State()
@@ -67,14 +73,15 @@ with app:
67
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
68
 
69
  # Summary state objects
70
- summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas") # gr.State(pd.DataFrame())
71
- master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
72
- master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
73
  summarised_references_markdown = gr.Markdown("", visible=False)
74
  summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
75
  latest_summary_completed_num = gr.Number(0, visible=False)
76
 
77
- unique_topics_table_file_textbox = gr.Textbox(label="unique_topics_table_file_textbox", visible=False)
 
78
 
79
  ###
80
  # UI LAYOUT
@@ -108,15 +115,15 @@ with app:
108
  in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
109
 
110
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
111
- candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
112
 
113
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
114
 
115
  sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative, Neutral, or Positive", choices=["Negative, Neutral, or Positive", "Negative or Positive", "Do not assess sentiment"])
116
 
117
- extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
118
 
119
- text_output_summary = gr.Markdown(value="### Language model response will appear here")
120
  text_output_file = gr.File(height=file_input_height, label="Output files")
121
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
122
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
@@ -131,14 +138,28 @@ with app:
131
  with gr.Row():
132
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
133
 
134
- with gr.Tab(label="Deduplicate and summarise topics"):
135
  gr.Markdown(
136
  """
137
- ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
138
  """)
 
 
 
 
 
 
 
 
 
 
 
139
  with gr.Accordion("Upload reference data file and unique data files", open = True):
140
- summarisation_in_previous_data_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
141
- summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
 
 
 
142
 
143
  with gr.Row():
144
  merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
@@ -146,19 +167,21 @@ with app:
146
  deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
147
 
148
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
149
-
150
- duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
 
 
151
 
152
  summarise_format_radio = gr.Radio(label="Choose summary type", value="Return a summary up to two paragraphs long that includes as much detail as possible from the original text", choices=["Return a summary up to two paragraphs long that includes as much detail as possible from the original text", "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"])
153
 
154
  summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
155
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
156
- summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
157
 
158
- with gr.Tab(label="Continue previous topic extraction"):
159
  gr.Markdown(
160
  """
161
- ### Load in data files from a previous attempt at extracting topics to continue it.
162
  """)
163
 
164
  with gr.Accordion("Upload reference data file and unique data files", open = True):
@@ -174,7 +197,7 @@ with app:
174
  """)
175
 
176
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
177
- view_table_markdown = gr.Markdown(value = "", label="View table")
178
 
179
  with gr.Tab(label="Topic extraction settings"):
180
  gr.Markdown(
@@ -187,7 +210,7 @@ with app:
187
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
188
 
189
  with gr.Accordion("Prompt settings", open = True):
190
- number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3)
191
  system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
192
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
193
  prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
@@ -200,7 +223,7 @@ with app:
200
 
201
  # Invisible text box to hold the session hash/username just for logging purposes
202
  session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
203
- data_file_names_textbox = gr.Textbox(label = "Data file name", value="", visible=False)
204
  estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
205
  total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
206
 
@@ -224,40 +247,46 @@ with app:
224
  ###
225
 
226
  # Tabular data upload
227
- in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
228
 
229
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
230
- then(load_in_data_file,
231
- inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
232
- fn=extract_topics,
233
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox],
234
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
235
 
236
- # return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
237
 
238
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
239
  latest_batch_completed.change(fn=extract_topics,
240
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox],
241
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files]).\
242
- then(fn = reveal_feedback_buttons,
243
- outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
 
 
 
 
 
 
 
244
 
245
  # When button pressed, deduplicate data
246
- deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
247
- then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files, log_files_output])
248
 
249
  # When button pressed, summarise previous data
250
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
251
- then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
252
- then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
253
- then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
254
 
255
- latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
256
 
257
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
258
  continue_previous_data_files_btn.click(
259
- load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches]).\
260
- then(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, data_file_names_textbox])
261
 
262
  ###
263
  # LOGGING AND ON APP LOAD FUNCTIONS
@@ -268,21 +297,21 @@ with app:
268
  access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
269
  access_callback.setup([session_hash_textbox], access_logs_data_folder)
270
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
271
- then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
272
 
273
  # Log usage usage when making a query
274
  usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
275
- usage_callback.setup([session_hash_textbox, data_file_names_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], usage_data_folder)
276
 
277
- conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_names_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], None, preprocess=False).\
278
- then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
279
 
280
  # User submitted feedback
281
  feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
282
- feedback_callback.setup([data_feedback_radio, data_further_details_text, data_file_names_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], feedback_data_folder)
283
 
284
- data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_names_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], None, preprocess=False).\
285
- then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
286
 
287
  in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
288
 
 
3
  import spaces
4
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
+ from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
9
  #from tools.aws_functions import load_data_from_aws
 
44
  ###
45
 
46
  text_output_file_list_state = gr.State([])
47
+ text_output_modify_file_list_state = gr.State([])
48
  log_files_output_list_state = gr.State([])
49
  first_loop_state = gr.State(True)
50
  second_loop_state = gr.State(False)
51
+ modified_unique_table_change_bool = gr.State(True) # This boolean is used to flag whether a file upload should change just the modified unique table object on the second tab
52
 
53
+ file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas")
54
+ master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas")
55
+ master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas")
56
+ master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas")
57
+
58
+ master_modify_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_unique_topics_df_state", visible=False, type="pandas")
59
+ master_modify_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_reference_df_state", visible=False, type="pandas")
60
+
61
 
62
  session_hash_state = gr.State()
63
  s3_output_folder_state = gr.State()
 
73
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
74
 
75
  # Summary state objects
76
+ summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas")
77
+ master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas")
78
+ master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas")
79
  summarised_references_markdown = gr.Markdown("", visible=False)
80
  summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
81
  latest_summary_completed_num = gr.Number(0, visible=False)
82
 
83
+ reference_data_file_name_textbox = gr.Textbox(label = "Reference data file name", value="", visible=False)
84
+ unique_topics_table_file_name_textbox = gr.Textbox(label="Unique topics data file name textbox", visible=False)
85
 
86
  ###
87
  # UI LAYOUT
 
115
  in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
116
 
117
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
118
+ candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic titles below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for either or both of these lists to be specified.")
119
 
120
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
121
 
122
  sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative, Neutral, or Positive", choices=["Negative, Neutral, or Positive", "Negative or Positive", "Do not assess sentiment"])
123
 
124
+ extract_topics_btn = gr.Button("Extract topics", variant="primary")
125
 
126
+ text_output_summary = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
127
  text_output_file = gr.File(height=file_input_height, label="Output files")
128
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
129
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
 
138
  with gr.Row():
139
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
140
 
141
+ with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
142
  gr.Markdown(
143
  """
144
+ Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.
145
  """)
146
+
147
+
148
+
149
+ with gr.Accordion("Modify existing topics", open = False):
150
+ modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
151
+
152
+ modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
153
+
154
+ save_modified_files_button = gr.Button(value="Save modified topic names")
155
+
156
+
157
  with gr.Accordion("Upload reference data file and unique data files", open = True):
158
+
159
+
160
+ ### DEDUPLICATION
161
+ deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
162
+ deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
163
 
164
  with gr.Row():
165
  merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
 
167
  deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
168
 
169
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
170
+
171
+
172
+ ### SUMMARISATION
173
+ summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
174
 
175
  summarise_format_radio = gr.Radio(label="Choose summary type", value="Return a summary up to two paragraphs long that includes as much detail as possible from the original text", choices=["Return a summary up to two paragraphs long that includes as much detail as possible from the original text", "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"])
176
 
177
  summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
178
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
179
+ summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
180
 
181
+ with gr.Tab(label="Continue unfinished topic extraction"):
182
  gr.Markdown(
183
  """
184
+ ### Load in output files from a previous topic extraction process and continue topic extraction with new data.
185
  """)
186
 
187
  with gr.Accordion("Upload reference data file and unique data files", open = True):
 
197
  """)
198
 
199
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
200
+ view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
201
 
202
  with gr.Tab(label="Topic extraction settings"):
203
  gr.Markdown(
 
210
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
211
 
212
  with gr.Accordion("Prompt settings", open = True):
213
+ number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
214
  system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
215
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
216
  prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
 
223
 
224
  # Invisible text box to hold the session hash/username just for logging purposes
225
  session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
226
+
227
  estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
228
  total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
229
 
 
247
  ###
248
 
249
  # Tabular data upload
250
+ in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
251
 
252
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
253
+ success(load_in_data_file,
254
+ inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
255
+ success(fn=extract_topics,
256
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox],
257
+ outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
258
 
 
259
 
260
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
261
  latest_batch_completed.change(fn=extract_topics,
262
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox],
263
+ outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
264
+ success(fn = reveal_feedback_buttons,
265
+ outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
266
+
267
+ # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
268
+ modification_input_files.upload(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
269
+
270
+
271
+ # Modify output table with custom topic names
272
+ save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, deduplication_input_files, summarisation_input_files, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, summarised_output_markdown])
273
 
274
  # When button pressed, deduplicate data
275
+ deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
276
+ success(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, in_excel_sheets, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, summarisation_input_files, log_files_output, summarised_output_markdown], scroll_to_output=True)
277
 
278
  # When button pressed, summarise previous data
279
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
280
+ success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
281
+ success(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
282
+ success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
283
 
284
+ latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output], scroll_to_output=True)
285
 
286
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
287
  continue_previous_data_files_btn.click(
288
+ load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
289
+ success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
290
 
291
  ###
292
  # LOGGING AND ON APP LOAD FUNCTIONS
 
297
  access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
298
  access_callback.setup([session_hash_textbox], access_logs_data_folder)
299
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
300
+ success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
301
 
302
  # Log usage usage when making a query
303
  usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
304
+ usage_callback.setup([session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], usage_data_folder)
305
 
306
+ conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], None, preprocess=False).\
307
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
308
 
309
  # User submitted feedback
310
  feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
311
+ feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], feedback_data_folder)
312
 
313
+ data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], None, preprocess=False).\
314
+ success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
315
 
316
  in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
317
 
tools/helper_functions.py CHANGED
@@ -106,22 +106,31 @@ def detect_file_type(filename):
106
  else:
107
  raise ValueError("Unsupported file type.")
108
 
109
- def read_file(filename):
110
  """Read the file based on its detected type."""
111
  file_type = detect_file_type(filename)
112
 
113
  if file_type == 'csv':
114
  return pd.read_csv(filename, low_memory=False)
115
  elif file_type == 'xlsx':
116
- return pd.read_excel(filename)
 
 
 
117
  elif file_type == 'parquet':
118
  return pd.read_parquet(filename)
119
 
120
  # Wrap text in each column to the specified max width, including whole words
121
- def wrap_text(text, max_width=60):
122
  if not isinstance(text, str):
123
  return text
124
 
 
 
 
 
 
 
125
  words = text.split()
126
  if not words:
127
  return text
 
106
  else:
107
  raise ValueError("Unsupported file type.")
108
 
109
+ def read_file(filename:str, sheet:str=""):
110
  """Read the file based on its detected type."""
111
  file_type = detect_file_type(filename)
112
 
113
  if file_type == 'csv':
114
  return pd.read_csv(filename, low_memory=False)
115
  elif file_type == 'xlsx':
116
+ if sheet:
117
+ return pd.read_excel(filename, sheet_name=sheet)
118
+ else:
119
+ return pd.read_excel(filename)
120
  elif file_type == 'parquet':
121
  return pd.read_parquet(filename)
122
 
123
  # Wrap text in each column to the specified max width, including whole words
124
+ def wrap_text(text:str, max_width=60, max_text_length=None):
125
  if not isinstance(text, str):
126
  return text
127
 
128
+ # If max_text_length is set, truncate the text and add ellipsis
129
+ if max_text_length and len(text) > max_text_length:
130
+ return text[:max_text_length] + '...'
131
+
132
+ text = text.replace('\r\n', '<br>').replace('\n', '<br>')
133
+
134
  words = text.split()
135
  if not words:
136
  return text
tools/llm_api_call.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import google.generativeai as ai
3
  import pandas as pd
4
  import numpy as np
@@ -57,7 +58,7 @@ def normalise_string(text):
57
 
58
  return text
59
 
60
- def load_in_file(file_path: str, colname:str=""):
61
  """
62
  Loads in a tabular data file and returns data and file name.
63
 
@@ -68,7 +69,7 @@ def load_in_file(file_path: str, colname:str=""):
68
  #print("File type is:", file_type)
69
 
70
  file_name = get_file_name_no_ext(file_path)
71
- file_data = read_file(file_path)
72
 
73
  if colname:
74
  file_data[colname] = file_data[colname].fillna("")
@@ -79,11 +80,11 @@ def load_in_file(file_path: str, colname:str=""):
79
 
80
  return file_data, file_name
81
 
82
- def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50):
83
  '''Load in data table, work out how many batches needed.'''
84
 
85
  try:
86
- file_data, file_name = load_in_file(file_paths[0], colname=in_colnames)
87
  num_batches = math.ceil(len(file_data) / batch_size)
88
  print("Total number of batches:", num_batches)
89
 
@@ -95,7 +96,7 @@ def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:in
95
 
96
  return file_data, file_name, num_batches
97
 
98
- def load_in_previous_data_files(file_paths_partial_output:List[str]):
99
  '''Load in data table from a partially completed consultation summary to continue it.'''
100
 
101
  reference_file_data = pd.DataFrame()
@@ -106,24 +107,25 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
106
  latest_batch = 0
107
 
108
  for file in file_paths_partial_output:
 
109
  # If reference table
110
  if 'reference_table' in file.name:
111
  try:
112
  reference_file_data, reference_file_name = load_in_file(file)
113
  #print("reference_file_data:", reference_file_data.head(2))
114
- out_message = out_message + " Reference file load successful"
115
  except Exception as e:
116
  out_message = "Could not load reference file data:" + str(e)
117
- print("Could not load reference file data:", e)
118
  # If unique table
119
  if 'unique_topics' in file.name:
120
  try:
121
  unique_file_data, unique_file_name = load_in_file(file)
122
  #print("unique_topics_file:", unique_file_data.head(2))
123
- out_message = out_message + " Unique table file load successful"
124
  except Exception as e:
125
  out_message = "Could not load unique table file data:" + str(e)
126
- print("Could not load unique table file data:", e)
127
  if 'batch_' in file.name:
128
  latest_batch = re.search(r'batch_(\d+)', file.name).group(1)
129
  print("latest batch:", latest_batch)
@@ -133,12 +135,37 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
133
  out_message = out_message + " Latest batch number not found."
134
  if reference_file_data.empty:
135
  out_message = out_message + " No reference data table provided."
 
136
  if unique_file_data.empty:
137
  out_message = out_message + " No unique data table provided."
138
 
139
  print(out_message)
140
-
141
- return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str]) -> pd.DataFrame:
144
  basic_response_data = file_data[[chosen_cols]].reset_index(names="Reference")
@@ -190,7 +217,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
190
  batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
191
 
192
  # Now replace the reference numbers with numbers starting from 1
193
- batch_basic_response_data["Reference"] = batch_basic_response_data["Reference"] - start_row
194
 
195
  #print("batch_basic_response_data:", batch_basic_response_data)
196
 
@@ -612,30 +639,43 @@ def clean_column_name(column_name, max_length=20):
612
  return column_name[:max_length]
613
 
614
  def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
615
- new_unique_topics_df = reference_df[["General Topic", "Subtopic", "Sentiment"]]
616
 
617
- new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General Topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
 
619
- # Join existing and new unique topics
620
- out_unique_topics_df = new_unique_topics_df
621
 
622
- out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General Topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})
623
 
624
- #print("out_unique_topics_df:", out_unique_topics_df)
625
 
626
- out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).\
627
- drop(["Response References", "Summary"], axis = 1, errors="ignore")
628
 
629
- # Get count of rows that refer to particular topics
630
- reference_counts = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
631
- 'Response References': 'size', # Count the number of references
632
- 'Summary': lambda x: '<br>'.join(
633
- sorted(set(x), key=lambda summary: reference_df.loc[reference_df['Summary'] == summary, 'Start row of group'].min())
634
- )
635
- }).reset_index()
636
 
637
- # Join the counts to existing_unique_topics_df
638
- out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General Topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)
639
 
640
  return out_unique_topics_df
641
 
@@ -762,13 +802,11 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
762
  batch_file_path_details = f"{file_name}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}"
763
  row_number_string_start = f"Rows {start_row_reported} to {end_row}: "
764
 
765
- print("batch_file_path_details:", batch_file_path_details)
766
-
767
  whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
768
  whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
769
 
770
- #with open(whole_conversation_path, "w", encoding='utf-8', errors='replace') as f:
771
- # f.write(whole_conversation_str)
772
 
773
  with open(whole_conversation_path_meta, "w", encoding='utf-8', errors='replace') as f:
774
  f.write(whole_conversation_metadata_str)
@@ -799,9 +837,9 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
799
  topic_with_response_df["Response References"] = topic_with_response_df["Response References"].astype(str).str.replace(".0", "", regex=False)
800
 
801
  # Strip and lower case topic names to remove issues where model is randomly capitalising topics/sentiment
802
- topic_with_response_df["General Topic"] = topic_with_response_df["General Topic"].str.strip().str.lower().str.capitalize()
803
- topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].str.strip().str.lower().str.capitalize()
804
- topic_with_response_df["Sentiment"] = topic_with_response_df["Sentiment"].str.strip().str.lower().str.capitalize()
805
 
806
  topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
807
 
@@ -860,7 +898,10 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
860
  except Exception as e:
861
  print("Could not convert Response References column to integer due to", e)
862
 
863
- out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
 
 
 
864
 
865
  # Save the new DataFrame to CSV
866
  reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
@@ -884,9 +925,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
884
  # Get count of rows that refer to particular topics
885
  reference_counts = out_reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
886
  'Response References': 'size', # Count the number of references
887
- 'Summary': lambda x: '<br>'.join(
888
- sorted(set(x), key=lambda summary: out_reference_df.loc[out_reference_df['Summary'] == summary, 'Start row of group'].min())
889
- )
890
  }).reset_index()
891
 
892
  # Join the counts to existing_unique_topics_df
@@ -1006,7 +1045,8 @@ def extract_topics(in_data_file,
1006
  # Check if files and text exist
1007
  out_message = "Please enter a data file to summarise."
1008
  print(out_message)
1009
- return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
 
1010
 
1011
 
1012
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
@@ -1014,20 +1054,20 @@ def extract_topics(in_data_file,
1014
 
1015
  # If this is the first time around, set variables to 0/blank
1016
  if first_loop_state==True:
1017
- print("This is the first time through the loop")
1018
  if (latest_batch_completed == 999) | (latest_batch_completed == 0):
1019
  latest_batch_completed = 0
1020
  out_message = []
1021
  out_file_paths = []
1022
- print("model_choice_clean:", model_choice_clean)
1023
 
1024
  if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
1025
  progress(0.1, "Loading in Gemma 2b model")
1026
  local_model, tokenizer = load_model()
1027
  print("Local model loaded:", local_model)
1028
 
1029
- print("latest_batch_completed at start of function:", str(latest_batch_completed))
1030
- print("total number of batches:", str(num_batches))
1031
 
1032
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
1033
  if latest_batch_completed >= num_batches:
@@ -1071,8 +1111,11 @@ def extract_topics(in_data_file,
1071
  existing_reference_df.to_csv(reference_table_out_path, index=None)
1072
  out_file_paths.append(reference_table_out_path)
1073
 
 
 
 
1074
  ## Unique topic list
1075
- existing_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1076
  out_file_paths.append(unique_topics_df_out_path)
1077
 
1078
  # Ensure that we are only returning the final results to outputs
@@ -1107,7 +1150,7 @@ def extract_topics(in_data_file,
1107
 
1108
  # Step 3: Populate the new DataFrame
1109
  missing_df['Response References'] = missing_references['Reference']
1110
- missing_df = missing_df.fillna(np.nan) # Fill other columns with NA
1111
 
1112
  # Display the new DataFrame
1113
  #print("missing_df:", missing_df)
@@ -1120,10 +1163,12 @@ def extract_topics(in_data_file,
1120
  log_files_output_paths = list(set(log_files_output_paths))
1121
 
1122
  summary_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
1123
- print("summary_out_file_paths:", summary_out_file_paths)
 
 
1124
 
1125
  #final_out_message = '\n'.join(out_message)
1126
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
1127
 
1128
 
1129
  if num_batches > 0:
@@ -1144,7 +1189,8 @@ def extract_topics(in_data_file,
1144
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1145
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1146
  print(out_message)
1147
- return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
 
1148
 
1149
 
1150
  if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
@@ -1155,9 +1201,6 @@ def extract_topics(in_data_file,
1155
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
1156
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
1157
 
1158
-
1159
-
1160
-
1161
  for i in topics_loop:
1162
  #for latest_batch_completed in range(num_batches):
1163
  reported_batch_no = latest_batch_completed + 1
@@ -1167,8 +1210,6 @@ def extract_topics(in_data_file,
1167
  simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
1168
  #log_files_output_paths.append(simplified_csv_table_path)
1169
 
1170
-
1171
-
1172
  # Conversation history
1173
  conversation_history = []
1174
 
@@ -1177,7 +1218,7 @@ def extract_topics(in_data_file,
1177
  # If the latest batch of responses contains at least one instance of text
1178
  if not batch_basic_response_df.empty:
1179
 
1180
- print("latest_batch_completed:", latest_batch_completed)
1181
 
1182
  #print("candidate_topics:", candidate_topics)
1183
 
@@ -1198,26 +1239,58 @@ def extract_topics(in_data_file,
1198
  # Preparing candidate topics
1199
  if candidate_topics and existing_unique_topics_df.empty:
1200
  progress(0.1, "Creating revised zero shot topics table")
 
1201
  # 'Zero shot topics' are those supplied by the user
1202
  max_topic_no = 120
1203
-
1204
  zero_shot_topics = read_file(candidate_topics.name)
1205
 
1206
- if zero_shot_topics.shape[1] == 1: # Check if there is only one column
1207
- zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
1208
- # Max 120 topics allowed
1209
- if len(zero_shot_topics_series) > max_topic_no:
1210
- print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1211
- zero_shot_topics_series = zero_shot_topics_series.iloc[:max_topic_no]
1212
-
1213
- zero_shot_topics_list = list(zero_shot_topics_series)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1214
 
1215
- print("Zero shot topics are:", zero_shot_topics_list)
1216
 
1217
  if create_revised_general_topics == True:
1218
  # Create the most up to date list of topics and subtopics.
1219
  # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1220
- unique_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
 
 
 
1221
  unique_topics_markdown = unique_topics_df.to_markdown()
1222
 
1223
  print("unique_topics_markdown:", unique_topics_markdown)
@@ -1244,16 +1317,24 @@ def extract_topics(in_data_file,
1244
  zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
1245
  zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
1246
  out_file_paths.append(zero_shot_revised_path)
 
1247
  except Exception as e:
1248
- print("Error in parsing markdown table from response text:", e)
1249
- print("Not adding revised General Topics to table")
1250
- zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
 
1251
 
1252
  if zero_shot_topics_df.empty:
1253
  print("Creation of revised general topics df failed, reverting to original list")
1254
- zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
 
 
1255
  else:
1256
- zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
 
 
 
 
1257
 
1258
  # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1259
  if not existing_unique_topics_df.empty:
@@ -1261,18 +1342,6 @@ def extract_topics(in_data_file,
1261
  else:
1262
  existing_unique_topics_df = zero_shot_topics_df
1263
 
1264
- # If your zero shot column file already contains General Topic and Subtopic columns
1265
- if set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
1266
- # Max 120 topics allowed
1267
- if zero_shot_topics.shape[0] > max_topic_no:
1268
- print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1269
- zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
1270
-
1271
- if existing_unique_topics_df.empty:
1272
- existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1]})
1273
-
1274
- zero_shot_topics_df = zero_shot_topics
1275
-
1276
  if candidate_topics and not zero_shot_topics_df.empty:
1277
  # If you have already created revised zero shot topics, concat to the current
1278
  existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
@@ -1282,7 +1351,7 @@ def extract_topics(in_data_file,
1282
  #all_topic_tables_df_merged = existing_unique_topics_df
1283
  existing_unique_topics_df["Response References"] = ""
1284
 
1285
- unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
1286
 
1287
  #existing_unique_topics_df.to_csv(output_folder + f"{file_name}_existing_unique_topics_df_" + #model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
1288
 
@@ -1317,17 +1386,36 @@ def extract_topics(in_data_file,
1317
  summary_whole_conversation = []
1318
 
1319
  # Process requests to large language model
1320
- master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1321
 
1322
- # print("master_summary_response:", master_summary_response[-1].text)
1323
  # print("Whole conversation metadata:", whole_conversation_metadata)
1324
 
1325
- topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(master_summary_response, whole_summary_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1326
 
1327
  # If error in table parsing, leave function
1328
  if is_error == True:
1329
  final_message_out = "Could not complete summary, error in LLM output."
1330
- display_table, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
 
1331
 
1332
  # Write outputs to csv
1333
  ## Topics with references
@@ -1347,7 +1435,7 @@ def extract_topics(in_data_file,
1347
  #all_topic_tables_df.append(new_topic_df)
1348
  #all_markdown_topic_tables.append(new_markdown_table)
1349
 
1350
- #display_table = master_summary_response[-1].text
1351
 
1352
  # Show unique topics alongside document counts as output
1353
  display_table = new_unique_topics_df.to_markdown(index=False)
@@ -1372,6 +1460,8 @@ def extract_topics(in_data_file,
1372
  if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1373
  print("Using Gemini model:", model_choice)
1374
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
 
 
1375
  else:
1376
  print("Using AWS Bedrock model:", model_choice)
1377
 
@@ -1449,7 +1539,7 @@ def extract_topics(in_data_file,
1449
  log_files_output_paths.append(final_table_output_path)
1450
 
1451
  except Exception as e:
1452
- print(e)
1453
 
1454
  new_topic_df = topic_table_df
1455
  new_reference_df = reference_df
@@ -1476,6 +1566,9 @@ def extract_topics(in_data_file,
1476
  existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
1477
  existing_topics_table = new_topic_df.dropna(how='all')
1478
 
 
 
 
1479
  out_time = f"{final_time:0.1f} seconds."
1480
 
1481
  out_message.append('All queries successfully completed in')
@@ -1485,10 +1578,8 @@ def extract_topics(in_data_file,
1485
 
1486
  print(final_message_out)
1487
 
1488
- #print("out_file_paths:", out_file_paths)
1489
- #print("log_files_output_paths:", log_files_output_paths)
1490
 
1491
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
1492
 
1493
  def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
1494
 
@@ -1521,9 +1612,115 @@ def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:
1521
  return pivot_table
1522
 
1523
 
1524
- # SUMMARISATION FUNCTIONS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1525
 
1526
- def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1527
  """
1528
  Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold,
1529
  merging smaller topics into larger topics.
@@ -1540,8 +1737,9 @@ def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, r
1540
  # Count occurrences of each category in the reference_df
1541
  category_counts = reference_df['Subtopic'].value_counts().to_dict()
1542
 
1543
- # Initialize the result dictionary
1544
  deduplication_map = {}
 
1545
 
1546
  # First pass: Handle exact matches
1547
  for category in category_series.unique():
@@ -1556,45 +1754,63 @@ def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, r
1556
  most_common = max(match_counts.items(), key=lambda x: x[1])[0]
1557
  most_common_category = category_series[most_common]
1558
 
1559
- # Map all exact matches to the most common variant
1560
  for match in exact_matches:
1561
  deduplication_map[category_series[match]] = most_common_category
 
1562
 
1563
  # Second pass: Handle fuzzy matches for remaining categories
1564
- for category in category_series.unique():
1565
- # Skip if the category is already processed
 
 
 
 
 
 
1566
  if category in deduplication_map:
1567
  continue
1568
 
1569
- # Find close matches to the current category, excluding the current category itself
 
 
 
 
 
 
 
 
 
 
 
 
1570
  matches = process.extract(category,
1571
- [cat for cat in category_series.unique() if cat != category],
1572
- scorer=fuzz.token_set_ratio,
1573
  score_cutoff=threshold)
1574
 
1575
- if matches: # Check if there are any matches
1576
- best_match = max(matches, key=lambda x: x[1]) # Get the match with the highest score
1577
- match, score, _ = best_match # Unpack the best match
1578
 
1579
- # Compare counts to ensure smaller topics merge into larger ones
1580
  if category_counts.get(category, 0) < category_counts.get(match, 0):
1581
- deduplication_map[category] = match # Map the smaller category to the larger one
 
1582
  else:
1583
- deduplication_map[match] = category # Map the larger category to the smaller one
 
1584
  else:
1585
- deduplication_map[category] = category # No match found, keep the category as is
1586
-
1587
- # Create the result DataFrame
1588
- if merge_sentiment == "Yes":
1589
- result_df = pd.DataFrame({
1590
- 'old_category': category_series + " | " + join_series,
1591
- 'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
1592
- })
1593
- else:
1594
- result_df = pd.DataFrame({
1595
- 'old_category': category_series + " | " + join_series,
1596
- 'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
1597
- })
1598
 
1599
  return result_df
1600
 
@@ -1602,10 +1818,11 @@ def deduplicate_topics(reference_df:pd.DataFrame,
1602
  unique_topics_df:pd.DataFrame,
1603
  reference_table_file_name:str,
1604
  unique_topics_table_file_name:str,
 
1605
  merge_sentiment:str= "No",
1606
  merge_general_topics:str="No",
1607
- score_threshold:int=deduplication_threshold,
1608
- in_data_files=[],
1609
  chosen_cols:List[str]="",
1610
  deduplicate_topics:str="Yes"
1611
  ):
@@ -1614,37 +1831,95 @@ def deduplicate_topics(reference_df:pd.DataFrame,
1614
  '''
1615
  output_files = []
1616
  log_output_files = []
 
1617
 
1618
- reference_table_file_name_no_ext = get_file_name_no_ext(reference_table_file_name)
1619
- unique_topics_table_file_name_no_ext = get_file_name_no_ext(unique_topics_table_file_name)
1620
 
1621
- if in_data_files and chosen_cols:
1622
- file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
1623
 
1624
- # Run through this x times to try to get all duplicate topics
1625
- if deduplicate_topics == "Yes":
1626
- for i in range(0, 5):
1627
- #print("Deduplication run:", i)
1628
-
1629
 
1630
- #reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
 
 
 
 
 
1631
 
1632
- if merge_sentiment == "No":
1633
- # First, combine duplicate topics in reference_df
1634
- reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1635
- reference_df_unique = reference_df.drop_duplicates("old_category")
 
 
1636
 
1637
- # Deduplicate categories within each sentiment group
1638
- deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
1639
- lambda group: deduplicate_categories(group["Subtopic"], group["Sentiment"], reference_df, threshold=score_threshold)
1640
- ).reset_index(drop=True) # Reset index after groupby
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1641
  else:
1642
- # Deduplicate categories by subtopic name only
1643
- # First, combine duplicate topics in reference_df
1644
- reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1645
- reference_df_unique = reference_df.drop_duplicates("old_category")
1646
-
1647
- deduplicated_topic_map_df = deduplicate_categories(reference_df_unique["Subtopic"], reference_df_unique["Sentiment"], reference_df, merge_sentiment=merge_sentiment, threshold=score_threshold).reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1648
 
1649
  if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
1650
  # Check if 'deduplicated_category' contains any values
@@ -1653,8 +1928,9 @@ def deduplicate_topics(reference_df:pd.DataFrame,
1653
  else:
1654
  # Join deduplicated columns back to original df
1655
  #deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
 
1656
  # Remove rows where 'deduplicated_category' is blank or NaN
1657
- deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
1658
 
1659
  deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1660
 
@@ -1675,9 +1951,9 @@ def deduplicate_topics(reference_df:pd.DataFrame,
1675
 
1676
  reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
1677
 
1678
- reference_df["General Topic"] = reference_df["General Topic"].str.lower().str.capitalize()
1679
- reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
1680
- reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
1681
 
1682
  if merge_general_topics == "Yes":
1683
  # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
@@ -1711,15 +1987,42 @@ def deduplicate_topics(reference_df:pd.DataFrame,
1711
 
1712
  reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
1713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1714
  # Remake unique_topics_df based on new reference_df
1715
  unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
1716
 
1717
- basic_response_data = get_basic_response_data(file_data, chosen_cols)
 
 
 
 
 
 
 
 
 
 
 
 
1718
 
1719
- reference_df_pivot = convert_reference_table_to_pivot_table(reference_df, basic_response_data)
1720
 
1721
- reference_table_file_name_no_ext = get_file_name_no_ext(reference_table_file_name)
1722
- unique_topics_table_file_name_no_ext = get_file_name_no_ext(unique_topics_table_file_name)
1723
 
1724
  reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
1725
  unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
@@ -1727,14 +2030,14 @@ def deduplicate_topics(reference_df:pd.DataFrame,
1727
  unique_topics_df.to_csv(unique_topics_file_path, index=None)
1728
 
1729
  output_files.append(reference_file_path)
1730
- output_files.append(unique_topics_file_path)
1731
 
1732
- reference_pivot_file_path = output_folder + reference_table_file_name_no_ext + "_pivot_dedup.csv"
1733
- reference_df_pivot.to_csv(reference_pivot_file_path, index=None)
1734
 
1735
- log_output_files.append(reference_pivot_file_path)
1736
 
1737
- return reference_df, unique_topics_df, output_files, log_output_files
1738
 
1739
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
1740
  unique_topics_df:pd.DataFrame,
@@ -1750,6 +2053,11 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
1750
 
1751
  reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
1752
 
 
 
 
 
 
1753
  for group_keys, reference_df_group in reference_df_grouped:
1754
  #print(f"Group: {group_keys}")
1755
  #print(f"Data: {reference_df_group}")
@@ -1831,6 +2139,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1831
  latest_summary_completed:int = 0,
1832
  out_metadata_str:str = "",
1833
  in_data_files:List[str]=[],
 
1834
  chosen_cols:List[str]=[],
1835
  log_output_files:list[str]=[],
1836
  summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
@@ -1845,11 +2154,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1845
  local_model = []
1846
  summarised_output_markdown = ""
1847
 
1848
- print("In summarise_output_topics function.")
1849
-
1850
- all_summaries = summarised_references["Summary"].tolist()
1851
-
1852
- length_all_summaries = len(all_summaries)
1853
 
1854
  # Check for data for summarisations
1855
  if not unique_table_df.empty and not reference_table_df.empty:
@@ -1857,17 +2161,24 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1857
  else:
1858
  out_message = "Please upload a unique topic table and reference table file to continue with summarisation."
1859
  print(out_message)
1860
- raise(out_message)
1861
- return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
1862
-
 
 
 
 
1863
  # Load in data file and chosen columns if exists to create pivot table later
1864
  if in_data_files and chosen_cols:
1865
- file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
1866
  else:
1867
- out_message = "No file data found, please load a data file on the first tab and select a column."
1868
  print(out_message)
1869
- raise(out_message)
1870
- return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
 
 
 
1871
 
1872
  # If all summaries completed, make final outputs
1873
  if latest_summary_completed >= length_all_summaries:
@@ -1908,9 +2219,18 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1908
  unique_table_df_revised = unique_table_df_revised.loc[unique_table_df_revised["Sentiment"] != "Not Mentioned", :]
1909
  reference_table_df_revised = reference_table_df_revised.loc[reference_table_df_revised["Sentiment"] != "Not Mentioned", :]
1910
 
1911
- basic_response_data = get_basic_response_data(file_data, chosen_cols)
 
1912
 
1913
- reference_table_df_revised_pivot = convert_reference_table_to_pivot_table(reference_table_df_revised, basic_response_data)
 
 
 
 
 
 
 
 
1914
 
1915
  # Save to file
1916
  unique_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_unique_topic_table_" + model_choice_clean + ".csv"
@@ -1919,16 +2239,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1919
  reference_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_reference_table_" + model_choice_clean + ".csv"
1920
  reference_table_df_revised.to_csv(reference_table_df_revised_path, index = None)
1921
 
1922
- output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
1923
-
1924
- ### Save pivot file to log area
1925
- reference_table_df_revised_pivot_path = output_folder + batch_file_path_details + "_summarised_reference_table_pivot_" + model_choice_clean + ".csv"
1926
- reference_table_df_revised_pivot.to_csv(reference_table_df_revised_pivot_path, index=None)
1927
-
1928
- log_output_files.append(reference_table_df_revised_pivot_path)
1929
 
1930
  ###
1931
- unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(wrap_text))
1932
 
1933
  summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
1934
 
 
1
  import os
2
+ import os
3
  import google.generativeai as ai
4
  import pandas as pd
5
  import numpy as np
 
58
 
59
  return text
60
 
61
+ def load_in_file(file_path: str, colname:str="", excel_sheet:str=""):
62
  """
63
  Loads in a tabular data file and returns data and file name.
64
 
 
69
  #print("File type is:", file_type)
70
 
71
  file_name = get_file_name_no_ext(file_path)
72
+ file_data = read_file(file_path, excel_sheet)
73
 
74
  if colname:
75
  file_data[colname] = file_data[colname].fillna("")
 
80
 
81
  return file_data, file_name
82
 
83
+ def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50, in_excel_sheets:str=""):
84
  '''Load in data table, work out how many batches needed.'''
85
 
86
  try:
87
+ file_data, file_name = load_in_file(file_paths[0], colname=in_colnames, excel_sheet=in_excel_sheets)
88
  num_batches = math.ceil(len(file_data) / batch_size)
89
  print("Total number of batches:", num_batches)
90
 
 
96
 
97
  return file_data, file_name, num_batches
98
 
99
+ def load_in_previous_data_files(file_paths_partial_output:List[str], for_modified_table:bool=False):
100
  '''Load in data table from a partially completed consultation summary to continue it.'''
101
 
102
  reference_file_data = pd.DataFrame()
 
107
  latest_batch = 0
108
 
109
  for file in file_paths_partial_output:
110
+
111
  # If reference table
112
  if 'reference_table' in file.name:
113
  try:
114
  reference_file_data, reference_file_name = load_in_file(file)
115
  #print("reference_file_data:", reference_file_data.head(2))
116
+ out_message = out_message + " Reference file load successful."
117
  except Exception as e:
118
  out_message = "Could not load reference file data:" + str(e)
119
+ raise Exception("Could not load reference file data:", e)
120
  # If unique table
121
  if 'unique_topics' in file.name:
122
  try:
123
  unique_file_data, unique_file_name = load_in_file(file)
124
  #print("unique_topics_file:", unique_file_data.head(2))
125
+ out_message = out_message + " Unique table file load successful."
126
  except Exception as e:
127
  out_message = "Could not load unique table file data:" + str(e)
128
+ raise Exception("Could not load unique table file data:", e)
129
  if 'batch_' in file.name:
130
  latest_batch = re.search(r'batch_(\d+)', file.name).group(1)
131
  print("latest batch:", latest_batch)
 
135
  out_message = out_message + " Latest batch number not found."
136
  if reference_file_data.empty:
137
  out_message = out_message + " No reference data table provided."
138
+ raise Exception(out_message)
139
  if unique_file_data.empty:
140
  out_message = out_message + " No unique data table provided."
141
 
142
  print(out_message)
143
+
144
+ # Return all data if using for deduplication task. Return just modified unique table if using just for table modification
145
+ if for_modified_table == False:
146
+ return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
147
+ else:
148
+
149
+ reference_file_data.drop("Topic_number", axis=1, inplace=True, errors="ignore")
150
+
151
+ unique_file_data = create_unique_table_df_from_reference_table(reference_file_data)
152
+
153
+ unique_file_data.drop("Summary",axis=1, inplace=True)
154
+
155
+ # Then merge the topic numbers back to the original dataframe
156
+ reference_file_data = reference_file_data.merge(
157
+ unique_file_data[['General Topic', 'Subtopic', 'Sentiment', 'Topic_number']],
158
+ on=['General Topic', 'Subtopic', 'Sentiment'],
159
+ how='left'
160
+ )
161
+
162
+ out_file_names = [reference_file_name + ".csv"]
163
+ out_file_names.append(unique_file_name + ".csv")
164
+
165
+ print("reference_file_name:", reference_file_name)
166
+ print("unique_file_name:", unique_file_name)
167
+
168
+ return gr.Dataframe(value=unique_file_data, headers=None, col_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas"), reference_file_data, unique_file_data, reference_file_name, unique_file_name, out_file_names
169
 
170
  def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str]) -> pd.DataFrame:
171
  basic_response_data = file_data[[chosen_cols]].reset_index(names="Reference")
 
217
  batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
218
 
219
  # Now replace the reference numbers with numbers starting from 1
220
+ batch_basic_response_data.loc[:, "Reference"] = batch_basic_response_data["Reference"] - start_row
221
 
222
  #print("batch_basic_response_data:", batch_basic_response_data)
223
 
 
639
  return column_name[:max_length]
640
 
641
  def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
 
642
 
643
+ out_unique_topics_df = (reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
644
+ .agg({
645
+ 'Response References': 'size', # Count the number of references
646
+ 'Summary': lambda x: '<br>'.join(
647
+ sorted(set(x), key=lambda summary: reference_df.loc[reference_df['Summary'] == summary, 'Start row of group'].min())
648
+ )
649
+ })
650
+ .reset_index()
651
+ .sort_values('Response References', ascending=False) # Sort by size, biggest first
652
+ .assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
653
+ )
654
+
655
+ # new_unique_topics_df = reference_df[["General Topic", "Subtopic", "Sentiment"]]
656
+
657
+ # new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General Topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})
658
 
659
+ # # Join existing and new unique topics
660
+ # out_unique_topics_df = new_unique_topics_df
661
 
662
+ # out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General Topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})
663
 
664
+ # #print("out_unique_topics_df:", out_unique_topics_df)
665
 
666
+ # out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).\
667
+ # drop(["Response References", "Summary"], axis = 1, errors="ignore")
668
 
669
+ # # Get count of rows that refer to particular topics
670
+ # reference_counts = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
671
+ # 'Response References': 'size', # Count the number of references
672
+ # 'Summary': lambda x: '<br>'.join(
673
+ # sorted(set(x), key=lambda summary: reference_df.loc[reference_df['Summary'] == summary, 'Start row of group'].min())
674
+ # )
675
+ # }).reset_index()
676
 
677
+ # # Join the counts to existing_unique_topics_df
678
+ # out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General Topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)
679
 
680
  return out_unique_topics_df
681
 
 
802
  batch_file_path_details = f"{file_name}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}"
803
  row_number_string_start = f"Rows {start_row_reported} to {end_row}: "
804
 
 
 
805
  whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
806
  whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
807
 
808
+ with open(whole_conversation_path, "w", encoding='utf-8', errors='replace') as f:
809
+ f.write(whole_conversation_str)
810
 
811
  with open(whole_conversation_path_meta, "w", encoding='utf-8', errors='replace') as f:
812
  f.write(whole_conversation_metadata_str)
 
837
  topic_with_response_df["Response References"] = topic_with_response_df["Response References"].astype(str).str.replace(".0", "", regex=False)
838
 
839
  # Strip and lower case topic names to remove issues where model is randomly capitalising topics/sentiment
840
+ topic_with_response_df["General Topic"] = topic_with_response_df["General Topic"].astype(str).str.strip().str.lower().str.capitalize()
841
+ topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].astype(str).str.strip().str.lower().str.capitalize()
842
+ topic_with_response_df["Sentiment"] = topic_with_response_df["Sentiment"].astype(str).str.strip().str.lower().str.capitalize()
843
 
844
  topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
845
 
 
898
  except Exception as e:
899
  print("Could not convert Response References column to integer due to", e)
900
 
901
+ out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
902
+
903
+ # Each topic should only be associated with each individual response once
904
+ out_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
905
 
906
  # Save the new DataFrame to CSV
907
  reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
 
925
  # Get count of rows that refer to particular topics
926
  reference_counts = out_reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
927
  'Response References': 'size', # Count the number of references
928
+ 'Summary': ' <br> '.join
 
 
929
  }).reset_index()
930
 
931
  # Join the counts to existing_unique_topics_df
 
1045
  # Check if files and text exist
1046
  out_message = "Please enter a data file to summarise."
1047
  print(out_message)
1048
+ raise Exception(out_message)
1049
+ #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
1050
 
1051
 
1052
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
 
1054
 
1055
  # If this is the first time around, set variables to 0/blank
1056
  if first_loop_state==True:
1057
+ #print("This is the first time through the loop")
1058
  if (latest_batch_completed == 999) | (latest_batch_completed == 0):
1059
  latest_batch_completed = 0
1060
  out_message = []
1061
  out_file_paths = []
1062
+ #print("model_choice_clean:", model_choice_clean)
1063
 
1064
  if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
1065
  progress(0.1, "Loading in Gemma 2b model")
1066
  local_model, tokenizer = load_model()
1067
  print("Local model loaded:", local_model)
1068
 
1069
+ #print("latest_batch_completed at start of function:", str(latest_batch_completed))
1070
+ #print("total number of batches:", str(num_batches))
1071
 
1072
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
1073
  if latest_batch_completed >= num_batches:
 
1111
  existing_reference_df.to_csv(reference_table_out_path, index=None)
1112
  out_file_paths.append(reference_table_out_path)
1113
 
1114
+ # Create final unique topics table from reference table to ensure consistent numbers
1115
+ final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
1116
+
1117
  ## Unique topic list
1118
+ final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1119
  out_file_paths.append(unique_topics_df_out_path)
1120
 
1121
  # Ensure that we are only returning the final results to outputs
 
1150
 
1151
  # Step 3: Populate the new DataFrame
1152
  missing_df['Response References'] = missing_references['Reference']
1153
+ missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False) # Fill other columns with NA
1154
 
1155
  # Display the new DataFrame
1156
  #print("missing_df:", missing_df)
 
1163
  log_files_output_paths = list(set(log_files_output_paths))
1164
 
1165
  summary_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
1166
+
1167
+ # The topic table that can be modified does not need the summary column
1168
+ modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
1169
 
1170
  #final_out_message = '\n'.join(out_message)
1171
+ return display_table, existing_topics_table, final_out_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1172
 
1173
 
1174
  if num_batches > 0:
 
1189
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1190
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1191
  print(out_message)
1192
+ raise Exception(out_message)
1193
+ #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
1194
 
1195
 
1196
  if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
 
1201
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
1202
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
1203
 
 
 
 
1204
  for i in topics_loop:
1205
  #for latest_batch_completed in range(num_batches):
1206
  reported_batch_no = latest_batch_completed + 1
 
1210
  simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
1211
  #log_files_output_paths.append(simplified_csv_table_path)
1212
 
 
 
1213
  # Conversation history
1214
  conversation_history = []
1215
 
 
1218
  # If the latest batch of responses contains at least one instance of text
1219
  if not batch_basic_response_df.empty:
1220
 
1221
+ #print("latest_batch_completed:", latest_batch_completed)
1222
 
1223
  #print("candidate_topics:", candidate_topics)
1224
 
 
1239
  # Preparing candidate topics
1240
  if candidate_topics and existing_unique_topics_df.empty:
1241
  progress(0.1, "Creating revised zero shot topics table")
1242
+
1243
  # 'Zero shot topics' are those supplied by the user
1244
  max_topic_no = 120
 
1245
  zero_shot_topics = read_file(candidate_topics.name)
1246
 
1247
+ # Max 120 topics allowed
1248
+ if zero_shot_topics.shape[0] > max_topic_no:
1249
+ print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1250
+ zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
1251
+
1252
+ # Forward slashes in the topic names seems to confuse the model
1253
+ if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
1254
+ for x in zero_shot_topics.columns:
1255
+ zero_shot_topics.loc[:, x] = (
1256
+ zero_shot_topics.loc[:, x]
1257
+ .str.strip()
1258
+ .str.replace('\n', ' ')
1259
+ .str.replace('\r', ' ')
1260
+ .str.replace('/', ' or ')
1261
+ .str.lower()
1262
+ .str.capitalize())
1263
+
1264
+ # If number of columns is 1, keep only subtopics
1265
+ if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
1266
+ zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1267
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1268
+ # Allow for possibility that the user only wants to set general topics and not subtopics
1269
+ elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
1270
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1271
+ zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
1272
+ # If general topic and subtopic are specified
1273
+ elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
1274
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1275
+ zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
1276
+ # If number of columns is 2, keep general topics and subtopics
1277
+ elif zero_shot_topics.shape[1] == 2:
1278
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
1279
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
1280
+ else:
1281
+ # If there are more columns, just assume that the first column was meant to be a subtopic
1282
+ zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1283
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1284
 
1285
+
1286
 
1287
  if create_revised_general_topics == True:
1288
  # Create the most up to date list of topics and subtopics.
1289
  # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1290
+ unique_topics_df = pd.DataFrame(data={
1291
+ "General Topic":zero_shot_topics_gen_topics_list,
1292
+ "Subtopic":zero_shot_topics_subtopics_list
1293
+ })
1294
  unique_topics_markdown = unique_topics_df.to_markdown()
1295
 
1296
  print("unique_topics_markdown:", unique_topics_markdown)
 
1317
  zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
1318
  zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
1319
  out_file_paths.append(zero_shot_revised_path)
1320
+
1321
  except Exception as e:
1322
+ print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
1323
+ zero_shot_topics_df = pd.DataFrame(data={
1324
+ "General Topic":zero_shot_topics_gen_topics_list,
1325
+ "Subtopic":zero_shot_topics_subtopics_list})
1326
 
1327
  if zero_shot_topics_df.empty:
1328
  print("Creation of revised general topics df failed, reverting to original list")
1329
+ zero_shot_topics_df = pd.DataFrame(data={
1330
+ "General Topic":zero_shot_topics_gen_topics_list,
1331
+ "Subtopic":zero_shot_topics_subtopics_list})
1332
  else:
1333
+ zero_shot_topics_df = pd.DataFrame(data={
1334
+ "General Topic":zero_shot_topics_gen_topics_list,
1335
+ "Subtopic":zero_shot_topics_subtopics_list})
1336
+
1337
+ print("Zero shot topics are:", zero_shot_topics_df)
1338
 
1339
  # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1340
  if not existing_unique_topics_df.empty:
 
1342
  else:
1343
  existing_unique_topics_df = zero_shot_topics_df
1344
 
 
 
 
 
 
 
 
 
 
 
 
 
1345
  if candidate_topics and not zero_shot_topics_df.empty:
1346
  # If you have already created revised zero shot topics, concat to the current
1347
  existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
 
1351
  #all_topic_tables_df_merged = existing_unique_topics_df
1352
  existing_unique_topics_df["Response References"] = ""
1353
 
1354
+ unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
1355
 
1356
  #existing_unique_topics_df.to_csv(output_folder + f"{file_name}_existing_unique_topics_df_" + #model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
1357
 
 
1386
  summary_whole_conversation = []
1387
 
1388
  # Process requests to large language model
1389
+ responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1390
 
1391
+ # print("responses:", responses[-1].text)
1392
  # print("Whole conversation metadata:", whole_conversation_metadata)
1393
 
1394
+ topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
1395
+
1396
+ # Write final output to text file for logging purposes
1397
+ try:
1398
+ final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1399
+
1400
+ if isinstance(responses[-1], ResponseObject):
1401
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1402
+ f.write(responses[-1].text)
1403
+ elif "choices" in responses[-1]:
1404
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1405
+ f.write(responses[-1]["choices"][0]['text'])
1406
+ else:
1407
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1408
+ f.write(responses[-1].text)
1409
+
1410
+ except Exception as e:
1411
+ print("Error in returning model response:", e)
1412
+
1413
 
1414
  # If error in table parsing, leave function
1415
  if is_error == True:
1416
  final_message_out = "Could not complete summary, error in LLM output."
1417
+ raise Exception(final_message_out)
1418
+ #return display_table, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1419
 
1420
  # Write outputs to csv
1421
  ## Topics with references
 
1435
  #all_topic_tables_df.append(new_topic_df)
1436
  #all_markdown_topic_tables.append(new_markdown_table)
1437
 
1438
+ #display_table = responses[-1].text
1439
 
1440
  # Show unique topics alongside document counts as output
1441
  display_table = new_unique_topics_df.to_markdown(index=False)
 
1460
  if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1461
  print("Using Gemini model:", model_choice)
1462
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1463
+ elif model_choice in ["gemma_2b_it_local"]:
1464
+ print("Using local Gemma 2b model")
1465
  else:
1466
  print("Using AWS Bedrock model:", model_choice)
1467
 
 
1539
  log_files_output_paths.append(final_table_output_path)
1540
 
1541
  except Exception as e:
1542
+ print("Error in returning model response:", e)
1543
 
1544
  new_topic_df = topic_table_df
1545
  new_reference_df = reference_df
 
1566
  existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
1567
  existing_topics_table = new_topic_df.dropna(how='all')
1568
 
1569
+ # The topic table that can be modified does not need the summary column
1570
+ modifiable_unique_topics_df = existing_unique_topics_df.drop("Summary", axis=1)
1571
+
1572
  out_time = f"{final_time:0.1f} seconds."
1573
 
1574
  out_message.append('All queries successfully completed in')
 
1578
 
1579
  print(final_message_out)
1580
 
 
 
1581
 
1582
+ return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1583
 
1584
  def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
1585
 
 
1612
  return pivot_table
1613
 
1614
 
1615
+ def join_modified_topic_names_to_ref_table(modified_unique_topics_df:pd.DataFrame, original_unique_topics_df:pd.DataFrame, reference_df:pd.DataFrame):
1616
+ '''
1617
+ Take a unique topic table that has been modified by the user, and apply the topic name changes to the long-form reference table.
1618
+ '''
1619
+
1620
+ # Drop rows where Response References is either NA or null
1621
+ modified_unique_topics_df = modified_unique_topics_df[~modified_unique_topics_df["Response References"].isnull()]
1622
+ modified_unique_topics_df.drop_duplicates(["General Topic", "Subtopic", "Sentiment", "Topic_number"], inplace=True)
1623
+
1624
+ # First, join the modified topics to the original topics dataframe based on index to have the modified names alongside the original names
1625
+ original_unique_topics_df_m = original_unique_topics_df.merge(modified_unique_topics_df[["General Topic", "Subtopic", "Sentiment", "Topic_number"]], on="Topic_number", how="left", suffixes=("", "_mod"))
1626
+
1627
+ original_unique_topics_df_m.to_csv(output_folder + "original_unique_topics_df_m.csv")
1628
+
1629
+ original_unique_topics_df_m.drop_duplicates(["General Topic", "Subtopic", "Sentiment", "Topic_number"], inplace=True)
1630
+
1631
+ reference_df.to_csv(output_folder + "before_join_reference_df.csv")
1632
+
1633
+
1634
+ # Then, join these new topic names onto the reference_df, merge based on the original names
1635
+ modified_reference_df = reference_df.merge(original_unique_topics_df_m[["Topic_number", "General Topic_mod", "Subtopic_mod", "Sentiment_mod"]], on=["Topic_number"], how="left")
1636
+
1637
+ modified_reference_df.to_csv(output_folder + "modified_reference_df.csv")
1638
+
1639
+ # Replace old topic names with new topic names in reference_df
1640
+ # modified_reference_df.rename(columns={"General Topic":"General Topic_old",
1641
+ # "Subtopic":"Subtopic_old",
1642
+ # "Sentiment":"Sentiment_old"}, inplace=True)
1643
+
1644
+ modified_reference_df.drop(["General Topic", "Subtopic", "Sentiment"], axis=1, inplace=True, errors="ignore")
1645
+
1646
+ modified_reference_df.rename(columns={"General Topic_mod":"General Topic",
1647
+ "Subtopic_mod":"Subtopic",
1648
+ "Sentiment_mod":"Sentiment"}, inplace=True)
1649
+
1650
+ modified_reference_df.drop(["General Topic_mod", "Subtopic_mod", "Sentiment_mod"], inplace=True, errors="ignore")
1651
+
1652
+
1653
+ #modified_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
1654
+
1655
+ modified_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
1656
+
1657
+ modified_reference_df.to_csv(output_folder + "test_out_ref_df.csv")
1658
+
1659
+ modified_reference_df = modified_reference_df.loc[:, ["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group", "Topic_number"]]
1660
+
1661
+ # Drop rows where Response References is either NA or null
1662
+ modified_reference_df = modified_reference_df[~modified_reference_df["Response References"].isnull()]
1663
+
1664
+ return modified_reference_df
1665
+
1666
+ # MODIFY EXISTING TABLE
1667
+ def modify_existing_output_tables(original_unique_topics_df:pd.DataFrame, modifiable_unique_topics_df:pd.DataFrame, reference_df:pd.DataFrame, text_output_file_list_state:List[str]) -> Tuple:
1668
+ '''
1669
+ Take a unique_topics table that has been modified, apply these new topic names to the long-form reference_df, and save both tables to file.
1670
+ '''
1671
+
1672
+ reference_file_path = os.path.basename([x for x in text_output_file_list_state if 'reference' in x][0])
1673
+ unique_table_file_path = os.path.basename([x for x in text_output_file_list_state if 'unique' in x][0])
1674
+
1675
+ print("reference_file_path:", reference_file_path)
1676
+
1677
+ output_file_list = []
1678
+
1679
+ if reference_file_path and unique_table_file_path:
1680
+
1681
+ reference_df = join_modified_topic_names_to_ref_table(modifiable_unique_topics_df, original_unique_topics_df, reference_df)
1682
+
1683
+ ## Reference table mapping response numbers to topics
1684
+ reference_table_file_name = reference_file_path.replace(".csv", "_mod")
1685
+ new_reference_df_file_path = output_folder + reference_table_file_name + ".csv"
1686
+ reference_df.to_csv(new_reference_df_file_path, index=None)
1687
+ output_file_list.append(new_reference_df_file_path)
1688
+
1689
+ # Drop rows where Response References is NA or null
1690
+ modifiable_unique_topics_df = modifiable_unique_topics_df[~modifiable_unique_topics_df["Response References"].isnull()]
1691
+
1692
+ # Convert 'Response References' to numeric (forcing errors to NaN if conversion fails)
1693
+ modifiable_unique_topics_df["Response References"] = pd.to_numeric(
1694
+ modifiable_unique_topics_df["Response References"], errors='coerce'
1695
+ )
1696
 
1697
+ # Drop any rows where conversion failed (original non-numeric values)
1698
+ modifiable_unique_topics_df.dropna(subset=["Response References"], inplace=True)
1699
+
1700
+ # Sort values
1701
+ modifiable_unique_topics_df.sort_values(["Response References"], ascending=False, inplace=True)
1702
+
1703
+ unique_table_file_name = unique_table_file_path.replace(".csv", "_mod")
1704
+ modified_unique_table_file_path = output_folder + unique_table_file_name + ".csv"
1705
+ modifiable_unique_topics_df.to_csv(modified_unique_table_file_path, index=None)
1706
+ output_file_list.append(modified_unique_table_file_path)
1707
+
1708
+ else:
1709
+ output_file_list = text_output_file_list_state
1710
+ reference_table_file_name = reference_file_path
1711
+ unique_table_file_name = unique_table_file_path
1712
+ raise Exception("Reference and unique topic tables not found.")
1713
+
1714
+ # Outputs for markdown table output
1715
+ unique_table_df_revised_display = modifiable_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1716
+ deduplicated_unique_table_markdown = unique_table_df_revised_display.to_markdown(index=False)
1717
+
1718
+
1719
+ return modifiable_unique_topics_df, reference_df, output_file_list, output_file_list, output_file_list, output_file_list, reference_table_file_name, unique_table_file_name, deduplicated_unique_table_markdown
1720
+
1721
+
1722
+ # DEDUPLICATION/SUMMARISATION FUNCTIONS
1723
+ def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, general_topic_series: pd.Series = None, merge_general_topics = "No", merge_sentiment:str="No", threshold: float = 90) -> pd.DataFrame:
1724
  """
1725
  Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold,
1726
  merging smaller topics into larger topics.
 
1737
  # Count occurrences of each category in the reference_df
1738
  category_counts = reference_df['Subtopic'].value_counts().to_dict()
1739
 
1740
+ # Initialize dictionaries for both category mapping and scores
1741
  deduplication_map = {}
1742
+ match_scores = {} # New dictionary to store match scores
1743
 
1744
  # First pass: Handle exact matches
1745
  for category in category_series.unique():
 
1754
  most_common = max(match_counts.items(), key=lambda x: x[1])[0]
1755
  most_common_category = category_series[most_common]
1756
 
1757
+ # Map all exact matches to the most common variant and store score
1758
  for match in exact_matches:
1759
  deduplication_map[category_series[match]] = most_common_category
1760
+ match_scores[category_series[match]] = 100 # Exact matches get score of 100
1761
 
1762
  # Second pass: Handle fuzzy matches for remaining categories
1763
+ # Create a DataFrame to maintain the relationship between categories and general topics
1764
+ categories_df = pd.DataFrame({
1765
+ 'category': category_series,
1766
+ 'general_topic': general_topic_series
1767
+ }).drop_duplicates()
1768
+
1769
+ for _, row in categories_df.iterrows():
1770
+ category = row['category']
1771
  if category in deduplication_map:
1772
  continue
1773
 
1774
+ current_general_topic = row['general_topic']
1775
+
1776
+ # Filter potential matches to only those within the same General Topic if relevant
1777
+ if merge_general_topics == "No":
1778
+ potential_matches = categories_df[
1779
+ (categories_df['category'] != category) &
1780
+ (categories_df['general_topic'] == current_general_topic)
1781
+ ]['category'].tolist()
1782
+ else:
1783
+ potential_matches = categories_df[
1784
+ (categories_df['category'] != category)
1785
+ ]['category'].tolist()
1786
+
1787
  matches = process.extract(category,
1788
+ potential_matches,
1789
+ scorer=fuzz.WRatio,
1790
  score_cutoff=threshold)
1791
 
1792
+ if matches:
1793
+ best_match = max(matches, key=lambda x: x[1])
1794
+ match, score, _ = best_match
1795
 
 
1796
  if category_counts.get(category, 0) < category_counts.get(match, 0):
1797
+ deduplication_map[category] = match
1798
+ match_scores[category] = score
1799
  else:
1800
+ deduplication_map[match] = category
1801
+ match_scores[match] = score
1802
  else:
1803
+ deduplication_map[category] = category
1804
+ match_scores[category] = 100
1805
+
1806
+ # Create the result DataFrame with scores
1807
+ result_df = pd.DataFrame({
1808
+ 'old_category': category_series + " | " + join_series,
1809
+ 'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x)),
1810
+ 'match_score': category_series.map(lambda x: match_scores.get(x, 100)) # Add scores column
1811
+ })
1812
+
1813
+ #print(result_df)
 
 
1814
 
1815
  return result_df
1816
 
 
1818
  unique_topics_df:pd.DataFrame,
1819
  reference_table_file_name:str,
1820
  unique_topics_table_file_name:str,
1821
+ in_excel_sheets:str="",
1822
  merge_sentiment:str= "No",
1823
  merge_general_topics:str="No",
1824
+ score_threshold:int=90,
1825
+ in_data_files:List[str]=[],
1826
  chosen_cols:List[str]="",
1827
  deduplicate_topics:str="Yes"
1828
  ):
 
1831
  '''
1832
  output_files = []
1833
  log_output_files = []
1834
+ file_data = pd.DataFrame()
1835
 
1836
+ reference_table_file_name_no_ext = reference_table_file_name #get_file_name_no_ext(reference_table_file_name)
1837
+ unique_topics_table_file_name_no_ext = unique_topics_table_file_name #get_file_name_no_ext(unique_topics_table_file_name)
1838
 
1839
+ # For checking that data is not lost during the process
1840
+ initial_unique_references = len(reference_df["Response References"].unique())
1841
 
1842
+ if unique_topics_df.empty:
1843
+ unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
 
 
 
1844
 
1845
+ # Then merge the topic numbers back to the original dataframe
1846
+ reference_df = reference_df.merge(
1847
+ unique_topics_df[['General Topic', 'Subtopic', 'Sentiment', 'Topic_number']],
1848
+ on=['General Topic', 'Subtopic', 'Sentiment'],
1849
+ how='left'
1850
+ )
1851
 
1852
+ if in_data_files and chosen_cols:
1853
+ file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1, in_excel_sheets)
1854
+ else:
1855
+ out_message = "No file data found, pivot table output will not be created."
1856
+ print(out_message)
1857
+ #raise Exception(out_message)
1858
 
1859
+ # Run through this x times to try to get all duplicate topics
1860
+ if deduplicate_topics == "Yes":
1861
+ for i in range(0, 8):
1862
+ if merge_sentiment == "No":
1863
+ if merge_general_topics == "No":
1864
+ reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1865
+ reference_df_unique = reference_df.drop_duplicates("old_category")
1866
+
1867
+ deduplicated_topic_map_df = reference_df_unique.groupby(["General Topic", "Sentiment"]).apply(
1868
+ lambda group: deduplicate_categories(
1869
+ group["Subtopic"],
1870
+ group["Sentiment"],
1871
+ reference_df,
1872
+ general_topic_series=group["General Topic"],
1873
+ merge_general_topics="No",
1874
+ threshold=score_threshold
1875
+ )
1876
+ ).reset_index(drop=True)
1877
+ else:
1878
+ # This case should allow cross-topic matching but is still grouping by Sentiment
1879
+ reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1880
+ reference_df_unique = reference_df.drop_duplicates("old_category")
1881
+
1882
+ deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
1883
+ lambda group: deduplicate_categories(
1884
+ group["Subtopic"],
1885
+ group["Sentiment"],
1886
+ reference_df,
1887
+ general_topic_series=None, # Set to None to allow cross-topic matching
1888
+ merge_general_topics="Yes",
1889
+ threshold=score_threshold
1890
+ )
1891
+ ).reset_index(drop=True)
1892
  else:
1893
+ if merge_general_topics == "No":
1894
+ # Update this case to maintain general topic boundaries
1895
+ reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1896
+ reference_df_unique = reference_df.drop_duplicates("old_category")
1897
+
1898
+ deduplicated_topic_map_df = reference_df_unique.groupby("General Topic").apply(
1899
+ lambda group: deduplicate_categories(
1900
+ group["Subtopic"],
1901
+ group["Sentiment"],
1902
+ reference_df,
1903
+ general_topic_series=group["General Topic"],
1904
+ merge_general_topics="No",
1905
+ merge_sentiment=merge_sentiment,
1906
+ threshold=score_threshold
1907
+ )
1908
+ ).reset_index(drop=True)
1909
+ else:
1910
+ # For complete merging across all categories
1911
+ reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
1912
+ reference_df_unique = reference_df.drop_duplicates("old_category")
1913
+
1914
+ deduplicated_topic_map_df = deduplicate_categories(
1915
+ reference_df_unique["Subtopic"],
1916
+ reference_df_unique["Sentiment"],
1917
+ reference_df,
1918
+ general_topic_series=None, # Set to None to allow cross-topic matching
1919
+ merge_general_topics="Yes",
1920
+ merge_sentiment=merge_sentiment,
1921
+ threshold=score_threshold
1922
+ ).reset_index(drop=True)
1923
 
1924
  if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
1925
  # Check if 'deduplicated_category' contains any values
 
1928
  else:
1929
  # Join deduplicated columns back to original df
1930
  #deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1931
+
1932
  # Remove rows where 'deduplicated_category' is blank or NaN
1933
+ deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category', 'match_score']]
1934
 
1935
  deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1936
 
 
1951
 
1952
  reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
1953
 
1954
+ #reference_df["General Topic"] = reference_df["General Topic"].str.lower().str.capitalize()
1955
+ #reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
1956
+ #reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
1957
 
1958
  if merge_general_topics == "Yes":
1959
  # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
 
1987
 
1988
  reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
1989
 
1990
+ # Update reference summary column with all summaries
1991
+ reference_df["Summary"] = reference_df.groupby(
1992
+ ["Response References", "General Topic", "Subtopic", "Sentiment"]
1993
+ )["Summary"].transform(' <br> '.join)
1994
+
1995
+ # Check that we have not inadvertantly removed some data during the above process
1996
+ end_unique_references = len(reference_df["Response References"].unique())
1997
+
1998
+ if initial_unique_references != end_unique_references:
1999
+ raise Exception(f"Number of unique references changed during processing: Initial={initial_unique_references}, Final={end_unique_references}")
2000
+
2001
+ # Drop duplicates in the reference table - each comment should only have the same topic referred to once
2002
+ reference_df.drop_duplicates(['Response References', 'General Topic', 'Subtopic', 'Sentiment'], inplace=True)
2003
+
2004
+
2005
  # Remake unique_topics_df based on new reference_df
2006
  unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
2007
 
2008
+ # Then merge the topic numbers back to the original dataframe
2009
+ reference_df = reference_df.merge(
2010
+ unique_topics_df[['General Topic', 'Subtopic', 'Sentiment', 'Topic_number']],
2011
+ on=['General Topic', 'Subtopic', 'Sentiment'],
2012
+ how='left'
2013
+ )
2014
+
2015
+ if not file_data.empty:
2016
+ basic_response_data = get_basic_response_data(file_data, chosen_cols)
2017
+ reference_df_pivot = convert_reference_table_to_pivot_table(reference_df, basic_response_data)
2018
+
2019
+ reference_pivot_file_path = output_folder + reference_table_file_name_no_ext + "_pivot_dedup.csv"
2020
+ reference_df_pivot.to_csv(reference_pivot_file_path, index=None)
2021
 
2022
+ log_output_files.append(reference_pivot_file_path)
2023
 
2024
+ #reference_table_file_name_no_ext = get_file_name_no_ext(reference_table_file_name)
2025
+ #unique_topics_table_file_name_no_ext = get_file_name_no_ext(unique_topics_table_file_name)
2026
 
2027
  reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
2028
  unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
 
2030
  unique_topics_df.to_csv(unique_topics_file_path, index=None)
2031
 
2032
  output_files.append(reference_file_path)
2033
+ output_files.append(unique_topics_file_path)
2034
 
2035
+ # Outputs for markdown table output
2036
+ unique_table_df_revised_display = unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
2037
 
2038
+ deduplicated_unique_table_markdown = unique_table_df_revised_display.to_markdown(index=False)
2039
 
2040
+ return reference_df, unique_topics_df, output_files, log_output_files, deduplicated_unique_table_markdown
2041
 
2042
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
2043
  unique_topics_df:pd.DataFrame,
 
2053
 
2054
  reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
2055
 
2056
+ if 'Revised summary' in reference_df.columns:
2057
+ out_message = "Summary has already been created for this file"
2058
+ print(out_message)
2059
+ raise Exception(out_message)
2060
+
2061
  for group_keys, reference_df_group in reference_df_grouped:
2062
  #print(f"Group: {group_keys}")
2063
  #print(f"Data: {reference_df_group}")
 
2139
  latest_summary_completed:int = 0,
2140
  out_metadata_str:str = "",
2141
  in_data_files:List[str]=[],
2142
+ in_excel_sheets:str="",
2143
  chosen_cols:List[str]=[],
2144
  log_output_files:list[str]=[],
2145
  summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
 
2154
  local_model = []
2155
  summarised_output_markdown = ""
2156
 
 
 
 
 
 
2157
 
2158
  # Check for data for summarisations
2159
  if not unique_table_df.empty and not reference_table_df.empty:
 
2161
  else:
2162
  out_message = "Please upload a unique topic table and reference table file to continue with summarisation."
2163
  print(out_message)
2164
+ raise Exception(out_message)
2165
+
2166
+ if 'Revised summary' in reference_table_df.columns:
2167
+ out_message = "Summary has already been created for this file"
2168
+ print(out_message)
2169
+ raise Exception(out_message)
2170
+
2171
  # Load in data file and chosen columns if exists to create pivot table later
2172
  if in_data_files and chosen_cols:
2173
+ file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1, in_excel_sheets=in_excel_sheets)
2174
  else:
2175
+ out_message = "No file data found, pivot table output will not be created."
2176
  print(out_message)
2177
+ raise Exception(out_message)
2178
+
2179
+
2180
+ all_summaries = summarised_references["Summary"].tolist()
2181
+ length_all_summaries = len(all_summaries)
2182
 
2183
  # If all summaries completed, make final outputs
2184
  if latest_summary_completed >= length_all_summaries:
 
2219
  unique_table_df_revised = unique_table_df_revised.loc[unique_table_df_revised["Sentiment"] != "Not Mentioned", :]
2220
  reference_table_df_revised = reference_table_df_revised.loc[reference_table_df_revised["Sentiment"] != "Not Mentioned", :]
2221
 
2222
+
2223
+
2224
 
2225
+ if not file_data.empty:
2226
+ basic_response_data = get_basic_response_data(file_data, chosen_cols)
2227
+ reference_table_df_revised_pivot = convert_reference_table_to_pivot_table(reference_table_df_revised, basic_response_data)
2228
+
2229
+ ### Save pivot file to log area
2230
+ reference_table_df_revised_pivot_path = output_folder + batch_file_path_details + "_summarised_reference_table_pivot_" + model_choice_clean + ".csv"
2231
+ reference_table_df_revised_pivot.to_csv(reference_table_df_revised_pivot_path, index=None)
2232
+
2233
+ log_output_files.append(reference_table_df_revised_pivot_path)
2234
 
2235
  # Save to file
2236
  unique_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_unique_topic_table_" + model_choice_clean + ".csv"
 
2239
  reference_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_reference_table_" + model_choice_clean + ".csv"
2240
  reference_table_df_revised.to_csv(reference_table_df_revised_path, index = None)
2241
 
2242
+ output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
 
 
 
 
 
 
2243
 
2244
  ###
2245
+ unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
2246
 
2247
  summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
2248
 
tools/prompts.py CHANGED
@@ -5,7 +5,7 @@ initial_table_prompt = """The open text data is shown in the following table tha
5
 
6
  Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
7
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
8
- In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
9
  In the third column write the sentiment of the subtopic: {sentiment_choices}.
10
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
11
  In the fifth and final column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
@@ -29,7 +29,7 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
29
 
30
  Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
31
  Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
32
- In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible.
33
  In the third column, write the sentiment of the Subtopic: {sentiment_choices}.
34
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
35
  In the fifth and final column, write a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
@@ -50,6 +50,8 @@ Your task is to make a consolidated summary of the above text. {summary_format}.
50
  Summary:"""
51
 
52
 
 
 
53
  create_general_topics_system_prompt = system_prompt
54
 
55
  create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
@@ -60,6 +62,10 @@ Your task is to create a General Topic name for each Subtopic. The new Topics ta
60
  New Topics table:"""
61
 
62
 
 
 
 
 
63
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
64
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
65
  # Summarise the following text in less than {length} words: "{text}"\n
 
5
 
6
  Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
7
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
8
+ In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
9
  In the third column write the sentiment of the subtopic: {sentiment_choices}.
10
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
11
  In the fifth and final column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
 
29
 
30
  Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
31
  Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
32
+ In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty.
33
  In the third column, write the sentiment of the Subtopic: {sentiment_choices}.
34
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
35
  In the fifth and final column, write a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
 
50
  Summary:"""
51
 
52
 
53
+ ## The following didn't work well in testing and so is not currently used
54
+
55
  create_general_topics_system_prompt = system_prompt
56
 
57
  create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
 
62
  New Topics table:"""
63
 
64
 
65
+
66
+
67
+
68
+
69
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
70
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
71
  # Summarise the following text in less than {length} words: "{text}"\n