seanpedrickcase commited on
Commit
b9301bd
·
1 Parent(s): 71fcefe

Upgraded Gradio. More resilient to cases where LLM calls do not return valid markdown tables (will reattempt with different temperature). Minor fixes

Browse files
app.py CHANGED
@@ -124,8 +124,8 @@ with app:
124
 
125
  extract_topics_btn = gr.Button("Extract topics", variant="primary")
126
 
127
- display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
128
- text_output_file = gr.File(height=file_input_height, label="Output files")
129
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
130
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
131
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
@@ -250,27 +250,27 @@ with app:
250
  # Tabular data upload
251
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
252
 
253
- extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
254
  success(load_in_data_file,
255
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
256
  success(fn=extract_topics,
257
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
258
- outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
259
 
260
 
261
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
262
- latest_batch_completed.change(fn=extract_topics,
263
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
264
- outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
265
- success(fn = reveal_feedback_buttons,
266
- outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
267
 
268
  # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
269
- modification_input_files.upload(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
270
 
271
 
272
  # Modify output table with custom topic names
273
- save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, deduplication_input_files, summarisation_input_files, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, summarised_output_markdown])
274
 
275
  # When button pressed, deduplicate data
276
  deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
 
124
 
125
  extract_topics_btn = gr.Button("Extract topics", variant="primary")
126
 
127
+ topic_extraction_output_files = gr.File(height=file_input_height, label="Output files")
128
+ display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
129
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
130
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
131
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
 
250
  # Tabular data upload
251
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
252
 
253
+ extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
254
  success(load_in_data_file,
255
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
256
  success(fn=extract_topics,
257
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
258
+ outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
259
 
260
 
261
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
262
+ # latest_batch_completed.change(fn=extract_topics,
263
+ # inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
264
+ # outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
265
+ # success(fn = reveal_feedback_buttons,
266
+ # outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
267
 
268
  # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
269
+ modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
270
 
271
 
272
  # Modify output table with custom topic names
273
+ save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, deduplication_input_files, summarisation_input_files, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, summarised_output_markdown])
274
 
275
  # When button pressed, deduplicate data
276
  deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.20.1
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
requirements_aws.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.20.1
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
requirements_gpu.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.20.1
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
tools/helper_functions.py CHANGED
@@ -15,8 +15,11 @@ def empty_output_vars_extract_topics():
15
  log_files_output_list_state = []
16
  conversation_metadata_textbox = ""
17
  estimated_time_taken_number = 0
 
 
 
18
 
19
- return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number
20
 
21
  def empty_output_vars_summarise():
22
  # Empty output objects before summarising files
@@ -127,7 +130,7 @@ def wrap_text(text:str, max_width=60, max_text_length=None):
127
 
128
  # If max_text_length is set, truncate the text and add ellipsis
129
  if max_text_length and len(text) > max_text_length:
130
- return text[:max_text_length] + '...'
131
 
132
  text = text.replace('\r\n', '<br>').replace('\n', '<br>')
133
 
 
15
  log_files_output_list_state = []
16
  conversation_metadata_textbox = ""
17
  estimated_time_taken_number = 0
18
+ file_data_state = pd.DataFrame()
19
+ reference_data_file_name_textbox = ""
20
+ display_topic_table_markdown = ""
21
 
22
+ return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown
23
 
24
  def empty_output_vars_summarise():
25
  # Empty output objects before summarising files
 
130
 
131
  # If max_text_length is set, truncate the text and add ellipsis
132
  if max_text_length and len(text) > max_text_length:
133
+ text = text[:max_text_length] + '...'
134
 
135
  text = text.replace('\r\n', '<br>').replace('\n', '<br>')
136
 
tools/llm_api_call.py CHANGED
@@ -30,9 +30,11 @@ class ResponseObject:
30
  self.text = text
31
  self.usage_metadata = usage_metadata
32
 
33
- max_tokens = 4096
34
  timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
35
  number_of_api_retry_attempts = 5
 
 
36
  max_time_for_loop = 99999
37
  batch_size_default = 5
38
  deduplication_threshold = 90
@@ -392,11 +394,6 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
392
  # Clear any existing progress bars
393
  tqdm._instances.clear()
394
 
395
- # Print the full prompt for debugging purposes
396
- #print("full_prompt:", full_prompt)
397
-
398
- #progress_bar = tqdm(range(0,number_of_api_retry_attempts), desc="Calling API with " + str(timeout_wait) + " seconds per retry.", unit="attempts")
399
-
400
  progress_bar = range(0,number_of_api_retry_attempts)
401
 
402
  # Generate the model's response
@@ -473,8 +470,6 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
473
  # Update the conversation history with the new prompt and response
474
  conversation_history.append({'role': 'user', 'parts': [prompt]})
475
 
476
- # output_str = output['choices'][0]['text']
477
-
478
  # Check if is a LLama.cpp model response
479
  # Check if the response is a ResponseObject
480
  if isinstance(response, ResponseObject):
@@ -739,6 +734,70 @@ def convert_response_text_to_markdown_table(response_text:str, table_type:str =
739
 
740
  return out_df, is_error
741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
 
743
  def write_llm_output_and_logs(responses: List[ResponseObject],
744
  whole_conversation: List[str],
@@ -884,7 +943,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
884
  # Create a new DataFrame from the reference data
885
  new_reference_df = pd.DataFrame(reference_data)
886
 
887
- print("new_reference_df:", new_reference_df)
888
 
889
  # Append on old reference data
890
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
@@ -1035,18 +1094,15 @@ def extract_topics(in_data_file,
1035
  llama_cpp_prefix = "<start_of_turn>user\n"
1036
  llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
1037
 
1038
- # Reset output files on each run:
1039
- # out_file_paths = []
1040
-
1041
  # If you have a file input but no file data it hasn't yet been loaded. Load it here.
1042
  if file_data.empty:
1043
  print("No data table found, loading from file")
1044
  try:
1045
- print("in_data_file:", in_data_file)
1046
  in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
1047
- print("in_colnames:", in_colnames_drop)
1048
  file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default)
1049
- print("file_data loaded in:", file_data)
1050
  except:
1051
  # Check if files and text exist
1052
  out_message = "Please enter a data file to summarise."
@@ -1060,7 +1116,7 @@ def extract_topics(in_data_file,
1060
 
1061
  # If this is the first time around, set variables to 0/blank
1062
  if first_loop_state==True:
1063
- #print("This is the first time through the loop")
1064
  if (latest_batch_completed == 999) | (latest_batch_completed == 0):
1065
  latest_batch_completed = 0
1066
  out_message = []
@@ -1072,527 +1128,511 @@ def extract_topics(in_data_file,
1072
  local_model, tokenizer = load_model()
1073
  print("Local model loaded:", local_model)
1074
 
1075
- #print("latest_batch_completed at start of function:", str(latest_batch_completed))
1076
- #print("total number of batches:", str(num_batches))
 
 
 
 
 
 
1077
 
1078
- # If we have already redacted the last file, return the input out_message and file list to the relevant components
1079
- if latest_batch_completed >= num_batches:
1080
- print("Last batch reached, returning batch:", str(latest_batch_completed))
1081
- # Set to a very high number so as not to mess with subsequent file processing by the user
1082
- #latest_batch_completed = 999
1083
 
1084
- toc = time.perf_counter()
1085
- final_time = (toc - tic) + time_taken
1086
- out_time = f"Everything finished in {final_time} seconds."
1087
- print(out_time)
1088
 
1089
- print("All summaries completed. Creating outputs.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
 
1091
- model_choice_clean = model_name_map[model_choice]
1092
- # Example usage
1093
- in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
 
1094
 
1095
- # Need to reduce output file names as full length files may be too long
1096
- file_name = clean_column_name(file_name, max_length=30)
 
1097
 
1098
- # Save outputs for each batch. If master file created, label file as master
1099
- file_path_details = f"{file_name}_col_{in_column_cleaned}"
1100
 
1101
- # Create a pivoted reference table
1102
- existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
1103
 
1104
- # Save the new DataFrame to CSV
1105
- #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1106
- reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1107
- reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1108
- unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1109
- basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1110
 
1111
- # Write outputs to csv
1112
- ## Topics with references
1113
- #new_topic_df.to_csv(topic_table_out_path, index=None)
1114
- #log_files_output_paths.append(topic_table_out_path)
1115
 
1116
- ## Reference table mapping response numbers to topics
1117
- existing_reference_df.to_csv(reference_table_out_path, index=None)
1118
- out_file_paths.append(reference_table_out_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1119
 
1120
- # Create final unique topics table from reference table to ensure consistent numbers
1121
- final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
 
1122
 
1123
- ## Unique topic list
1124
- final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1125
- out_file_paths.append(unique_topics_df_out_path)
1126
 
1127
- # Ensure that we are only returning the final results to outputs
1128
- out_file_paths = [x for x in out_file_paths if '_final_' in x]
1129
 
1130
- ## Reference table mapping response numbers to topics
1131
- existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
1132
- log_files_output_paths.append(reference_table_out_pivot_path)
1133
 
1134
- ## Create a dataframe for missing response references:
1135
- # Assuming existing_reference_df and file_data are already defined
1136
-
1137
- # Simplify table to just responses column and the Response reference number
1138
-
1139
 
1140
- basic_response_data = get_basic_response_data(file_data, chosen_cols)
1141
 
1142
- #print("basic_response_data:", basic_response_data)
 
 
 
1143
 
1144
- # Save simplified file data to log outputs
1145
- pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None)
1146
- log_files_output_paths.append(basic_response_data_out_path)
1147
 
 
 
 
 
 
1148
 
1149
- # Step 1: Identify missing references
1150
- #print("basic_response_data:", basic_response_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1151
 
1152
- missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
 
 
 
1153
 
1154
- # Step 2: Create a new DataFrame with the same columns as existing_reference_df
1155
- missing_df = pd.DataFrame(columns=existing_reference_df.columns)
 
 
 
 
 
1156
 
1157
- # Step 3: Populate the new DataFrame
1158
- missing_df['Response References'] = missing_references['Reference']
1159
- missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False) # Fill other columns with NA
1160
 
1161
- # Display the new DataFrame
1162
- #print("missing_df:", missing_df)
 
 
 
 
1163
 
1164
- missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1165
- missing_df.to_csv(missing_df_out_path, index=None)
1166
- log_files_output_paths.append(missing_df_out_path)
1167
 
1168
- out_file_paths = list(set(out_file_paths))
1169
- log_files_output_paths = list(set(log_files_output_paths))
1170
 
1171
- summary_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
1172
-
1173
- # The topic table that can be modified does not need the summary column
1174
- modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
1175
 
1176
- #final_out_message = '\n'.join(out_message)
1177
- return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1178
-
1179
-
1180
- if num_batches > 0:
1181
- progress_measure = round(latest_batch_completed / num_batches, 1)
1182
- progress(progress_measure, desc="Querying large language model")
1183
- else:
1184
- progress(0.1, desc="Querying large language model")
1185
 
1186
- # Load file
1187
- # If out message or out_file_paths are blank, change to a list so it can be appended to
1188
- if isinstance(out_message, str):
1189
- out_message = [out_message]
1190
 
1191
- if not out_file_paths:
1192
- out_file_paths = []
1193
-
1194
-
1195
- if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1196
- out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1197
- print(out_message)
1198
- raise Exception(out_message)
1199
- #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
1200
-
1201
-
1202
- if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
1203
- elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
1204
- elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
1205
- else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
1206
-
1207
- topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
1208
- topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
1209
 
1210
- for i in topics_loop:
1211
- #for latest_batch_completed in range(num_batches):
1212
- reported_batch_no = latest_batch_completed + 1
1213
- print("Running query batch", str(reported_batch_no))
1214
 
1215
- # Call the function to prepare the input table
1216
- simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
1217
- #log_files_output_paths.append(simplified_csv_table_path)
1218
 
1219
- # Conversation history
1220
- conversation_history = []
1221
-
1222
- #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
1223
-
1224
- # If the latest batch of responses contains at least one instance of text
1225
- if not batch_basic_response_df.empty:
1226
-
1227
- #print("latest_batch_completed:", latest_batch_completed)
1228
 
1229
- #print("candidate_topics:", candidate_topics)
 
 
1230
 
1231
- # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
1232
- if latest_batch_completed >= 1 or candidate_topics is not None:
 
 
 
1233
 
1234
- #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
 
 
 
1235
 
1236
- # Prepare Gemini models before query
1237
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1238
- print("Using Gemini model:", model_choice)
1239
- model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
1240
- elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
1241
- print("Using AWS Bedrock model:", model_choice)
1242
- else:
1243
- print("Using local model:", model_choice)
1244
-
1245
- # Preparing candidate topics if no topics currently exist
1246
- if candidate_topics and existing_unique_topics_df.empty:
1247
- progress(0.1, "Creating revised zero shot topics table")
1248
-
1249
- # 'Zero shot topics' are those supplied by the user
1250
- max_topic_no = 120
1251
- zero_shot_topics = read_file(candidate_topics.name)
1252
-
1253
- # Max 120 topics allowed
1254
- if zero_shot_topics.shape[0] > max_topic_no:
1255
- print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1256
- zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
1257
-
1258
- # Forward slashes in the topic names seems to confuse the model
1259
- if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
1260
- for x in zero_shot_topics.columns:
1261
- zero_shot_topics.loc[:, x] = (
1262
- zero_shot_topics.loc[:, x]
1263
- .str.strip()
1264
- .str.replace('\n', ' ')
1265
- .str.replace('\r', ' ')
1266
- .str.replace('/', ' or ')
1267
- .str.lower()
1268
- .str.capitalize())
1269
-
1270
- # If number of columns is 1, keep only subtopics
1271
- if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
1272
- zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1273
- zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1274
- # Allow for possibility that the user only wants to set general topics and not subtopics
1275
- elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
1276
- zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1277
- zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
1278
- # If general topic and subtopic are specified
1279
- elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
1280
- zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1281
- zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
1282
- # If number of columns is 2, keep general topics and subtopics
1283
- elif zero_shot_topics.shape[1] == 2:
1284
- zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
1285
- zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
1286
- else:
1287
- # If there are more columns, just assume that the first column was meant to be a subtopic
1288
- zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1289
- zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1290
 
1291
- # If the responses are being forced into zero shot topics, allow an option for nothing relevant
1292
- if force_zero_shot_radio == "Yes":
1293
- zero_shot_topics_gen_topics_list.append("")
1294
- zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
1295
 
1296
- if create_revised_general_topics == True:
1297
- # Create the most up to date list of topics and subtopics.
1298
- # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1299
- unique_topics_df = pd.DataFrame(data={
1300
- "General Topic":zero_shot_topics_gen_topics_list,
1301
- "Subtopic":zero_shot_topics_subtopics_list
1302
- })
1303
- unique_topics_markdown = unique_topics_df.to_markdown()
1304
 
1305
- print("unique_topics_markdown:", unique_topics_markdown)
1306
-
1307
- formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1308
 
1309
- # Format the general_topics prompt with the topics
1310
- formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
1311
 
1312
- if model_choice == "gemma_2b_it_local":
1313
- formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
1314
 
1315
- formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
1316
 
1317
- whole_conversation = []
 
 
 
 
 
 
 
 
 
 
 
1318
 
1319
- general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = process_requests(formatted_general_topics_prompt_list, formatted_general_topics_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1320
 
1321
- # Convert response text to a markdown table
1322
- try:
1323
- zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
1324
- print("Output revised zero shot topics table is:", zero_shot_topics_df)
1325
 
1326
- zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
1327
- zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
1328
- out_file_paths.append(zero_shot_revised_path)
 
 
1329
 
1330
- except Exception as e:
1331
- print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
1332
- zero_shot_topics_df = pd.DataFrame(data={
1333
- "General Topic":zero_shot_topics_gen_topics_list,
1334
- "Subtopic":zero_shot_topics_subtopics_list})
1335
 
1336
- if zero_shot_topics_df.empty:
1337
- print("Creation of revised general topics df failed, reverting to original list")
1338
- zero_shot_topics_df = pd.DataFrame(data={
1339
- "General Topic":zero_shot_topics_gen_topics_list,
1340
- "Subtopic":zero_shot_topics_subtopics_list})
1341
- else:
1342
- zero_shot_topics_df = pd.DataFrame(data={
1343
- "General Topic":zero_shot_topics_gen_topics_list,
1344
- "Subtopic":zero_shot_topics_subtopics_list})
1345
-
1346
- #print("Zero shot topics are:", zero_shot_topics_df)
1347
-
1348
- # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1349
- if not existing_unique_topics_df.empty:
1350
- existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1351
- else:
1352
- existing_unique_topics_df = zero_shot_topics_df
1353
 
1354
- if candidate_topics and not zero_shot_topics_df.empty:
1355
- # If you have already created revised zero shot topics, concat to the current
1356
- existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
1357
 
1358
- #existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
1359
 
1360
- #all_topic_tables_df_merged = existing_unique_topics_df
1361
- existing_unique_topics_df["Response References"] = ""
1362
- existing_unique_topics_df.fillna("", inplace=True)
1363
- existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
1364
- existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
1365
 
1366
- # print("existing_unique_topics_df:", existing_unique_topics_df)
1367
 
1368
- # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
1369
- if force_zero_shot_radio == "Yes":
1370
- unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
1371
- topic_assignment_prompt = force_existing_topics_prompt
1372
- else:
1373
- unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
1374
- topic_assignment_prompt = allow_new_topics_prompt
1375
-
1376
-
1377
- # Format the summary prompt with the response table and topics
1378
- formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1379
- formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
1380
-
1381
 
1382
- if model_choice == "gemma_2b_it_local":
1383
- formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
1384
- full_prompt = formatted_summary_prompt
1385
- else:
1386
- full_prompt = formatted_system_prompt + formatted_summary_prompt
1387
 
1388
- #latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
1389
-
1390
- # Define the output file path for the formatted prompt
1391
- formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1392
-
1393
- # Write the formatted prompt to the specified file
1394
- try:
1395
- with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
1396
- f.write(full_prompt)
1397
- except Exception as e:
1398
- print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
1399
 
1400
- summary_prompt_list = [formatted_summary_prompt]
 
1401
 
1402
- # print("master_summary_prompt_list:", summary_prompt_list[0])
 
1403
 
1404
- summary_conversation_history = []
1405
- summary_whole_conversation = []
1406
 
1407
- # Process requests to large language model
1408
- responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1409
 
1410
- # print("responses:", responses[-1].text)
1411
- # print("Whole conversation metadata:", whole_conversation_metadata)
 
 
1412
 
1413
- topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1414
 
1415
- # Write final output to text file for logging purposes
1416
- try:
1417
- final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1418
 
1419
- if isinstance(responses[-1], ResponseObject):
1420
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1421
- f.write(responses[-1].text)
1422
- elif "choices" in responses[-1]:
1423
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1424
- f.write(responses[-1]["choices"][0]['text'])
1425
- else:
1426
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1427
- f.write(responses[-1].text)
1428
 
1429
- except Exception as e:
1430
- print("Error in returning model response:", e)
1431
-
1432
 
1433
- # If error in table parsing, leave function
1434
- if is_error == True:
1435
- final_message_out = "Could not complete summary, error in LLM output."
1436
- raise Exception(final_message_out)
1437
- #return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1438
 
1439
- # Write outputs to csv
1440
- ## Topics with references
1441
- new_topic_df.to_csv(topic_table_out_path, index=None)
1442
- log_files_output_paths.append(topic_table_out_path)
1443
 
1444
- ## Reference table mapping response numbers to topics
1445
- new_reference_df.to_csv(reference_table_out_path, index=None)
1446
- out_file_paths.append(reference_table_out_path)
 
 
1447
 
1448
- ## Unique topic list
1449
- new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
 
 
1450
 
1451
- new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1452
- out_file_paths.append(unique_topics_df_out_path)
1453
-
1454
- # Outputs for markdown table output
1455
- unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1456
- unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
1457
 
1458
- #whole_conversation_metadata.append(whole_conversation_metadata_str)
1459
- whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
1460
-
1461
 
1462
- #out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
1463
- #log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
1464
 
1465
- out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1466
- log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1467
 
1468
- #print("out_file_paths at end of loop:", out_file_paths)
 
 
 
 
1469
 
1470
- # If this is the first batch, run this
1471
- else:
1472
- #system_prompt = system_prompt + normalised_simple_markdown_table
1473
-
1474
- # Prepare Gemini models before query
1475
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1476
- print("Using Gemini model:", model_choice)
1477
- model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1478
- elif model_choice in ["gemma_2b_it_local"]:
1479
- print("Using local Gemma 2b model")
1480
- else:
1481
- print("Using AWS Bedrock model:", model_choice)
1482
 
1483
- formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1484
 
1485
- formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
 
 
1486
 
1487
- if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
1488
- else: formatted_prompt2 = prompt2
1489
-
1490
- if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
1491
- else: formatted_prompt3 = prompt3
1492
 
1493
- if model_choice == "gemma_2b_it_local":
1494
- formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
1495
- formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
1496
- formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
1497
 
1498
- batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
1499
-
1500
- whole_conversation = [formatted_initial_table_system_prompt]
1501
 
1502
- # Process requests to large language model
1503
- responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
1504
-
1505
- # print("Whole conversation metadata before:", whole_conversation_metadata)
 
 
1506
 
1507
- # print("responses:", responses[-1].text)
1508
- # print("Whole conversation metadata:", whole_conversation_metadata)
 
1509
 
1510
- topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=True)
 
1511
 
1512
- # If error in table parsing, leave function
1513
- if is_error == True:
1514
- raise Exception("Error in output table parsing")
1515
- # unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1516
-
1517
-
1518
- #all_topic_tables_df.append(topic_table_df)
1519
 
1520
- topic_table_df.to_csv(topic_table_out_path, index=None)
1521
- out_file_paths.append(topic_table_out_path)
1522
 
1523
- reference_df.to_csv(reference_table_out_path, index=None)
1524
- out_file_paths.append(reference_table_out_path)
 
1525
 
1526
- ## Unique topic list
 
 
1527
 
1528
- new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
1529
 
1530
- new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1531
- out_file_paths.append(unique_topics_df_out_path)
1532
-
1533
- #all_markdown_topic_tables.append(markdown_table)
1534
 
1535
- whole_conversation_metadata.append(whole_conversation_metadata_str)
1536
- whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)
1537
-
1538
- # Write final output to text file also
1539
- try:
1540
- final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1541
-
1542
- if isinstance(responses[-1], ResponseObject):
1543
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1544
- f.write(responses[-1].text)
1545
- unique_table_df_display_table_markdown = responses[-1].text
1546
- elif "choices" in responses[-1]:
1547
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1548
- f.write(responses[-1]["choices"][0]['text'])
1549
- unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
1550
- else:
1551
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1552
- f.write(responses[-1].text)
1553
- unique_table_df_display_table_markdown = responses[-1].text
1554
 
1555
- log_files_output_paths.append(final_table_output_path)
1556
 
1557
- except Exception as e:
1558
- print("Error in returning model response:", e)
1559
-
1560
- new_topic_df = topic_table_df
1561
- new_reference_df = reference_df
1562
 
1563
- else:
1564
- print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
1565
 
1566
- # Increase latest file completed count unless we are over the last batch number
1567
- if latest_batch_completed <= num_batches:
1568
- print("Completed batch number:", str(reported_batch_no))
1569
- latest_batch_completed += 1
1570
 
1571
- toc = time.perf_counter()
1572
- final_time = toc - tic
1573
 
1574
- if final_time > max_time_for_loop:
1575
- print("Max time reached, breaking loop.")
1576
- topics_loop.close()
1577
- tqdm._instances.clear()
1578
- break
1579
 
1580
- # Overwrite 'existing' elements to add new tables
1581
- existing_reference_df = new_reference_df.dropna(how='all')
1582
- existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
1583
- existing_topics_table = new_topic_df.dropna(how='all')
1584
 
 
 
1585
  # The topic table that can be modified does not need the summary column
1586
- modifiable_unique_topics_df = existing_unique_topics_df.drop("Summary", axis=1)
1587
-
1588
- out_time = f"{final_time:0.1f} seconds."
1589
-
1590
- out_message.append('All queries successfully completed in')
1591
 
1592
- final_message_out = '\n'.join(out_message)
1593
- final_message_out = final_message_out + " " + out_time
1594
 
1595
- print(final_message_out)
1596
 
1597
 
1598
  return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
@@ -1683,12 +1723,22 @@ def join_modified_topic_names_to_ref_table(modified_unique_topics_df:pd.DataFram
1683
  def modify_existing_output_tables(original_unique_topics_df:pd.DataFrame, modifiable_unique_topics_df:pd.DataFrame, reference_df:pd.DataFrame, text_output_file_list_state:List[str]) -> Tuple:
1684
  '''
1685
  Take a unique_topics table that has been modified, apply these new topic names to the long-form reference_df, and save both tables to file.
1686
- '''
1687
-
1688
- reference_file_path = os.path.basename([x for x in text_output_file_list_state if 'reference' in x][0])
1689
- unique_table_file_path = os.path.basename([x for x in text_output_file_list_state if 'unique' in x][0])
 
 
 
 
 
 
 
 
 
1690
 
1691
- print("reference_file_path:", reference_file_path)
 
1692
 
1693
  output_file_list = []
1694
 
 
30
  self.text = text
31
  self.usage_metadata = usage_metadata
32
 
33
+ max_tokens = 4096 # Maximum number of output tokens
34
  timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
35
  number_of_api_retry_attempts = 5
36
+ # Try up to 3 times to get a valid markdown table response with LLM calls, otherwise retry with temperature changed
37
+ MAX_OUTPUT_VALIDATION_ATTEMPTS = 3
38
  max_time_for_loop = 99999
39
  batch_size_default = 5
40
  deduplication_threshold = 90
 
394
  # Clear any existing progress bars
395
  tqdm._instances.clear()
396
 
 
 
 
 
 
397
  progress_bar = range(0,number_of_api_retry_attempts)
398
 
399
  # Generate the model's response
 
470
  # Update the conversation history with the new prompt and response
471
  conversation_history.append({'role': 'user', 'parts': [prompt]})
472
 
 
 
473
  # Check if is a LLama.cpp model response
474
  # Check if the response is a ResponseObject
475
  if isinstance(response, ResponseObject):
 
734
 
735
  return out_df, is_error
736
 
737
+ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
738
+ system_prompt: str,
739
+ conversation_history: List[dict],
740
+ whole_conversation: List[str],
741
+ whole_conversation_metadata: List[str],
742
+ model: object,
743
+ config: dict,
744
+ model_choice: str,
745
+ temperature: float,
746
+ reported_batch_no: int,
747
+ local_model: object,
748
+ MAX_OUTPUT_VALIDATION_ATTEMPTS: int,
749
+ master:bool=False) -> Tuple[List[ResponseObject], List[dict], List[str], List[str], str]:
750
+ """
751
+ Call the large language model with checks for a valid markdown table.
752
+
753
+ Parameters:
754
+ - batch_prompts (List[str]): A list of prompts to be processed.
755
+ - system_prompt (str): The system prompt.
756
+ - conversation_history (List[dict]): The history of the conversation.
757
+ - whole_conversation (List[str]): The complete conversation including prompts and responses.
758
+ - whole_conversation_metadata (List[str]): Metadata about the whole conversation.
759
+ - model (object): The model to use for processing the prompts.
760
+ - config (dict): Configuration for the model.
761
+ - model_choice (str): The choice of model to use.
762
+ - temperature (float): The temperature parameter for the model.
763
+ - reported_batch_no (int): The reported batch number.
764
+ - local_model (object): The local model to use.
765
+ - MAX_OUTPUT_VALIDATION_ATTEMPTS (int): The maximum number of attempts to validate the output.
766
+ - master (bool, optional): Boolean to determine whether this call is for the master output table.
767
+
768
+ Returns:
769
+ - Tuple[List[ResponseObject], List[dict], List[str], List[str], str]: A tuple containing the list of responses, the updated conversation history, the updated whole conversation, the updated whole conversation metadata, and the response text.
770
+ """
771
+
772
+ call_temperature = temperature # This is correct now with the fixed parameter name
773
+
774
+ # Update Gemini config with the temperature settings
775
+ config = ai.GenerationConfig(temperature=call_temperature, max_output_tokens=max_tokens)
776
+
777
+ for attempt in range(MAX_OUTPUT_VALIDATION_ATTEMPTS):
778
+ # Process requests to large language model
779
+ responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(
780
+ batch_prompts, system_prompt, conversation_history, whole_conversation,
781
+ whole_conversation_metadata, model, config, model_choice,
782
+ call_temperature, reported_batch_no, local_model, master=master
783
+ )
784
+
785
+ stripped_response = responses[-1].text.strip()
786
+
787
+ # Check if response meets our criteria (length and contains table)
788
+ if len(stripped_response) > 120 and '|' in stripped_response:
789
+ print(f"Attempt {attempt + 1} produced response with markdown table.")
790
+ break # Success - exit loop
791
+
792
+ # Increase temperature for next attempt
793
+ call_temperature = temperature + (0.1 * (attempt + 1))
794
+ print(f"Attempt {attempt + 1} resulted in invalid table: {stripped_response}. "
795
+ f"Trying again with temperature: {call_temperature}")
796
+
797
+ else: # This runs if no break occurred (all attempts failed)
798
+ print(f"Failed to get valid response after {MAX_OUTPUT_VALIDATION_ATTEMPTS} attempts")
799
+
800
+ return responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text
801
 
802
  def write_llm_output_and_logs(responses: List[ResponseObject],
803
  whole_conversation: List[str],
 
943
  # Create a new DataFrame from the reference data
944
  new_reference_df = pd.DataFrame(reference_data)
945
 
946
+ #print("new_reference_df:", new_reference_df)
947
 
948
  # Append on old reference data
949
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
 
1094
  llama_cpp_prefix = "<start_of_turn>user\n"
1095
  llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
1096
 
 
 
 
1097
  # If you have a file input but no file data it hasn't yet been loaded. Load it here.
1098
  if file_data.empty:
1099
  print("No data table found, loading from file")
1100
  try:
1101
+ #print("in_data_file:", in_data_file)
1102
  in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
1103
+ #print("in_colnames:", in_colnames_drop)
1104
  file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default)
1105
+ #print("file_data loaded in:", file_data)
1106
  except:
1107
  # Check if files and text exist
1108
  out_message = "Please enter a data file to summarise."
 
1116
 
1117
  # If this is the first time around, set variables to 0/blank
1118
  if first_loop_state==True:
1119
+ print("This is the first time through the loop, resetting latest_batch_completed to 0")
1120
  if (latest_batch_completed == 999) | (latest_batch_completed == 0):
1121
  latest_batch_completed = 0
1122
  out_message = []
 
1128
  local_model, tokenizer = load_model()
1129
  print("Local model loaded:", local_model)
1130
 
1131
+
1132
+
1133
+
1134
+ if num_batches > 0:
1135
+ progress_measure = round(latest_batch_completed / num_batches, 1)
1136
+ progress(progress_measure, desc="Querying large language model")
1137
+ else:
1138
+ progress(0.1, desc="Querying large language model")
1139
 
1140
+ if latest_batch_completed < num_batches:
 
 
 
 
1141
 
1142
+ # Load file
1143
+ # If out message or out_file_paths are blank, change to a list so it can be appended to
1144
+ if isinstance(out_message, str):
1145
+ out_message = [out_message]
1146
 
1147
+ if not out_file_paths:
1148
+ out_file_paths = []
1149
+
1150
+
1151
+ if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1152
+ out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1153
+ print(out_message)
1154
+ raise Exception(out_message)
1155
+ #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
1156
+
1157
+
1158
+ if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
1159
+ elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
1160
+ elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
1161
+ else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
1162
+
1163
+ topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
1164
+ topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
1165
 
1166
+ for i in topics_loop:
1167
+ #for latest_batch_completed in range(num_batches):
1168
+ reported_batch_no = latest_batch_completed + 1
1169
+ print("Running query batch", str(reported_batch_no))
1170
 
1171
+ # Call the function to prepare the input table
1172
+ simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
1173
+ #log_files_output_paths.append(simplified_csv_table_path)
1174
 
1175
+ # Conversation history
1176
+ conversation_history = []
1177
 
1178
+ #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
 
1179
 
1180
+ # If the latest batch of responses contains at least one instance of text
1181
+ if not batch_basic_response_df.empty:
 
 
 
 
1182
 
1183
+ # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
1184
+ if latest_batch_completed >= 1 or candidate_topics is not None:
 
 
1185
 
1186
+ # Prepare Gemini models before query
1187
+ if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1188
+ print("Using Gemini model:", model_choice)
1189
+ model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
1190
+ elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
1191
+ print("Using AWS Bedrock model:", model_choice)
1192
+ else:
1193
+ print("Using local model:", model_choice)
1194
+
1195
+ # Preparing candidate topics if no topics currently exist
1196
+ if candidate_topics and existing_unique_topics_df.empty:
1197
+ progress(0.1, "Creating revised zero shot topics table")
1198
+
1199
+ # 'Zero shot topics' are those supplied by the user
1200
+ max_topic_no = 120
1201
+ zero_shot_topics = read_file(candidate_topics.name)
1202
+
1203
+ # Max 120 topics allowed
1204
+ if zero_shot_topics.shape[0] > max_topic_no:
1205
+ print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1206
+ zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
1207
+
1208
+ # Forward slashes in the topic names seems to confuse the model
1209
+ if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
1210
+ for x in zero_shot_topics.columns:
1211
+ zero_shot_topics.loc[:, x] = (
1212
+ zero_shot_topics.loc[:, x]
1213
+ .str.strip()
1214
+ .str.replace('\n', ' ')
1215
+ .str.replace('\r', ' ')
1216
+ .str.replace('/', ' or ')
1217
+ .str.lower()
1218
+ .str.capitalize())
1219
+
1220
+ # If number of columns is 1, keep only subtopics
1221
+ if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
1222
+ zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1223
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1224
+ # Allow for possibility that the user only wants to set general topics and not subtopics
1225
+ elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
1226
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1227
+ zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
1228
+ # If general topic and subtopic are specified
1229
+ elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
1230
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1231
+ zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
1232
+ # If number of columns is 2, keep general topics and subtopics
1233
+ elif zero_shot_topics.shape[1] == 2:
1234
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
1235
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
1236
+ else:
1237
+ # If there are more columns, just assume that the first column was meant to be a subtopic
1238
+ zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1239
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1240
+
1241
+ # If the responses are being forced into zero shot topics, allow an option for nothing relevant
1242
+ if force_zero_shot_radio == "Yes":
1243
+ zero_shot_topics_gen_topics_list.append("")
1244
+ zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
1245
+
1246
+ if create_revised_general_topics == True:
1247
+ # Create the most up to date list of topics and subtopics.
1248
+ # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1249
+ unique_topics_df = pd.DataFrame(data={
1250
+ "General Topic":zero_shot_topics_gen_topics_list,
1251
+ "Subtopic":zero_shot_topics_subtopics_list
1252
+ })
1253
+ unique_topics_markdown = unique_topics_df.to_markdown()
1254
 
1255
+ print("unique_topics_markdown:", unique_topics_markdown)
1256
+
1257
+ formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1258
 
1259
+ # Format the general_topics prompt with the topics
1260
+ formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
 
1261
 
1262
+ if model_choice == "gemma_2b_it_local":
1263
+ formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
1264
 
1265
+ formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
 
 
1266
 
1267
+ whole_conversation = []
 
 
 
 
1268
 
1269
+ general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
1270
 
1271
+ # Convert response text to a markdown table
1272
+ try:
1273
+ zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
1274
+ print("Output revised zero shot topics table is:", zero_shot_topics_df)
1275
 
1276
+ zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
1277
+ zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
1278
+ out_file_paths.append(zero_shot_revised_path)
1279
 
1280
+ except Exception as e:
1281
+ print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
1282
+ zero_shot_topics_df = pd.DataFrame(data={
1283
+ "General Topic":zero_shot_topics_gen_topics_list,
1284
+ "Subtopic":zero_shot_topics_subtopics_list})
1285
 
1286
+ if zero_shot_topics_df.empty:
1287
+ print("Creation of revised general topics df failed, reverting to original list")
1288
+ zero_shot_topics_df = pd.DataFrame(data={
1289
+ "General Topic":zero_shot_topics_gen_topics_list,
1290
+ "Subtopic":zero_shot_topics_subtopics_list})
1291
+ else:
1292
+ zero_shot_topics_df = pd.DataFrame(data={
1293
+ "General Topic":zero_shot_topics_gen_topics_list,
1294
+ "Subtopic":zero_shot_topics_subtopics_list})
1295
+
1296
+
1297
+ # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1298
+ if not existing_unique_topics_df.empty:
1299
+ existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1300
+ else:
1301
+ existing_unique_topics_df = zero_shot_topics_df
1302
+
1303
+ if candidate_topics and not zero_shot_topics_df.empty:
1304
+ # If you have already created revised zero shot topics, concat to the current
1305
+ existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
1306
+
1307
+ #all_topic_tables_df_merged = existing_unique_topics_df
1308
+ existing_unique_topics_df["Response References"] = ""
1309
+ existing_unique_topics_df.fillna("", inplace=True)
1310
+ existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
1311
+ existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
1312
+
1313
+ # print("existing_unique_topics_df:", existing_unique_topics_df)
1314
+
1315
+ # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
1316
+ if force_zero_shot_radio == "Yes":
1317
+ unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
1318
+ topic_assignment_prompt = force_existing_topics_prompt
1319
+ else:
1320
+ unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
1321
+ topic_assignment_prompt = allow_new_topics_prompt
1322
+
1323
 
1324
+ # Format the summary prompt with the response table and topics
1325
+ formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1326
+ formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
1327
+
1328
 
1329
+ if model_choice == "gemma_2b_it_local":
1330
+ formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
1331
+ full_prompt = formatted_summary_prompt
1332
+ else:
1333
+ full_prompt = formatted_system_prompt + formatted_summary_prompt
1334
+
1335
+ #latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
1336
 
1337
+ # Define the output file path for the formatted prompt
1338
+ formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
 
1339
 
1340
+ # Write the formatted prompt to the specified file
1341
+ try:
1342
+ with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
1343
+ f.write(full_prompt)
1344
+ except Exception as e:
1345
+ print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
1346
 
1347
+ summary_prompt_list = [formatted_summary_prompt]
 
 
1348
 
1349
+ # print("master_summary_prompt_list:", summary_prompt_list[0])
 
1350
 
1351
+ summary_conversation_history = []
1352
+ summary_whole_conversation = []
 
 
1353
 
1354
+ # Process requests to large language model
1355
+ # responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
 
 
 
 
 
 
 
1356
 
1357
+ responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
 
 
 
1358
 
1359
+ # print("responses:", responses[-1].text)
1360
+ # print("Whole conversation metadata:", whole_conversation_metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1361
 
1362
+ topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
 
 
 
1363
 
1364
+ # Write final output to text file for logging purposes
1365
+ try:
1366
+ final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1367
 
1368
+ if isinstance(responses[-1], ResponseObject):
1369
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1370
+ f.write(responses[-1].text)
1371
+ elif "choices" in responses[-1]:
1372
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1373
+ f.write(responses[-1]["choices"][0]['text'])
1374
+ else:
1375
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1376
+ f.write(responses[-1].text)
1377
 
1378
+ except Exception as e:
1379
+ print("Error in returning model response:", e)
1380
+
1381
 
1382
+ # If error in table parsing, leave function
1383
+ if is_error == True:
1384
+ final_message_out = "Could not complete summary, error in LLM output."
1385
+ raise Exception(final_message_out)
1386
+ #return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1387
 
1388
+ # Write outputs to csv
1389
+ ## Topics with references
1390
+ new_topic_df.to_csv(topic_table_out_path, index=None)
1391
+ log_files_output_paths.append(topic_table_out_path)
1392
 
1393
+ ## Reference table mapping response numbers to topics
1394
+ new_reference_df.to_csv(reference_table_out_path, index=None)
1395
+ out_file_paths.append(reference_table_out_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1396
 
1397
+ ## Unique topic list
1398
+ new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
 
 
1399
 
1400
+ new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1401
+ out_file_paths.append(unique_topics_df_out_path)
1402
+
1403
+ # Outputs for markdown table output
1404
+ unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1405
+ unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
 
 
1406
 
1407
+ #whole_conversation_metadata.append(whole_conversation_metadata_str)
1408
+ whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
1409
+
1410
 
1411
+ #out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
1412
+ #log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
1413
 
1414
+ out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1415
+ log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1416
 
1417
+ #print("out_file_paths at end of loop:", out_file_paths)
1418
 
1419
+ # If this is the first batch, run this
1420
+ else:
1421
+ #system_prompt = system_prompt + normalised_simple_markdown_table
1422
+
1423
+ # Prepare Gemini models before query
1424
+ if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1425
+ print("Using Gemini model:", model_choice)
1426
+ model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1427
+ elif model_choice in ["gemma_2b_it_local"]:
1428
+ print("Using local Gemma 2b model")
1429
+ else:
1430
+ print("Using AWS Bedrock model:", model_choice)
1431
 
1432
+ formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1433
 
1434
+ formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
 
 
 
1435
 
1436
+ if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
1437
+ else: formatted_prompt2 = prompt2
1438
+
1439
+ if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
1440
+ else: formatted_prompt3 = prompt3
1441
 
1442
+ if model_choice == "gemma_2b_it_local":
1443
+ formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
1444
+ formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
1445
+ formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
 
1446
 
1447
+ batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
1448
+
1449
+ whole_conversation = [formatted_initial_table_system_prompt]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
 
1451
+
 
 
1452
 
1453
+
1454
 
1455
+ responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
 
 
 
 
1456
 
 
1457
 
1458
+ topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=True)
 
 
 
 
 
 
 
 
 
 
 
 
1459
 
1460
+ # If error in table parsing, leave function
1461
+ if is_error == True:
1462
+ raise Exception("Error in output table parsing")
1463
+ # unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
 
1464
 
1465
+
1466
+ #all_topic_tables_df.append(topic_table_df)
 
 
 
 
 
 
 
 
 
1467
 
1468
+ topic_table_df.to_csv(topic_table_out_path, index=None)
1469
+ out_file_paths.append(topic_table_out_path)
1470
 
1471
+ reference_df.to_csv(reference_table_out_path, index=None)
1472
+ out_file_paths.append(reference_table_out_path)
1473
 
1474
+ ## Unique topic list
 
1475
 
1476
+ new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
 
1477
 
1478
+ new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1479
+ out_file_paths.append(unique_topics_df_out_path)
1480
+
1481
+ #all_markdown_topic_tables.append(markdown_table)
1482
 
1483
+ whole_conversation_metadata.append(whole_conversation_metadata_str)
1484
+ whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)
1485
+
1486
+ # Write final output to text file also
1487
+ try:
1488
+ final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1489
+
1490
+ if isinstance(responses[-1], ResponseObject):
1491
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1492
+ f.write(responses[-1].text)
1493
+ unique_table_df_display_table_markdown = responses[-1].text
1494
+ elif "choices" in responses[-1]:
1495
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1496
+ f.write(responses[-1]["choices"][0]['text'])
1497
+ unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
1498
+ else:
1499
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1500
+ f.write(responses[-1].text)
1501
+ unique_table_df_display_table_markdown = responses[-1].text
1502
 
1503
+ log_files_output_paths.append(final_table_output_path)
 
 
1504
 
1505
+ except Exception as e:
1506
+ print("Error in returning model response:", e)
1507
+
1508
+ new_topic_df = topic_table_df
1509
+ new_reference_df = reference_df
 
 
 
 
1510
 
1511
+ else:
1512
+ print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
 
1513
 
1514
+ # Increase latest file completed count unless we are over the last batch number
1515
+ if latest_batch_completed <= num_batches:
1516
+ print("Completed batch number:", str(reported_batch_no))
1517
+ latest_batch_completed += 1
 
1518
 
1519
+ toc = time.perf_counter()
1520
+ final_time = toc - tic
 
 
1521
 
1522
+ if final_time > max_time_for_loop:
1523
+ print("Max time reached, breaking loop.")
1524
+ topics_loop.close()
1525
+ tqdm._instances.clear()
1526
+ break
1527
 
1528
+ # Overwrite 'existing' elements to add new tables
1529
+ existing_reference_df = new_reference_df.dropna(how='all')
1530
+ existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
1531
+ existing_topics_table = new_topic_df.dropna(how='all')
1532
 
1533
+ # The topic table that can be modified does not need the summary column
1534
+ modifiable_unique_topics_df = existing_unique_topics_df.drop("Summary", axis=1)
 
 
 
 
1535
 
1536
+ out_time = f"{final_time:0.1f} seconds."
1537
+
1538
+ out_message.append('All queries successfully completed in')
1539
 
1540
+ final_message_out = '\n'.join(out_message)
1541
+ final_message_out = final_message_out + " " + out_time
1542
 
1543
+ print(final_message_out)
 
1544
 
1545
+ # If we have extracted topics from the last batch, return the input out_message and file list to the relevant components
1546
+ if latest_batch_completed >= num_batches:
1547
+ print("Last batch reached, returning batch:", str(latest_batch_completed))
1548
+ # Set to a very high number so as not to mess with subsequent file processing by the user
1549
+ #latest_batch_completed = 999
1550
 
1551
+ toc = time.perf_counter()
1552
+ final_time = (toc - tic) + time_taken
1553
+ out_time = f"Everything finished in {round(final_time,1)} seconds."
1554
+ print(out_time)
 
 
 
 
 
 
 
 
1555
 
1556
+ print("All summaries completed. Creating outputs.")
1557
 
1558
+ model_choice_clean = model_name_map[model_choice]
1559
+ # Example usage
1560
+ in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
1561
 
1562
+ # Need to reduce output file names as full length files may be too long
1563
+ file_name = clean_column_name(file_name, max_length=30)
 
 
 
1564
 
1565
+ # Save outputs for each batch. If master file created, label file as master
1566
+ file_path_details = f"{file_name}_col_{in_column_cleaned}"
 
 
1567
 
1568
+ # Create a pivoted reference table
1569
+ existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
 
1570
 
1571
+ # Save the new DataFrame to CSV
1572
+ #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1573
+ reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1574
+ reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1575
+ unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1576
+ basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1577
 
1578
+ ## Reference table mapping response numbers to topics
1579
+ existing_reference_df.to_csv(reference_table_out_path, index=None)
1580
+ out_file_paths.append(reference_table_out_path)
1581
 
1582
+ # Create final unique topics table from reference table to ensure consistent numbers
1583
+ final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
1584
 
1585
+ ## Unique topic list
1586
+ final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1587
+ out_file_paths.append(unique_topics_df_out_path)
 
 
 
 
1588
 
1589
+ # Ensure that we are only returning the final results to outputs
1590
+ out_file_paths = [x for x in out_file_paths if '_final_' in x]
1591
 
1592
+ ## Reference table mapping response numbers to topics
1593
+ existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
1594
+ log_files_output_paths.append(reference_table_out_pivot_path)
1595
 
1596
+ ## Create a dataframe for missing response references:
1597
+ # Assuming existing_reference_df and file_data are already defined
1598
+ # Simplify table to just responses column and the Response reference number
1599
 
1600
+ basic_response_data = get_basic_response_data(file_data, chosen_cols)
1601
 
 
 
 
 
1602
 
1603
+ # Save simplified file data to log outputs
1604
+ pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None)
1605
+ log_files_output_paths.append(basic_response_data_out_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1606
 
 
1607
 
1608
+ # Step 1: Identify missing references
1609
+ missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
 
 
 
1610
 
1611
+ # Step 2: Create a new DataFrame with the same columns as existing_reference_df
1612
+ missing_df = pd.DataFrame(columns=existing_reference_df.columns)
1613
 
1614
+ # Step 3: Populate the new DataFrame
1615
+ missing_df['Response References'] = missing_references['Reference']
1616
+ missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False) # Fill other columns with NA
 
1617
 
1618
+ # Display the new DataFrame
1619
+ #print("missing_df:", missing_df)
1620
 
1621
+ missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1622
+ missing_df.to_csv(missing_df_out_path, index=None)
1623
+ log_files_output_paths.append(missing_df_out_path)
 
 
1624
 
1625
+ out_file_paths = list(set(out_file_paths))
1626
+ log_files_output_paths = list(set(log_files_output_paths))
 
 
1627
 
1628
+ final_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
1629
+
1630
  # The topic table that can be modified does not need the summary column
1631
+ modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
 
 
 
 
1632
 
1633
+ print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
 
1634
 
1635
+ return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths
1636
 
1637
 
1638
  return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
 
1723
  def modify_existing_output_tables(original_unique_topics_df:pd.DataFrame, modifiable_unique_topics_df:pd.DataFrame, reference_df:pd.DataFrame, text_output_file_list_state:List[str]) -> Tuple:
1724
  '''
1725
  Take a unique_topics table that has been modified, apply these new topic names to the long-form reference_df, and save both tables to file.
1726
+ '''
1727
+
1728
+ # Ensure text_output_file_list_state is a flat list
1729
+ if any(isinstance(i, list) for i in text_output_file_list_state):
1730
+ text_output_file_list_state = [item for sublist in text_output_file_list_state for item in sublist] # Flatten list
1731
+
1732
+ # Extract file paths safely
1733
+ reference_files = [x for x in text_output_file_list_state if 'reference' in x]
1734
+ unique_files = [x for x in text_output_file_list_state if 'unique' in x]
1735
+
1736
+ # Ensure files exist before accessing
1737
+ reference_file_path = os.path.basename(reference_files[0]) if reference_files else None
1738
+ unique_table_file_path = os.path.basename(unique_files[0]) if unique_files else None
1739
 
1740
+ print("Reference File:", reference_file_path)
1741
+ print("Unique Table File:", unique_table_file_path)
1742
 
1743
  output_file_list = []
1744