seanpedrickcase commited on
Commit
71fcefe
·
1 Parent(s): 75d1651

Improved zero shot 'forced' categorisation and prompts

Browse files
Files changed (3) hide show
  1. app.py +10 -9
  2. tools/llm_api_call.py +42 -24
  3. tools/prompts.py +17 -9
app.py CHANGED
@@ -112,10 +112,11 @@ with app:
112
  in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
113
 
114
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
115
- in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
116
 
117
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
118
- candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic titles below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for either or both of these lists to be specified.")
 
119
 
120
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
121
 
@@ -123,7 +124,7 @@ with app:
123
 
124
  extract_topics_btn = gr.Button("Extract topics", variant="primary")
125
 
126
- text_output_summary = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
127
  text_output_file = gr.File(height=file_input_height, label="Output files")
128
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
129
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
@@ -253,14 +254,14 @@ with app:
253
  success(load_in_data_file,
254
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
255
  success(fn=extract_topics,
256
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox],
257
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
258
 
259
 
260
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
261
  latest_batch_completed.change(fn=extract_topics,
262
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox],
263
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
264
  success(fn = reveal_feedback_buttons,
265
  outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
266
 
@@ -308,9 +309,9 @@ with app:
308
 
309
  # User submitted feedback
310
  feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
311
- feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], feedback_data_folder)
312
 
313
- data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], None, preprocess=False).\
314
  success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
315
 
316
  in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
 
112
  in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
113
 
114
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
115
+ in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
116
 
117
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
118
+ candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
119
+ force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
120
 
121
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
122
 
 
124
 
125
  extract_topics_btn = gr.Button("Extract topics", variant="primary")
126
 
127
+ display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
128
  text_output_file = gr.File(height=file_input_height, label="Output files")
129
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
130
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
 
254
  success(load_in_data_file,
255
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
256
  success(fn=extract_topics,
257
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
258
+ outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
259
 
260
 
261
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
262
  latest_batch_completed.change(fn=extract_topics,
263
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
264
+ outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
265
  success(fn = reveal_feedback_buttons,
266
  outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
267
 
 
309
 
310
  # User submitted feedback
311
  feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
312
+ feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], feedback_data_folder)
313
 
314
+ data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], None, preprocess=False).\
315
  success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
316
 
317
  in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
tools/llm_api_call.py CHANGED
@@ -20,7 +20,7 @@ from io import StringIO
20
 
21
  GradioFileData = gr.FileData
22
 
23
- from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
24
  from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
25
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
26
 
@@ -884,7 +884,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
884
  # Create a new DataFrame from the reference data
885
  new_reference_df = pd.DataFrame(reference_data)
886
 
887
- #print("new_reference_df:", new_reference_df)
888
 
889
  # Append on old reference data
890
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
@@ -897,6 +897,9 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
897
  out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
898
  except Exception as e:
899
  print("Could not convert Response References column to integer due to", e)
 
 
 
900
 
901
  out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
902
 
@@ -941,7 +944,7 @@ def extract_topics(in_data_file,
941
  existing_topics_table:pd.DataFrame,
942
  existing_reference_df:pd.DataFrame,
943
  existing_unique_topics_df:pd.DataFrame,
944
- display_table:str,
945
  file_name:str,
946
  num_batches:int,
947
  in_api_key:str,
@@ -966,6 +969,7 @@ def extract_topics(in_data_file,
966
  context_textbox:str="",
967
  time_taken:float = 0,
968
  sentiment_checkbox:str = "Negative, Neutral, or Positive",
 
969
  max_tokens:int=max_tokens,
970
  model_name_map:dict=model_name_map,
971
  max_time_for_loop:int=max_time_for_loop,
@@ -980,7 +984,7 @@ def extract_topics(in_data_file,
980
  - existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
981
  - existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
982
  - existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point.
983
- - display_table (str): Table for display in markdown format.
984
  - file_name (str): File name of the data file.
985
  - num_batches (int): Number of batches required to go through all the response rows.
986
  - in_api_key (str): The API key for authentication.
@@ -1004,6 +1008,8 @@ def extract_topics(in_data_file,
1004
  - batch_size (int): The number of data rows to consider in each request.
1005
  - context_textbox (str, optional): A string giving some context to the consultation/task.
1006
  - time_taken (float, optional): The amount of time taken to process the responses up until this point.
 
 
1007
  - max_tokens (int): The maximum number of tokens for the model.
1008
  - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
1009
  - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
@@ -1168,7 +1174,7 @@ def extract_topics(in_data_file,
1168
  modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
1169
 
1170
  #final_out_message = '\n'.join(out_message)
1171
- return display_table, existing_topics_table, final_out_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1172
 
1173
 
1174
  if num_batches > 0:
@@ -1236,7 +1242,7 @@ def extract_topics(in_data_file,
1236
  else:
1237
  print("Using local model:", model_choice)
1238
 
1239
- # Preparing candidate topics
1240
  if candidate_topics and existing_unique_topics_df.empty:
1241
  progress(0.1, "Creating revised zero shot topics table")
1242
 
@@ -1282,7 +1288,10 @@ def extract_topics(in_data_file,
1282
  zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1283
  zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1284
 
1285
-
 
 
 
1286
 
1287
  if create_revised_general_topics == True:
1288
  # Create the most up to date list of topics and subtopics.
@@ -1334,7 +1343,7 @@ def extract_topics(in_data_file,
1334
  "General Topic":zero_shot_topics_gen_topics_list,
1335
  "Subtopic":zero_shot_topics_subtopics_list})
1336
 
1337
- print("Zero shot topics are:", zero_shot_topics_df)
1338
 
1339
  # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1340
  if not existing_unique_topics_df.empty:
@@ -1350,14 +1359,24 @@ def extract_topics(in_data_file,
1350
 
1351
  #all_topic_tables_df_merged = existing_unique_topics_df
1352
  existing_unique_topics_df["Response References"] = ""
 
 
 
 
 
1353
 
1354
- unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
 
 
 
 
 
 
1355
 
1356
- #existing_unique_topics_df.to_csv(output_folder + f"{file_name}_existing_unique_topics_df_" + #model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
1357
 
1358
  # Format the summary prompt with the response table and topics
1359
  formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1360
- formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, sentiment_choices=sentiment_prompt)
1361
 
1362
 
1363
  if model_choice == "gemma_2b_it_local":
@@ -1415,7 +1434,7 @@ def extract_topics(in_data_file,
1415
  if is_error == True:
1416
  final_message_out = "Could not complete summary, error in LLM output."
1417
  raise Exception(final_message_out)
1418
- #return display_table, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1419
 
1420
  # Write outputs to csv
1421
  ## Topics with references
@@ -1432,13 +1451,9 @@ def extract_topics(in_data_file,
1432
  new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1433
  out_file_paths.append(unique_topics_df_out_path)
1434
 
1435
- #all_topic_tables_df.append(new_topic_df)
1436
- #all_markdown_topic_tables.append(new_markdown_table)
1437
-
1438
- #display_table = responses[-1].text
1439
-
1440
- # Show unique topics alongside document counts as output
1441
- display_table = new_unique_topics_df.to_markdown(index=False)
1442
 
1443
  #whole_conversation_metadata.append(whole_conversation_metadata_str)
1444
  whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
@@ -1496,7 +1511,8 @@ def extract_topics(in_data_file,
1496
 
1497
  # If error in table parsing, leave function
1498
  if is_error == True:
1499
- display_table, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
 
1500
 
1501
 
1502
  #all_topic_tables_df.append(topic_table_df)
@@ -1526,15 +1542,15 @@ def extract_topics(in_data_file,
1526
  if isinstance(responses[-1], ResponseObject):
1527
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1528
  f.write(responses[-1].text)
1529
- display_table = responses[-1].text
1530
  elif "choices" in responses[-1]:
1531
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1532
  f.write(responses[-1]["choices"][0]['text'])
1533
- display_table =responses[-1]["choices"][0]['text']
1534
  else:
1535
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1536
  f.write(responses[-1].text)
1537
- display_table = responses[-1].text
1538
 
1539
  log_files_output_paths.append(final_table_output_path)
1540
 
@@ -1579,7 +1595,7 @@ def extract_topics(in_data_file,
1579
  print(final_message_out)
1580
 
1581
 
1582
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1583
 
1584
  def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
1585
 
@@ -2304,4 +2320,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
2304
  if latest_summary_completed >= length_all_summaries:
2305
  print("At last summary.")
2306
 
 
 
2307
  return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
 
20
 
21
  GradioFileData = gr.FileData
22
 
23
+ from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt
24
  from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
25
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
26
 
 
884
  # Create a new DataFrame from the reference data
885
  new_reference_df = pd.DataFrame(reference_data)
886
 
887
+ print("new_reference_df:", new_reference_df)
888
 
889
  # Append on old reference data
890
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
 
897
  out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
898
  except Exception as e:
899
  print("Could not convert Response References column to integer due to", e)
900
+ print("out_reference_df['Response References']:", out_reference_df["Response References"].head())
901
+
902
+ out_reference_df.to_csv(output_folder + "test_output_reference_df.csv")
903
 
904
  out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
905
 
 
944
  existing_topics_table:pd.DataFrame,
945
  existing_reference_df:pd.DataFrame,
946
  existing_unique_topics_df:pd.DataFrame,
947
+ unique_table_df_display_table_markdown:str,
948
  file_name:str,
949
  num_batches:int,
950
  in_api_key:str,
 
969
  context_textbox:str="",
970
  time_taken:float = 0,
971
  sentiment_checkbox:str = "Negative, Neutral, or Positive",
972
+ force_zero_shot_radio:str = "No",
973
  max_tokens:int=max_tokens,
974
  model_name_map:dict=model_name_map,
975
  max_time_for_loop:int=max_time_for_loop,
 
984
  - existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
985
  - existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
986
  - existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point.
987
+ - unique_table_df_display_table_markdown (str): Table for display in markdown format.
988
  - file_name (str): File name of the data file.
989
  - num_batches (int): Number of batches required to go through all the response rows.
990
  - in_api_key (str): The API key for authentication.
 
1008
  - batch_size (int): The number of data rows to consider in each request.
1009
  - context_textbox (str, optional): A string giving some context to the consultation/task.
1010
  - time_taken (float, optional): The amount of time taken to process the responses up until this point.
1011
+ - sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
1012
+ - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
1013
  - max_tokens (int): The maximum number of tokens for the model.
1014
  - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
1015
  - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
 
1174
  modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
1175
 
1176
  #final_out_message = '\n'.join(out_message)
1177
+ return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1178
 
1179
 
1180
  if num_batches > 0:
 
1242
  else:
1243
  print("Using local model:", model_choice)
1244
 
1245
+ # Preparing candidate topics if no topics currently exist
1246
  if candidate_topics and existing_unique_topics_df.empty:
1247
  progress(0.1, "Creating revised zero shot topics table")
1248
 
 
1288
  zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1289
  zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1290
 
1291
+ # If the responses are being forced into zero shot topics, allow an option for nothing relevant
1292
+ if force_zero_shot_radio == "Yes":
1293
+ zero_shot_topics_gen_topics_list.append("")
1294
+ zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
1295
 
1296
  if create_revised_general_topics == True:
1297
  # Create the most up to date list of topics and subtopics.
 
1343
  "General Topic":zero_shot_topics_gen_topics_list,
1344
  "Subtopic":zero_shot_topics_subtopics_list})
1345
 
1346
+ #print("Zero shot topics are:", zero_shot_topics_df)
1347
 
1348
  # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1349
  if not existing_unique_topics_df.empty:
 
1359
 
1360
  #all_topic_tables_df_merged = existing_unique_topics_df
1361
  existing_unique_topics_df["Response References"] = ""
1362
+ existing_unique_topics_df.fillna("", inplace=True)
1363
+ existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
1364
+ existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
1365
+
1366
+ # print("existing_unique_topics_df:", existing_unique_topics_df)
1367
 
1368
+ # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
1369
+ if force_zero_shot_radio == "Yes":
1370
+ unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
1371
+ topic_assignment_prompt = force_existing_topics_prompt
1372
+ else:
1373
+ unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
1374
+ topic_assignment_prompt = allow_new_topics_prompt
1375
 
 
1376
 
1377
  # Format the summary prompt with the response table and topics
1378
  formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1379
+ formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
1380
 
1381
 
1382
  if model_choice == "gemma_2b_it_local":
 
1434
  if is_error == True:
1435
  final_message_out = "Could not complete summary, error in LLM output."
1436
  raise Exception(final_message_out)
1437
+ #return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1438
 
1439
  # Write outputs to csv
1440
  ## Topics with references
 
1451
  new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1452
  out_file_paths.append(unique_topics_df_out_path)
1453
 
1454
+ # Outputs for markdown table output
1455
+ unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1456
+ unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
 
 
 
 
1457
 
1458
  #whole_conversation_metadata.append(whole_conversation_metadata_str)
1459
  whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
 
1511
 
1512
  # If error in table parsing, leave function
1513
  if is_error == True:
1514
+ raise Exception("Error in output table parsing")
1515
+ # unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1516
 
1517
 
1518
  #all_topic_tables_df.append(topic_table_df)
 
1542
  if isinstance(responses[-1], ResponseObject):
1543
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1544
  f.write(responses[-1].text)
1545
+ unique_table_df_display_table_markdown = responses[-1].text
1546
  elif "choices" in responses[-1]:
1547
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1548
  f.write(responses[-1]["choices"][0]['text'])
1549
+ unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
1550
  else:
1551
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1552
  f.write(responses[-1].text)
1553
+ unique_table_df_display_table_markdown = responses[-1].text
1554
 
1555
  log_files_output_paths.append(final_table_output_path)
1556
 
 
1595
  print(final_message_out)
1596
 
1597
 
1598
+ return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1599
 
1600
  def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
1601
 
 
2320
  if latest_summary_completed >= length_all_summaries:
2321
  print("At last summary.")
2322
 
2323
+ output_files = list(set(output_files))
2324
+
2325
  return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
tools/prompts.py CHANGED
@@ -6,13 +6,15 @@ initial_table_prompt = """The open text data is shown in the following table tha
6
  Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
7
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
8
  In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
9
- In the third column write the sentiment of the subtopic: {sentiment_choices}.
10
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
11
- In the fifth and final column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
12
- Do not add any other columns. Do not repeat Subtopics with the same Sentiment. Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
13
 
14
  New table:"""
15
 
 
 
16
  prompt2 = ""
17
 
18
  prompt3 = ""
@@ -21,6 +23,12 @@ prompt3 = ""
21
 
22
  add_existing_topics_system_prompt = system_prompt
23
 
 
 
 
 
 
 
24
  add_existing_topics_prompt = """Responses are shown in the following Response table:
25
  {response_table}
26
 
@@ -28,16 +36,16 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
28
  {topics}
29
 
30
  Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
31
- Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
32
- In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty.
33
- In the third column, write the sentiment of the Subtopic: {sentiment_choices}.
34
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
35
- In the fifth and final column, write a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
36
- Do not add any other columns. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
37
- Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
38
 
39
  New table:"""
40
 
 
 
41
 
42
  summarise_topic_descriptions_system_prompt = system_prompt
43
 
 
6
  Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
7
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
8
  In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
9
+ {sentiment_choices}.
10
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
11
+ In the fifth column, write a short summary of the subtopic based on relevant responses - highlight specific issues that appear.
12
+ Do not add any other columns. Do not add any other text to your response.
13
 
14
  New table:"""
15
 
16
+ # Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
17
+
18
  prompt2 = ""
19
 
20
  prompt3 = ""
 
23
 
24
  add_existing_topics_system_prompt = system_prompt
25
 
26
+ force_existing_topics_prompt = """Create a new markdown table with the headings 'Placeholder', 'Subtopics', 'Sentiment', 'Response references', and 'Summary'.
27
+ In the first column, write 'Not assessed'. In the second column, assign Subtopics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
28
+
29
+ allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
30
+ In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
31
+
32
  add_existing_topics_prompt = """Responses are shown in the following Response table:
33
  {response_table}
34
 
 
36
  {topics}
37
 
38
  Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
39
+ {topic_assignment}
40
+ {sentiment_choices}.
 
41
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
42
+ In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
43
+ Do not add any other columns. Do not add any other text to your response.
 
44
 
45
  New table:"""
46
 
47
+ # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
48
+
49
 
50
  summarise_topic_descriptions_system_prompt = system_prompt
51