Sean Pedrick-Case commited on
Commit
a21f6e5
·
unverified ·
2 Parent(s): fd8dddc b9301bd

Merge pull request #2 from seanpedrick-case/dev

Browse files

Various improvements to zero shot topic modelling/categorisation. Varied sentiment and summary options. More resilient LLM calls.

app.py CHANGED
@@ -3,7 +3,7 @@ import socket
3
  import spaces
4
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
- from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
9
  #from tools.aws_functions import load_data_from_aws
@@ -44,14 +44,20 @@ with app:
44
  ###
45
 
46
  text_output_file_list_state = gr.State([])
 
47
  log_files_output_list_state = gr.State([])
48
  first_loop_state = gr.State(True)
49
  second_loop_state = gr.State(False)
 
50
 
51
- file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
52
- master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
53
- master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
54
- master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
 
 
 
 
55
 
56
  session_hash_state = gr.State()
57
  s3_output_folder_state = gr.State()
@@ -67,14 +73,15 @@ with app:
67
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
68
 
69
  # Summary state objects
70
- summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas") # gr.State(pd.DataFrame())
71
- master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
72
- master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
73
  summarised_references_markdown = gr.Markdown("", visible=False)
74
  summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
75
  latest_summary_completed_num = gr.Number(0, visible=False)
76
 
77
- unique_topics_table_file_textbox = gr.Textbox(label="unique_topics_table_file_textbox", visible=False)
 
78
 
79
  ###
80
  # UI LAYOUT
@@ -105,17 +112,20 @@ with app:
105
  in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
106
 
107
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
108
- in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
109
 
110
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
111
- candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
 
112
 
113
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
114
 
115
- extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
 
 
116
 
117
- text_output_summary = gr.Markdown(value="### Language model response will appear here")
118
- text_output_file = gr.File(height=file_input_height, label="Output files")
119
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
120
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
121
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
@@ -129,14 +139,28 @@ with app:
129
  with gr.Row():
130
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
131
 
132
- with gr.Tab(label="Deduplicate and summarise topics"):
133
  gr.Markdown(
134
  """
135
- ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
136
  """)
 
 
 
 
 
 
 
 
 
 
 
137
  with gr.Accordion("Upload reference data file and unique data files", open = True):
138
- summarisation_in_previous_data_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
139
- summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
 
 
 
140
 
141
  with gr.Row():
142
  merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
@@ -144,17 +168,21 @@ with app:
144
  deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
145
 
146
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
147
-
148
- duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
 
 
 
 
149
 
150
  summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
151
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
152
- summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
153
 
154
- with gr.Tab(label="Continue previous topic extraction"):
155
  gr.Markdown(
156
  """
157
- ### Load in data files from a previous attempt at extracting topics to continue it.
158
  """)
159
 
160
  with gr.Accordion("Upload reference data file and unique data files", open = True):
@@ -170,7 +198,7 @@ with app:
170
  """)
171
 
172
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
173
- view_table_markdown = gr.Markdown(value = "", label="View table")
174
 
175
  with gr.Tab(label="Topic extraction settings"):
176
  gr.Markdown(
@@ -183,7 +211,7 @@ with app:
183
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
184
 
185
  with gr.Accordion("Prompt settings", open = True):
186
- number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3)
187
  system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
188
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
189
  prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
@@ -196,7 +224,7 @@ with app:
196
 
197
  # Invisible text box to hold the session hash/username just for logging purposes
198
  session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
199
- data_file_names_textbox = gr.Textbox(label = "Data file name", value="", visible=False)
200
  estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
201
  total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
202
 
@@ -220,40 +248,46 @@ with app:
220
  ###
221
 
222
  # Tabular data upload
223
- in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
224
-
225
- extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
226
- then(load_in_data_file,
227
- inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
228
- fn=extract_topics,
229
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
230
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
231
 
232
- # return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
233
 
234
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
235
- latest_batch_completed.change(fn=extract_topics,
236
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
237
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files]).\
238
- then(fn = reveal_feedback_buttons,
239
- outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
 
 
 
 
 
 
 
240
 
241
  # When button pressed, deduplicate data
242
- deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
243
- then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files, log_files_output])
244
 
245
  # When button pressed, summarise previous data
246
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
247
- then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
248
- then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
249
- then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
250
 
251
- latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
252
 
253
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
254
  continue_previous_data_files_btn.click(
255
- load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches]).\
256
- then(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, data_file_names_textbox])
257
 
258
  ###
259
  # LOGGING AND ON APP LOAD FUNCTIONS
@@ -264,21 +298,21 @@ with app:
264
  access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
265
  access_callback.setup([session_hash_textbox], access_logs_data_folder)
266
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
267
- then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
268
 
269
  # Log usage usage when making a query
270
  usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
271
- usage_callback.setup([session_hash_textbox, data_file_names_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], usage_data_folder)
272
 
273
- conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_names_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], None, preprocess=False).\
274
- then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
275
 
276
  # User submitted feedback
277
  feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
278
- feedback_callback.setup([data_feedback_radio, data_further_details_text, data_file_names_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], feedback_data_folder)
279
 
280
- data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_names_textbox, model_choice, temperature_slide, text_output_summary, conversation_metadata_textbox], None, preprocess=False).\
281
- then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
282
 
283
  in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
284
 
 
3
  import spaces
4
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
+ from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
9
  #from tools.aws_functions import load_data_from_aws
 
44
  ###
45
 
46
  text_output_file_list_state = gr.State([])
47
+ text_output_modify_file_list_state = gr.State([])
48
  log_files_output_list_state = gr.State([])
49
  first_loop_state = gr.State(True)
50
  second_loop_state = gr.State(False)
51
+ modified_unique_table_change_bool = gr.State(True) # This boolean is used to flag whether a file upload should change just the modified unique table object on the second tab
52
 
53
+ file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas")
54
+ master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas")
55
+ master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas")
56
+ master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas")
57
+
58
+ master_modify_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_unique_topics_df_state", visible=False, type="pandas")
59
+ master_modify_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_reference_df_state", visible=False, type="pandas")
60
+
61
 
62
  session_hash_state = gr.State()
63
  s3_output_folder_state = gr.State()
 
73
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
74
 
75
  # Summary state objects
76
+ summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas")
77
+ master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas")
78
+ master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas")
79
  summarised_references_markdown = gr.Markdown("", visible=False)
80
  summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
81
  latest_summary_completed_num = gr.Number(0, visible=False)
82
 
83
+ reference_data_file_name_textbox = gr.Textbox(label = "Reference data file name", value="", visible=False)
84
+ unique_topics_table_file_name_textbox = gr.Textbox(label="Unique topics data file name textbox", visible=False)
85
 
86
  ###
87
  # UI LAYOUT
 
112
  in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
113
 
114
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
115
+ in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
116
 
117
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
118
+ candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
119
+ force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
120
 
121
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
122
 
123
+ sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative, Neutral, or Positive", choices=["Negative, Neutral, or Positive", "Negative or Positive", "Do not assess sentiment"])
124
+
125
+ extract_topics_btn = gr.Button("Extract topics", variant="primary")
126
 
127
+ topic_extraction_output_files = gr.File(height=file_input_height, label="Output files")
128
+ display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
129
  latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
130
  # Duplicate version of the above variable for when you don't want to initiate the summarisation loop
131
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
 
139
  with gr.Row():
140
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
141
 
142
+ with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
143
  gr.Markdown(
144
  """
145
+ Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.
146
  """)
147
+
148
+
149
+
150
+ with gr.Accordion("Modify existing topics", open = False):
151
+ modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
152
+
153
+ modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
154
+
155
+ save_modified_files_button = gr.Button(value="Save modified topic names")
156
+
157
+
158
  with gr.Accordion("Upload reference data file and unique data files", open = True):
159
+
160
+
161
+ ### DEDUPLICATION
162
+ deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
163
+ deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
164
 
165
  with gr.Row():
166
  merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
 
168
  deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
169
 
170
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
171
+
172
+
173
+ ### SUMMARISATION
174
+ summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
175
+
176
+ summarise_format_radio = gr.Radio(label="Choose summary type", value="Return a summary up to two paragraphs long that includes as much detail as possible from the original text", choices=["Return a summary up to two paragraphs long that includes as much detail as possible from the original text", "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"])
177
 
178
  summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
179
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
180
+ summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
181
 
182
+ with gr.Tab(label="Continue unfinished topic extraction"):
183
  gr.Markdown(
184
  """
185
+ ### Load in output files from a previous topic extraction process and continue topic extraction with new data.
186
  """)
187
 
188
  with gr.Accordion("Upload reference data file and unique data files", open = True):
 
198
  """)
199
 
200
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
201
+ view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
202
 
203
  with gr.Tab(label="Topic extraction settings"):
204
  gr.Markdown(
 
211
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
212
 
213
  with gr.Accordion("Prompt settings", open = True):
214
+ number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
215
  system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
216
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
217
  prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
 
224
 
225
  # Invisible text box to hold the session hash/username just for logging purposes
226
  session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
227
+
228
  estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
229
  total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
230
 
 
248
  ###
249
 
250
  # Tabular data upload
251
+ in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
252
+
253
+ extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
254
+ success(load_in_data_file,
255
+ inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
256
+ success(fn=extract_topics,
257
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
258
+ outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
259
 
 
260
 
261
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
262
+ # latest_batch_completed.change(fn=extract_topics,
263
+ # inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
264
+ # outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
265
+ # success(fn = reveal_feedback_buttons,
266
+ # outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
267
+
268
+ # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
269
+ modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
270
+
271
+
272
+ # Modify output table with custom topic names
273
+ save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, deduplication_input_files, summarisation_input_files, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, summarised_output_markdown])
274
 
275
  # When button pressed, deduplicate data
276
+ deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
277
+ success(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, in_excel_sheets, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, summarisation_input_files, log_files_output, summarised_output_markdown], scroll_to_output=True)
278
 
279
  # When button pressed, summarise previous data
280
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
281
+ success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
282
+ success(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
283
+ success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
284
 
285
+ latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output], scroll_to_output=True)
286
 
287
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
288
  continue_previous_data_files_btn.click(
289
+ load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
290
+ success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
291
 
292
  ###
293
  # LOGGING AND ON APP LOAD FUNCTIONS
 
298
  access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
299
  access_callback.setup([session_hash_textbox], access_logs_data_folder)
300
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
301
+ success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
302
 
303
  # Log usage usage when making a query
304
  usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
305
+ usage_callback.setup([session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], usage_data_folder)
306
 
307
+ conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], None, preprocess=False).\
308
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
309
 
310
  # User submitted feedback
311
  feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
312
+ feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], feedback_data_folder)
313
 
314
+ data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], None, preprocess=False).\
315
+ success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
316
 
317
  in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
318
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.20.1
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
requirements_aws.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.20.1
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
requirements_gpu.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
1
  pandas==2.2.3
2
+ gradio==5.20.1
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
tools/helper_functions.py CHANGED
@@ -15,8 +15,11 @@ def empty_output_vars_extract_topics():
15
  log_files_output_list_state = []
16
  conversation_metadata_textbox = ""
17
  estimated_time_taken_number = 0
 
 
 
18
 
19
- return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number
20
 
21
  def empty_output_vars_summarise():
22
  # Empty output objects before summarising files
@@ -106,22 +109,31 @@ def detect_file_type(filename):
106
  else:
107
  raise ValueError("Unsupported file type.")
108
 
109
- def read_file(filename):
110
  """Read the file based on its detected type."""
111
  file_type = detect_file_type(filename)
112
 
113
  if file_type == 'csv':
114
  return pd.read_csv(filename, low_memory=False)
115
  elif file_type == 'xlsx':
116
- return pd.read_excel(filename)
 
 
 
117
  elif file_type == 'parquet':
118
  return pd.read_parquet(filename)
119
 
120
  # Wrap text in each column to the specified max width, including whole words
121
- def wrap_text(text, max_width=60):
122
  if not isinstance(text, str):
123
  return text
124
 
 
 
 
 
 
 
125
  words = text.split()
126
  if not words:
127
  return text
 
15
  log_files_output_list_state = []
16
  conversation_metadata_textbox = ""
17
  estimated_time_taken_number = 0
18
+ file_data_state = pd.DataFrame()
19
+ reference_data_file_name_textbox = ""
20
+ display_topic_table_markdown = ""
21
 
22
+ return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown
23
 
24
  def empty_output_vars_summarise():
25
  # Empty output objects before summarising files
 
109
  else:
110
  raise ValueError("Unsupported file type.")
111
 
112
+ def read_file(filename:str, sheet:str=""):
113
  """Read the file based on its detected type."""
114
  file_type = detect_file_type(filename)
115
 
116
  if file_type == 'csv':
117
  return pd.read_csv(filename, low_memory=False)
118
  elif file_type == 'xlsx':
119
+ if sheet:
120
+ return pd.read_excel(filename, sheet_name=sheet)
121
+ else:
122
+ return pd.read_excel(filename)
123
  elif file_type == 'parquet':
124
  return pd.read_parquet(filename)
125
 
126
  # Wrap text in each column to the specified max width, including whole words
127
+ def wrap_text(text:str, max_width=60, max_text_length=None):
128
  if not isinstance(text, str):
129
  return text
130
 
131
+ # If max_text_length is set, truncate the text and add ellipsis
132
+ if max_text_length and len(text) > max_text_length:
133
+ text = text[:max_text_length] + '...'
134
+
135
+ text = text.replace('\r\n', '<br>').replace('\n', '<br>')
136
+
137
  words = text.split()
138
  if not words:
139
  return text
tools/llm_api_call.py CHANGED
The diff for this file is too large to render. See raw diff
 
tools/prompts.py CHANGED
@@ -5,14 +5,16 @@ initial_table_prompt = """The open text data is shown in the following table tha
5
 
6
  Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
7
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
8
- In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
9
- In the third column write the sentiment of the subtopic: Negative, Neutral, or Positive.
10
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
11
- In the fifth and final column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
12
- Do not add any other columns. Do not repeat Subtopics with the same Sentiment. Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
13
 
14
  New table:"""
15
 
 
 
16
  prompt2 = ""
17
 
18
  prompt3 = ""
@@ -21,6 +23,12 @@ prompt3 = ""
21
 
22
  add_existing_topics_system_prompt = system_prompt
23
 
 
 
 
 
 
 
24
  add_existing_topics_prompt = """Responses are shown in the following Response table:
25
  {response_table}
26
 
@@ -28,16 +36,16 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
28
  {topics}
29
 
30
  Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
31
- Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
32
- In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible.
33
- In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive.
34
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
35
- In the fifth and final column, write a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
36
- Do not add any other columns. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
37
- Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
38
 
39
  New table:"""
40
 
 
 
41
 
42
  summarise_topic_descriptions_system_prompt = system_prompt
43
 
@@ -45,11 +53,13 @@ summarise_topic_descriptions_prompt = """Below is a table with number of paragra
45
 
46
  '{summaries}'
47
 
48
- Your task is to make a consolidated summary of the above text. Return a summary up to two paragraphs long that includes as much detail as possible from the original text. Return only the summary and no other text.
49
 
50
  Summary:"""
51
 
52
 
 
 
53
  create_general_topics_system_prompt = system_prompt
54
 
55
  create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
@@ -60,6 +70,10 @@ Your task is to create a General Topic name for each Subtopic. The new Topics ta
60
  New Topics table:"""
61
 
62
 
 
 
 
 
63
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
64
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
65
  # Summarise the following text in less than {length} words: "{text}"\n
 
5
 
6
  Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
7
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
8
+ In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
9
+ {sentiment_choices}.
10
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
11
+ In the fifth column, write a short summary of the subtopic based on relevant responses - highlight specific issues that appear.
12
+ Do not add any other columns. Do not add any other text to your response.
13
 
14
  New table:"""
15
 
16
+ # Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
17
+
18
  prompt2 = ""
19
 
20
  prompt3 = ""
 
23
 
24
  add_existing_topics_system_prompt = system_prompt
25
 
26
+ force_existing_topics_prompt = """Create a new markdown table with the headings 'Placeholder', 'Subtopics', 'Sentiment', 'Response references', and 'Summary'.
27
+ In the first column, write 'Not assessed'. In the second column, assign Subtopics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
28
+
29
+ allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
30
+ In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
31
+
32
  add_existing_topics_prompt = """Responses are shown in the following Response table:
33
  {response_table}
34
 
 
36
  {topics}
37
 
38
  Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
39
+ {topic_assignment}
40
+ {sentiment_choices}.
 
41
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
42
+ In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
43
+ Do not add any other columns. Do not add any other text to your response.
 
44
 
45
  New table:"""
46
 
47
+ # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
48
+
49
 
50
  summarise_topic_descriptions_system_prompt = system_prompt
51
 
 
53
 
54
  '{summaries}'
55
 
56
+ Your task is to make a consolidated summary of the above text. {summary_format}. Return only the summary and no other text.
57
 
58
  Summary:"""
59
 
60
 
61
+ ## The following didn't work well in testing and so is not currently used
62
+
63
  create_general_topics_system_prompt = system_prompt
64
 
65
  create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
 
70
  New Topics table:"""
71
 
72
 
73
+
74
+
75
+
76
+
77
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
78
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
79
  # Summarise the following text in less than {length} words: "{text}"\n