Commit
·
71fcefe
1
Parent(s):
75d1651
Improved zero shot 'forced' categorisation and prompts
Browse files- app.py +10 -9
- tools/llm_api_call.py +42 -24
- tools/prompts.py +17 -9
app.py
CHANGED
@@ -112,10 +112,11 @@ with app:
|
|
112 |
in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
113 |
|
114 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
115 |
-
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
|
116 |
|
117 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
118 |
-
candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic
|
|
|
119 |
|
120 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
121 |
|
@@ -123,7 +124,7 @@ with app:
|
|
123 |
|
124 |
extract_topics_btn = gr.Button("Extract topics", variant="primary")
|
125 |
|
126 |
-
|
127 |
text_output_file = gr.File(height=file_input_height, label="Output files")
|
128 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
129 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
@@ -253,14 +254,14 @@ with app:
|
|
253 |
success(load_in_data_file,
|
254 |
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
|
255 |
success(fn=extract_topics,
|
256 |
-
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state,
|
257 |
-
outputs=[
|
258 |
|
259 |
|
260 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
261 |
latest_batch_completed.change(fn=extract_topics,
|
262 |
-
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state,
|
263 |
-
outputs=[
|
264 |
success(fn = reveal_feedback_buttons,
|
265 |
outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
266 |
|
@@ -308,9 +309,9 @@ with app:
|
|
308 |
|
309 |
# User submitted feedback
|
310 |
feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
311 |
-
feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide,
|
312 |
|
313 |
-
data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide,
|
314 |
success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
315 |
|
316 |
in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
|
|
|
112 |
in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
113 |
|
114 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
115 |
+
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
116 |
|
117 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
118 |
+
candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
|
119 |
+
force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
|
120 |
|
121 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
122 |
|
|
|
124 |
|
125 |
extract_topics_btn = gr.Button("Extract topics", variant="primary")
|
126 |
|
127 |
+
display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
|
128 |
text_output_file = gr.File(height=file_input_height, label="Output files")
|
129 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
130 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
|
|
254 |
success(load_in_data_file,
|
255 |
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
|
256 |
success(fn=extract_topics,
|
257 |
+
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
|
258 |
+
outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
|
259 |
|
260 |
|
261 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
262 |
latest_batch_completed.change(fn=extract_topics,
|
263 |
+
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
|
264 |
+
outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
|
265 |
success(fn = reveal_feedback_buttons,
|
266 |
outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
267 |
|
|
|
309 |
|
310 |
# User submitted feedback
|
311 |
feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
312 |
+
feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], feedback_data_folder)
|
313 |
|
314 |
+
data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], None, preprocess=False).\
|
315 |
success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
316 |
|
317 |
in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
|
tools/llm_api_call.py
CHANGED
@@ -20,7 +20,7 @@ from io import StringIO
|
|
20 |
|
21 |
GradioFileData = gr.FileData
|
22 |
|
23 |
-
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
|
24 |
from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
|
25 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
26 |
|
@@ -884,7 +884,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
884 |
# Create a new DataFrame from the reference data
|
885 |
new_reference_df = pd.DataFrame(reference_data)
|
886 |
|
887 |
-
|
888 |
|
889 |
# Append on old reference data
|
890 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
@@ -897,6 +897,9 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
897 |
out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
|
898 |
except Exception as e:
|
899 |
print("Could not convert Response References column to integer due to", e)
|
|
|
|
|
|
|
900 |
|
901 |
out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
|
902 |
|
@@ -941,7 +944,7 @@ def extract_topics(in_data_file,
|
|
941 |
existing_topics_table:pd.DataFrame,
|
942 |
existing_reference_df:pd.DataFrame,
|
943 |
existing_unique_topics_df:pd.DataFrame,
|
944 |
-
|
945 |
file_name:str,
|
946 |
num_batches:int,
|
947 |
in_api_key:str,
|
@@ -966,6 +969,7 @@ def extract_topics(in_data_file,
|
|
966 |
context_textbox:str="",
|
967 |
time_taken:float = 0,
|
968 |
sentiment_checkbox:str = "Negative, Neutral, or Positive",
|
|
|
969 |
max_tokens:int=max_tokens,
|
970 |
model_name_map:dict=model_name_map,
|
971 |
max_time_for_loop:int=max_time_for_loop,
|
@@ -980,7 +984,7 @@ def extract_topics(in_data_file,
|
|
980 |
- existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
|
981 |
- existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
|
982 |
- existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point.
|
983 |
-
-
|
984 |
- file_name (str): File name of the data file.
|
985 |
- num_batches (int): Number of batches required to go through all the response rows.
|
986 |
- in_api_key (str): The API key for authentication.
|
@@ -1004,6 +1008,8 @@ def extract_topics(in_data_file,
|
|
1004 |
- batch_size (int): The number of data rows to consider in each request.
|
1005 |
- context_textbox (str, optional): A string giving some context to the consultation/task.
|
1006 |
- time_taken (float, optional): The amount of time taken to process the responses up until this point.
|
|
|
|
|
1007 |
- max_tokens (int): The maximum number of tokens for the model.
|
1008 |
- model_name_map (dict, optional): A dictionary mapping full model name to shortened.
|
1009 |
- max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
|
@@ -1168,7 +1174,7 @@ def extract_topics(in_data_file,
|
|
1168 |
modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
|
1169 |
|
1170 |
#final_out_message = '\n'.join(out_message)
|
1171 |
-
return
|
1172 |
|
1173 |
|
1174 |
if num_batches > 0:
|
@@ -1236,7 +1242,7 @@ def extract_topics(in_data_file,
|
|
1236 |
else:
|
1237 |
print("Using local model:", model_choice)
|
1238 |
|
1239 |
-
# Preparing candidate topics
|
1240 |
if candidate_topics and existing_unique_topics_df.empty:
|
1241 |
progress(0.1, "Creating revised zero shot topics table")
|
1242 |
|
@@ -1282,7 +1288,10 @@ def extract_topics(in_data_file,
|
|
1282 |
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1283 |
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1284 |
|
1285 |
-
|
|
|
|
|
|
|
1286 |
|
1287 |
if create_revised_general_topics == True:
|
1288 |
# Create the most up to date list of topics and subtopics.
|
@@ -1334,7 +1343,7 @@ def extract_topics(in_data_file,
|
|
1334 |
"General Topic":zero_shot_topics_gen_topics_list,
|
1335 |
"Subtopic":zero_shot_topics_subtopics_list})
|
1336 |
|
1337 |
-
print("Zero shot topics are:", zero_shot_topics_df)
|
1338 |
|
1339 |
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1340 |
if not existing_unique_topics_df.empty:
|
@@ -1350,14 +1359,24 @@ def extract_topics(in_data_file,
|
|
1350 |
|
1351 |
#all_topic_tables_df_merged = existing_unique_topics_df
|
1352 |
existing_unique_topics_df["Response References"] = ""
|
|
|
|
|
|
|
|
|
|
|
1353 |
|
1354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1355 |
|
1356 |
-
#existing_unique_topics_df.to_csv(output_folder + f"{file_name}_existing_unique_topics_df_" + #model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
|
1357 |
|
1358 |
# Format the summary prompt with the response table and topics
|
1359 |
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1360 |
-
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, sentiment_choices=sentiment_prompt)
|
1361 |
|
1362 |
|
1363 |
if model_choice == "gemma_2b_it_local":
|
@@ -1415,7 +1434,7 @@ def extract_topics(in_data_file,
|
|
1415 |
if is_error == True:
|
1416 |
final_message_out = "Could not complete summary, error in LLM output."
|
1417 |
raise Exception(final_message_out)
|
1418 |
-
#return
|
1419 |
|
1420 |
# Write outputs to csv
|
1421 |
## Topics with references
|
@@ -1432,13 +1451,9 @@ def extract_topics(in_data_file,
|
|
1432 |
new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1433 |
out_file_paths.append(unique_topics_df_out_path)
|
1434 |
|
1435 |
-
#
|
1436 |
-
|
1437 |
-
|
1438 |
-
#display_table = responses[-1].text
|
1439 |
-
|
1440 |
-
# Show unique topics alongside document counts as output
|
1441 |
-
display_table = new_unique_topics_df.to_markdown(index=False)
|
1442 |
|
1443 |
#whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1444 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
@@ -1496,7 +1511,8 @@ def extract_topics(in_data_file,
|
|
1496 |
|
1497 |
# If error in table parsing, leave function
|
1498 |
if is_error == True:
|
1499 |
-
|
|
|
1500 |
|
1501 |
|
1502 |
#all_topic_tables_df.append(topic_table_df)
|
@@ -1526,15 +1542,15 @@ def extract_topics(in_data_file,
|
|
1526 |
if isinstance(responses[-1], ResponseObject):
|
1527 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1528 |
f.write(responses[-1].text)
|
1529 |
-
|
1530 |
elif "choices" in responses[-1]:
|
1531 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1532 |
f.write(responses[-1]["choices"][0]['text'])
|
1533 |
-
|
1534 |
else:
|
1535 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1536 |
f.write(responses[-1].text)
|
1537 |
-
|
1538 |
|
1539 |
log_files_output_paths.append(final_table_output_path)
|
1540 |
|
@@ -1579,7 +1595,7 @@ def extract_topics(in_data_file,
|
|
1579 |
print(final_message_out)
|
1580 |
|
1581 |
|
1582 |
-
return
|
1583 |
|
1584 |
def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
|
1585 |
|
@@ -2304,4 +2320,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
2304 |
if latest_summary_completed >= length_all_summaries:
|
2305 |
print("At last summary.")
|
2306 |
|
|
|
|
|
2307 |
return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|
|
|
20 |
|
21 |
GradioFileData = gr.FileData
|
22 |
|
23 |
+
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt
|
24 |
from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
|
25 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
26 |
|
|
|
884 |
# Create a new DataFrame from the reference data
|
885 |
new_reference_df = pd.DataFrame(reference_data)
|
886 |
|
887 |
+
print("new_reference_df:", new_reference_df)
|
888 |
|
889 |
# Append on old reference data
|
890 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
|
|
897 |
out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
|
898 |
except Exception as e:
|
899 |
print("Could not convert Response References column to integer due to", e)
|
900 |
+
print("out_reference_df['Response References']:", out_reference_df["Response References"].head())
|
901 |
+
|
902 |
+
out_reference_df.to_csv(output_folder + "test_output_reference_df.csv")
|
903 |
|
904 |
out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
|
905 |
|
|
|
944 |
existing_topics_table:pd.DataFrame,
|
945 |
existing_reference_df:pd.DataFrame,
|
946 |
existing_unique_topics_df:pd.DataFrame,
|
947 |
+
unique_table_df_display_table_markdown:str,
|
948 |
file_name:str,
|
949 |
num_batches:int,
|
950 |
in_api_key:str,
|
|
|
969 |
context_textbox:str="",
|
970 |
time_taken:float = 0,
|
971 |
sentiment_checkbox:str = "Negative, Neutral, or Positive",
|
972 |
+
force_zero_shot_radio:str = "No",
|
973 |
max_tokens:int=max_tokens,
|
974 |
model_name_map:dict=model_name_map,
|
975 |
max_time_for_loop:int=max_time_for_loop,
|
|
|
984 |
- existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
|
985 |
- existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
|
986 |
- existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point.
|
987 |
+
- unique_table_df_display_table_markdown (str): Table for display in markdown format.
|
988 |
- file_name (str): File name of the data file.
|
989 |
- num_batches (int): Number of batches required to go through all the response rows.
|
990 |
- in_api_key (str): The API key for authentication.
|
|
|
1008 |
- batch_size (int): The number of data rows to consider in each request.
|
1009 |
- context_textbox (str, optional): A string giving some context to the consultation/task.
|
1010 |
- time_taken (float, optional): The amount of time taken to process the responses up until this point.
|
1011 |
+
- sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
|
1012 |
+
- force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
|
1013 |
- max_tokens (int): The maximum number of tokens for the model.
|
1014 |
- model_name_map (dict, optional): A dictionary mapping full model name to shortened.
|
1015 |
- max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
|
|
|
1174 |
modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
|
1175 |
|
1176 |
#final_out_message = '\n'.join(out_message)
|
1177 |
+
return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
|
1178 |
|
1179 |
|
1180 |
if num_batches > 0:
|
|
|
1242 |
else:
|
1243 |
print("Using local model:", model_choice)
|
1244 |
|
1245 |
+
# Preparing candidate topics if no topics currently exist
|
1246 |
if candidate_topics and existing_unique_topics_df.empty:
|
1247 |
progress(0.1, "Creating revised zero shot topics table")
|
1248 |
|
|
|
1288 |
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1289 |
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1290 |
|
1291 |
+
# If the responses are being forced into zero shot topics, allow an option for nothing relevant
|
1292 |
+
if force_zero_shot_radio == "Yes":
|
1293 |
+
zero_shot_topics_gen_topics_list.append("")
|
1294 |
+
zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
|
1295 |
|
1296 |
if create_revised_general_topics == True:
|
1297 |
# Create the most up to date list of topics and subtopics.
|
|
|
1343 |
"General Topic":zero_shot_topics_gen_topics_list,
|
1344 |
"Subtopic":zero_shot_topics_subtopics_list})
|
1345 |
|
1346 |
+
#print("Zero shot topics are:", zero_shot_topics_df)
|
1347 |
|
1348 |
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1349 |
if not existing_unique_topics_df.empty:
|
|
|
1359 |
|
1360 |
#all_topic_tables_df_merged = existing_unique_topics_df
|
1361 |
existing_unique_topics_df["Response References"] = ""
|
1362 |
+
existing_unique_topics_df.fillna("", inplace=True)
|
1363 |
+
existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
|
1364 |
+
existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
|
1365 |
+
|
1366 |
+
# print("existing_unique_topics_df:", existing_unique_topics_df)
|
1367 |
|
1368 |
+
# If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
|
1369 |
+
if force_zero_shot_radio == "Yes":
|
1370 |
+
unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
|
1371 |
+
topic_assignment_prompt = force_existing_topics_prompt
|
1372 |
+
else:
|
1373 |
+
unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
|
1374 |
+
topic_assignment_prompt = allow_new_topics_prompt
|
1375 |
|
|
|
1376 |
|
1377 |
# Format the summary prompt with the response table and topics
|
1378 |
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1379 |
+
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
|
1380 |
|
1381 |
|
1382 |
if model_choice == "gemma_2b_it_local":
|
|
|
1434 |
if is_error == True:
|
1435 |
final_message_out = "Could not complete summary, error in LLM output."
|
1436 |
raise Exception(final_message_out)
|
1437 |
+
#return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
|
1438 |
|
1439 |
# Write outputs to csv
|
1440 |
## Topics with references
|
|
|
1451 |
new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1452 |
out_file_paths.append(unique_topics_df_out_path)
|
1453 |
|
1454 |
+
# Outputs for markdown table output
|
1455 |
+
unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
1456 |
+
unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
|
|
|
|
|
|
|
|
|
1457 |
|
1458 |
#whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1459 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
|
|
1511 |
|
1512 |
# If error in table parsing, leave function
|
1513 |
if is_error == True:
|
1514 |
+
raise Exception("Error in output table parsing")
|
1515 |
+
# unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
|
1516 |
|
1517 |
|
1518 |
#all_topic_tables_df.append(topic_table_df)
|
|
|
1542 |
if isinstance(responses[-1], ResponseObject):
|
1543 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1544 |
f.write(responses[-1].text)
|
1545 |
+
unique_table_df_display_table_markdown = responses[-1].text
|
1546 |
elif "choices" in responses[-1]:
|
1547 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1548 |
f.write(responses[-1]["choices"][0]['text'])
|
1549 |
+
unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
|
1550 |
else:
|
1551 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1552 |
f.write(responses[-1].text)
|
1553 |
+
unique_table_df_display_table_markdown = responses[-1].text
|
1554 |
|
1555 |
log_files_output_paths.append(final_table_output_path)
|
1556 |
|
|
|
1595 |
print(final_message_out)
|
1596 |
|
1597 |
|
1598 |
+
return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
|
1599 |
|
1600 |
def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
|
1601 |
|
|
|
2320 |
if latest_summary_completed >= length_all_summaries:
|
2321 |
print("At last summary.")
|
2322 |
|
2323 |
+
output_files = list(set(output_files))
|
2324 |
+
|
2325 |
return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|
tools/prompts.py
CHANGED
@@ -6,13 +6,15 @@ initial_table_prompt = """The open text data is shown in the following table tha
|
|
6 |
Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
|
7 |
In the first column identify general topics relevant to responses. Create as many general topics as you can.
|
8 |
In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
|
9 |
-
|
10 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
11 |
-
In the fifth
|
12 |
-
Do not add any other columns. Do not
|
13 |
|
14 |
New table:"""
|
15 |
|
|
|
|
|
16 |
prompt2 = ""
|
17 |
|
18 |
prompt3 = ""
|
@@ -21,6 +23,12 @@ prompt3 = ""
|
|
21 |
|
22 |
add_existing_topics_system_prompt = system_prompt
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
add_existing_topics_prompt = """Responses are shown in the following Response table:
|
25 |
{response_table}
|
26 |
|
@@ -28,16 +36,16 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
|
|
28 |
{topics}
|
29 |
|
30 |
Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
|
31 |
-
|
32 |
-
|
33 |
-
In the third column, write the sentiment of the Subtopic: {sentiment_choices}.
|
34 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
35 |
-
In the fifth
|
36 |
-
Do not add any other columns.
|
37 |
-
Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
|
38 |
|
39 |
New table:"""
|
40 |
|
|
|
|
|
41 |
|
42 |
summarise_topic_descriptions_system_prompt = system_prompt
|
43 |
|
|
|
6 |
Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
|
7 |
In the first column identify general topics relevant to responses. Create as many general topics as you can.
|
8 |
In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
|
9 |
+
{sentiment_choices}.
|
10 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
11 |
+
In the fifth column, write a short summary of the subtopic based on relevant responses - highlight specific issues that appear.
|
12 |
+
Do not add any other columns. Do not add any other text to your response.
|
13 |
|
14 |
New table:"""
|
15 |
|
16 |
+
# Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
|
17 |
+
|
18 |
prompt2 = ""
|
19 |
|
20 |
prompt3 = ""
|
|
|
23 |
|
24 |
add_existing_topics_system_prompt = system_prompt
|
25 |
|
26 |
+
force_existing_topics_prompt = """Create a new markdown table with the headings 'Placeholder', 'Subtopics', 'Sentiment', 'Response references', and 'Summary'.
|
27 |
+
In the first column, write 'Not assessed'. In the second column, assign Subtopics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
|
28 |
+
|
29 |
+
allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
|
30 |
+
In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
|
31 |
+
|
32 |
add_existing_topics_prompt = """Responses are shown in the following Response table:
|
33 |
{response_table}
|
34 |
|
|
|
36 |
{topics}
|
37 |
|
38 |
Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
|
39 |
+
{topic_assignment}
|
40 |
+
{sentiment_choices}.
|
|
|
41 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
42 |
+
In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
|
43 |
+
Do not add any other columns. Do not add any other text to your response.
|
|
|
44 |
|
45 |
New table:"""
|
46 |
|
47 |
+
# Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
|
48 |
+
|
49 |
|
50 |
summarise_topic_descriptions_system_prompt = system_prompt
|
51 |
|