Commit
·
b9301bd
1
Parent(s):
71fcefe
Upgraded Gradio. More resilient to cases where LLM calls do not return valid markdown tables (will reattempt with different temperature). Minor fixes
Browse files- app.py +11 -11
- requirements.txt +1 -1
- requirements_aws.txt +1 -1
- requirements_gpu.txt +1 -1
- tools/helper_functions.py +5 -2
- tools/llm_api_call.py +491 -441
app.py
CHANGED
@@ -124,8 +124,8 @@ with app:
|
|
124 |
|
125 |
extract_topics_btn = gr.Button("Extract topics", variant="primary")
|
126 |
|
127 |
-
|
128 |
-
|
129 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
130 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
131 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
@@ -250,27 +250,27 @@ with app:
|
|
250 |
# Tabular data upload
|
251 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
|
252 |
|
253 |
-
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state,
|
254 |
success(load_in_data_file,
|
255 |
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
|
256 |
success(fn=extract_topics,
|
257 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
|
258 |
-
outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state,
|
259 |
|
260 |
|
261 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
262 |
-
latest_batch_completed.change(fn=extract_topics,
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
|
268 |
# If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
|
269 |
-
modification_input_files.
|
270 |
|
271 |
|
272 |
# Modify output table with custom topic names
|
273 |
-
save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state,
|
274 |
|
275 |
# When button pressed, deduplicate data
|
276 |
deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
|
|
124 |
|
125 |
extract_topics_btn = gr.Button("Extract topics", variant="primary")
|
126 |
|
127 |
+
topic_extraction_output_files = gr.File(height=file_input_height, label="Output files")
|
128 |
+
display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
|
129 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
130 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
131 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
|
|
250 |
# Tabular data upload
|
251 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
|
252 |
|
253 |
+
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
|
254 |
success(load_in_data_file,
|
255 |
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
|
256 |
success(fn=extract_topics,
|
257 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
|
258 |
+
outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
|
259 |
|
260 |
|
261 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
262 |
+
# latest_batch_completed.change(fn=extract_topics,
|
263 |
+
# inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
|
264 |
+
# outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
|
265 |
+
# success(fn = reveal_feedback_buttons,
|
266 |
+
# outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
267 |
|
268 |
# If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
|
269 |
+
modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
|
270 |
|
271 |
|
272 |
# Modify output table with custom topic names
|
273 |
+
save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, deduplication_input_files, summarisation_input_files, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, summarised_output_markdown])
|
274 |
|
275 |
# When button pressed, deduplicate data
|
276 |
deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.20.1
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
requirements_aws.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.20.1
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
requirements_gpu.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.20.1
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
tools/helper_functions.py
CHANGED
@@ -15,8 +15,11 @@ def empty_output_vars_extract_topics():
|
|
15 |
log_files_output_list_state = []
|
16 |
conversation_metadata_textbox = ""
|
17 |
estimated_time_taken_number = 0
|
|
|
|
|
|
|
18 |
|
19 |
-
return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number
|
20 |
|
21 |
def empty_output_vars_summarise():
|
22 |
# Empty output objects before summarising files
|
@@ -127,7 +130,7 @@ def wrap_text(text:str, max_width=60, max_text_length=None):
|
|
127 |
|
128 |
# If max_text_length is set, truncate the text and add ellipsis
|
129 |
if max_text_length and len(text) > max_text_length:
|
130 |
-
|
131 |
|
132 |
text = text.replace('\r\n', '<br>').replace('\n', '<br>')
|
133 |
|
|
|
15 |
log_files_output_list_state = []
|
16 |
conversation_metadata_textbox = ""
|
17 |
estimated_time_taken_number = 0
|
18 |
+
file_data_state = pd.DataFrame()
|
19 |
+
reference_data_file_name_textbox = ""
|
20 |
+
display_topic_table_markdown = ""
|
21 |
|
22 |
+
return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown
|
23 |
|
24 |
def empty_output_vars_summarise():
|
25 |
# Empty output objects before summarising files
|
|
|
130 |
|
131 |
# If max_text_length is set, truncate the text and add ellipsis
|
132 |
if max_text_length and len(text) > max_text_length:
|
133 |
+
text = text[:max_text_length] + '...'
|
134 |
|
135 |
text = text.replace('\r\n', '<br>').replace('\n', '<br>')
|
136 |
|
tools/llm_api_call.py
CHANGED
@@ -30,9 +30,11 @@ class ResponseObject:
|
|
30 |
self.text = text
|
31 |
self.usage_metadata = usage_metadata
|
32 |
|
33 |
-
max_tokens = 4096
|
34 |
timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
|
35 |
number_of_api_retry_attempts = 5
|
|
|
|
|
36 |
max_time_for_loop = 99999
|
37 |
batch_size_default = 5
|
38 |
deduplication_threshold = 90
|
@@ -392,11 +394,6 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
392 |
# Clear any existing progress bars
|
393 |
tqdm._instances.clear()
|
394 |
|
395 |
-
# Print the full prompt for debugging purposes
|
396 |
-
#print("full_prompt:", full_prompt)
|
397 |
-
|
398 |
-
#progress_bar = tqdm(range(0,number_of_api_retry_attempts), desc="Calling API with " + str(timeout_wait) + " seconds per retry.", unit="attempts")
|
399 |
-
|
400 |
progress_bar = range(0,number_of_api_retry_attempts)
|
401 |
|
402 |
# Generate the model's response
|
@@ -473,8 +470,6 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
473 |
# Update the conversation history with the new prompt and response
|
474 |
conversation_history.append({'role': 'user', 'parts': [prompt]})
|
475 |
|
476 |
-
# output_str = output['choices'][0]['text']
|
477 |
-
|
478 |
# Check if is a LLama.cpp model response
|
479 |
# Check if the response is a ResponseObject
|
480 |
if isinstance(response, ResponseObject):
|
@@ -739,6 +734,70 @@ def convert_response_text_to_markdown_table(response_text:str, table_type:str =
|
|
739 |
|
740 |
return out_df, is_error
|
741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
742 |
|
743 |
def write_llm_output_and_logs(responses: List[ResponseObject],
|
744 |
whole_conversation: List[str],
|
@@ -884,7 +943,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
884 |
# Create a new DataFrame from the reference data
|
885 |
new_reference_df = pd.DataFrame(reference_data)
|
886 |
|
887 |
-
print("new_reference_df:", new_reference_df)
|
888 |
|
889 |
# Append on old reference data
|
890 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
@@ -1035,18 +1094,15 @@ def extract_topics(in_data_file,
|
|
1035 |
llama_cpp_prefix = "<start_of_turn>user\n"
|
1036 |
llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
|
1037 |
|
1038 |
-
# Reset output files on each run:
|
1039 |
-
# out_file_paths = []
|
1040 |
-
|
1041 |
# If you have a file input but no file data it hasn't yet been loaded. Load it here.
|
1042 |
if file_data.empty:
|
1043 |
print("No data table found, loading from file")
|
1044 |
try:
|
1045 |
-
print("in_data_file:", in_data_file)
|
1046 |
in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
|
1047 |
-
print("in_colnames:", in_colnames_drop)
|
1048 |
file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default)
|
1049 |
-
print("file_data loaded in:", file_data)
|
1050 |
except:
|
1051 |
# Check if files and text exist
|
1052 |
out_message = "Please enter a data file to summarise."
|
@@ -1060,7 +1116,7 @@ def extract_topics(in_data_file,
|
|
1060 |
|
1061 |
# If this is the first time around, set variables to 0/blank
|
1062 |
if first_loop_state==True:
|
1063 |
-
|
1064 |
if (latest_batch_completed == 999) | (latest_batch_completed == 0):
|
1065 |
latest_batch_completed = 0
|
1066 |
out_message = []
|
@@ -1072,527 +1128,511 @@ def extract_topics(in_data_file,
|
|
1072 |
local_model, tokenizer = load_model()
|
1073 |
print("Local model loaded:", local_model)
|
1074 |
|
1075 |
-
|
1076 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1077 |
|
1078 |
-
|
1079 |
-
if latest_batch_completed >= num_batches:
|
1080 |
-
print("Last batch reached, returning batch:", str(latest_batch_completed))
|
1081 |
-
# Set to a very high number so as not to mess with subsequent file processing by the user
|
1082 |
-
#latest_batch_completed = 999
|
1083 |
|
1084 |
-
|
1085 |
-
|
1086 |
-
|
1087 |
-
|
1088 |
|
1089 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1090 |
|
1091 |
-
|
1092 |
-
|
1093 |
-
|
|
|
1094 |
|
1095 |
-
|
1096 |
-
|
|
|
1097 |
|
1098 |
-
|
1099 |
-
|
1100 |
|
1101 |
-
|
1102 |
-
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
1103 |
|
1104 |
-
|
1105 |
-
|
1106 |
-
reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1107 |
-
reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1108 |
-
unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1109 |
-
basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1110 |
|
1111 |
-
|
1112 |
-
|
1113 |
-
#new_topic_df.to_csv(topic_table_out_path, index=None)
|
1114 |
-
#log_files_output_paths.append(topic_table_out_path)
|
1115 |
|
1116 |
-
|
1117 |
-
|
1118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1119 |
|
1120 |
-
|
1121 |
-
|
|
|
1122 |
|
1123 |
-
|
1124 |
-
|
1125 |
-
out_file_paths.append(unique_topics_df_out_path)
|
1126 |
|
1127 |
-
|
1128 |
-
|
1129 |
|
1130 |
-
|
1131 |
-
existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
|
1132 |
-
log_files_output_paths.append(reference_table_out_pivot_path)
|
1133 |
|
1134 |
-
|
1135 |
-
# Assuming existing_reference_df and file_data are already defined
|
1136 |
-
|
1137 |
-
# Simplify table to just responses column and the Response reference number
|
1138 |
-
|
1139 |
|
1140 |
-
|
1141 |
|
1142 |
-
|
|
|
|
|
|
|
1143 |
|
1144 |
-
|
1145 |
-
|
1146 |
-
|
1147 |
|
|
|
|
|
|
|
|
|
|
|
1148 |
|
1149 |
-
|
1150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1151 |
|
1152 |
-
|
|
|
|
|
|
|
1153 |
|
1154 |
-
|
1155 |
-
|
|
|
|
|
|
|
|
|
|
|
1156 |
|
1157 |
-
|
1158 |
-
|
1159 |
-
missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False) # Fill other columns with NA
|
1160 |
|
1161 |
-
|
1162 |
-
|
|
|
|
|
|
|
|
|
1163 |
|
1164 |
-
|
1165 |
-
missing_df.to_csv(missing_df_out_path, index=None)
|
1166 |
-
log_files_output_paths.append(missing_df_out_path)
|
1167 |
|
1168 |
-
|
1169 |
-
log_files_output_paths = list(set(log_files_output_paths))
|
1170 |
|
1171 |
-
|
1172 |
-
|
1173 |
-
# The topic table that can be modified does not need the summary column
|
1174 |
-
modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
|
1175 |
|
1176 |
-
|
1177 |
-
|
1178 |
-
|
1179 |
-
|
1180 |
-
if num_batches > 0:
|
1181 |
-
progress_measure = round(latest_batch_completed / num_batches, 1)
|
1182 |
-
progress(progress_measure, desc="Querying large language model")
|
1183 |
-
else:
|
1184 |
-
progress(0.1, desc="Querying large language model")
|
1185 |
|
1186 |
-
|
1187 |
-
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
1188 |
-
if isinstance(out_message, str):
|
1189 |
-
out_message = [out_message]
|
1190 |
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
-
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
1196 |
-
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
1197 |
-
print(out_message)
|
1198 |
-
raise Exception(out_message)
|
1199 |
-
#return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
|
1200 |
-
|
1201 |
-
|
1202 |
-
if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
1203 |
-
elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
|
1204 |
-
elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
|
1205 |
-
else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
1206 |
-
|
1207 |
-
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
1208 |
-
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
1209 |
|
1210 |
-
|
1211 |
-
#for latest_batch_completed in range(num_batches):
|
1212 |
-
reported_batch_no = latest_batch_completed + 1
|
1213 |
-
print("Running query batch", str(reported_batch_no))
|
1214 |
|
1215 |
-
|
1216 |
-
|
1217 |
-
|
1218 |
|
1219 |
-
|
1220 |
-
|
1221 |
-
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
-
|
1227 |
-
|
1228 |
|
1229 |
-
|
|
|
|
|
1230 |
|
1231 |
-
|
1232 |
-
|
|
|
|
|
|
|
1233 |
|
1234 |
-
|
|
|
|
|
|
|
1235 |
|
1236 |
-
|
1237 |
-
|
1238 |
-
|
1239 |
-
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
1240 |
-
elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
|
1241 |
-
print("Using AWS Bedrock model:", model_choice)
|
1242 |
-
else:
|
1243 |
-
print("Using local model:", model_choice)
|
1244 |
-
|
1245 |
-
# Preparing candidate topics if no topics currently exist
|
1246 |
-
if candidate_topics and existing_unique_topics_df.empty:
|
1247 |
-
progress(0.1, "Creating revised zero shot topics table")
|
1248 |
-
|
1249 |
-
# 'Zero shot topics' are those supplied by the user
|
1250 |
-
max_topic_no = 120
|
1251 |
-
zero_shot_topics = read_file(candidate_topics.name)
|
1252 |
-
|
1253 |
-
# Max 120 topics allowed
|
1254 |
-
if zero_shot_topics.shape[0] > max_topic_no:
|
1255 |
-
print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
|
1256 |
-
zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
|
1257 |
-
|
1258 |
-
# Forward slashes in the topic names seems to confuse the model
|
1259 |
-
if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
|
1260 |
-
for x in zero_shot_topics.columns:
|
1261 |
-
zero_shot_topics.loc[:, x] = (
|
1262 |
-
zero_shot_topics.loc[:, x]
|
1263 |
-
.str.strip()
|
1264 |
-
.str.replace('\n', ' ')
|
1265 |
-
.str.replace('\r', ' ')
|
1266 |
-
.str.replace('/', ' or ')
|
1267 |
-
.str.lower()
|
1268 |
-
.str.capitalize())
|
1269 |
-
|
1270 |
-
# If number of columns is 1, keep only subtopics
|
1271 |
-
if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
|
1272 |
-
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1273 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1274 |
-
# Allow for possibility that the user only wants to set general topics and not subtopics
|
1275 |
-
elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
|
1276 |
-
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
|
1277 |
-
zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
|
1278 |
-
# If general topic and subtopic are specified
|
1279 |
-
elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
|
1280 |
-
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
|
1281 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
|
1282 |
-
# If number of columns is 2, keep general topics and subtopics
|
1283 |
-
elif zero_shot_topics.shape[1] == 2:
|
1284 |
-
zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
|
1285 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
|
1286 |
-
else:
|
1287 |
-
# If there are more columns, just assume that the first column was meant to be a subtopic
|
1288 |
-
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1289 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1290 |
|
1291 |
-
|
1292 |
-
|
1293 |
-
zero_shot_topics_gen_topics_list.append("")
|
1294 |
-
zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
|
1295 |
|
1296 |
-
|
1297 |
-
|
1298 |
-
|
1299 |
-
|
1300 |
-
|
1301 |
-
|
1302 |
-
})
|
1303 |
-
unique_topics_markdown = unique_topics_df.to_markdown()
|
1304 |
|
1305 |
-
|
1306 |
-
|
1307 |
-
|
1308 |
|
1309 |
-
|
1310 |
-
|
1311 |
|
1312 |
-
|
1313 |
-
|
1314 |
|
1315 |
-
|
1316 |
|
1317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1318 |
|
1319 |
-
|
1320 |
|
1321 |
-
|
1322 |
-
try:
|
1323 |
-
zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
|
1324 |
-
print("Output revised zero shot topics table is:", zero_shot_topics_df)
|
1325 |
|
1326 |
-
|
1327 |
-
|
1328 |
-
|
|
|
|
|
1329 |
|
1330 |
-
|
1331 |
-
|
1332 |
-
|
1333 |
-
|
1334 |
-
"Subtopic":zero_shot_topics_subtopics_list})
|
1335 |
|
1336 |
-
|
1337 |
-
|
1338 |
-
|
1339 |
-
"General Topic":zero_shot_topics_gen_topics_list,
|
1340 |
-
"Subtopic":zero_shot_topics_subtopics_list})
|
1341 |
-
else:
|
1342 |
-
zero_shot_topics_df = pd.DataFrame(data={
|
1343 |
-
"General Topic":zero_shot_topics_gen_topics_list,
|
1344 |
-
"Subtopic":zero_shot_topics_subtopics_list})
|
1345 |
-
|
1346 |
-
#print("Zero shot topics are:", zero_shot_topics_df)
|
1347 |
-
|
1348 |
-
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1349 |
-
if not existing_unique_topics_df.empty:
|
1350 |
-
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1351 |
-
else:
|
1352 |
-
existing_unique_topics_df = zero_shot_topics_df
|
1353 |
|
1354 |
-
|
1355 |
-
# If you have already created revised zero shot topics, concat to the current
|
1356 |
-
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
|
1357 |
|
1358 |
-
|
1359 |
|
1360 |
-
|
1361 |
-
existing_unique_topics_df["Response References"] = ""
|
1362 |
-
existing_unique_topics_df.fillna("", inplace=True)
|
1363 |
-
existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
|
1364 |
-
existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
|
1365 |
|
1366 |
-
# print("existing_unique_topics_df:", existing_unique_topics_df)
|
1367 |
|
1368 |
-
|
1369 |
-
if force_zero_shot_radio == "Yes":
|
1370 |
-
unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
|
1371 |
-
topic_assignment_prompt = force_existing_topics_prompt
|
1372 |
-
else:
|
1373 |
-
unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
|
1374 |
-
topic_assignment_prompt = allow_new_topics_prompt
|
1375 |
-
|
1376 |
-
|
1377 |
-
# Format the summary prompt with the response table and topics
|
1378 |
-
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1379 |
-
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
|
1380 |
-
|
1381 |
|
1382 |
-
|
1383 |
-
|
1384 |
-
|
1385 |
-
|
1386 |
-
full_prompt = formatted_system_prompt + formatted_summary_prompt
|
1387 |
|
1388 |
-
|
1389 |
-
|
1390 |
-
# Define the output file path for the formatted prompt
|
1391 |
-
formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1392 |
-
|
1393 |
-
# Write the formatted prompt to the specified file
|
1394 |
-
try:
|
1395 |
-
with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1396 |
-
f.write(full_prompt)
|
1397 |
-
except Exception as e:
|
1398 |
-
print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
|
1399 |
|
1400 |
-
|
|
|
1401 |
|
1402 |
-
|
|
|
1403 |
|
1404 |
-
|
1405 |
-
summary_whole_conversation = []
|
1406 |
|
1407 |
-
|
1408 |
-
responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
|
1409 |
|
1410 |
-
|
1411 |
-
|
|
|
|
|
1412 |
|
1413 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1414 |
|
1415 |
-
|
1416 |
-
try:
|
1417 |
-
final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1418 |
|
1419 |
-
|
1420 |
-
|
1421 |
-
|
1422 |
-
|
1423 |
-
|
1424 |
-
f.write(responses[-1]["choices"][0]['text'])
|
1425 |
-
else:
|
1426 |
-
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1427 |
-
f.write(responses[-1].text)
|
1428 |
|
1429 |
-
|
1430 |
-
|
1431 |
-
|
1432 |
|
1433 |
-
|
1434 |
-
|
1435 |
-
|
1436 |
-
|
1437 |
-
#return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
|
1438 |
|
1439 |
-
|
1440 |
-
|
1441 |
-
new_topic_df.to_csv(topic_table_out_path, index=None)
|
1442 |
-
log_files_output_paths.append(topic_table_out_path)
|
1443 |
|
1444 |
-
|
1445 |
-
|
1446 |
-
|
|
|
|
|
1447 |
|
1448 |
-
|
1449 |
-
|
|
|
|
|
1450 |
|
1451 |
-
|
1452 |
-
|
1453 |
-
|
1454 |
-
# Outputs for markdown table output
|
1455 |
-
unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
1456 |
-
unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
|
1457 |
|
1458 |
-
|
1459 |
-
|
1460 |
-
|
1461 |
|
1462 |
-
|
1463 |
-
|
1464 |
|
1465 |
-
|
1466 |
-
log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1467 |
|
1468 |
-
|
|
|
|
|
|
|
|
|
1469 |
|
1470 |
-
|
1471 |
-
|
1472 |
-
|
1473 |
-
|
1474 |
-
# Prepare Gemini models before query
|
1475 |
-
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
|
1476 |
-
print("Using Gemini model:", model_choice)
|
1477 |
-
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1478 |
-
elif model_choice in ["gemma_2b_it_local"]:
|
1479 |
-
print("Using local Gemma 2b model")
|
1480 |
-
else:
|
1481 |
-
print("Using AWS Bedrock model:", model_choice)
|
1482 |
|
1483 |
-
|
1484 |
|
1485 |
-
|
|
|
|
|
1486 |
|
1487 |
-
|
1488 |
-
|
1489 |
-
|
1490 |
-
if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
|
1491 |
-
else: formatted_prompt3 = prompt3
|
1492 |
|
1493 |
-
|
1494 |
-
|
1495 |
-
formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
|
1496 |
-
formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
|
1497 |
|
1498 |
-
|
1499 |
-
|
1500 |
-
whole_conversation = [formatted_initial_table_system_prompt]
|
1501 |
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
|
|
|
|
1506 |
|
1507 |
-
|
1508 |
-
|
|
|
1509 |
|
1510 |
-
|
|
|
1511 |
|
1512 |
-
|
1513 |
-
|
1514 |
-
|
1515 |
-
# unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
|
1516 |
-
|
1517 |
-
|
1518 |
-
#all_topic_tables_df.append(topic_table_df)
|
1519 |
|
1520 |
-
|
1521 |
-
|
1522 |
|
1523 |
-
|
1524 |
-
|
|
|
1525 |
|
1526 |
-
|
|
|
|
|
1527 |
|
1528 |
-
|
1529 |
|
1530 |
-
new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1531 |
-
out_file_paths.append(unique_topics_df_out_path)
|
1532 |
-
|
1533 |
-
#all_markdown_topic_tables.append(markdown_table)
|
1534 |
|
1535 |
-
|
1536 |
-
|
1537 |
-
|
1538 |
-
# Write final output to text file also
|
1539 |
-
try:
|
1540 |
-
final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1541 |
-
|
1542 |
-
if isinstance(responses[-1], ResponseObject):
|
1543 |
-
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1544 |
-
f.write(responses[-1].text)
|
1545 |
-
unique_table_df_display_table_markdown = responses[-1].text
|
1546 |
-
elif "choices" in responses[-1]:
|
1547 |
-
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1548 |
-
f.write(responses[-1]["choices"][0]['text'])
|
1549 |
-
unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
|
1550 |
-
else:
|
1551 |
-
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1552 |
-
f.write(responses[-1].text)
|
1553 |
-
unique_table_df_display_table_markdown = responses[-1].text
|
1554 |
|
1555 |
-
log_files_output_paths.append(final_table_output_path)
|
1556 |
|
1557 |
-
|
1558 |
-
|
1559 |
-
|
1560 |
-
new_topic_df = topic_table_df
|
1561 |
-
new_reference_df = reference_df
|
1562 |
|
1563 |
-
|
1564 |
-
|
1565 |
|
1566 |
-
#
|
1567 |
-
|
1568 |
-
|
1569 |
-
latest_batch_completed += 1
|
1570 |
|
1571 |
-
|
1572 |
-
|
1573 |
|
1574 |
-
|
1575 |
-
|
1576 |
-
|
1577 |
-
tqdm._instances.clear()
|
1578 |
-
break
|
1579 |
|
1580 |
-
|
1581 |
-
|
1582 |
-
existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
|
1583 |
-
existing_topics_table = new_topic_df.dropna(how='all')
|
1584 |
|
|
|
|
|
1585 |
# The topic table that can be modified does not need the summary column
|
1586 |
-
modifiable_unique_topics_df =
|
1587 |
-
|
1588 |
-
out_time = f"{final_time:0.1f} seconds."
|
1589 |
-
|
1590 |
-
out_message.append('All queries successfully completed in')
|
1591 |
|
1592 |
-
|
1593 |
-
final_message_out = final_message_out + " " + out_time
|
1594 |
|
1595 |
-
|
1596 |
|
1597 |
|
1598 |
return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
|
@@ -1683,12 +1723,22 @@ def join_modified_topic_names_to_ref_table(modified_unique_topics_df:pd.DataFram
|
|
1683 |
def modify_existing_output_tables(original_unique_topics_df:pd.DataFrame, modifiable_unique_topics_df:pd.DataFrame, reference_df:pd.DataFrame, text_output_file_list_state:List[str]) -> Tuple:
|
1684 |
'''
|
1685 |
Take a unique_topics table that has been modified, apply these new topic names to the long-form reference_df, and save both tables to file.
|
1686 |
-
'''
|
1687 |
-
|
1688 |
-
|
1689 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1690 |
|
1691 |
-
print("
|
|
|
1692 |
|
1693 |
output_file_list = []
|
1694 |
|
|
|
30 |
self.text = text
|
31 |
self.usage_metadata = usage_metadata
|
32 |
|
33 |
+
max_tokens = 4096 # Maximum number of output tokens
|
34 |
timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
|
35 |
number_of_api_retry_attempts = 5
|
36 |
+
# Try up to 3 times to get a valid markdown table response with LLM calls, otherwise retry with temperature changed
|
37 |
+
MAX_OUTPUT_VALIDATION_ATTEMPTS = 3
|
38 |
max_time_for_loop = 99999
|
39 |
batch_size_default = 5
|
40 |
deduplication_threshold = 90
|
|
|
394 |
# Clear any existing progress bars
|
395 |
tqdm._instances.clear()
|
396 |
|
|
|
|
|
|
|
|
|
|
|
397 |
progress_bar = range(0,number_of_api_retry_attempts)
|
398 |
|
399 |
# Generate the model's response
|
|
|
470 |
# Update the conversation history with the new prompt and response
|
471 |
conversation_history.append({'role': 'user', 'parts': [prompt]})
|
472 |
|
|
|
|
|
473 |
# Check if is a LLama.cpp model response
|
474 |
# Check if the response is a ResponseObject
|
475 |
if isinstance(response, ResponseObject):
|
|
|
734 |
|
735 |
return out_df, is_error
|
736 |
|
737 |
+
def call_llm_with_markdown_table_checks(batch_prompts: List[str],
|
738 |
+
system_prompt: str,
|
739 |
+
conversation_history: List[dict],
|
740 |
+
whole_conversation: List[str],
|
741 |
+
whole_conversation_metadata: List[str],
|
742 |
+
model: object,
|
743 |
+
config: dict,
|
744 |
+
model_choice: str,
|
745 |
+
temperature: float,
|
746 |
+
reported_batch_no: int,
|
747 |
+
local_model: object,
|
748 |
+
MAX_OUTPUT_VALIDATION_ATTEMPTS: int,
|
749 |
+
master:bool=False) -> Tuple[List[ResponseObject], List[dict], List[str], List[str], str]:
|
750 |
+
"""
|
751 |
+
Call the large language model with checks for a valid markdown table.
|
752 |
+
|
753 |
+
Parameters:
|
754 |
+
- batch_prompts (List[str]): A list of prompts to be processed.
|
755 |
+
- system_prompt (str): The system prompt.
|
756 |
+
- conversation_history (List[dict]): The history of the conversation.
|
757 |
+
- whole_conversation (List[str]): The complete conversation including prompts and responses.
|
758 |
+
- whole_conversation_metadata (List[str]): Metadata about the whole conversation.
|
759 |
+
- model (object): The model to use for processing the prompts.
|
760 |
+
- config (dict): Configuration for the model.
|
761 |
+
- model_choice (str): The choice of model to use.
|
762 |
+
- temperature (float): The temperature parameter for the model.
|
763 |
+
- reported_batch_no (int): The reported batch number.
|
764 |
+
- local_model (object): The local model to use.
|
765 |
+
- MAX_OUTPUT_VALIDATION_ATTEMPTS (int): The maximum number of attempts to validate the output.
|
766 |
+
- master (bool, optional): Boolean to determine whether this call is for the master output table.
|
767 |
+
|
768 |
+
Returns:
|
769 |
+
- Tuple[List[ResponseObject], List[dict], List[str], List[str], str]: A tuple containing the list of responses, the updated conversation history, the updated whole conversation, the updated whole conversation metadata, and the response text.
|
770 |
+
"""
|
771 |
+
|
772 |
+
call_temperature = temperature # This is correct now with the fixed parameter name
|
773 |
+
|
774 |
+
# Update Gemini config with the temperature settings
|
775 |
+
config = ai.GenerationConfig(temperature=call_temperature, max_output_tokens=max_tokens)
|
776 |
+
|
777 |
+
for attempt in range(MAX_OUTPUT_VALIDATION_ATTEMPTS):
|
778 |
+
# Process requests to large language model
|
779 |
+
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(
|
780 |
+
batch_prompts, system_prompt, conversation_history, whole_conversation,
|
781 |
+
whole_conversation_metadata, model, config, model_choice,
|
782 |
+
call_temperature, reported_batch_no, local_model, master=master
|
783 |
+
)
|
784 |
+
|
785 |
+
stripped_response = responses[-1].text.strip()
|
786 |
+
|
787 |
+
# Check if response meets our criteria (length and contains table)
|
788 |
+
if len(stripped_response) > 120 and '|' in stripped_response:
|
789 |
+
print(f"Attempt {attempt + 1} produced response with markdown table.")
|
790 |
+
break # Success - exit loop
|
791 |
+
|
792 |
+
# Increase temperature for next attempt
|
793 |
+
call_temperature = temperature + (0.1 * (attempt + 1))
|
794 |
+
print(f"Attempt {attempt + 1} resulted in invalid table: {stripped_response}. "
|
795 |
+
f"Trying again with temperature: {call_temperature}")
|
796 |
+
|
797 |
+
else: # This runs if no break occurred (all attempts failed)
|
798 |
+
print(f"Failed to get valid response after {MAX_OUTPUT_VALIDATION_ATTEMPTS} attempts")
|
799 |
+
|
800 |
+
return responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text
|
801 |
|
802 |
def write_llm_output_and_logs(responses: List[ResponseObject],
|
803 |
whole_conversation: List[str],
|
|
|
943 |
# Create a new DataFrame from the reference data
|
944 |
new_reference_df = pd.DataFrame(reference_data)
|
945 |
|
946 |
+
#print("new_reference_df:", new_reference_df)
|
947 |
|
948 |
# Append on old reference data
|
949 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
|
|
1094 |
llama_cpp_prefix = "<start_of_turn>user\n"
|
1095 |
llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
|
1096 |
|
|
|
|
|
|
|
1097 |
# If you have a file input but no file data it hasn't yet been loaded. Load it here.
|
1098 |
if file_data.empty:
|
1099 |
print("No data table found, loading from file")
|
1100 |
try:
|
1101 |
+
#print("in_data_file:", in_data_file)
|
1102 |
in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
|
1103 |
+
#print("in_colnames:", in_colnames_drop)
|
1104 |
file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default)
|
1105 |
+
#print("file_data loaded in:", file_data)
|
1106 |
except:
|
1107 |
# Check if files and text exist
|
1108 |
out_message = "Please enter a data file to summarise."
|
|
|
1116 |
|
1117 |
# If this is the first time around, set variables to 0/blank
|
1118 |
if first_loop_state==True:
|
1119 |
+
print("This is the first time through the loop, resetting latest_batch_completed to 0")
|
1120 |
if (latest_batch_completed == 999) | (latest_batch_completed == 0):
|
1121 |
latest_batch_completed = 0
|
1122 |
out_message = []
|
|
|
1128 |
local_model, tokenizer = load_model()
|
1129 |
print("Local model loaded:", local_model)
|
1130 |
|
1131 |
+
|
1132 |
+
|
1133 |
+
|
1134 |
+
if num_batches > 0:
|
1135 |
+
progress_measure = round(latest_batch_completed / num_batches, 1)
|
1136 |
+
progress(progress_measure, desc="Querying large language model")
|
1137 |
+
else:
|
1138 |
+
progress(0.1, desc="Querying large language model")
|
1139 |
|
1140 |
+
if latest_batch_completed < num_batches:
|
|
|
|
|
|
|
|
|
1141 |
|
1142 |
+
# Load file
|
1143 |
+
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
1144 |
+
if isinstance(out_message, str):
|
1145 |
+
out_message = [out_message]
|
1146 |
|
1147 |
+
if not out_file_paths:
|
1148 |
+
out_file_paths = []
|
1149 |
+
|
1150 |
+
|
1151 |
+
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
1152 |
+
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
1153 |
+
print(out_message)
|
1154 |
+
raise Exception(out_message)
|
1155 |
+
#return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
|
1156 |
+
|
1157 |
+
|
1158 |
+
if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
1159 |
+
elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
|
1160 |
+
elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
|
1161 |
+
else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
1162 |
+
|
1163 |
+
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
1164 |
+
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
1165 |
|
1166 |
+
for i in topics_loop:
|
1167 |
+
#for latest_batch_completed in range(num_batches):
|
1168 |
+
reported_batch_no = latest_batch_completed + 1
|
1169 |
+
print("Running query batch", str(reported_batch_no))
|
1170 |
|
1171 |
+
# Call the function to prepare the input table
|
1172 |
+
simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
|
1173 |
+
#log_files_output_paths.append(simplified_csv_table_path)
|
1174 |
|
1175 |
+
# Conversation history
|
1176 |
+
conversation_history = []
|
1177 |
|
1178 |
+
#print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
|
|
|
1179 |
|
1180 |
+
# If the latest batch of responses contains at least one instance of text
|
1181 |
+
if not batch_basic_response_df.empty:
|
|
|
|
|
|
|
|
|
1182 |
|
1183 |
+
# If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
|
1184 |
+
if latest_batch_completed >= 1 or candidate_topics is not None:
|
|
|
|
|
1185 |
|
1186 |
+
# Prepare Gemini models before query
|
1187 |
+
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
|
1188 |
+
print("Using Gemini model:", model_choice)
|
1189 |
+
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
1190 |
+
elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
|
1191 |
+
print("Using AWS Bedrock model:", model_choice)
|
1192 |
+
else:
|
1193 |
+
print("Using local model:", model_choice)
|
1194 |
+
|
1195 |
+
# Preparing candidate topics if no topics currently exist
|
1196 |
+
if candidate_topics and existing_unique_topics_df.empty:
|
1197 |
+
progress(0.1, "Creating revised zero shot topics table")
|
1198 |
+
|
1199 |
+
# 'Zero shot topics' are those supplied by the user
|
1200 |
+
max_topic_no = 120
|
1201 |
+
zero_shot_topics = read_file(candidate_topics.name)
|
1202 |
+
|
1203 |
+
# Max 120 topics allowed
|
1204 |
+
if zero_shot_topics.shape[0] > max_topic_no:
|
1205 |
+
print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
|
1206 |
+
zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
|
1207 |
+
|
1208 |
+
# Forward slashes in the topic names seems to confuse the model
|
1209 |
+
if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
|
1210 |
+
for x in zero_shot_topics.columns:
|
1211 |
+
zero_shot_topics.loc[:, x] = (
|
1212 |
+
zero_shot_topics.loc[:, x]
|
1213 |
+
.str.strip()
|
1214 |
+
.str.replace('\n', ' ')
|
1215 |
+
.str.replace('\r', ' ')
|
1216 |
+
.str.replace('/', ' or ')
|
1217 |
+
.str.lower()
|
1218 |
+
.str.capitalize())
|
1219 |
+
|
1220 |
+
# If number of columns is 1, keep only subtopics
|
1221 |
+
if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
|
1222 |
+
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1223 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1224 |
+
# Allow for possibility that the user only wants to set general topics and not subtopics
|
1225 |
+
elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
|
1226 |
+
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
|
1227 |
+
zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
|
1228 |
+
# If general topic and subtopic are specified
|
1229 |
+
elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
|
1230 |
+
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
|
1231 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
|
1232 |
+
# If number of columns is 2, keep general topics and subtopics
|
1233 |
+
elif zero_shot_topics.shape[1] == 2:
|
1234 |
+
zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
|
1235 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
|
1236 |
+
else:
|
1237 |
+
# If there are more columns, just assume that the first column was meant to be a subtopic
|
1238 |
+
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1239 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1240 |
+
|
1241 |
+
# If the responses are being forced into zero shot topics, allow an option for nothing relevant
|
1242 |
+
if force_zero_shot_radio == "Yes":
|
1243 |
+
zero_shot_topics_gen_topics_list.append("")
|
1244 |
+
zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
|
1245 |
+
|
1246 |
+
if create_revised_general_topics == True:
|
1247 |
+
# Create the most up to date list of topics and subtopics.
|
1248 |
+
# If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
|
1249 |
+
unique_topics_df = pd.DataFrame(data={
|
1250 |
+
"General Topic":zero_shot_topics_gen_topics_list,
|
1251 |
+
"Subtopic":zero_shot_topics_subtopics_list
|
1252 |
+
})
|
1253 |
+
unique_topics_markdown = unique_topics_df.to_markdown()
|
1254 |
|
1255 |
+
print("unique_topics_markdown:", unique_topics_markdown)
|
1256 |
+
|
1257 |
+
formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1258 |
|
1259 |
+
# Format the general_topics prompt with the topics
|
1260 |
+
formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
|
|
|
1261 |
|
1262 |
+
if model_choice == "gemma_2b_it_local":
|
1263 |
+
formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
|
1264 |
|
1265 |
+
formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
|
|
|
|
|
1266 |
|
1267 |
+
whole_conversation = []
|
|
|
|
|
|
|
|
|
1268 |
|
1269 |
+
general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
1270 |
|
1271 |
+
# Convert response text to a markdown table
|
1272 |
+
try:
|
1273 |
+
zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
|
1274 |
+
print("Output revised zero shot topics table is:", zero_shot_topics_df)
|
1275 |
|
1276 |
+
zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
|
1277 |
+
zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
|
1278 |
+
out_file_paths.append(zero_shot_revised_path)
|
1279 |
|
1280 |
+
except Exception as e:
|
1281 |
+
print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
|
1282 |
+
zero_shot_topics_df = pd.DataFrame(data={
|
1283 |
+
"General Topic":zero_shot_topics_gen_topics_list,
|
1284 |
+
"Subtopic":zero_shot_topics_subtopics_list})
|
1285 |
|
1286 |
+
if zero_shot_topics_df.empty:
|
1287 |
+
print("Creation of revised general topics df failed, reverting to original list")
|
1288 |
+
zero_shot_topics_df = pd.DataFrame(data={
|
1289 |
+
"General Topic":zero_shot_topics_gen_topics_list,
|
1290 |
+
"Subtopic":zero_shot_topics_subtopics_list})
|
1291 |
+
else:
|
1292 |
+
zero_shot_topics_df = pd.DataFrame(data={
|
1293 |
+
"General Topic":zero_shot_topics_gen_topics_list,
|
1294 |
+
"Subtopic":zero_shot_topics_subtopics_list})
|
1295 |
+
|
1296 |
+
|
1297 |
+
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1298 |
+
if not existing_unique_topics_df.empty:
|
1299 |
+
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1300 |
+
else:
|
1301 |
+
existing_unique_topics_df = zero_shot_topics_df
|
1302 |
+
|
1303 |
+
if candidate_topics and not zero_shot_topics_df.empty:
|
1304 |
+
# If you have already created revised zero shot topics, concat to the current
|
1305 |
+
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
|
1306 |
+
|
1307 |
+
#all_topic_tables_df_merged = existing_unique_topics_df
|
1308 |
+
existing_unique_topics_df["Response References"] = ""
|
1309 |
+
existing_unique_topics_df.fillna("", inplace=True)
|
1310 |
+
existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
|
1311 |
+
existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
|
1312 |
+
|
1313 |
+
# print("existing_unique_topics_df:", existing_unique_topics_df)
|
1314 |
+
|
1315 |
+
# If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
|
1316 |
+
if force_zero_shot_radio == "Yes":
|
1317 |
+
unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
|
1318 |
+
topic_assignment_prompt = force_existing_topics_prompt
|
1319 |
+
else:
|
1320 |
+
unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
|
1321 |
+
topic_assignment_prompt = allow_new_topics_prompt
|
1322 |
+
|
1323 |
|
1324 |
+
# Format the summary prompt with the response table and topics
|
1325 |
+
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1326 |
+
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
|
1327 |
+
|
1328 |
|
1329 |
+
if model_choice == "gemma_2b_it_local":
|
1330 |
+
formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
|
1331 |
+
full_prompt = formatted_summary_prompt
|
1332 |
+
else:
|
1333 |
+
full_prompt = formatted_system_prompt + formatted_summary_prompt
|
1334 |
+
|
1335 |
+
#latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
|
1336 |
|
1337 |
+
# Define the output file path for the formatted prompt
|
1338 |
+
formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
|
|
1339 |
|
1340 |
+
# Write the formatted prompt to the specified file
|
1341 |
+
try:
|
1342 |
+
with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1343 |
+
f.write(full_prompt)
|
1344 |
+
except Exception as e:
|
1345 |
+
print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
|
1346 |
|
1347 |
+
summary_prompt_list = [formatted_summary_prompt]
|
|
|
|
|
1348 |
|
1349 |
+
# print("master_summary_prompt_list:", summary_prompt_list[0])
|
|
|
1350 |
|
1351 |
+
summary_conversation_history = []
|
1352 |
+
summary_whole_conversation = []
|
|
|
|
|
1353 |
|
1354 |
+
# Process requests to large language model
|
1355 |
+
# responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1356 |
|
1357 |
+
responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
|
|
|
|
|
|
1358 |
|
1359 |
+
# print("responses:", responses[-1].text)
|
1360 |
+
# print("Whole conversation metadata:", whole_conversation_metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1361 |
|
1362 |
+
topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
|
|
|
|
|
|
|
1363 |
|
1364 |
+
# Write final output to text file for logging purposes
|
1365 |
+
try:
|
1366 |
+
final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1367 |
|
1368 |
+
if isinstance(responses[-1], ResponseObject):
|
1369 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1370 |
+
f.write(responses[-1].text)
|
1371 |
+
elif "choices" in responses[-1]:
|
1372 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1373 |
+
f.write(responses[-1]["choices"][0]['text'])
|
1374 |
+
else:
|
1375 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1376 |
+
f.write(responses[-1].text)
|
1377 |
|
1378 |
+
except Exception as e:
|
1379 |
+
print("Error in returning model response:", e)
|
1380 |
+
|
1381 |
|
1382 |
+
# If error in table parsing, leave function
|
1383 |
+
if is_error == True:
|
1384 |
+
final_message_out = "Could not complete summary, error in LLM output."
|
1385 |
+
raise Exception(final_message_out)
|
1386 |
+
#return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
|
1387 |
|
1388 |
+
# Write outputs to csv
|
1389 |
+
## Topics with references
|
1390 |
+
new_topic_df.to_csv(topic_table_out_path, index=None)
|
1391 |
+
log_files_output_paths.append(topic_table_out_path)
|
1392 |
|
1393 |
+
## Reference table mapping response numbers to topics
|
1394 |
+
new_reference_df.to_csv(reference_table_out_path, index=None)
|
1395 |
+
out_file_paths.append(reference_table_out_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1396 |
|
1397 |
+
## Unique topic list
|
1398 |
+
new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
|
|
|
|
|
1399 |
|
1400 |
+
new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1401 |
+
out_file_paths.append(unique_topics_df_out_path)
|
1402 |
+
|
1403 |
+
# Outputs for markdown table output
|
1404 |
+
unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
1405 |
+
unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
|
|
|
|
|
1406 |
|
1407 |
+
#whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1408 |
+
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
1409 |
+
|
1410 |
|
1411 |
+
#out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
|
1412 |
+
#log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
|
1413 |
|
1414 |
+
out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1415 |
+
log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1416 |
|
1417 |
+
#print("out_file_paths at end of loop:", out_file_paths)
|
1418 |
|
1419 |
+
# If this is the first batch, run this
|
1420 |
+
else:
|
1421 |
+
#system_prompt = system_prompt + normalised_simple_markdown_table
|
1422 |
+
|
1423 |
+
# Prepare Gemini models before query
|
1424 |
+
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
|
1425 |
+
print("Using Gemini model:", model_choice)
|
1426 |
+
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1427 |
+
elif model_choice in ["gemma_2b_it_local"]:
|
1428 |
+
print("Using local Gemma 2b model")
|
1429 |
+
else:
|
1430 |
+
print("Using AWS Bedrock model:", model_choice)
|
1431 |
|
1432 |
+
formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1433 |
|
1434 |
+
formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
|
|
|
|
|
|
|
1435 |
|
1436 |
+
if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
|
1437 |
+
else: formatted_prompt2 = prompt2
|
1438 |
+
|
1439 |
+
if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
|
1440 |
+
else: formatted_prompt3 = prompt3
|
1441 |
|
1442 |
+
if model_choice == "gemma_2b_it_local":
|
1443 |
+
formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
|
1444 |
+
formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
|
1445 |
+
formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
|
|
|
1446 |
|
1447 |
+
batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
|
1448 |
+
|
1449 |
+
whole_conversation = [formatted_initial_table_system_prompt]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1450 |
|
1451 |
+
|
|
|
|
|
1452 |
|
1453 |
+
|
1454 |
|
1455 |
+
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
|
|
|
|
|
|
|
|
|
1456 |
|
|
|
1457 |
|
1458 |
+
topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1459 |
|
1460 |
+
# If error in table parsing, leave function
|
1461 |
+
if is_error == True:
|
1462 |
+
raise Exception("Error in output table parsing")
|
1463 |
+
# unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
|
|
|
1464 |
|
1465 |
+
|
1466 |
+
#all_topic_tables_df.append(topic_table_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1467 |
|
1468 |
+
topic_table_df.to_csv(topic_table_out_path, index=None)
|
1469 |
+
out_file_paths.append(topic_table_out_path)
|
1470 |
|
1471 |
+
reference_df.to_csv(reference_table_out_path, index=None)
|
1472 |
+
out_file_paths.append(reference_table_out_path)
|
1473 |
|
1474 |
+
## Unique topic list
|
|
|
1475 |
|
1476 |
+
new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]).drop_duplicates('Subtopic')
|
|
|
1477 |
|
1478 |
+
new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1479 |
+
out_file_paths.append(unique_topics_df_out_path)
|
1480 |
+
|
1481 |
+
#all_markdown_topic_tables.append(markdown_table)
|
1482 |
|
1483 |
+
whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1484 |
+
whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)
|
1485 |
+
|
1486 |
+
# Write final output to text file also
|
1487 |
+
try:
|
1488 |
+
final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1489 |
+
|
1490 |
+
if isinstance(responses[-1], ResponseObject):
|
1491 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1492 |
+
f.write(responses[-1].text)
|
1493 |
+
unique_table_df_display_table_markdown = responses[-1].text
|
1494 |
+
elif "choices" in responses[-1]:
|
1495 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1496 |
+
f.write(responses[-1]["choices"][0]['text'])
|
1497 |
+
unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
|
1498 |
+
else:
|
1499 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1500 |
+
f.write(responses[-1].text)
|
1501 |
+
unique_table_df_display_table_markdown = responses[-1].text
|
1502 |
|
1503 |
+
log_files_output_paths.append(final_table_output_path)
|
|
|
|
|
1504 |
|
1505 |
+
except Exception as e:
|
1506 |
+
print("Error in returning model response:", e)
|
1507 |
+
|
1508 |
+
new_topic_df = topic_table_df
|
1509 |
+
new_reference_df = reference_df
|
|
|
|
|
|
|
|
|
1510 |
|
1511 |
+
else:
|
1512 |
+
print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
|
|
|
1513 |
|
1514 |
+
# Increase latest file completed count unless we are over the last batch number
|
1515 |
+
if latest_batch_completed <= num_batches:
|
1516 |
+
print("Completed batch number:", str(reported_batch_no))
|
1517 |
+
latest_batch_completed += 1
|
|
|
1518 |
|
1519 |
+
toc = time.perf_counter()
|
1520 |
+
final_time = toc - tic
|
|
|
|
|
1521 |
|
1522 |
+
if final_time > max_time_for_loop:
|
1523 |
+
print("Max time reached, breaking loop.")
|
1524 |
+
topics_loop.close()
|
1525 |
+
tqdm._instances.clear()
|
1526 |
+
break
|
1527 |
|
1528 |
+
# Overwrite 'existing' elements to add new tables
|
1529 |
+
existing_reference_df = new_reference_df.dropna(how='all')
|
1530 |
+
existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
|
1531 |
+
existing_topics_table = new_topic_df.dropna(how='all')
|
1532 |
|
1533 |
+
# The topic table that can be modified does not need the summary column
|
1534 |
+
modifiable_unique_topics_df = existing_unique_topics_df.drop("Summary", axis=1)
|
|
|
|
|
|
|
|
|
1535 |
|
1536 |
+
out_time = f"{final_time:0.1f} seconds."
|
1537 |
+
|
1538 |
+
out_message.append('All queries successfully completed in')
|
1539 |
|
1540 |
+
final_message_out = '\n'.join(out_message)
|
1541 |
+
final_message_out = final_message_out + " " + out_time
|
1542 |
|
1543 |
+
print(final_message_out)
|
|
|
1544 |
|
1545 |
+
# If we have extracted topics from the last batch, return the input out_message and file list to the relevant components
|
1546 |
+
if latest_batch_completed >= num_batches:
|
1547 |
+
print("Last batch reached, returning batch:", str(latest_batch_completed))
|
1548 |
+
# Set to a very high number so as not to mess with subsequent file processing by the user
|
1549 |
+
#latest_batch_completed = 999
|
1550 |
|
1551 |
+
toc = time.perf_counter()
|
1552 |
+
final_time = (toc - tic) + time_taken
|
1553 |
+
out_time = f"Everything finished in {round(final_time,1)} seconds."
|
1554 |
+
print(out_time)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1555 |
|
1556 |
+
print("All summaries completed. Creating outputs.")
|
1557 |
|
1558 |
+
model_choice_clean = model_name_map[model_choice]
|
1559 |
+
# Example usage
|
1560 |
+
in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
|
1561 |
|
1562 |
+
# Need to reduce output file names as full length files may be too long
|
1563 |
+
file_name = clean_column_name(file_name, max_length=30)
|
|
|
|
|
|
|
1564 |
|
1565 |
+
# Save outputs for each batch. If master file created, label file as master
|
1566 |
+
file_path_details = f"{file_name}_col_{in_column_cleaned}"
|
|
|
|
|
1567 |
|
1568 |
+
# Create a pivoted reference table
|
1569 |
+
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
|
|
1570 |
|
1571 |
+
# Save the new DataFrame to CSV
|
1572 |
+
#topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1573 |
+
reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1574 |
+
reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1575 |
+
unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1576 |
+
basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1577 |
|
1578 |
+
## Reference table mapping response numbers to topics
|
1579 |
+
existing_reference_df.to_csv(reference_table_out_path, index=None)
|
1580 |
+
out_file_paths.append(reference_table_out_path)
|
1581 |
|
1582 |
+
# Create final unique topics table from reference table to ensure consistent numbers
|
1583 |
+
final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
|
1584 |
|
1585 |
+
## Unique topic list
|
1586 |
+
final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1587 |
+
out_file_paths.append(unique_topics_df_out_path)
|
|
|
|
|
|
|
|
|
1588 |
|
1589 |
+
# Ensure that we are only returning the final results to outputs
|
1590 |
+
out_file_paths = [x for x in out_file_paths if '_final_' in x]
|
1591 |
|
1592 |
+
## Reference table mapping response numbers to topics
|
1593 |
+
existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
|
1594 |
+
log_files_output_paths.append(reference_table_out_pivot_path)
|
1595 |
|
1596 |
+
## Create a dataframe for missing response references:
|
1597 |
+
# Assuming existing_reference_df and file_data are already defined
|
1598 |
+
# Simplify table to just responses column and the Response reference number
|
1599 |
|
1600 |
+
basic_response_data = get_basic_response_data(file_data, chosen_cols)
|
1601 |
|
|
|
|
|
|
|
|
|
1602 |
|
1603 |
+
# Save simplified file data to log outputs
|
1604 |
+
pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None)
|
1605 |
+
log_files_output_paths.append(basic_response_data_out_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1606 |
|
|
|
1607 |
|
1608 |
+
# Step 1: Identify missing references
|
1609 |
+
missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
|
|
|
|
|
|
|
1610 |
|
1611 |
+
# Step 2: Create a new DataFrame with the same columns as existing_reference_df
|
1612 |
+
missing_df = pd.DataFrame(columns=existing_reference_df.columns)
|
1613 |
|
1614 |
+
# Step 3: Populate the new DataFrame
|
1615 |
+
missing_df['Response References'] = missing_references['Reference']
|
1616 |
+
missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False) # Fill other columns with NA
|
|
|
1617 |
|
1618 |
+
# Display the new DataFrame
|
1619 |
+
#print("missing_df:", missing_df)
|
1620 |
|
1621 |
+
missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1622 |
+
missing_df.to_csv(missing_df_out_path, index=None)
|
1623 |
+
log_files_output_paths.append(missing_df_out_path)
|
|
|
|
|
1624 |
|
1625 |
+
out_file_paths = list(set(out_file_paths))
|
1626 |
+
log_files_output_paths = list(set(log_files_output_paths))
|
|
|
|
|
1627 |
|
1628 |
+
final_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
|
1629 |
+
|
1630 |
# The topic table that can be modified does not need the summary column
|
1631 |
+
modifiable_unique_topics_df = final_out_unique_topics_df.drop("Summary", axis=1)
|
|
|
|
|
|
|
|
|
1632 |
|
1633 |
+
print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
|
|
|
1634 |
|
1635 |
+
return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths
|
1636 |
|
1637 |
|
1638 |
return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
|
|
|
1723 |
def modify_existing_output_tables(original_unique_topics_df:pd.DataFrame, modifiable_unique_topics_df:pd.DataFrame, reference_df:pd.DataFrame, text_output_file_list_state:List[str]) -> Tuple:
|
1724 |
'''
|
1725 |
Take a unique_topics table that has been modified, apply these new topic names to the long-form reference_df, and save both tables to file.
|
1726 |
+
'''
|
1727 |
+
|
1728 |
+
# Ensure text_output_file_list_state is a flat list
|
1729 |
+
if any(isinstance(i, list) for i in text_output_file_list_state):
|
1730 |
+
text_output_file_list_state = [item for sublist in text_output_file_list_state for item in sublist] # Flatten list
|
1731 |
+
|
1732 |
+
# Extract file paths safely
|
1733 |
+
reference_files = [x for x in text_output_file_list_state if 'reference' in x]
|
1734 |
+
unique_files = [x for x in text_output_file_list_state if 'unique' in x]
|
1735 |
+
|
1736 |
+
# Ensure files exist before accessing
|
1737 |
+
reference_file_path = os.path.basename(reference_files[0]) if reference_files else None
|
1738 |
+
unique_table_file_path = os.path.basename(unique_files[0]) if unique_files else None
|
1739 |
|
1740 |
+
print("Reference File:", reference_file_path)
|
1741 |
+
print("Unique Table File:", unique_table_file_path)
|
1742 |
|
1743 |
output_file_list = []
|
1744 |
|