seanpedrickcase commited on
Commit
b0e08c8
·
1 Parent(s): 854a758

Changed default requirements to CPU version of llama cpp. Added Gemini Flash 2.0 to model list. Output files should contain only final files.

Browse files
app.py CHANGED
@@ -16,7 +16,7 @@ today_rev = datetime.now().strftime("%Y%m%d")
16
  ensure_output_folder_exists()
17
 
18
  host_name = socket.gethostname()
19
- print("host_name is:", host_name)
20
 
21
  access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
22
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
@@ -32,7 +32,7 @@ elif RUN_AWS_FUNCTIONS == "1":
32
  default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
33
 
34
  else:
35
- default_model_choice = "gemini-1.5-flash-002"
36
 
37
  # Create the gradio interface
38
  app = gr.Blocks(theme = gr.themes.Base())
@@ -229,6 +229,8 @@ with app:
229
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
230
  outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
231
 
 
 
232
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
233
  latest_batch_completed.change(fn=extract_topics,
234
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
@@ -238,15 +240,15 @@ with app:
238
 
239
  # When button pressed, deduplicate data
240
  deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
241
- then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files])
242
 
243
  # When button pressed, summarise previous data
244
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
245
  then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
246
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
247
- then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
248
 
249
- latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
250
 
251
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
252
  continue_previous_data_files_btn.click(
 
16
  ensure_output_folder_exists()
17
 
18
  host_name = socket.gethostname()
19
+ # print("host_name is:", host_name)
20
 
21
  access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
22
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 
32
  default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
33
 
34
  else:
35
+ default_model_choice = "gemini-2.0-flash"
36
 
37
  # Create the gradio interface
38
  app = gr.Blocks(theme = gr.themes.Base())
 
229
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
230
  outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
231
 
232
+ # return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
233
+
234
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
235
  latest_batch_completed.change(fn=extract_topics,
236
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
 
240
 
241
  # When button pressed, deduplicate data
242
  deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
243
+ then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files, log_files_output])
244
 
245
  # When button pressed, summarise previous data
246
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
247
  then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
248
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
249
+ then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
250
 
251
+ latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
252
 
253
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
254
  continue_previous_data_files_btn.click(
requirements.txt CHANGED
@@ -11,10 +11,8 @@ google-generativeai==0.8.3
11
  html5lib==1.1
12
  beautifulsoup4==4.12.3
13
  rapidfuzz==3.10.1
14
- torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
15
- #llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
16
- # Specify exact llama_cpp wheel for huggingface compatibility
17
- https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
18
  transformers==4.47.0
19
  numpy==1.26.4
20
  typing_extensions==4.12.2
 
11
  html5lib==1.1
12
  beautifulsoup4==4.12.3
13
  rapidfuzz==3.10.1
14
+ torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
15
+ llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 
 
16
  transformers==4.47.0
17
  numpy==1.26.4
18
  typing_extensions==4.12.2
requirements_cpu.txt → requirements_gpu.txt RENAMED
@@ -11,8 +11,10 @@ google-generativeai==0.8.3
11
  html5lib==1.1
12
  beautifulsoup4==4.12.3
13
  rapidfuzz==3.10.1
14
- torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
15
- llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 
 
16
  transformers==4.47.0
17
  numpy==1.26.4
18
  typing_extensions==4.12.2
 
11
  html5lib==1.1
12
  beautifulsoup4==4.12.3
13
  rapidfuzz==3.10.1
14
+ torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
15
+ #llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
16
+ # Specify exact llama_cpp wheel for huggingface compatibility
17
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
18
  transformers==4.47.0
19
  numpy==1.26.4
20
  typing_extensions==4.12.2
tools/helper_functions.py CHANGED
@@ -46,14 +46,14 @@ def get_or_create_env_var(var_name, default_value):
46
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
47
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
48
 
49
- RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "0")
50
  print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
51
 
52
  if RUN_AWS_FUNCTIONS == "1":
53
- model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-1.5-flash-002", "gemini-1.5-pro-002", "gemma_2b_it_local"]
54
  model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
55
  else:
56
- model_full_names = ["gemini-1.5-flash-002", "gemini-1.5-pro-002", "gemma_2b_it_local"]
57
  model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
58
 
59
  if RUN_LOCAL_MODEL == "0":
@@ -76,7 +76,7 @@ def get_file_path_with_extension(file_path):
76
  # Return the basename with its extension
77
  return basename
78
 
79
- def get_file_path_end(file_path):
80
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
81
  basename = os.path.basename(file_path)
82
 
@@ -246,10 +246,6 @@ def put_columns_in_df(in_file):
246
  # Read each sheet into a DataFrame
247
  df = pd.read_excel(file_name, sheet_name=sheet_name)
248
 
249
- # Process the DataFrame (e.g., print its contents)
250
- print(f"Sheet Name: {sheet_name}")
251
- print(df.head()) # Print the first few rows
252
-
253
  new_choices.extend(list(df.columns))
254
 
255
  all_sheet_names.extend(new_sheet_names)
@@ -261,10 +257,10 @@ def put_columns_in_df(in_file):
261
  concat_choices.extend(new_choices)
262
 
263
  # Drop duplicate columns
264
- concat_choices = list(set(concat_choices))
265
 
266
  if number_of_excel_files > 0:
267
- return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True), file_end
268
  else:
269
  return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end
270
 
 
46
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
47
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
48
 
49
+ RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
50
  print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
51
 
52
  if RUN_AWS_FUNCTIONS == "1":
53
+ model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
54
  model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
55
  else:
56
+ model_full_names = ["gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
57
  model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
58
 
59
  if RUN_LOCAL_MODEL == "0":
 
76
  # Return the basename with its extension
77
  return basename
78
 
79
+ def get_file_name_no_ext(file_path):
80
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
81
  basename = os.path.basename(file_path)
82
 
 
246
  # Read each sheet into a DataFrame
247
  df = pd.read_excel(file_name, sheet_name=sheet_name)
248
 
 
 
 
 
249
  new_choices.extend(list(df.columns))
250
 
251
  all_sheet_names.extend(new_sheet_names)
 
257
  concat_choices.extend(new_choices)
258
 
259
  # Drop duplicate columns
260
+ concat_choices = sorted(set(concat_choices))
261
 
262
  if number_of_excel_files > 0:
263
+ return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end
264
  else:
265
  return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end
266
 
tools/llm_api_call.py CHANGED
@@ -20,7 +20,7 @@ from io import StringIO
20
  GradioFileData = gr.FileData
21
 
22
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
23
- from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
24
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
25
 
26
  # ResponseObject class for AWS Bedrock calls
@@ -65,9 +65,9 @@ def load_in_file(file_path: str, colname:str=""):
65
  - file_path (str): The path to the file to be processed.
66
  """
67
  file_type = detect_file_type(file_path)
68
- print("File type is:", file_type)
69
 
70
- file_name = get_file_path_end(file_path)
71
  file_data = read_file(file_path)
72
 
73
  if colname:
@@ -140,6 +140,14 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
140
 
141
  return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
142
 
 
 
 
 
 
 
 
 
143
  def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
144
  """
145
  Processes a file by simplifying its content based on chosen columns and saves the result to a specified output folder.
@@ -163,11 +171,9 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
163
  simplified_csv_table_path = ""
164
 
165
  # Simplify table to just responses column and the Response reference number
166
- simple_file = file_data[[chosen_cols]].reset_index(names="Reference")
167
- simple_file["Reference"] = simple_file["Reference"].astype(int) + 1
168
- simple_file = simple_file.rename(columns={chosen_cols: "Response"})
169
- simple_file["Response"] = simple_file["Response"].str.strip()
170
- file_len = len(simple_file["Reference"])
171
 
172
 
173
  # Subset the data for the current batch
@@ -181,35 +187,35 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
181
  else:
182
  end_row = file_len + 1
183
 
184
- simple_file = simple_file[start_row:end_row] # Select the current batch
185
 
186
  # Now replace the reference numbers with numbers starting from 1
187
- simple_file["Reference"] = simple_file["Reference"] - start_row
188
 
189
- #print("simple_file:", simple_file)
190
 
191
  # Remove problematic characters including ASCII and various quote marks
192
  # Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
193
- simple_file["Response"] = simple_file["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
194
- simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
195
- simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
196
- simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
197
- simple_file["Response"] = simple_file["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
198
 
199
  # Remove blank and extremely short responses
200
- simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
201
- ~(simple_file["Response"] == "None") &\
202
- ~(simple_file["Response"] == " ") &\
203
- ~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
204
 
205
  #simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
206
- #simple_file.to_csv(simplified_csv_table_path, index=None)
207
 
208
- simple_markdown_table = simple_file.to_markdown(index=None)
209
 
210
  normalised_simple_markdown_table = normalise_string(simple_markdown_table)
211
 
212
- return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, simple_file
213
 
214
  def replace_punctuation_with_underscore(input_string):
215
  # Create a translation table where each punctuation character maps to '_'
@@ -368,7 +374,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
368
  progress_bar = range(0,number_of_api_retry_attempts)
369
 
370
  # Generate the model's response
371
- if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
372
 
373
  for i in progress_bar:
374
  try:
@@ -841,7 +847,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
841
  # Create a new DataFrame from the reference data
842
  new_reference_df = pd.DataFrame(reference_data)
843
 
844
- print("new_reference_df:", new_reference_df)
845
 
846
  # Append on old reference data
847
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
@@ -1040,10 +1046,15 @@ def extract_topics(in_data_file,
1040
  # Save outputs for each batch. If master file created, label file as master
1041
  file_path_details = f"{file_name}_col_{in_column_cleaned}"
1042
 
 
 
 
1043
  # Save the new DataFrame to CSV
1044
  #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
 
1045
  reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1046
- unique_topics_df_out_path = output_folder +file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
 
1047
 
1048
  # Write outputs to csv
1049
  ## Topics with references
@@ -1058,19 +1069,32 @@ def extract_topics(in_data_file,
1058
  existing_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1059
  out_file_paths.append(unique_topics_df_out_path)
1060
 
 
 
 
 
 
 
 
1061
  ## Create a dataframe for missing response references:
1062
  # Assuming existing_reference_df and file_data are already defined
1063
 
1064
  # Simplify table to just responses column and the Response reference number
1065
- simple_file = file_data[[chosen_cols]].reset_index(names="Reference")
1066
- simple_file["Reference"] = simple_file["Reference"].astype(int) + 1
1067
- simple_file = simple_file.rename(columns={chosen_cols: "Response"})
1068
- simple_file["Response"] = simple_file["Response"].str.strip()
 
 
 
 
 
 
1069
 
1070
  # Step 1: Identify missing references
1071
- #print("simple_file:", simple_file)
1072
 
1073
- missing_references = simple_file[~simple_file['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
1074
 
1075
  # Step 2: Create a new DataFrame with the same columns as existing_reference_df
1076
  missing_df = pd.DataFrame(columns=existing_reference_df.columns)
@@ -1126,21 +1150,21 @@ def extract_topics(in_data_file,
1126
  print("Running query batch", str(reported_batch_no))
1127
 
1128
  # Call the function to prepare the input table
1129
- simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, simple_table_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
1130
- log_files_output_paths.append(simplified_csv_table_path)
1131
 
1132
 
1133
  # Conversation history
1134
  conversation_history = []
1135
 
1136
- print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
1137
 
1138
  # If the latest batch of responses contains at least one instance of text
1139
- if not simple_table_df.empty:
1140
 
1141
  print("latest_batch_completed:", latest_batch_completed)
1142
 
1143
- print("candidate_topics:", candidate_topics)
1144
 
1145
  # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
1146
  if latest_batch_completed >= 1 or candidate_topics is not None:
@@ -1148,7 +1172,7 @@ def extract_topics(in_data_file,
1148
  #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
1149
 
1150
  # Prepare Gemini models before query
1151
- if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
1152
  print("Using Gemini model:", model_choice)
1153
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
1154
  elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
@@ -1323,14 +1347,14 @@ def extract_topics(in_data_file,
1323
  out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1324
  log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1325
 
1326
- print("out_file_paths at end of loop:", out_file_paths)
1327
 
1328
  # If this is the first batch, run this
1329
  else:
1330
  #system_prompt = system_prompt + normalised_simple_markdown_table
1331
 
1332
  # Prepare Gemini models before query
1333
- if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
1334
  print("Using Gemini model:", model_choice)
1335
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1336
  else:
@@ -1418,8 +1442,8 @@ def extract_topics(in_data_file,
1418
  else:
1419
  print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
1420
 
1421
- # Increase latest file completed count unless we are at the last file
1422
- if latest_batch_completed != num_batches:
1423
  print("Completed batch number:", str(reported_batch_no))
1424
  latest_batch_completed += 1
1425
 
@@ -1444,10 +1468,44 @@ def extract_topics(in_data_file,
1444
  final_message_out = '\n'.join(out_message)
1445
  final_message_out = final_message_out + " " + out_time
1446
 
1447
- print(final_message_out)
 
 
 
1448
 
1449
  return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
1450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1451
  # SUMMARISATION FUNCTIONS
1452
 
1453
  def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
@@ -1525,21 +1583,28 @@ def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, r
1525
 
1526
  return result_df
1527
 
1528
- def deduplicate_topics(reference_df,
1529
- unique_topics_df,
1530
  reference_table_file_name:str,
1531
  unique_topics_table_file_name:str,
1532
  merge_sentiment:str= "No",
1533
  merge_general_topics:str="No",
1534
  score_threshold:int=deduplication_threshold,
1535
- deduplicate_topics:str="Yes"):
 
 
 
1536
  '''
1537
  Deduplicate topics based on a reference and unique topics table
1538
  '''
1539
  output_files = []
 
1540
 
1541
- reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
1542
- unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
 
 
 
1543
 
1544
  # Run through this x times to try to get all duplicate topics
1545
  if deduplicate_topics == "Yes":
@@ -1572,7 +1637,7 @@ def deduplicate_topics(reference_df,
1572
 
1573
  else:
1574
  # Join deduplicated columns back to original df
1575
- deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1576
  # Remove rows where 'deduplicated_category' is blank or NaN
1577
  deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
1578
 
@@ -1634,8 +1699,12 @@ def deduplicate_topics(reference_df,
1634
  # Remake unique_topics_df based on new reference_df
1635
  unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
1636
 
1637
- reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
1638
- unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
 
 
 
 
1639
 
1640
  reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
1641
  unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
@@ -1645,7 +1714,12 @@ def deduplicate_topics(reference_df,
1645
  output_files.append(reference_file_path)
1646
  output_files.append(unique_topics_file_path)
1647
 
1648
- return reference_df, unique_topics_df, output_files
 
 
 
 
 
1649
 
1650
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
1651
  unique_topics_df:pd.DataFrame,
@@ -1700,7 +1774,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
1700
  whole_conversation_metadata = []
1701
 
1702
  # Prepare Gemini models before query
1703
- if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
1704
  print("Using Gemini model:", model_choice)
1705
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1706
  else:
@@ -1741,6 +1815,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1741
  summarised_outputs:list = [],
1742
  latest_summary_completed:int = 0,
1743
  out_metadata_str:str = "",
 
 
1744
  output_files:list = [],
1745
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
1746
  do_summaries="Yes",
@@ -1750,6 +1826,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1750
  '''
1751
  out_metadata = []
1752
  local_model = []
 
1753
  summarised_output_markdown = ""
1754
 
1755
  print("In summarise_output_topics function.")
@@ -1758,6 +1835,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1758
 
1759
  length_all_summaries = len(all_summaries)
1760
 
 
 
 
 
1761
  #print("latest_summary_completed:", latest_summary_completed)
1762
  #print("length_all_summaries:", length_all_summaries)
1763
 
@@ -1798,7 +1879,12 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1798
  # Remove topics that are tagged as 'Not Mentioned'
1799
  unique_table_df_revised = unique_table_df_revised.loc[unique_table_df_revised["Sentiment"] != "Not Mentioned", :]
1800
  reference_table_df_revised = reference_table_df_revised.loc[reference_table_df_revised["Sentiment"] != "Not Mentioned", :]
 
 
 
 
1801
 
 
1802
  unique_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_unique_topic_table_" + model_choice_clean + ".csv"
1803
  unique_table_df_revised.to_csv(unique_table_df_revised_path, index = None)
1804
 
@@ -1807,11 +1893,18 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1807
 
1808
  output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
1809
 
 
 
 
 
 
 
 
1810
  unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(wrap_text))
1811
 
1812
  summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
1813
 
1814
- return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown
1815
 
1816
  tic = time.perf_counter()
1817
 
@@ -1865,4 +1958,4 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1865
  if latest_summary_completed >= length_all_summaries:
1866
  print("At last summary.")
1867
 
1868
- return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown
 
20
  GradioFileData = gr.FileData
21
 
22
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
23
+ from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
24
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
25
 
26
  # ResponseObject class for AWS Bedrock calls
 
65
  - file_path (str): The path to the file to be processed.
66
  """
67
  file_type = detect_file_type(file_path)
68
+ #print("File type is:", file_type)
69
 
70
+ file_name = get_file_name_no_ext(file_path)
71
  file_data = read_file(file_path)
72
 
73
  if colname:
 
140
 
141
  return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
142
 
143
+ def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str]) -> pd.DataFrame:
144
+ basic_response_data = file_data[[chosen_cols]].reset_index(names="Reference")
145
+ basic_response_data["Reference"] = basic_response_data["Reference"].astype(int) + 1
146
+ basic_response_data = basic_response_data.rename(columns={chosen_cols: "Response"})
147
+ basic_response_data["Response"] = basic_response_data["Response"].str.strip()
148
+
149
+ return basic_response_data
150
+
151
  def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
152
  """
153
  Processes a file by simplifying its content based on chosen columns and saves the result to a specified output folder.
 
171
  simplified_csv_table_path = ""
172
 
173
  # Simplify table to just responses column and the Response reference number
174
+ basic_response_data = get_basic_response_data(file_data, chosen_cols)
175
+
176
+ file_len = len(basic_response_data["Reference"])
 
 
177
 
178
 
179
  # Subset the data for the current batch
 
187
  else:
188
  end_row = file_len + 1
189
 
190
+ batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
191
 
192
  # Now replace the reference numbers with numbers starting from 1
193
+ batch_basic_response_data["Reference"] = batch_basic_response_data["Reference"] - start_row
194
 
195
+ #print("batch_basic_response_data:", batch_basic_response_data)
196
 
197
  # Remove problematic characters including ASCII and various quote marks
198
  # Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
199
+ batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
200
+ batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.strip() # Remove leading and trailing whitespace
201
+ batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
202
+ batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
203
+ batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
204
 
205
  # Remove blank and extremely short responses
206
+ batch_basic_response_data = batch_basic_response_data.loc[~(batch_basic_response_data["Response"].isnull()) &\
207
+ ~(batch_basic_response_data["Response"] == "None") &\
208
+ ~(batch_basic_response_data["Response"] == " ") &\
209
+ ~(batch_basic_response_data["Response"] == ""),:]#~(batch_basic_response_data["Response"].str.len() < 5), :]
210
 
211
  #simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
212
+ #batch_basic_response_data.to_csv(simplified_csv_table_path, index=None)
213
 
214
+ simple_markdown_table = batch_basic_response_data.to_markdown(index=None)
215
 
216
  normalised_simple_markdown_table = normalise_string(simple_markdown_table)
217
 
218
+ return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
219
 
220
  def replace_punctuation_with_underscore(input_string):
221
  # Create a translation table where each punctuation character maps to '_'
 
374
  progress_bar = range(0,number_of_api_retry_attempts)
375
 
376
  # Generate the model's response
377
+ if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
378
 
379
  for i in progress_bar:
380
  try:
 
847
  # Create a new DataFrame from the reference data
848
  new_reference_df = pd.DataFrame(reference_data)
849
 
850
+ #print("new_reference_df:", new_reference_df)
851
 
852
  # Append on old reference data
853
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
 
1046
  # Save outputs for each batch. If master file created, label file as master
1047
  file_path_details = f"{file_name}_col_{in_column_cleaned}"
1048
 
1049
+ # Create a pivoted reference table
1050
+ existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
1051
+
1052
  # Save the new DataFrame to CSV
1053
  #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1054
+ reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1055
  reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1056
+ unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1057
+ basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1058
 
1059
  # Write outputs to csv
1060
  ## Topics with references
 
1069
  existing_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
1070
  out_file_paths.append(unique_topics_df_out_path)
1071
 
1072
+ # Ensure that we are only returning the final results to outputs
1073
+ out_file_paths = [x for x in out_file_paths if '_final_' in x]
1074
+
1075
+ ## Reference table mapping response numbers to topics
1076
+ existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
1077
+ log_files_output_paths.append(reference_table_out_pivot_path)
1078
+
1079
  ## Create a dataframe for missing response references:
1080
  # Assuming existing_reference_df and file_data are already defined
1081
 
1082
  # Simplify table to just responses column and the Response reference number
1083
+
1084
+
1085
+ basic_response_data = get_basic_response_data(file_data, chosen_cols)
1086
+
1087
+ #print("basic_response_data:", basic_response_data)
1088
+
1089
+ # Save simplified file data to log outputs
1090
+ pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None)
1091
+ log_files_output_paths.append(basic_response_data_out_path)
1092
+
1093
 
1094
  # Step 1: Identify missing references
1095
+ #print("basic_response_data:", basic_response_data)
1096
 
1097
+ missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
1098
 
1099
  # Step 2: Create a new DataFrame with the same columns as existing_reference_df
1100
  missing_df = pd.DataFrame(columns=existing_reference_df.columns)
 
1150
  print("Running query batch", str(reported_batch_no))
1151
 
1152
  # Call the function to prepare the input table
1153
+ simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
1154
+ #log_files_output_paths.append(simplified_csv_table_path)
1155
 
1156
 
1157
  # Conversation history
1158
  conversation_history = []
1159
 
1160
+ #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
1161
 
1162
  # If the latest batch of responses contains at least one instance of text
1163
+ if not batch_basic_response_df.empty:
1164
 
1165
  print("latest_batch_completed:", latest_batch_completed)
1166
 
1167
+ #print("candidate_topics:", candidate_topics)
1168
 
1169
  # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
1170
  if latest_batch_completed >= 1 or candidate_topics is not None:
 
1172
  #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
1173
 
1174
  # Prepare Gemini models before query
1175
+ if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1176
  print("Using Gemini model:", model_choice)
1177
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
1178
  elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
 
1347
  out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1348
  log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1349
 
1350
+ #print("out_file_paths at end of loop:", out_file_paths)
1351
 
1352
  # If this is the first batch, run this
1353
  else:
1354
  #system_prompt = system_prompt + normalised_simple_markdown_table
1355
 
1356
  # Prepare Gemini models before query
1357
+ if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1358
  print("Using Gemini model:", model_choice)
1359
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1360
  else:
 
1442
  else:
1443
  print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
1444
 
1445
+ # Increase latest file completed count unless we are over the last batch number
1446
+ if latest_batch_completed <= num_batches:
1447
  print("Completed batch number:", str(reported_batch_no))
1448
  latest_batch_completed += 1
1449
 
 
1468
  final_message_out = '\n'.join(out_message)
1469
  final_message_out = final_message_out + " " + out_time
1470
 
1471
+ print(final_message_out)
1472
+
1473
+ #print("out_file_paths:", out_file_paths)
1474
+ #print("log_files_output_paths:", log_files_output_paths)
1475
 
1476
  return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
1477
 
1478
+ def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
1479
+
1480
+ df_in = df[['Response References', 'General Topic', 'Subtopic', 'Sentiment']].copy()
1481
+
1482
+ df_in['Response References'] = df_in['Response References'].astype(int)
1483
+
1484
+ # Create a combined category column
1485
+ df_in['Category'] = df_in['General Topic'] + ' - ' + df_in['Subtopic'] + ' - ' + df_in['Sentiment']
1486
+
1487
+ # Create pivot table counting occurrences of each unique combination
1488
+ pivot_table = pd.crosstab(
1489
+ index=df_in['Response References'],
1490
+ columns=[df_in['General Topic'], df_in['Subtopic'], df_in['Sentiment']],
1491
+ margins=True
1492
+ )
1493
+
1494
+ # Flatten column names to make them more readable
1495
+ pivot_table.columns = [' - '.join(col) for col in pivot_table.columns]
1496
+
1497
+ pivot_table.reset_index(inplace=True)
1498
+
1499
+ if not basic_response_data.empty:
1500
+ pivot_table = basic_response_data.merge(pivot_table, right_on="Response References", left_on="Reference", how="left")
1501
+
1502
+ pivot_table.drop("Response References", axis=1, inplace=True)
1503
+
1504
+ # print("pivot_table:", pivot_table)
1505
+
1506
+ return pivot_table
1507
+
1508
+
1509
  # SUMMARISATION FUNCTIONS
1510
 
1511
  def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
 
1583
 
1584
  return result_df
1585
 
1586
+ def deduplicate_topics(reference_df:pd.DataFrame,
1587
+ unique_topics_df:pd.DataFrame,
1588
  reference_table_file_name:str,
1589
  unique_topics_table_file_name:str,
1590
  merge_sentiment:str= "No",
1591
  merge_general_topics:str="No",
1592
  score_threshold:int=deduplication_threshold,
1593
+ in_data_files=[],
1594
+ chosen_cols:List[str]="",
1595
+ deduplicate_topics:str="Yes"
1596
+ ):
1597
  '''
1598
  Deduplicate topics based on a reference and unique topics table
1599
  '''
1600
  output_files = []
1601
+ log_output_files = []
1602
 
1603
+ reference_table_file_name_no_ext = get_file_name_no_ext(reference_table_file_name)
1604
+ unique_topics_table_file_name_no_ext = get_file_name_no_ext(unique_topics_table_file_name)
1605
+
1606
+ if in_data_files and chosen_cols:
1607
+ file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
1608
 
1609
  # Run through this x times to try to get all duplicate topics
1610
  if deduplicate_topics == "Yes":
 
1637
 
1638
  else:
1639
  # Join deduplicated columns back to original df
1640
+ #deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
1641
  # Remove rows where 'deduplicated_category' is blank or NaN
1642
  deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
1643
 
 
1699
  # Remake unique_topics_df based on new reference_df
1700
  unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
1701
 
1702
+ basic_response_data = get_basic_response_data(file_data, chosen_cols)
1703
+
1704
+ reference_df_pivot = convert_reference_table_to_pivot_table(reference_df, basic_response_data)
1705
+
1706
+ reference_table_file_name_no_ext = get_file_name_no_ext(reference_table_file_name)
1707
+ unique_topics_table_file_name_no_ext = get_file_name_no_ext(unique_topics_table_file_name)
1708
 
1709
  reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
1710
  unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
 
1714
  output_files.append(reference_file_path)
1715
  output_files.append(unique_topics_file_path)
1716
 
1717
+ reference_pivot_file_path = output_folder + reference_table_file_name_no_ext + "_pivot_dedup.csv"
1718
+ reference_df_pivot.to_csv(reference_pivot_file_path, index=None)
1719
+
1720
+ log_output_files.append(reference_pivot_file_path)
1721
+
1722
+ return reference_df, unique_topics_df, output_files, log_output_files
1723
 
1724
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
1725
  unique_topics_df:pd.DataFrame,
 
1774
  whole_conversation_metadata = []
1775
 
1776
  # Prepare Gemini models before query
1777
+ if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1778
  print("Using Gemini model:", model_choice)
1779
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1780
  else:
 
1815
  summarised_outputs:list = [],
1816
  latest_summary_completed:int = 0,
1817
  out_metadata_str:str = "",
1818
+ in_data_files:List[str]=[],
1819
+ chosen_cols:List[str]=[],
1820
  output_files:list = [],
1821
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
1822
  do_summaries="Yes",
 
1826
  '''
1827
  out_metadata = []
1828
  local_model = []
1829
+ log_output_files = []
1830
  summarised_output_markdown = ""
1831
 
1832
  print("In summarise_output_topics function.")
 
1835
 
1836
  length_all_summaries = len(all_summaries)
1837
 
1838
+ # Load in data file and chosen columns if exists to create pivot table later
1839
+ if in_data_files and chosen_cols:
1840
+ file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
1841
+
1842
  #print("latest_summary_completed:", latest_summary_completed)
1843
  #print("length_all_summaries:", length_all_summaries)
1844
 
 
1879
  # Remove topics that are tagged as 'Not Mentioned'
1880
  unique_table_df_revised = unique_table_df_revised.loc[unique_table_df_revised["Sentiment"] != "Not Mentioned", :]
1881
  reference_table_df_revised = reference_table_df_revised.loc[reference_table_df_revised["Sentiment"] != "Not Mentioned", :]
1882
+
1883
+ basic_response_data = get_basic_response_data(file_data, chosen_cols)
1884
+
1885
+ reference_table_df_revised_pivot = convert_reference_table_to_pivot_table(reference_table_df_revised, basic_response_data)
1886
 
1887
+ # Save to file
1888
  unique_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_unique_topic_table_" + model_choice_clean + ".csv"
1889
  unique_table_df_revised.to_csv(unique_table_df_revised_path, index = None)
1890
 
 
1893
 
1894
  output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
1895
 
1896
+ ### Save pivot file to log area
1897
+ reference_table_df_revised_pivot_path = output_folder + batch_file_path_details + "_summarised_reference_table_pivot_" + model_choice_clean + ".csv"
1898
+ reference_table_df_revised_pivot.to_csv(reference_table_df_revised_pivot_path, index=None)
1899
+
1900
+ log_output_files.append(reference_table_df_revised_pivot_path)
1901
+
1902
+ ###
1903
  unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(wrap_text))
1904
 
1905
  summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
1906
 
1907
+ return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
1908
 
1909
  tic = time.perf_counter()
1910
 
 
1958
  if latest_summary_completed >= length_all_summaries:
1959
  print("At last summary.")
1960
 
1961
+ return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files