seanpedrickcase commited on
Commit
74d2271
·
1 Parent(s): f0b3bbc

Refactor app.py and related modules for improved topic extraction and summarization. Updated UI prompts for clarity, enhanced file upload functionality, and added error handling in AWS file uploads. Introduced new functions for converting response text to markdown tables, creating general topics from subtopics, and improved overall code structure for better maintainability.

Browse files
Files changed (4) hide show
  1. app.py +10 -11
  2. tools/aws_functions.py +25 -20
  3. tools/llm_api_call.py +197 -158
  4. tools/prompts.py +11 -1
app.py CHANGED
@@ -6,7 +6,6 @@ from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
  from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
9
- from tools.chatfuncs import load_model
10
  #from tools.aws_functions import load_data_from_aws
11
  import gradio as gr
12
  import pandas as pd
@@ -92,23 +91,23 @@ with app:
92
  with gr.Tab(label="Extract topics"):
93
  gr.Markdown(
94
  """
95
- ### Choose a tabular data file (xlsx or csv) of consultation responses to summarise.
96
  """
97
  )
98
  with gr.Row():
99
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
100
  in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
101
 
102
- with gr.Accordion("Upload xlsx or csv files with consultation responses", open = True):
103
  in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
104
 
105
- in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet with responses"], multiselect = False, label="Select the Excel sheet that has the responses.", visible=False, allow_custom_value=True)
106
- in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select column that contains the responses (showing columns present across all files).", allow_custom_value=True, interactive=True)
107
 
108
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
109
  candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
110
 
111
- context_textbox = gr.Textbox(label="Write a short description (up to one sentence) giving context to the large language model about the your consultation and any relevant context")
112
 
113
  extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
114
 
@@ -119,7 +118,7 @@ with app:
119
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
120
 
121
  data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
122
- data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
123
  choices=["The results were good", "The results were not good"], visible=False)
124
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
125
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
@@ -130,7 +129,7 @@ with app:
130
  with gr.Tab(label="Summarise topic outputs"):
131
  gr.Markdown(
132
  """
133
- ### Load in data files from a consultation summarisation to summarise the outputs.
134
  """)
135
  with gr.Accordion("Upload reference data file and unique data files", open = True):
136
  summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -141,7 +140,7 @@ with app:
141
  with gr.Tab(label="Continue previous topic extraction"):
142
  gr.Markdown(
143
  """
144
- ### Load in data files from a previous attempt at summarising a consultation to continue it.
145
  """)
146
 
147
  with gr.Accordion("Upload reference data file and unique data files", open = True):
@@ -207,7 +206,7 @@ with app:
207
  ###
208
 
209
  # Tabular data upload
210
- in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
211
 
212
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
213
  then(load_in_data_file,
@@ -215,7 +214,7 @@ with app:
215
  fn=extract_topics,
216
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
217
  outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
218
-
219
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
220
  latest_batch_completed.change(fn=extract_topics,
221
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
 
6
  from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
7
  from tools.auth import authenticate_user
8
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 
9
  #from tools.aws_functions import load_data_from_aws
10
  import gradio as gr
11
  import pandas as pd
 
91
  with gr.Tab(label="Extract topics"):
92
  gr.Markdown(
93
  """
94
+ ### Choose a tabular data file (xlsx or csv) of open text to extract topics from.
95
  """
96
  )
97
  with gr.Row():
98
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
99
  in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
100
 
101
+ with gr.Accordion("Upload xlsx or csv file", open = True):
102
  in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
103
 
104
+ in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
105
+ in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
106
 
107
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
108
  candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
109
 
110
+ context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
111
 
112
  extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
113
 
 
118
  latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
119
 
120
  data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
121
+ data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the topic extraction.",
122
  choices=["The results were good", "The results were not good"], visible=False)
123
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
124
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
 
129
  with gr.Tab(label="Summarise topic outputs"):
130
  gr.Markdown(
131
  """
132
+ ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
133
  """)
134
  with gr.Accordion("Upload reference data file and unique data files", open = True):
135
  summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
 
140
  with gr.Tab(label="Continue previous topic extraction"):
141
  gr.Markdown(
142
  """
143
+ ### Load in data files from a previous attempt at extracting topics to continue it.
144
  """)
145
 
146
  with gr.Accordion("Upload reference data file and unique data files", open = True):
 
206
  ###
207
 
208
  # Tabular data upload
209
+ in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
210
 
211
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
212
  then(load_in_data_file,
 
214
  fn=extract_topics,
215
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
216
  outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
217
+
218
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
219
  latest_batch_completed.change(fn=extract_topics,
220
  inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
tools/aws_functions.py CHANGED
@@ -159,7 +159,7 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
159
 
160
  return files, out_message
161
 
162
- def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
163
  """
164
  Uploads a file from local machine to Amazon S3.
165
 
@@ -171,31 +171,36 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
171
  Returns:
172
  - Message as variable/printed to console
173
  """
174
- final_out_message = []
175
 
176
- s3_client = boto3.client('s3')
177
 
178
- if isinstance(local_file_paths, str):
179
- local_file_paths = [local_file_paths]
180
 
181
- for file in local_file_paths:
182
- try:
183
- # Get file name off file path
184
- file_name = os.path.basename(file)
185
 
186
- s3_key_full = s3_key + file_name
187
- print("S3 key: ", s3_key_full)
 
 
188
 
189
- s3_client.upload_file(file, s3_bucket, s3_key_full)
190
- out_message = "File " + file_name + " uploaded successfully!"
191
- print(out_message)
192
-
193
- except Exception as e:
194
- out_message = f"Error uploading file(s): {e}"
195
- print(out_message)
 
 
 
196
 
197
- final_out_message.append(out_message)
198
- final_out_message_str = '\n'.join(final_out_message)
 
 
 
199
 
200
  return final_out_message_str
201
 
 
159
 
160
  return files, out_message
161
 
162
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
163
  """
164
  Uploads a file from local machine to Amazon S3.
165
 
 
171
  Returns:
172
  - Message as variable/printed to console
173
  """
174
+ if RUN_AWS_FUNCTIONS == "1":
175
 
176
+ final_out_message = []
177
 
178
+ s3_client = boto3.client('s3')
 
179
 
180
+ if isinstance(local_file_paths, str):
181
+ local_file_paths = [local_file_paths]
 
 
182
 
183
+ for file in local_file_paths:
184
+ try:
185
+ # Get file name off file path
186
+ file_name = os.path.basename(file)
187
 
188
+ s3_key_full = s3_key + file_name
189
+ print("S3 key: ", s3_key_full)
190
+
191
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
192
+ out_message = "File " + file_name + " uploaded successfully!"
193
+ print(out_message)
194
+
195
+ except Exception as e:
196
+ out_message = f"Error uploading file(s): {e}"
197
+ print(out_message)
198
 
199
+ final_out_message.append(out_message)
200
+ final_out_message_str = '\n'.join(final_out_message)
201
+
202
+ else:
203
+ final_out_message_str("Not connected to AWS, no files uploaded.")
204
 
205
  return final_out_message_str
206
 
tools/llm_api_call.py CHANGED
@@ -7,6 +7,7 @@ import markdown
7
  import time
8
  import boto3
9
  import json
 
10
  import string
11
  import re
12
  import spaces
@@ -18,7 +19,7 @@ from io import StringIO
18
 
19
  GradioFileData = gr.FileData
20
 
21
- from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
22
  from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
23
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
24
 
@@ -77,7 +78,7 @@ def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:in
77
 
78
  try:
79
  file_data, file_name = load_in_file(file_paths[0], colname=in_colnames)
80
- num_batches = (len(file_data) // batch_size) + 1
81
  print("Total number of batches:", num_batches)
82
 
83
  except Exception as e:
@@ -195,8 +196,8 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
195
  ~(simple_file["Response"] == " ") &\
196
  ~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
197
 
198
- simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
199
- simple_file.to_csv(simplified_csv_table_path, index=None)
200
 
201
  simple_markdown_table = simple_file.to_markdown(index=None)
202
 
@@ -483,18 +484,15 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
483
  response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model)
484
 
485
  if isinstance(response, ResponseObject):
486
- responses.append(response)
487
- whole_conversation.append(prompt)
488
- whole_conversation.append(response.text)
489
  elif 'choices' in response:
490
- responses.append(response)
491
- # Create conversation txt object
492
- whole_conversation.append(prompt)
493
- whole_conversation.append(response['choices'][0]['text'])
494
  else:
495
- responses.append(response)
496
- whole_conversation.append(prompt)
497
- whole_conversation.append(response.text)
 
 
498
 
499
  # Create conversation metadata
500
  if master == False:
@@ -522,7 +520,7 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
522
  whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
523
 
524
 
525
- return responses, conversation_history, whole_conversation, whole_conversation_metadata
526
 
527
  ### INITIAL TOPIC MODEL DEVELOPMENT FUNCTIONS
528
 
@@ -630,6 +628,66 @@ def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
630
 
631
  return out_unique_topics_df
632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
633
 
634
  def write_llm_output_and_logs(responses: List[ResponseObject],
635
  whole_conversation: List[str],
@@ -706,70 +764,18 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
706
 
707
  #log_files_output_paths.append(whole_conversation_path)
708
  log_files_output_paths.append(whole_conversation_path_meta)
709
-
710
- # Convert output table to markdown and then to a pandas dataframe to csv
711
- def remove_before_last_term(input_string: str) -> str:
712
- # Use regex to find the last occurrence of the term
713
- match = re.search(r'(\| ?General Topic)', input_string)
714
- if match:
715
- # Find the last occurrence by using rfind
716
- last_index = input_string.rfind(match.group(0))
717
- return input_string[last_index:] # Return everything from the last match onward
718
- return input_string # Return the original string if the term is not found
719
-
720
- # Check if the last response is a ResponseObject
721
- if isinstance(responses[-1], ResponseObject):
722
- #print("Text response:", responses[-1].text)
723
- start_of_table_response = remove_before_last_term(responses[-1].text)
724
- cleaned_response = clean_markdown_table(start_of_table_response)
725
- print("cleaned_response:", cleaned_response)
726
- elif "choices" in responses[-1]:
727
- #print("Text response:", responses[-1]["choices"][0]['text'])
728
- start_of_table_response = remove_before_last_term(responses[-1]["choices"][0]['text'])
729
- cleaned_response = clean_markdown_table(start_of_table_response)
730
- print("cleaned_response:", cleaned_response)
731
- else:
732
- #print("Text response:", responses[-1].text)
733
- start_of_table_response = remove_before_last_term(responses[-1].text)
734
- cleaned_response = clean_markdown_table(start_of_table_response)
735
- print("cleaned_response:", cleaned_response)
736
 
737
- markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
738
-
739
- #print("markdown_table:", markdown_table)
740
-
741
- # Remove <p> tags and make sure it has a valid HTML structure
742
- html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
743
- html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
744
-
745
- # Now ensure that the HTML structure is correct
746
- if "<table>" not in html_table:
747
- html_table = f"""
748
- <table>
749
- <tr>
750
- <th>General Topic</th>
751
- <th>Subtopic</th>
752
- <th>Sentiment</th>
753
- <th>Response References</th>
754
- <th>Summary</th>
755
- </tr>
756
- {html_table}
757
- </table>
758
- """
759
-
760
- # print("Markdown table as HTML:", html_table)
761
-
762
- html_buffer = StringIO(html_table)
763
 
 
764
  try:
765
- topic_with_response_df = pd.read_html(html_buffer)[0] # Assuming the first table in the HTML is the one you want
766
  except Exception as e:
767
- print("Error when trying to parse table:", e)
768
- is_error = True
769
- raise ValueError()
770
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
771
 
772
-
773
  # Rename columns to ensure consistent use of data frames later in code
774
  topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
775
 
@@ -887,7 +893,7 @@ def extract_topics(in_data_file,
887
  temperature:float,
888
  chosen_cols:List[str],
889
  model_choice:str,
890
- candidate_topics: GradioFileData = [],
891
  latest_batch_completed:int=0,
892
  out_message:List=[],
893
  out_file_paths:List = [],
@@ -906,11 +912,11 @@ def extract_topics(in_data_file,
906
  time_taken:float = 0,
907
  max_tokens:int=max_tokens,
908
  model_name_map:dict=model_name_map,
909
- max_time_for_loop:int=max_time_for_loop,
910
  progress=Progress(track_tqdm=True)):
911
 
912
  '''
913
- Query an LLM (Gemini or AWS Anthropic-based) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
914
 
915
  Parameters:
916
  - in_data_file (gr.File): Gradio file object containing input data
@@ -954,14 +960,18 @@ def extract_topics(in_data_file,
954
  final_time = 0.0
955
  whole_conversation_metadata = []
956
  is_error = False
 
 
 
 
957
  #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
958
  #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
959
- #llama_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
960
- #llama_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
961
- #llama_prefix = "<|user|>\n" # This is for phi 3.5
962
- #llama_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
963
- llama_prefix = "<start_of_turn>user\n"
964
- llama_suffix = "<end_of_turn>\n<start_of_turn>model\n"
965
 
966
  # Reset output files on each run:
967
  # out_file_paths = []
@@ -987,6 +997,7 @@ def extract_topics(in_data_file,
987
 
988
  # If this is the first time around, set variables to 0/blank
989
  if first_loop_state==True:
 
990
  if (latest_batch_completed == 999) | (latest_batch_completed == 0):
991
  latest_batch_completed = 0
992
  out_message = []
@@ -998,7 +1009,8 @@ def extract_topics(in_data_file,
998
  local_model, tokenizer = load_model()
999
  print("Local model loaded:", local_model)
1000
 
1001
- #print("latest_batch_completed:", str(latest_batch_completed))
 
1002
 
1003
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
1004
  if latest_batch_completed >= num_batches:
@@ -1070,14 +1082,14 @@ def extract_topics(in_data_file,
1070
  log_files_output_paths.append(missing_df_out_path)
1071
 
1072
  out_file_paths = list(set(out_file_paths))
1073
- log_files_output_paths = list(set(log_files_output_paths))
1074
 
1075
- print("out_file_paths:", out_file_paths)
 
1076
 
1077
  #final_out_message = '\n'.join(out_message)
1078
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
1079
-
1080
-
1081
 
1082
  if num_batches > 0:
1083
  progress_measure = round(latest_batch_completed / num_batches, 1)
@@ -1092,8 +1104,7 @@ def extract_topics(in_data_file,
1092
 
1093
  if not out_file_paths:
1094
  out_file_paths = []
1095
-
1096
-
1097
 
1098
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1099
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
@@ -1104,9 +1115,7 @@ def extract_topics(in_data_file,
1104
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
1105
 
1106
 
1107
-
1108
  for i in topics_loop:
1109
-
1110
  #for latest_batch_completed in range(num_batches):
1111
  reported_batch_no = latest_batch_completed + 1
1112
  print("Running query batch", str(reported_batch_no))
@@ -1124,11 +1133,12 @@ def extract_topics(in_data_file,
1124
  # If the latest batch of responses contains at least one instance of text
1125
  if not simple_table_df.empty:
1126
 
1127
-
1128
  print("latest_batch_completed:", latest_batch_completed)
1129
 
 
 
1130
  # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
1131
- if latest_batch_completed >= 1 or candidate_topics:
1132
 
1133
  #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
1134
 
@@ -1141,11 +1151,14 @@ def extract_topics(in_data_file,
1141
  else:
1142
  print("Using local model:", model_choice)
1143
 
1144
- if candidate_topics:
 
 
1145
  # 'Zero shot topics' are those supplied by the user
1146
  max_topic_no = 120
1147
 
1148
  zero_shot_topics = read_file(candidate_topics.name)
 
1149
  if zero_shot_topics.shape[1] == 1: # Check if there is only one column
1150
  zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
1151
  # Max 120 topics allowed
@@ -1156,55 +1169,99 @@ def extract_topics(in_data_file,
1156
  zero_shot_topics_list = list(zero_shot_topics_series)
1157
 
1158
  print("Zero shot topics are:", zero_shot_topics_list)
1159
-
1160
- # Create the most up to date list of topics and subtopics.
1161
- # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1162
- if existing_unique_topics_df.empty:
1163
- existing_unique_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1164
 
1165
  # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1166
- elif not existing_unique_topics_df.empty:
1167
- zero_shot_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
1168
  existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1169
- zero_shot_topics_list_str = zero_shot_topics_list
 
1170
 
1171
- elif set(["General Topic", "Subtopic", "Sentiment"]).issubset(zero_shot_topics.columns):
 
1172
  # Max 120 topics allowed
1173
  if zero_shot_topics.shape[0] > max_topic_no:
1174
  print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1175
  zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
1176
 
1177
  if existing_unique_topics_df.empty:
1178
- existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1], 'Sentiment':zero_shot_topics.iloc[:,2]})
 
 
1179
 
1180
-
 
 
1181
 
1182
  #existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
1183
 
1184
  #all_topic_tables_df_merged = existing_unique_topics_df
1185
  existing_unique_topics_df["Response References"] = ""
1186
 
1187
- unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic", "Sentiment"]].drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).to_markdown(index=False)
1188
 
1189
- #existing_unique_topics_df.to_csv(output_folder + f"{file_name}_master_all_topic_tables_df_merged_" + model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
1190
 
1191
  # Format the summary prompt with the response table and topics
1192
- formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, consultation_context=context_textbox, column_name=chosen_cols)
 
 
1193
 
1194
  if model_choice == "gemma_2b_it_local":
1195
- # add_existing_topics_system_prompt = llama_system_prefix + add_existing_topics_system_prompt + llama_system_suffix
1196
- # formatted_initial_table_prompt = llama_prefix + formatted_summary_prompt + llama_suffix
1197
-
1198
- formatted_initial_table_prompt = llama_prefix + add_existing_topics_system_prompt + formatted_summary_prompt + llama_suffix
1199
 
 
1200
 
1201
  # Define the output file path for the formatted prompt
1202
- formatted_prompt_output_path = output_folder + file_name + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1203
 
1204
  # Write the formatted prompt to the specified file
1205
  try:
1206
  with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
1207
- f.write(formatted_summary_prompt)
1208
  except Exception as e:
1209
  print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
1210
 
@@ -1216,7 +1273,7 @@ def extract_topics(in_data_file,
1216
  summary_whole_conversation = []
1217
 
1218
  # Process requests to large language model
1219
- master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1220
 
1221
  # print("master_summary_response:", master_summary_response[-1].text)
1222
  # print("Whole conversation metadata:", whole_conversation_metadata)
@@ -1253,24 +1310,13 @@ def extract_topics(in_data_file,
1253
 
1254
  #whole_conversation_metadata.append(whole_conversation_metadata_str)
1255
  whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
 
1256
 
 
 
1257
 
1258
- # Write final output to text file also
1259
- #try:
1260
- # new_final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + #model_choice_clean + "_temp_" + str(temperature) + ".txt"
1261
-
1262
- # with open(new_final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1263
- # f.write(display_table)
1264
-
1265
- # log_files_output_paths.append(new_final_table_output_path)
1266
-
1267
- #except Exception as e:
1268
- # print(e)
1269
-
1270
- latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
1271
-
1272
- out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
1273
- log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
1274
 
1275
  print("out_file_paths at end of loop:", out_file_paths)
1276
 
@@ -1285,7 +1331,9 @@ def extract_topics(in_data_file,
1285
  else:
1286
  print("Using AWS Bedrock model:", model_choice)
1287
 
1288
- formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, consultation_context=context_textbox, column_name=chosen_cols)
 
 
1289
 
1290
  if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
1291
  else: formatted_prompt2 = prompt2
@@ -1294,21 +1342,16 @@ def extract_topics(in_data_file,
1294
  else: formatted_prompt3 = prompt3
1295
 
1296
  if model_choice == "gemma_2b_it_local":
1297
- # system_prompt = llama_system_prefix + system_prompt + llama_system_suffix
1298
- # formatted_initial_table_prompt = llama_prefix + formatted_initial_table_prompt + llama_suffix
1299
- # formatted_prompt2 = llama_prefix + formatted_prompt2 + llama_suffix
1300
- # formatted_prompt3 = llama_prefix + formatted_prompt3 + llama_suffix
1301
-
1302
- formatted_initial_table_prompt = llama_prefix + system_prompt + formatted_initial_table_prompt + llama_suffix
1303
- formatted_prompt2 = llama_prefix + system_prompt + formatted_prompt2 + llama_suffix
1304
- formatted_prompt3 = llama_prefix + system_prompt + formatted_prompt3 + llama_suffix
1305
 
1306
  batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
1307
 
1308
- whole_conversation = [system_prompt]
1309
 
1310
  # Process requests to large language model
1311
- responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
1312
 
1313
  # print("Whole conversation metadata before:", whole_conversation_metadata)
1314
 
@@ -1358,8 +1401,6 @@ def extract_topics(in_data_file,
1358
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1359
  f.write(responses[-1].text)
1360
  display_table = responses[-1].text
1361
-
1362
-
1363
 
1364
  log_files_output_paths.append(final_table_output_path)
1365
 
@@ -1370,11 +1411,11 @@ def extract_topics(in_data_file,
1370
  new_reference_df = reference_df
1371
 
1372
  else:
1373
- print("Current batch of responses contains no text, moving onto next. Batch number:", latest_batch_completed, ". Start row:", start_row, ". End row:", end_row)
1374
 
1375
  # Increase latest file completed count unless we are at the last file
1376
  if latest_batch_completed != num_batches:
1377
- print("Completed batch number:", str(latest_batch_completed))
1378
  latest_batch_completed += 1
1379
 
1380
  toc = time.perf_counter()
@@ -1391,17 +1432,16 @@ def extract_topics(in_data_file,
1391
  existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
1392
  existing_topics_table = new_topic_df.dropna(how='all')
1393
 
1394
- out_time = f"in {final_time:0.1f} seconds."
1395
- print(out_time)
1396
 
1397
  out_message.append('All queries successfully completed in')
1398
 
1399
  final_message_out = '\n'.join(out_message)
1400
- final_message_out = final_message_out + " " + out_time
1401
 
1402
- final_message_out = final_message_out + "\n\nGo to to the LLM settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
1403
 
1404
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths #, final_message_out
1405
 
1406
  # SUMMARISATION FUNCTIONS
1407
 
@@ -1463,7 +1503,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
1463
 
1464
  reference_df_unique = reference_df.drop_duplicates("old_category")
1465
 
1466
- reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
1467
 
1468
  # Deduplicate categories within each sentiment group
1469
  deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
@@ -1558,7 +1598,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
1558
  whole_conversation = [summarise_topic_descriptions_system_prompt]
1559
 
1560
  # Process requests to large language model
1561
- responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, local_model=local_model)
1562
 
1563
  print("Finished summary query")
1564
 
@@ -1569,8 +1609,6 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
1569
  else:
1570
  response_texts = [resp.text for resp in responses]
1571
 
1572
-
1573
-
1574
  latest_response_text = response_texts[-1]
1575
 
1576
  #print("latest_response_text:", latest_response_text)
@@ -1597,6 +1635,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1597
  Create better summaries of the raw batch-level summaries created in the first run of the model.
1598
  '''
1599
  out_metadata = []
 
1600
 
1601
  print("In summarise_output_topics function.")
1602
 
@@ -1672,7 +1711,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1672
  print("Current summary number is:", summary_no)
1673
 
1674
  summary_text = all_summaries[summary_no]
1675
- print("summary_text:", summary_text)
1676
  formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
1677
 
1678
  try:
@@ -1696,7 +1735,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1696
  time_taken = tic - toc
1697
 
1698
  if time_taken > max_time_for_loop:
1699
- print("Time taken for loop is greater than maximum time allowed.")
1700
  summary_loop.close()
1701
  tqdm._instances.clear()
1702
  break
 
7
  import time
8
  import boto3
9
  import json
10
+ import math
11
  import string
12
  import re
13
  import spaces
 
19
 
20
  GradioFileData = gr.FileData
21
 
22
+ from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
23
  from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
24
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
25
 
 
78
 
79
  try:
80
  file_data, file_name = load_in_file(file_paths[0], colname=in_colnames)
81
+ num_batches = math.ceil(len(file_data) / batch_size)
82
  print("Total number of batches:", num_batches)
83
 
84
  except Exception as e:
 
196
  ~(simple_file["Response"] == " ") &\
197
  ~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
198
 
199
+ #simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
200
+ #simple_file.to_csv(simplified_csv_table_path, index=None)
201
 
202
  simple_markdown_table = simple_file.to_markdown(index=None)
203
 
 
484
  response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model)
485
 
486
  if isinstance(response, ResponseObject):
487
+ response_text = response.text
 
 
488
  elif 'choices' in response:
489
+ response_text = response['choices'][0]['text']
 
 
 
490
  else:
491
+ response_text = response.text
492
+
493
+ responses.append(response)
494
+ whole_conversation.append(prompt)
495
+ whole_conversation.append(response_text)
496
 
497
  # Create conversation metadata
498
  if master == False:
 
520
  whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
521
 
522
 
523
+ return responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text
524
 
525
  ### INITIAL TOPIC MODEL DEVELOPMENT FUNCTIONS
526
 
 
628
 
629
  return out_unique_topics_df
630
 
631
+ # Convert output table to markdown and then to a pandas dataframe to csv
632
+ def remove_before_last_term(input_string: str) -> str:
633
+ # Use regex to find the last occurrence of the term
634
+ match = re.search(r'(\| ?General Topic)', input_string)
635
+ if match:
636
+ # Find the last occurrence by using rfind
637
+ last_index = input_string.rfind(match.group(0))
638
+ return input_string[last_index:] # Return everything from the last match onward
639
+ return input_string # Return the original string if the term is not found
640
+
641
+ def convert_response_text_to_markdown_table(response_text:str, table_type:str = "Main table"):
642
+ is_error = False
643
+ start_of_table_response = remove_before_last_term(response_text)
644
+ cleaned_response = clean_markdown_table(start_of_table_response)
645
+
646
+ markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
647
+
648
+ # Remove <p> tags and make sure it has a valid HTML structure
649
+ html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
650
+ html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
651
+
652
+ # Now ensure that the HTML structure is correct
653
+ if table_type == "Main table":
654
+ if "<table>" not in html_table:
655
+ html_table = f"""
656
+ <table>
657
+ <tr>
658
+ <th>General Topic</th>
659
+ <th>Subtopic</th>
660
+ <th>Sentiment</th>
661
+ <th>Response References</th>
662
+ <th>Summary</th>
663
+ </tr>
664
+ {html_table}
665
+ </table>
666
+ """
667
+ elif table_type == "Revised topics table":
668
+ if "<table>" not in html_table:
669
+ html_table = f"""
670
+ <table>
671
+ <tr>
672
+ <th>General Topic</th>
673
+ <th>Subtopic</th>
674
+ </tr>
675
+ {html_table}
676
+ </table>
677
+ """
678
+
679
+ html_buffer = StringIO(html_table)
680
+
681
+ try:
682
+ out_df = pd.read_html(html_buffer)[0] # Assuming the first table in the HTML is the one you want
683
+ except Exception as e:
684
+ print("Error when trying to parse table:", e)
685
+ is_error = True
686
+ raise ValueError()
687
+ return pd.DataFrame(), is_error
688
+
689
+ return out_df, is_error
690
+
691
 
692
  def write_llm_output_and_logs(responses: List[ResponseObject],
693
  whole_conversation: List[str],
 
764
 
765
  #log_files_output_paths.append(whole_conversation_path)
766
  log_files_output_paths.append(whole_conversation_path_meta)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767
 
768
+ if isinstance(responses[-1], ResponseObject): response_text = responses[-1].text
769
+ elif "choices" in responses[-1]: response_text = responses[-1]["choices"][0]['text']
770
+ else: response_text = responses[-1].text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
 
772
+ # Convert response text to a markdown table
773
  try:
774
+ topic_with_response_df, is_error = convert_response_text_to_markdown_table(response_text)
775
  except Exception as e:
776
+ print("Error in parsing markdown table from response text:", e)
 
 
777
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
778
 
 
779
  # Rename columns to ensure consistent use of data frames later in code
780
  topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
781
 
 
893
  temperature:float,
894
  chosen_cols:List[str],
895
  model_choice:str,
896
+ candidate_topics: GradioFileData = None,
897
  latest_batch_completed:int=0,
898
  out_message:List=[],
899
  out_file_paths:List = [],
 
912
  time_taken:float = 0,
913
  max_tokens:int=max_tokens,
914
  model_name_map:dict=model_name_map,
915
+ max_time_for_loop:int=max_time_for_loop,
916
  progress=Progress(track_tqdm=True)):
917
 
918
  '''
919
+ Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
920
 
921
  Parameters:
922
  - in_data_file (gr.File): Gradio file object containing input data
 
960
  final_time = 0.0
961
  whole_conversation_metadata = []
962
  is_error = False
963
+ create_revised_general_topics = False
964
+ local_model = []
965
+ tokenizer = []
966
+ zero_shot_topics_df = pd.DataFrame()
967
  #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
968
  #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
969
+ #llama_cpp_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
970
+ #llama_cpp_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
971
+ #llama_cpp_prefix = "<|user|>\n" # This is for phi 3.5
972
+ #llama_cpp_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
973
+ llama_cpp_prefix = "<start_of_turn>user\n"
974
+ llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
975
 
976
  # Reset output files on each run:
977
  # out_file_paths = []
 
997
 
998
  # If this is the first time around, set variables to 0/blank
999
  if first_loop_state==True:
1000
+ print("This is the first time through the loop")
1001
  if (latest_batch_completed == 999) | (latest_batch_completed == 0):
1002
  latest_batch_completed = 0
1003
  out_message = []
 
1009
  local_model, tokenizer = load_model()
1010
  print("Local model loaded:", local_model)
1011
 
1012
+ print("latest_batch_completed at start of function:", str(latest_batch_completed))
1013
+ print("total number of batches:", str(num_batches))
1014
 
1015
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
1016
  if latest_batch_completed >= num_batches:
 
1082
  log_files_output_paths.append(missing_df_out_path)
1083
 
1084
  out_file_paths = list(set(out_file_paths))
1085
+ log_files_output_paths = list(set(log_files_output_paths))
1086
 
1087
+ summary_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
1088
+ print("summary_out_file_paths:", summary_out_file_paths)
1089
 
1090
  #final_out_message = '\n'.join(out_message)
1091
+ return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, summary_out_file_paths
1092
+
 
1093
 
1094
  if num_batches > 0:
1095
  progress_measure = round(latest_batch_completed / num_batches, 1)
 
1104
 
1105
  if not out_file_paths:
1106
  out_file_paths = []
1107
+
 
1108
 
1109
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1110
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
 
1115
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
1116
 
1117
 
 
1118
  for i in topics_loop:
 
1119
  #for latest_batch_completed in range(num_batches):
1120
  reported_batch_no = latest_batch_completed + 1
1121
  print("Running query batch", str(reported_batch_no))
 
1133
  # If the latest batch of responses contains at least one instance of text
1134
  if not simple_table_df.empty:
1135
 
 
1136
  print("latest_batch_completed:", latest_batch_completed)
1137
 
1138
+ print("candidate_topics:", candidate_topics)
1139
+
1140
  # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
1141
+ if latest_batch_completed >= 1 or candidate_topics is not None:
1142
 
1143
  #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
1144
 
 
1151
  else:
1152
  print("Using local model:", model_choice)
1153
 
1154
+ # Preparing candidate topics
1155
+ if candidate_topics and existing_unique_topics_df.empty:
1156
+ progress(0.1, "Creating revised zero shot topics table")
1157
  # 'Zero shot topics' are those supplied by the user
1158
  max_topic_no = 120
1159
 
1160
  zero_shot_topics = read_file(candidate_topics.name)
1161
+
1162
  if zero_shot_topics.shape[1] == 1: # Check if there is only one column
1163
  zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
1164
  # Max 120 topics allowed
 
1169
  zero_shot_topics_list = list(zero_shot_topics_series)
1170
 
1171
  print("Zero shot topics are:", zero_shot_topics_list)
1172
+
1173
+ if create_revised_general_topics == True:
1174
+ # Create the most up to date list of topics and subtopics.
1175
+ # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1176
+ unique_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
1177
+ unique_topics_markdown = unique_topics_df.to_markdown()
1178
+
1179
+ print("unique_topics_markdown:", unique_topics_markdown)
1180
+
1181
+ formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1182
+
1183
+ # Format the general_topics prompt with the topics
1184
+ formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
1185
+
1186
+ if model_choice == "gemma_2b_it_local":
1187
+ formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
1188
+
1189
+ formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
1190
+
1191
+ whole_conversation = []
1192
+
1193
+ general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = process_requests(formatted_general_topics_prompt_list, formatted_general_topics_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1194
+
1195
+ # Convert response text to a markdown table
1196
+ try:
1197
+ zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
1198
+ print("Output revised zero shot topics table is:", zero_shot_topics_df)
1199
+
1200
+ zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
1201
+ zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
1202
+ out_file_paths.append(zero_shot_revised_path)
1203
+ except Exception as e:
1204
+ print("Error in parsing markdown table from response text:", e)
1205
+ print("Not adding revised General Topics to table")
1206
+ zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
1207
+
1208
+ if zero_shot_topics_df.empty:
1209
+ print("Creation of revised general topics df failed, reverting to original list")
1210
+ zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
1211
+ else:
1212
+ zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
1213
 
1214
  # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1215
+ if not existing_unique_topics_df.empty:
 
1216
  existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1217
+ else:
1218
+ existing_unique_topics_df = zero_shot_topics_df
1219
 
1220
+ # If your zero shot column file already contains General Topic and Subtopic columns
1221
+ if set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
1222
  # Max 120 topics allowed
1223
  if zero_shot_topics.shape[0] > max_topic_no:
1224
  print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1225
  zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
1226
 
1227
  if existing_unique_topics_df.empty:
1228
+ existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1]})
1229
+
1230
+ zero_shot_topics_df = zero_shot_topics
1231
 
1232
+ if candidate_topics and not zero_shot_topics_df.empty:
1233
+ # If you have already created revised zero shot topics, concat to the current
1234
+ existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
1235
 
1236
  #existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
1237
 
1238
  #all_topic_tables_df_merged = existing_unique_topics_df
1239
  existing_unique_topics_df["Response References"] = ""
1240
 
1241
+ unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
1242
 
1243
+ #existing_unique_topics_df.to_csv(output_folder + f"{file_name}_existing_unique_topics_df_" + #model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
1244
 
1245
  # Format the summary prompt with the response table and topics
1246
+ formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1247
+ formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
1248
+
1249
 
1250
  if model_choice == "gemma_2b_it_local":
1251
+ formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
1252
+ full_prompt = formatted_summary_prompt
1253
+ else:
1254
+ full_prompt = formatted_system_prompt + formatted_summary_prompt
1255
 
1256
+ #latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
1257
 
1258
  # Define the output file path for the formatted prompt
1259
+ formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1260
 
1261
  # Write the formatted prompt to the specified file
1262
  try:
1263
  with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
1264
+ f.write(full_prompt)
1265
  except Exception as e:
1266
  print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
1267
 
 
1273
  summary_whole_conversation = []
1274
 
1275
  # Process requests to large language model
1276
+ master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1277
 
1278
  # print("master_summary_response:", master_summary_response[-1].text)
1279
  # print("Whole conversation metadata:", whole_conversation_metadata)
 
1310
 
1311
  #whole_conversation_metadata.append(whole_conversation_metadata_str)
1312
  whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
1313
+
1314
 
1315
+ #out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
1316
+ #log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
1317
 
1318
+ out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
1319
+ log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1320
 
1321
  print("out_file_paths at end of loop:", out_file_paths)
1322
 
 
1331
  else:
1332
  print("Using AWS Bedrock model:", model_choice)
1333
 
1334
+ formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table)
1335
+
1336
+ formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1337
 
1338
  if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
1339
  else: formatted_prompt2 = prompt2
 
1342
  else: formatted_prompt3 = prompt3
1343
 
1344
  if model_choice == "gemma_2b_it_local":
1345
+ formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
1346
+ formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
1347
+ formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
 
 
 
 
 
1348
 
1349
  batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
1350
 
1351
+ whole_conversation = [formatted_initial_table_system_prompt]
1352
 
1353
  # Process requests to large language model
1354
+ responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
1355
 
1356
  # print("Whole conversation metadata before:", whole_conversation_metadata)
1357
 
 
1401
  with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1402
  f.write(responses[-1].text)
1403
  display_table = responses[-1].text
 
 
1404
 
1405
  log_files_output_paths.append(final_table_output_path)
1406
 
 
1411
  new_reference_df = reference_df
1412
 
1413
  else:
1414
+ print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
1415
 
1416
  # Increase latest file completed count unless we are at the last file
1417
  if latest_batch_completed != num_batches:
1418
+ print("Completed batch number:", str(reported_batch_no))
1419
  latest_batch_completed += 1
1420
 
1421
  toc = time.perf_counter()
 
1432
  existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
1433
  existing_topics_table = new_topic_df.dropna(how='all')
1434
 
1435
+ out_time = f"{final_time:0.1f} seconds."
 
1436
 
1437
  out_message.append('All queries successfully completed in')
1438
 
1439
  final_message_out = '\n'.join(out_message)
1440
+ final_message_out = final_message_out + " " + out_time
1441
 
1442
+ print(final_message_out)
1443
 
1444
+ return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
1445
 
1446
  # SUMMARISATION FUNCTIONS
1447
 
 
1503
 
1504
  reference_df_unique = reference_df.drop_duplicates("old_category")
1505
 
1506
+ #reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
1507
 
1508
  # Deduplicate categories within each sentiment group
1509
  deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
 
1598
  whole_conversation = [summarise_topic_descriptions_system_prompt]
1599
 
1600
  # Process requests to large language model
1601
+ responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, local_model=local_model)
1602
 
1603
  print("Finished summary query")
1604
 
 
1609
  else:
1610
  response_texts = [resp.text for resp in responses]
1611
 
 
 
1612
  latest_response_text = response_texts[-1]
1613
 
1614
  #print("latest_response_text:", latest_response_text)
 
1635
  Create better summaries of the raw batch-level summaries created in the first run of the model.
1636
  '''
1637
  out_metadata = []
1638
+ local_model = []
1639
 
1640
  print("In summarise_output_topics function.")
1641
 
 
1711
  print("Current summary number is:", summary_no)
1712
 
1713
  summary_text = all_summaries[summary_no]
1714
+ #print("summary_text:", summary_text)
1715
  formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
1716
 
1717
  try:
 
1735
  time_taken = tic - toc
1736
 
1737
  if time_taken > max_time_for_loop:
1738
+ print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
1739
  summary_loop.close()
1740
  tqdm._instances.clear()
1741
  break
tools/prompts.py CHANGED
@@ -1,4 +1,4 @@
1
- system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called {column_name}. The context of this analysis is: {consultation_context}. """
2
 
3
  initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
4
  {response_table}
@@ -50,6 +50,16 @@ Your task is to make a consolidated summary of the above text. Return a summary
50
  Summary:"""
51
 
52
 
 
 
 
 
 
 
 
 
 
 
53
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
54
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
55
  # Summarise the following text in less than {length} words: "{text}"\n
 
1
+ system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called '{column_name}'. The context of this analysis is '{consultation_context}'."""
2
 
3
  initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
4
  {response_table}
 
50
  Summary:"""
51
 
52
 
53
+ create_general_topics_system_prompt = system_prompt
54
+
55
+ create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
56
+ {topics}
57
+
58
+ Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
59
+
60
+ New Topics table:"""
61
+
62
+
63
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
64
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
65
  # Summarise the following text in less than {length} words: "{text}"\n