Spaces:

seanpedrickcase
/

llm_topic_modelling

Running

App Files Files Community

seanpedrickcase commited on Dec 12, 2024

Commit

74d2271

1 Parent(s): f0b3bbc

Refactor app.py and related modules for improved topic extraction and summarization. Updated UI prompts for clarity, enhanced file upload functionality, and added error handling in AWS file uploads. Introduced new functions for converting response text to markdown tables, creating general topics from subtopics, and improved overall code structure for better maintainability.

Browse files

Files changed (4) hide show

app.py +10 -11
tools/aws_functions.py +25 -20
tools/llm_api_call.py +197 -158
tools/prompts.py +11 -1

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
-from tools.chatfuncs import load_model
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
 import pandas as pd
@@ -92,23 +91,23 @@ with app:
     with gr.Tab(label="Extract topics"):
         gr.Markdown(
         """
-        ### Choose a tabular data file (xlsx or csv) of consultation responses to summarise.
         """
         )
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
             in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
-        with gr.Accordion("Upload xlsx or csv files with consultation responses", open = True):
             in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
-        in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet with responses"], multiselect = False, label="Select the Excel sheet that has the responses.", visible=False, allow_custom_value=True)
-        in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select column that contains the responses (showing columns present across all files).", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
             candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
-        context_textbox = gr.Textbox(label="Write a short description (up to one sentence) giving context to the large language model about the your consultation and any relevant context")
         extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
@@ -119,7 +118,7 @@ with app:
         latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
-        data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
                 choices=["The results were good", "The results were not good"], visible=False)
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
@@ -130,7 +129,7 @@ with app:
     with gr.Tab(label="Summarise topic outputs"):
         gr.Markdown(
         """
-        ### Load in data files from a consultation summarisation to summarise the outputs.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
             summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -141,7 +140,7 @@ with app:
     with gr.Tab(label="Continue previous topic extraction"):
         gr.Markdown(
         """
-        ### Load in data files from a previous attempt at summarising a consultation to continue it.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
@@ -207,7 +206,7 @@ with app:
     ###
      # Tabular data upload
-    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
     extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
     then(load_in_data_file,
@@ -215,7 +214,7 @@ with app:
         fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
         outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
     latest_batch_completed.change(fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],

 from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
 import pandas as pd
     with gr.Tab(label="Extract topics"):
         gr.Markdown(
         """
+        ### Choose a tabular data file (xlsx or csv) of open text to extract topics from.
         """
         )
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
             in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
+        with gr.Accordion("Upload xlsx or csv file", open = True):
             in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
+        in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
+        in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
             candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
+        context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
         extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
         latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
         data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
+        data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the topic extraction.",
                 choices=["The results were good", "The results were not good"], visible=False)
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     with gr.Tab(label="Summarise topic outputs"):
         gr.Markdown(
         """
+        ### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
             summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
     with gr.Tab(label="Continue previous topic extraction"):
         gr.Markdown(
         """
+        ### Load in data files from a previous attempt at extracting topics to continue it.
         """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
     ###
      # Tabular data upload
+    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
     extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
     then(load_in_data_file,
         fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
         outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
     latest_batch_completed.change(fn=extract_topics,
         inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],

tools/aws_functions.py CHANGED Viewed

@@ -159,7 +159,7 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
     return files, out_message
-def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
     """
     Uploads a file from local machine to Amazon S3.
@@ -171,31 +171,36 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
     Returns:
     - Message as variable/printed to console
     """
-    final_out_message = []
-    s3_client = boto3.client('s3')
-    if isinstance(local_file_paths, str):
-        local_file_paths = [local_file_paths]
-    for file in local_file_paths:
-        try:
-            # Get file name off file path
-            file_name = os.path.basename(file)
-            s3_key_full = s3_key + file_name
-            print("S3 key: ", s3_key_full)
-            s3_client.upload_file(file, s3_bucket, s3_key_full)
-            out_message = "File " + file_name + " uploaded successfully!"
-            print(out_message)
-        except Exception as e:
-            out_message = f"Error uploading file(s): {e}"
-            print(out_message)
-        final_out_message.append(out_message)
-        final_out_message_str = '\n'.join(final_out_message)
     return final_out_message_str

     return files, out_message
+def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
     """
     Uploads a file from local machine to Amazon S3.
     Returns:
     - Message as variable/printed to console
     """
+    if RUN_AWS_FUNCTIONS == "1":
+        final_out_message = []
+        s3_client = boto3.client('s3')
+        if isinstance(local_file_paths, str):
+            local_file_paths = [local_file_paths]
+        for file in local_file_paths:
+            try:
+                # Get file name off file path
+                file_name = os.path.basename(file)
+                s3_key_full = s3_key + file_name
+                print("S3 key: ", s3_key_full)
+                s3_client.upload_file(file, s3_bucket, s3_key_full)
+                out_message = "File " + file_name + " uploaded successfully!"
+                print(out_message)
+            except Exception as e:
+                out_message = f"Error uploading file(s): {e}"
+                print(out_message)
+            final_out_message.append(out_message)
+            final_out_message_str = '\n'.join(final_out_message)
+    else:
+        final_out_message_str("Not connected to AWS, no files uploaded.")
     return final_out_message_str

tools/llm_api_call.py CHANGED Viewed

@@ -7,6 +7,7 @@ import markdown
 import time
 import boto3
 import json
 import string
 import re
 import spaces
@@ -18,7 +19,7 @@ from io import StringIO
 GradioFileData = gr.FileData
-from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
 from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
@@ -77,7 +78,7 @@ def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:in
     try:
         file_data, file_name = load_in_file(file_paths[0], colname=in_colnames)
-        num_batches = (len(file_data) // batch_size) + 1
         print("Total number of batches:", num_batches)
     except Exception as e:
@@ -195,8 +196,8 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
                                   ~(simple_file["Response"] == " ") &\
                                   ~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
-    simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
-    simple_file.to_csv(simplified_csv_table_path, index=None)
     simple_markdown_table = simple_file.to_markdown(index=None)
@@ -483,18 +484,15 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
         response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model)
         if isinstance(response, ResponseObject):
-            responses.append(response)
-            whole_conversation.append(prompt)
-            whole_conversation.append(response.text)
         elif 'choices' in response:
-            responses.append(response)
-            # Create conversation txt object
-            whole_conversation.append(prompt)
-            whole_conversation.append(response['choices'][0]['text'])
         else:
-            responses.append(response)
-            whole_conversation.append(prompt)
-            whole_conversation.append(response.text)
         # Create conversation metadata
         if master == False:
@@ -522,7 +520,7 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
             whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
-    return responses, conversation_history, whole_conversation, whole_conversation_metadata
 ### INITIAL TOPIC MODEL DEVELOPMENT FUNCTIONS
@@ -630,6 +628,66 @@ def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
     return out_unique_topics_df
 def write_llm_output_and_logs(responses: List[ResponseObject],
                               whole_conversation: List[str],
@@ -706,70 +764,18 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     #log_files_output_paths.append(whole_conversation_path)
     log_files_output_paths.append(whole_conversation_path_meta)
-    # Convert output table to markdown and then to a pandas dataframe to csv
-    def remove_before_last_term(input_string: str) -> str:
-        # Use regex to find the last occurrence of the term
-        match = re.search(r'(\| ?General Topic)', input_string)
-        if match:
-            # Find the last occurrence by using rfind
-            last_index = input_string.rfind(match.group(0))
-            return input_string[last_index:]  # Return everything from the last match onward
-        return input_string  # Return the original string if the term is not found
-        # Check if the last response is a ResponseObject
-    if isinstance(responses[-1], ResponseObject):
-        #print("Text response:", responses[-1].text)
-        start_of_table_response = remove_before_last_term(responses[-1].text)
-        cleaned_response = clean_markdown_table(start_of_table_response)
-        print("cleaned_response:", cleaned_response)
-    elif "choices" in responses[-1]:
-        #print("Text response:", responses[-1]["choices"][0]['text'])
-        start_of_table_response = remove_before_last_term(responses[-1]["choices"][0]['text'])
-        cleaned_response = clean_markdown_table(start_of_table_response)
-        print("cleaned_response:", cleaned_response)
-    else:
-        #print("Text response:", responses[-1].text)
-        start_of_table_response = remove_before_last_term(responses[-1].text)
-        cleaned_response = clean_markdown_table(start_of_table_response)
-        print("cleaned_response:", cleaned_response)
-    markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
-    #print("markdown_table:", markdown_table)
-    # Remove <p> tags and make sure it has a valid HTML structure
-    html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
-    html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
-    # Now ensure that the HTML structure is correct
-    if "<table>" not in html_table:
-        html_table = f"""
-        <table>
-            <tr>
-                <th>General Topic</th>
-                <th>Subtopic</th>
-                <th>Sentiment</th>
-                <th>Response References</th>
-                <th>Summary</th>
-            </tr>
-            {html_table}
-        </table>
-        """
-    # print("Markdown table as HTML:", html_table)
-    html_buffer = StringIO(html_table)
     try:
-        topic_with_response_df = pd.read_html(html_buffer)[0]  # Assuming the first table in the HTML is the one you want
     except Exception as e:
-        print("Error when trying to parse table:", e)
-        is_error = True
-        raise ValueError()
         return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
     topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
@@ -887,7 +893,7 @@ def extract_topics(in_data_file,
               temperature:float,
               chosen_cols:List[str],
               model_choice:str,
-              candidate_topics: GradioFileData = [],
               latest_batch_completed:int=0,
               out_message:List=[],
               out_file_paths:List = [],
@@ -906,11 +912,11 @@ def extract_topics(in_data_file,
               time_taken:float = 0,
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
-              max_time_for_loop:int=max_time_for_loop,
               progress=Progress(track_tqdm=True)):
     '''
-    Query an LLM (Gemini or AWS Anthropic-based) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
     Parameters:
     - in_data_file (gr.File): Gradio file object containing input data
@@ -954,14 +960,18 @@ def extract_topics(in_data_file,
     final_time = 0.0
     whole_conversation_metadata = []
     is_error = False
     #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
     #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
-    #llama_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
-    #llama_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
-    #llama_prefix = "<|user|>\n" # This is for phi 3.5
-    #llama_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
-    llama_prefix = "<start_of_turn>user\n"
-    llama_suffix = "<end_of_turn>\n<start_of_turn>model\n"
     # Reset output files on each run:
     # out_file_paths = []
@@ -987,6 +997,7 @@ def extract_topics(in_data_file,
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         if (latest_batch_completed == 999) | (latest_batch_completed == 0):
             latest_batch_completed = 0
             out_message = []
@@ -998,7 +1009,8 @@ def extract_topics(in_data_file,
                 local_model, tokenizer = load_model()
                 print("Local model loaded:", local_model)
-    #print("latest_batch_completed:", str(latest_batch_completed))
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_batch_completed >= num_batches:
@@ -1070,14 +1082,14 @@ def extract_topics(in_data_file,
         log_files_output_paths.append(missing_df_out_path)
         out_file_paths = list(set(out_file_paths))
-        log_files_output_paths = list(set(log_files_output_paths))
-        print("out_file_paths:", out_file_paths)
         #final_out_message = '\n'.join(out_message)
-        return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
     if num_batches > 0:
         progress_measure = round(latest_batch_completed / num_batches, 1)
@@ -1092,8 +1104,7 @@ def extract_topics(in_data_file,
     if not out_file_paths:
         out_file_paths = []
     if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
         out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
@@ -1104,9 +1115,7 @@ def extract_topics(in_data_file,
     topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
     for i in topics_loop:
         #for latest_batch_completed in range(num_batches):
         reported_batch_no = latest_batch_completed + 1
         print("Running query batch", str(reported_batch_no))
@@ -1124,11 +1133,12 @@ def extract_topics(in_data_file,
         # If the latest batch of responses contains at least one instance of text
         if not simple_table_df.empty:
             print("latest_batch_completed:", latest_batch_completed)
             # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
-            if latest_batch_completed >= 1 or candidate_topics:
                 #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
@@ -1141,11 +1151,14 @@ def extract_topics(in_data_file,
                 else:
                     print("Using local model:", model_choice)
-                if candidate_topics:
                     # 'Zero shot topics' are those supplied by the user
                     max_topic_no = 120
                     zero_shot_topics = read_file(candidate_topics.name)
                     if zero_shot_topics.shape[1] == 1:  # Check if there is only one column
                         zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
                         # Max 120 topics allowed
@@ -1156,55 +1169,99 @@ def extract_topics(in_data_file,
                         zero_shot_topics_list = list(zero_shot_topics_series)
                         print("Zero shot topics are:", zero_shot_topics_list)
-                        # Create the most up to date list of topics and subtopics.
-                        # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
-                        if existing_unique_topics_df.empty:
-                            existing_unique_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
                         # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
-                        elif not existing_unique_topics_df.empty:
-                            zero_shot_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
                             existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
-                            zero_shot_topics_list_str = zero_shot_topics_list
-                    elif set(["General Topic", "Subtopic", "Sentiment"]).issubset(zero_shot_topics.columns):
                         # Max 120 topics allowed
                         if zero_shot_topics.shape[0] > max_topic_no:
                             print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
                             zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
                         if existing_unique_topics_df.empty:
-                            existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1], 'Sentiment':zero_shot_topics.iloc[:,2]})
                     #existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
                 #all_topic_tables_df_merged = existing_unique_topics_df
                 existing_unique_topics_df["Response References"] = ""
-                unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic", "Sentiment"]].drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).to_markdown(index=False)
-                #existing_unique_topics_df.to_csv(output_folder + f"{file_name}_master_all_topic_tables_df_merged_" + model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
                 # Format the summary prompt with the response table and topics
-                formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, consultation_context=context_textbox, column_name=chosen_cols)
                 if model_choice == "gemma_2b_it_local":
-                    # add_existing_topics_system_prompt = llama_system_prefix + add_existing_topics_system_prompt + llama_system_suffix
-                    # formatted_initial_table_prompt = llama_prefix + formatted_summary_prompt + llama_suffix
-                    formatted_initial_table_prompt = llama_prefix + add_existing_topics_system_prompt + formatted_summary_prompt + llama_suffix
                 # Define the output file path for the formatted prompt
-                formatted_prompt_output_path = output_folder + file_name + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
                 # Write the formatted prompt to the specified file
                 try:
                     with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
-                        f.write(formatted_summary_prompt)
                 except Exception as e:
                     print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
@@ -1216,7 +1273,7 @@ def extract_topics(in_data_file,
                 summary_whole_conversation = []
                 # Process requests to large language model
-                master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
                 # print("master_summary_response:", master_summary_response[-1].text)
                 # print("Whole conversation metadata:", whole_conversation_metadata)
@@ -1253,24 +1310,13 @@ def extract_topics(in_data_file,
                 #whole_conversation_metadata.append(whole_conversation_metadata_str)
                 whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
-                # Write final output to text file also
-                #try:
-                #    new_final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + #model_choice_clean + "_temp_" + str(temperature) + ".txt"
-                #    with open(new_final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
-                #        f.write(display_table)
-                #    log_files_output_paths.append(new_final_table_output_path)
-                #except Exception as e:
-                #    print(e)
-                latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
-                out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
-                log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
                 print("out_file_paths at end of loop:", out_file_paths)
@@ -1285,7 +1331,9 @@ def extract_topics(in_data_file,
                 else:
                     print("Using AWS Bedrock model:", model_choice)
-                formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, consultation_context=context_textbox, column_name=chosen_cols)
                 if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
                 else: formatted_prompt2 = prompt2
@@ -1294,21 +1342,16 @@ def extract_topics(in_data_file,
                 else: formatted_prompt3 = prompt3
                 if model_choice == "gemma_2b_it_local":
-                    # system_prompt = llama_system_prefix + system_prompt + llama_system_suffix
-                    # formatted_initial_table_prompt = llama_prefix + formatted_initial_table_prompt + llama_suffix
-                    # formatted_prompt2 = llama_prefix + formatted_prompt2 + llama_suffix
-                    # formatted_prompt3 = llama_prefix + formatted_prompt3 + llama_suffix
-                    formatted_initial_table_prompt = llama_prefix + system_prompt + formatted_initial_table_prompt + llama_suffix
-                    formatted_prompt2 = llama_prefix + system_prompt + formatted_prompt2 + llama_suffix
-                    formatted_prompt3 = llama_prefix + system_prompt + formatted_prompt3 + llama_suffix
                 batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used]  # Adjust this list to send fewer requests
-                whole_conversation = [system_prompt]
                 # Process requests to large language model
-                responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
                 # print("Whole conversation metadata before:", whole_conversation_metadata)
@@ -1358,8 +1401,6 @@ def extract_topics(in_data_file,
                         with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
                             f.write(responses[-1].text)
                         display_table = responses[-1].text
                     log_files_output_paths.append(final_table_output_path)
@@ -1370,11 +1411,11 @@ def extract_topics(in_data_file,
                 new_reference_df = reference_df
         else:
-            print("Current batch of responses contains no text, moving onto next. Batch number:", latest_batch_completed, ". Start row:", start_row, ". End row:", end_row)
         # Increase latest file completed count unless we are at the last file
         if latest_batch_completed != num_batches:
-            print("Completed batch number:", str(latest_batch_completed))
             latest_batch_completed += 1
         toc = time.perf_counter()
@@ -1391,17 +1432,16 @@ def extract_topics(in_data_file,
         existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
         existing_topics_table = new_topic_df.dropna(how='all')
-    out_time = f"in {final_time:0.1f} seconds."
-    print(out_time)
     out_message.append('All queries successfully completed in')
     final_message_out = '\n'.join(out_message)
-    final_message_out = final_message_out + " " + out_time
-    final_message_out = final_message_out + "\n\nGo to to the LLM settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
-    return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths #, final_message_out
 # SUMMARISATION FUNCTIONS
@@ -1463,7 +1503,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
             reference_df_unique = reference_df.drop_duplicates("old_category")
-            reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
             # Deduplicate categories within each sentiment group
             deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
@@ -1558,7 +1598,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     whole_conversation = [summarise_topic_descriptions_system_prompt]
     # Process requests to large language model
-    responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, local_model=local_model)
     print("Finished summary query")
@@ -1569,8 +1609,6 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     else:
         response_texts = [resp.text for resp in responses]
     latest_response_text = response_texts[-1]
     #print("latest_response_text:", latest_response_text)
@@ -1597,6 +1635,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
     Create better summaries of the raw batch-level summaries created in the first run of the model.
     '''
     out_metadata = []
     print("In summarise_output_topics function.")
@@ -1672,7 +1711,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
         print("Current summary number is:", summary_no)
         summary_text = all_summaries[summary_no]
-        print("summary_text:", summary_text)
         formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
         try:
@@ -1696,7 +1735,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
         time_taken = tic - toc
         if time_taken > max_time_for_loop:
-            print("Time taken for loop is greater than maximum time allowed.")
             summary_loop.close()
             tqdm._instances.clear()
             break

 import time
 import boto3
 import json
+import math
 import string
 import re
 import spaces
 GradioFileData = gr.FileData
+from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
 from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
 from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
     try:
         file_data, file_name = load_in_file(file_paths[0], colname=in_colnames)
+        num_batches = math.ceil(len(file_data) / batch_size)
         print("Total number of batches:", num_batches)
     except Exception as e:
                                   ~(simple_file["Response"] == " ") &\
                                   ~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
+    #simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
+    #simple_file.to_csv(simplified_csv_table_path, index=None)
     simple_markdown_table = simple_file.to_markdown(index=None)
         response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model)
         if isinstance(response, ResponseObject):
+            response_text = response.text
         elif 'choices' in response:
+            response_text = response['choices'][0]['text']
         else:
+            response_text = response.text
+        responses.append(response)
+        whole_conversation.append(prompt)
+        whole_conversation.append(response_text)
         # Create conversation metadata
         if master == False:
             whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
+    return responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text
 ### INITIAL TOPIC MODEL DEVELOPMENT FUNCTIONS
     return out_unique_topics_df
+# Convert output table to markdown and then to a pandas dataframe to csv
+def remove_before_last_term(input_string: str) -> str:
+    # Use regex to find the last occurrence of the term
+    match = re.search(r'(\| ?General Topic)', input_string)
+    if match:
+        # Find the last occurrence by using rfind
+        last_index = input_string.rfind(match.group(0))
+        return input_string[last_index:]  # Return everything from the last match onward
+    return input_string  # Return the original string if the term is not found
+def convert_response_text_to_markdown_table(response_text:str, table_type:str = "Main table"):
+    is_error = False
+    start_of_table_response = remove_before_last_term(response_text)
+    cleaned_response = clean_markdown_table(start_of_table_response)
+    markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
+    # Remove <p> tags and make sure it has a valid HTML structure
+    html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
+    html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
+    # Now ensure that the HTML structure is correct
+    if table_type == "Main table":
+        if "<table>" not in html_table:
+            html_table = f"""
+            <table>
+                <tr>
+                    <th>General Topic</th>
+                    <th>Subtopic</th>
+                    <th>Sentiment</th>
+                    <th>Response References</th>
+                    <th>Summary</th>
+                </tr>
+                {html_table}
+            </table>
+            """
+    elif table_type == "Revised topics table":
+        if "<table>" not in html_table:
+            html_table = f"""
+            <table>
+                <tr>
+                    <th>General Topic</th>
+                    <th>Subtopic</th>
+                </tr>
+                {html_table}
+            </table>
+            """
+    html_buffer = StringIO(html_table)
+    try:
+        out_df = pd.read_html(html_buffer)[0]  # Assuming the first table in the HTML is the one you want
+    except Exception as e:
+        print("Error when trying to parse table:", e)
+        is_error = True
+        raise ValueError()
+        return pd.DataFrame(), is_error
+    return out_df, is_error
 def write_llm_output_and_logs(responses: List[ResponseObject],
                               whole_conversation: List[str],
     #log_files_output_paths.append(whole_conversation_path)
     log_files_output_paths.append(whole_conversation_path_meta)
+    if isinstance(responses[-1], ResponseObject): response_text =  responses[-1].text
+    elif "choices" in responses[-1]: response_text =  responses[-1]["choices"][0]['text']
+    else: response_text =  responses[-1].text
+    # Convert response text to a markdown table
     try:
+        topic_with_response_df, is_error = convert_response_text_to_markdown_table(response_text)
     except Exception as e:
+        print("Error in parsing markdown table from response text:", e)
         return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
     topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
               temperature:float,
               chosen_cols:List[str],
               model_choice:str,
+              candidate_topics: GradioFileData = None,
               latest_batch_completed:int=0,
               out_message:List=[],
               out_file_paths:List = [],
               time_taken:float = 0,
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
+              max_time_for_loop:int=max_time_for_loop,
               progress=Progress(track_tqdm=True)):
     '''
+    Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
     Parameters:
     - in_data_file (gr.File): Gradio file object containing input data
     final_time = 0.0
     whole_conversation_metadata = []
     is_error = False
+    create_revised_general_topics = False
+    local_model = []
+    tokenizer = []
+    zero_shot_topics_df = pd.DataFrame()
     #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
     #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
+    #llama_cpp_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
+    #llama_cpp_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
+    #llama_cpp_prefix = "<|user|>\n" # This is for phi 3.5
+    #llama_cpp_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
+    llama_cpp_prefix = "<start_of_turn>user\n"
+    llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
     # Reset output files on each run:
     # out_file_paths = []
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
+        print("This is the first time through the loop")
         if (latest_batch_completed == 999) | (latest_batch_completed == 0):
             latest_batch_completed = 0
             out_message = []
                 local_model, tokenizer = load_model()
                 print("Local model loaded:", local_model)
+    print("latest_batch_completed at start of function:", str(latest_batch_completed))
+    print("total number of batches:", str(num_batches))
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_batch_completed >= num_batches:
         log_files_output_paths.append(missing_df_out_path)
         out_file_paths = list(set(out_file_paths))
+        log_files_output_paths = list(set(log_files_output_paths))
+        summary_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
+        print("summary_out_file_paths:", summary_out_file_paths)
         #final_out_message = '\n'.join(out_message)
+        return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, summary_out_file_paths
     if num_batches > 0:
         progress_measure = round(latest_batch_completed / num_batches, 1)
     if not out_file_paths:
         out_file_paths = []
     if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
         out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
     topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
     for i in topics_loop:
         #for latest_batch_completed in range(num_batches):
         reported_batch_no = latest_batch_completed + 1
         print("Running query batch", str(reported_batch_no))
         # If the latest batch of responses contains at least one instance of text
         if not simple_table_df.empty:
             print("latest_batch_completed:", latest_batch_completed)
+            print("candidate_topics:", candidate_topics)
             # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
+            if latest_batch_completed >= 1 or candidate_topics is not None:
                 #print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
                 else:
                     print("Using local model:", model_choice)
+                # Preparing candidate topics
+                if candidate_topics and existing_unique_topics_df.empty:
+                    progress(0.1, "Creating revised zero shot topics table")
                     # 'Zero shot topics' are those supplied by the user
                     max_topic_no = 120
                     zero_shot_topics = read_file(candidate_topics.name)
                     if zero_shot_topics.shape[1] == 1:  # Check if there is only one column
                         zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
                         # Max 120 topics allowed
                         zero_shot_topics_list = list(zero_shot_topics_series)
                         print("Zero shot topics are:", zero_shot_topics_list)
+                        if create_revised_general_topics == True:
+                            # Create the most up to date list of topics and subtopics.
+                            # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
+                            unique_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
+                            unique_topics_markdown = unique_topics_df.to_markdown()
+                            print("unique_topics_markdown:", unique_topics_markdown)
+                            formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
+                            # Format the general_topics prompt with the topics
+                            formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
+                            if model_choice == "gemma_2b_it_local":
+                                formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
+                            formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
+                            whole_conversation = []
+                            general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = process_requests(formatted_general_topics_prompt_list, formatted_general_topics_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
+                            # Convert response text to a markdown table
+                            try:
+                                zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
+                                print("Output revised zero shot topics table is:", zero_shot_topics_df)
+                                zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
+                                zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
+                                out_file_paths.append(zero_shot_revised_path)
+                            except Exception as e:
+                                print("Error in parsing markdown table from response text:", e)
+                                print("Not adding revised General Topics to table")
+                                zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
+                            if zero_shot_topics_df.empty:
+                                print("Creation of revised general topics df failed, reverting to original list")
+                                zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
+                        else:
+                            zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
                         # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
+                        if not existing_unique_topics_df.empty:
                             existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
+                        else:
+                            existing_unique_topics_df = zero_shot_topics_df
+                    # If your zero shot column file already contains General Topic and Subtopic columns
+                    if set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
                         # Max 120 topics allowed
                         if zero_shot_topics.shape[0] > max_topic_no:
                             print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
                             zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
                         if existing_unique_topics_df.empty:
+                            existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1]})
+                        zero_shot_topics_df = zero_shot_topics
+                if candidate_topics and not zero_shot_topics_df.empty:
+                    # If you have already created revised zero shot topics, concat to the current
+                    existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
                     #existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
                 #all_topic_tables_df_merged = existing_unique_topics_df
                 existing_unique_topics_df["Response References"] = ""
+                unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
+                #existing_unique_topics_df.to_csv(output_folder + f"{file_name}_existing_unique_topics_df_" + #model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
                 # Format the summary prompt with the response table and topics
+                formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
+                formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
                 if model_choice == "gemma_2b_it_local":
+                    formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
+                    full_prompt = formatted_summary_prompt
+                else:
+                    full_prompt = formatted_system_prompt + formatted_summary_prompt
+                #latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
                 # Define the output file path for the formatted prompt
+                formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) +  "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
                 # Write the formatted prompt to the specified file
                 try:
                     with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
+                        f.write(full_prompt)
                 except Exception as e:
                     print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
                 summary_whole_conversation = []
                 # Process requests to large language model
+                master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
                 # print("master_summary_response:", master_summary_response[-1].text)
                 # print("Whole conversation metadata:", whole_conversation_metadata)
                 #whole_conversation_metadata.append(whole_conversation_metadata_str)
                 whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
+                #out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
+                #log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
+                out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
+                log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
                 print("out_file_paths at end of loop:", out_file_paths)
                 else:
                     print("Using AWS Bedrock model:", model_choice)
+                formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table)
+                formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
                 if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
                 else: formatted_prompt2 = prompt2
                 else: formatted_prompt3 = prompt3
                 if model_choice == "gemma_2b_it_local":
+                    formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
+                    formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
+                    formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
                 batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used]  # Adjust this list to send fewer requests
+                whole_conversation = [formatted_initial_table_system_prompt]
                 # Process requests to large language model
+                responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
                 # print("Whole conversation metadata before:", whole_conversation_metadata)
                         with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
                             f.write(responses[-1].text)
                         display_table = responses[-1].text
                     log_files_output_paths.append(final_table_output_path)
                 new_reference_df = reference_df
         else:
+            print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
         # Increase latest file completed count unless we are at the last file
         if latest_batch_completed != num_batches:
+            print("Completed batch number:", str(reported_batch_no))
             latest_batch_completed += 1
         toc = time.perf_counter()
         existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
         existing_topics_table = new_topic_df.dropna(how='all')
+    out_time = f"{final_time:0.1f} seconds."
     out_message.append('All queries successfully completed in')
     final_message_out = '\n'.join(out_message)
+    final_message_out = final_message_out + " " + out_time
+    print(final_message_out)
+    return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
 # SUMMARISATION FUNCTIONS
             reference_df_unique = reference_df.drop_duplicates("old_category")
+            #reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
             # Deduplicate categories within each sentiment group
             deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
     whole_conversation = [summarise_topic_descriptions_system_prompt]
     # Process requests to large language model
+    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, local_model=local_model)
     print("Finished summary query")
     else:
         response_texts = [resp.text for resp in responses]
     latest_response_text = response_texts[-1]
     #print("latest_response_text:", latest_response_text)
     Create better summaries of the raw batch-level summaries created in the first run of the model.
     '''
     out_metadata = []
+    local_model = []
     print("In summarise_output_topics function.")
         print("Current summary number is:", summary_no)
         summary_text = all_summaries[summary_no]
+        #print("summary_text:", summary_text)
         formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
         try:
         time_taken = tic - toc
         if time_taken > max_time_for_loop:
+            print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
             summary_loop.close()
             tqdm._instances.clear()
             break

tools/prompts.py CHANGED Viewed

@@ -1,4 +1,4 @@
-system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called {column_name}. The context of this analysis is: {consultation_context}. """
 initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
 {response_table}
@@ -50,6 +50,16 @@ Your task is to make a consolidated summary of the above text. Return a summary
 Summary:"""
 # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
 # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 # Summarise the following text in less than {length} words: "{text}"\n

+system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called '{column_name}'. The context of this analysis is '{consultation_context}'."""
 initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
 {response_table}
 Summary:"""
+create_general_topics_system_prompt = system_prompt
+create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
+{topics}
+Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
+New Topics table:"""
 # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
 # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 # Summarise the following text in less than {length} words: "{text}"\n