Spaces:
Runtime error
Runtime error
Commit
·
74d2271
1
Parent(s):
f0b3bbc
Refactor app.py and related modules for improved topic extraction and summarization. Updated UI prompts for clarity, enhanced file upload functionality, and added error handling in AWS file uploads. Introduced new functions for converting response text to markdown tables, creating general topics from subtopics, and improved overall code structure for better maintainability.
Browse files- app.py +10 -11
- tools/aws_functions.py +25 -20
- tools/llm_api_call.py +197 -158
- tools/prompts.py +11 -1
app.py
CHANGED
@@ -6,7 +6,6 @@ from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
|
6 |
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
|
7 |
from tools.auth import authenticate_user
|
8 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
9 |
-
from tools.chatfuncs import load_model
|
10 |
#from tools.aws_functions import load_data_from_aws
|
11 |
import gradio as gr
|
12 |
import pandas as pd
|
@@ -92,23 +91,23 @@ with app:
|
|
92 |
with gr.Tab(label="Extract topics"):
|
93 |
gr.Markdown(
|
94 |
"""
|
95 |
-
### Choose a tabular data file (xlsx or csv) of
|
96 |
"""
|
97 |
)
|
98 |
with gr.Row():
|
99 |
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
100 |
in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
101 |
|
102 |
-
with gr.Accordion("Upload xlsx or csv
|
103 |
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
104 |
|
105 |
-
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet
|
106 |
-
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select
|
107 |
|
108 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
109 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
|
110 |
|
111 |
-
context_textbox = gr.Textbox(label="Write
|
112 |
|
113 |
extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
|
114 |
|
@@ -119,7 +118,7 @@ with app:
|
|
119 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
120 |
|
121 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
122 |
-
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the
|
123 |
choices=["The results were good", "The results were not good"], visible=False)
|
124 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
125 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
@@ -130,7 +129,7 @@ with app:
|
|
130 |
with gr.Tab(label="Summarise topic outputs"):
|
131 |
gr.Markdown(
|
132 |
"""
|
133 |
-
### Load in
|
134 |
""")
|
135 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
136 |
summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
@@ -141,7 +140,7 @@ with app:
|
|
141 |
with gr.Tab(label="Continue previous topic extraction"):
|
142 |
gr.Markdown(
|
143 |
"""
|
144 |
-
### Load in data files from a previous attempt at
|
145 |
""")
|
146 |
|
147 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
@@ -207,7 +206,7 @@ with app:
|
|
207 |
###
|
208 |
|
209 |
# Tabular data upload
|
210 |
-
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
|
211 |
|
212 |
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
|
213 |
then(load_in_data_file,
|
@@ -215,7 +214,7 @@ with app:
|
|
215 |
fn=extract_topics,
|
216 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
217 |
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
|
218 |
-
|
219 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
220 |
latest_batch_completed.change(fn=extract_topics,
|
221 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
|
|
6 |
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
|
7 |
from tools.auth import authenticate_user
|
8 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
|
|
9 |
#from tools.aws_functions import load_data_from_aws
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
|
|
91 |
with gr.Tab(label="Extract topics"):
|
92 |
gr.Markdown(
|
93 |
"""
|
94 |
+
### Choose a tabular data file (xlsx or csv) of open text to extract topics from.
|
95 |
"""
|
96 |
)
|
97 |
with gr.Row():
|
98 |
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
99 |
in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
100 |
|
101 |
+
with gr.Accordion("Upload xlsx or csv file", open = True):
|
102 |
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
103 |
|
104 |
+
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
105 |
+
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
|
106 |
|
107 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
108 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
|
109 |
|
110 |
+
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
111 |
|
112 |
extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
|
113 |
|
|
|
118 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
119 |
|
120 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
121 |
+
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the topic extraction.",
|
122 |
choices=["The results were good", "The results were not good"], visible=False)
|
123 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
124 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
|
|
129 |
with gr.Tab(label="Summarise topic outputs"):
|
130 |
gr.Markdown(
|
131 |
"""
|
132 |
+
### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
|
133 |
""")
|
134 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
135 |
summarisation_in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
|
|
140 |
with gr.Tab(label="Continue previous topic extraction"):
|
141 |
gr.Markdown(
|
142 |
"""
|
143 |
+
### Load in data files from a previous attempt at extracting topics to continue it.
|
144 |
""")
|
145 |
|
146 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
|
|
206 |
###
|
207 |
|
208 |
# Tabular data upload
|
209 |
+
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, data_file_names_textbox])
|
210 |
|
211 |
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
|
212 |
then(load_in_data_file,
|
|
|
214 |
fn=extract_topics,
|
215 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
216 |
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
|
217 |
+
|
218 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
219 |
latest_batch_completed.change(fn=extract_topics,
|
220 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
tools/aws_functions.py
CHANGED
@@ -159,7 +159,7 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
|
|
159 |
|
160 |
return files, out_message
|
161 |
|
162 |
-
def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
|
163 |
"""
|
164 |
Uploads a file from local machine to Amazon S3.
|
165 |
|
@@ -171,31 +171,36 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
|
|
171 |
Returns:
|
172 |
- Message as variable/printed to console
|
173 |
"""
|
174 |
-
|
175 |
|
176 |
-
|
177 |
|
178 |
-
|
179 |
-
local_file_paths = [local_file_paths]
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
# Get file name off file path
|
184 |
-
file_name = os.path.basename(file)
|
185 |
|
186 |
-
|
187 |
-
|
|
|
|
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
196 |
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
199 |
|
200 |
return final_out_message_str
|
201 |
|
|
|
159 |
|
160 |
return files, out_message
|
161 |
|
162 |
+
def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name, RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
|
163 |
"""
|
164 |
Uploads a file from local machine to Amazon S3.
|
165 |
|
|
|
171 |
Returns:
|
172 |
- Message as variable/printed to console
|
173 |
"""
|
174 |
+
if RUN_AWS_FUNCTIONS == "1":
|
175 |
|
176 |
+
final_out_message = []
|
177 |
|
178 |
+
s3_client = boto3.client('s3')
|
|
|
179 |
|
180 |
+
if isinstance(local_file_paths, str):
|
181 |
+
local_file_paths = [local_file_paths]
|
|
|
|
|
182 |
|
183 |
+
for file in local_file_paths:
|
184 |
+
try:
|
185 |
+
# Get file name off file path
|
186 |
+
file_name = os.path.basename(file)
|
187 |
|
188 |
+
s3_key_full = s3_key + file_name
|
189 |
+
print("S3 key: ", s3_key_full)
|
190 |
+
|
191 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
192 |
+
out_message = "File " + file_name + " uploaded successfully!"
|
193 |
+
print(out_message)
|
194 |
+
|
195 |
+
except Exception as e:
|
196 |
+
out_message = f"Error uploading file(s): {e}"
|
197 |
+
print(out_message)
|
198 |
|
199 |
+
final_out_message.append(out_message)
|
200 |
+
final_out_message_str = '\n'.join(final_out_message)
|
201 |
+
|
202 |
+
else:
|
203 |
+
final_out_message_str("Not connected to AWS, no files uploaded.")
|
204 |
|
205 |
return final_out_message_str
|
206 |
|
tools/llm_api_call.py
CHANGED
@@ -7,6 +7,7 @@ import markdown
|
|
7 |
import time
|
8 |
import boto3
|
9 |
import json
|
|
|
10 |
import string
|
11 |
import re
|
12 |
import spaces
|
@@ -18,7 +19,7 @@ from io import StringIO
|
|
18 |
|
19 |
GradioFileData = gr.FileData
|
20 |
|
21 |
-
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
22 |
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
|
23 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
24 |
|
@@ -77,7 +78,7 @@ def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:in
|
|
77 |
|
78 |
try:
|
79 |
file_data, file_name = load_in_file(file_paths[0], colname=in_colnames)
|
80 |
-
num_batches = (len(file_data)
|
81 |
print("Total number of batches:", num_batches)
|
82 |
|
83 |
except Exception as e:
|
@@ -195,8 +196,8 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
195 |
~(simple_file["Response"] == " ") &\
|
196 |
~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
|
197 |
|
198 |
-
simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
|
199 |
-
simple_file.to_csv(simplified_csv_table_path, index=None)
|
200 |
|
201 |
simple_markdown_table = simple_file.to_markdown(index=None)
|
202 |
|
@@ -483,18 +484,15 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
|
|
483 |
response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model)
|
484 |
|
485 |
if isinstance(response, ResponseObject):
|
486 |
-
|
487 |
-
whole_conversation.append(prompt)
|
488 |
-
whole_conversation.append(response.text)
|
489 |
elif 'choices' in response:
|
490 |
-
|
491 |
-
# Create conversation txt object
|
492 |
-
whole_conversation.append(prompt)
|
493 |
-
whole_conversation.append(response['choices'][0]['text'])
|
494 |
else:
|
495 |
-
|
496 |
-
|
497 |
-
|
|
|
|
|
498 |
|
499 |
# Create conversation metadata
|
500 |
if master == False:
|
@@ -522,7 +520,7 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
|
|
522 |
whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
|
523 |
|
524 |
|
525 |
-
return responses, conversation_history, whole_conversation, whole_conversation_metadata
|
526 |
|
527 |
### INITIAL TOPIC MODEL DEVELOPMENT FUNCTIONS
|
528 |
|
@@ -630,6 +628,66 @@ def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
|
|
630 |
|
631 |
return out_unique_topics_df
|
632 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
|
634 |
def write_llm_output_and_logs(responses: List[ResponseObject],
|
635 |
whole_conversation: List[str],
|
@@ -706,70 +764,18 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
706 |
|
707 |
#log_files_output_paths.append(whole_conversation_path)
|
708 |
log_files_output_paths.append(whole_conversation_path_meta)
|
709 |
-
|
710 |
-
# Convert output table to markdown and then to a pandas dataframe to csv
|
711 |
-
def remove_before_last_term(input_string: str) -> str:
|
712 |
-
# Use regex to find the last occurrence of the term
|
713 |
-
match = re.search(r'(\| ?General Topic)', input_string)
|
714 |
-
if match:
|
715 |
-
# Find the last occurrence by using rfind
|
716 |
-
last_index = input_string.rfind(match.group(0))
|
717 |
-
return input_string[last_index:] # Return everything from the last match onward
|
718 |
-
return input_string # Return the original string if the term is not found
|
719 |
-
|
720 |
-
# Check if the last response is a ResponseObject
|
721 |
-
if isinstance(responses[-1], ResponseObject):
|
722 |
-
#print("Text response:", responses[-1].text)
|
723 |
-
start_of_table_response = remove_before_last_term(responses[-1].text)
|
724 |
-
cleaned_response = clean_markdown_table(start_of_table_response)
|
725 |
-
print("cleaned_response:", cleaned_response)
|
726 |
-
elif "choices" in responses[-1]:
|
727 |
-
#print("Text response:", responses[-1]["choices"][0]['text'])
|
728 |
-
start_of_table_response = remove_before_last_term(responses[-1]["choices"][0]['text'])
|
729 |
-
cleaned_response = clean_markdown_table(start_of_table_response)
|
730 |
-
print("cleaned_response:", cleaned_response)
|
731 |
-
else:
|
732 |
-
#print("Text response:", responses[-1].text)
|
733 |
-
start_of_table_response = remove_before_last_term(responses[-1].text)
|
734 |
-
cleaned_response = clean_markdown_table(start_of_table_response)
|
735 |
-
print("cleaned_response:", cleaned_response)
|
736 |
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
# Remove <p> tags and make sure it has a valid HTML structure
|
742 |
-
html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
|
743 |
-
html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
|
744 |
-
|
745 |
-
# Now ensure that the HTML structure is correct
|
746 |
-
if "<table>" not in html_table:
|
747 |
-
html_table = f"""
|
748 |
-
<table>
|
749 |
-
<tr>
|
750 |
-
<th>General Topic</th>
|
751 |
-
<th>Subtopic</th>
|
752 |
-
<th>Sentiment</th>
|
753 |
-
<th>Response References</th>
|
754 |
-
<th>Summary</th>
|
755 |
-
</tr>
|
756 |
-
{html_table}
|
757 |
-
</table>
|
758 |
-
"""
|
759 |
-
|
760 |
-
# print("Markdown table as HTML:", html_table)
|
761 |
-
|
762 |
-
html_buffer = StringIO(html_table)
|
763 |
|
|
|
764 |
try:
|
765 |
-
topic_with_response_df =
|
766 |
except Exception as e:
|
767 |
-
print("Error
|
768 |
-
is_error = True
|
769 |
-
raise ValueError()
|
770 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
771 |
|
772 |
-
|
773 |
# Rename columns to ensure consistent use of data frames later in code
|
774 |
topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
|
775 |
|
@@ -887,7 +893,7 @@ def extract_topics(in_data_file,
|
|
887 |
temperature:float,
|
888 |
chosen_cols:List[str],
|
889 |
model_choice:str,
|
890 |
-
candidate_topics: GradioFileData =
|
891 |
latest_batch_completed:int=0,
|
892 |
out_message:List=[],
|
893 |
out_file_paths:List = [],
|
@@ -906,11 +912,11 @@ def extract_topics(in_data_file,
|
|
906 |
time_taken:float = 0,
|
907 |
max_tokens:int=max_tokens,
|
908 |
model_name_map:dict=model_name_map,
|
909 |
-
max_time_for_loop:int=max_time_for_loop,
|
910 |
progress=Progress(track_tqdm=True)):
|
911 |
|
912 |
'''
|
913 |
-
Query an LLM (Gemini or
|
914 |
|
915 |
Parameters:
|
916 |
- in_data_file (gr.File): Gradio file object containing input data
|
@@ -954,14 +960,18 @@ def extract_topics(in_data_file,
|
|
954 |
final_time = 0.0
|
955 |
whole_conversation_metadata = []
|
956 |
is_error = False
|
|
|
|
|
|
|
|
|
957 |
#llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
|
958 |
#llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
|
959 |
-
#
|
960 |
-
#
|
961 |
-
#
|
962 |
-
#
|
963 |
-
|
964 |
-
|
965 |
|
966 |
# Reset output files on each run:
|
967 |
# out_file_paths = []
|
@@ -987,6 +997,7 @@ def extract_topics(in_data_file,
|
|
987 |
|
988 |
# If this is the first time around, set variables to 0/blank
|
989 |
if first_loop_state==True:
|
|
|
990 |
if (latest_batch_completed == 999) | (latest_batch_completed == 0):
|
991 |
latest_batch_completed = 0
|
992 |
out_message = []
|
@@ -998,7 +1009,8 @@ def extract_topics(in_data_file,
|
|
998 |
local_model, tokenizer = load_model()
|
999 |
print("Local model loaded:", local_model)
|
1000 |
|
1001 |
-
|
|
|
1002 |
|
1003 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
1004 |
if latest_batch_completed >= num_batches:
|
@@ -1070,14 +1082,14 @@ def extract_topics(in_data_file,
|
|
1070 |
log_files_output_paths.append(missing_df_out_path)
|
1071 |
|
1072 |
out_file_paths = list(set(out_file_paths))
|
1073 |
-
log_files_output_paths = list(set(log_files_output_paths))
|
1074 |
|
1075 |
-
|
|
|
1076 |
|
1077 |
#final_out_message = '\n'.join(out_message)
|
1078 |
-
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df,
|
1079 |
-
|
1080 |
-
|
1081 |
|
1082 |
if num_batches > 0:
|
1083 |
progress_measure = round(latest_batch_completed / num_batches, 1)
|
@@ -1092,8 +1104,7 @@ def extract_topics(in_data_file,
|
|
1092 |
|
1093 |
if not out_file_paths:
|
1094 |
out_file_paths = []
|
1095 |
-
|
1096 |
-
|
1097 |
|
1098 |
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
1099 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
@@ -1104,9 +1115,7 @@ def extract_topics(in_data_file,
|
|
1104 |
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
1105 |
|
1106 |
|
1107 |
-
|
1108 |
for i in topics_loop:
|
1109 |
-
|
1110 |
#for latest_batch_completed in range(num_batches):
|
1111 |
reported_batch_no = latest_batch_completed + 1
|
1112 |
print("Running query batch", str(reported_batch_no))
|
@@ -1124,11 +1133,12 @@ def extract_topics(in_data_file,
|
|
1124 |
# If the latest batch of responses contains at least one instance of text
|
1125 |
if not simple_table_df.empty:
|
1126 |
|
1127 |
-
|
1128 |
print("latest_batch_completed:", latest_batch_completed)
|
1129 |
|
|
|
|
|
1130 |
# If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
|
1131 |
-
if latest_batch_completed >= 1 or candidate_topics:
|
1132 |
|
1133 |
#print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
|
1134 |
|
@@ -1141,11 +1151,14 @@ def extract_topics(in_data_file,
|
|
1141 |
else:
|
1142 |
print("Using local model:", model_choice)
|
1143 |
|
1144 |
-
|
|
|
|
|
1145 |
# 'Zero shot topics' are those supplied by the user
|
1146 |
max_topic_no = 120
|
1147 |
|
1148 |
zero_shot_topics = read_file(candidate_topics.name)
|
|
|
1149 |
if zero_shot_topics.shape[1] == 1: # Check if there is only one column
|
1150 |
zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
|
1151 |
# Max 120 topics allowed
|
@@ -1156,55 +1169,99 @@ def extract_topics(in_data_file,
|
|
1156 |
zero_shot_topics_list = list(zero_shot_topics_series)
|
1157 |
|
1158 |
print("Zero shot topics are:", zero_shot_topics_list)
|
1159 |
-
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1164 |
|
1165 |
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1166 |
-
|
1167 |
-
zero_shot_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
|
1168 |
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1169 |
-
|
|
|
1170 |
|
1171 |
-
|
|
|
1172 |
# Max 120 topics allowed
|
1173 |
if zero_shot_topics.shape[0] > max_topic_no:
|
1174 |
print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
|
1175 |
zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
|
1176 |
|
1177 |
if existing_unique_topics_df.empty:
|
1178 |
-
existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1]
|
|
|
|
|
1179 |
|
1180 |
-
|
|
|
|
|
1181 |
|
1182 |
#existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
|
1183 |
|
1184 |
#all_topic_tables_df_merged = existing_unique_topics_df
|
1185 |
existing_unique_topics_df["Response References"] = ""
|
1186 |
|
1187 |
-
unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"
|
1188 |
|
1189 |
-
#existing_unique_topics_df.to_csv(output_folder + f"{file_name}
|
1190 |
|
1191 |
# Format the summary prompt with the response table and topics
|
1192 |
-
|
|
|
|
|
1193 |
|
1194 |
if model_choice == "gemma_2b_it_local":
|
1195 |
-
|
1196 |
-
|
1197 |
-
|
1198 |
-
|
1199 |
|
|
|
1200 |
|
1201 |
# Define the output file path for the formatted prompt
|
1202 |
-
formatted_prompt_output_path = output_folder + file_name + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1203 |
|
1204 |
# Write the formatted prompt to the specified file
|
1205 |
try:
|
1206 |
with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1207 |
-
f.write(
|
1208 |
except Exception as e:
|
1209 |
print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
|
1210 |
|
@@ -1216,7 +1273,7 @@ def extract_topics(in_data_file,
|
|
1216 |
summary_whole_conversation = []
|
1217 |
|
1218 |
# Process requests to large language model
|
1219 |
-
master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
|
1220 |
|
1221 |
# print("master_summary_response:", master_summary_response[-1].text)
|
1222 |
# print("Whole conversation metadata:", whole_conversation_metadata)
|
@@ -1253,24 +1310,13 @@ def extract_topics(in_data_file,
|
|
1253 |
|
1254 |
#whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1255 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
|
|
1256 |
|
|
|
|
|
1257 |
|
1258 |
-
|
1259 |
-
|
1260 |
-
# new_final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + #model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1261 |
-
|
1262 |
-
# with open(new_final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1263 |
-
# f.write(display_table)
|
1264 |
-
|
1265 |
-
# log_files_output_paths.append(new_final_table_output_path)
|
1266 |
-
|
1267 |
-
#except Exception as e:
|
1268 |
-
# print(e)
|
1269 |
-
|
1270 |
-
latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
|
1271 |
-
|
1272 |
-
out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
|
1273 |
-
log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
|
1274 |
|
1275 |
print("out_file_paths at end of loop:", out_file_paths)
|
1276 |
|
@@ -1285,7 +1331,9 @@ def extract_topics(in_data_file,
|
|
1285 |
else:
|
1286 |
print("Using AWS Bedrock model:", model_choice)
|
1287 |
|
1288 |
-
formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table
|
|
|
|
|
1289 |
|
1290 |
if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
|
1291 |
else: formatted_prompt2 = prompt2
|
@@ -1294,21 +1342,16 @@ def extract_topics(in_data_file,
|
|
1294 |
else: formatted_prompt3 = prompt3
|
1295 |
|
1296 |
if model_choice == "gemma_2b_it_local":
|
1297 |
-
|
1298 |
-
|
1299 |
-
|
1300 |
-
# formatted_prompt3 = llama_prefix + formatted_prompt3 + llama_suffix
|
1301 |
-
|
1302 |
-
formatted_initial_table_prompt = llama_prefix + system_prompt + formatted_initial_table_prompt + llama_suffix
|
1303 |
-
formatted_prompt2 = llama_prefix + system_prompt + formatted_prompt2 + llama_suffix
|
1304 |
-
formatted_prompt3 = llama_prefix + system_prompt + formatted_prompt3 + llama_suffix
|
1305 |
|
1306 |
batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
|
1307 |
|
1308 |
-
whole_conversation = [
|
1309 |
|
1310 |
# Process requests to large language model
|
1311 |
-
responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
|
1312 |
|
1313 |
# print("Whole conversation metadata before:", whole_conversation_metadata)
|
1314 |
|
@@ -1358,8 +1401,6 @@ def extract_topics(in_data_file,
|
|
1358 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1359 |
f.write(responses[-1].text)
|
1360 |
display_table = responses[-1].text
|
1361 |
-
|
1362 |
-
|
1363 |
|
1364 |
log_files_output_paths.append(final_table_output_path)
|
1365 |
|
@@ -1370,11 +1411,11 @@ def extract_topics(in_data_file,
|
|
1370 |
new_reference_df = reference_df
|
1371 |
|
1372 |
else:
|
1373 |
-
print("Current batch of responses contains no text, moving onto next. Batch number:", latest_batch_completed, ". Start row:", start_row, ". End row:", end_row)
|
1374 |
|
1375 |
# Increase latest file completed count unless we are at the last file
|
1376 |
if latest_batch_completed != num_batches:
|
1377 |
-
print("Completed batch number:", str(
|
1378 |
latest_batch_completed += 1
|
1379 |
|
1380 |
toc = time.perf_counter()
|
@@ -1391,17 +1432,16 @@ def extract_topics(in_data_file,
|
|
1391 |
existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
|
1392 |
existing_topics_table = new_topic_df.dropna(how='all')
|
1393 |
|
1394 |
-
out_time = f"
|
1395 |
-
print(out_time)
|
1396 |
|
1397 |
out_message.append('All queries successfully completed in')
|
1398 |
|
1399 |
final_message_out = '\n'.join(out_message)
|
1400 |
-
final_message_out = final_message_out + " " + out_time
|
1401 |
|
1402 |
-
final_message_out
|
1403 |
|
1404 |
-
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
|
1405 |
|
1406 |
# SUMMARISATION FUNCTIONS
|
1407 |
|
@@ -1463,7 +1503,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
|
1463 |
|
1464 |
reference_df_unique = reference_df.drop_duplicates("old_category")
|
1465 |
|
1466 |
-
reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
|
1467 |
|
1468 |
# Deduplicate categories within each sentiment group
|
1469 |
deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
|
@@ -1558,7 +1598,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
|
|
1558 |
whole_conversation = [summarise_topic_descriptions_system_prompt]
|
1559 |
|
1560 |
# Process requests to large language model
|
1561 |
-
responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, local_model=local_model)
|
1562 |
|
1563 |
print("Finished summary query")
|
1564 |
|
@@ -1569,8 +1609,6 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
|
|
1569 |
else:
|
1570 |
response_texts = [resp.text for resp in responses]
|
1571 |
|
1572 |
-
|
1573 |
-
|
1574 |
latest_response_text = response_texts[-1]
|
1575 |
|
1576 |
#print("latest_response_text:", latest_response_text)
|
@@ -1597,6 +1635,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1597 |
Create better summaries of the raw batch-level summaries created in the first run of the model.
|
1598 |
'''
|
1599 |
out_metadata = []
|
|
|
1600 |
|
1601 |
print("In summarise_output_topics function.")
|
1602 |
|
@@ -1672,7 +1711,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1672 |
print("Current summary number is:", summary_no)
|
1673 |
|
1674 |
summary_text = all_summaries[summary_no]
|
1675 |
-
print("summary_text:", summary_text)
|
1676 |
formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
|
1677 |
|
1678 |
try:
|
@@ -1696,7 +1735,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1696 |
time_taken = tic - toc
|
1697 |
|
1698 |
if time_taken > max_time_for_loop:
|
1699 |
-
print("Time taken for loop is greater than maximum time allowed.")
|
1700 |
summary_loop.close()
|
1701 |
tqdm._instances.clear()
|
1702 |
break
|
|
|
7 |
import time
|
8 |
import boto3
|
9 |
import json
|
10 |
+
import math
|
11 |
import string
|
12 |
import re
|
13 |
import spaces
|
|
|
19 |
|
20 |
GradioFileData = gr.FileData
|
21 |
|
22 |
+
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
|
23 |
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
|
24 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
25 |
|
|
|
78 |
|
79 |
try:
|
80 |
file_data, file_name = load_in_file(file_paths[0], colname=in_colnames)
|
81 |
+
num_batches = math.ceil(len(file_data) / batch_size)
|
82 |
print("Total number of batches:", num_batches)
|
83 |
|
84 |
except Exception as e:
|
|
|
196 |
~(simple_file["Response"] == " ") &\
|
197 |
~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
|
198 |
|
199 |
+
#simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
|
200 |
+
#simple_file.to_csv(simplified_csv_table_path, index=None)
|
201 |
|
202 |
simple_markdown_table = simple_file.to_markdown(index=None)
|
203 |
|
|
|
484 |
response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model)
|
485 |
|
486 |
if isinstance(response, ResponseObject):
|
487 |
+
response_text = response.text
|
|
|
|
|
488 |
elif 'choices' in response:
|
489 |
+
response_text = response['choices'][0]['text']
|
|
|
|
|
|
|
490 |
else:
|
491 |
+
response_text = response.text
|
492 |
+
|
493 |
+
responses.append(response)
|
494 |
+
whole_conversation.append(prompt)
|
495 |
+
whole_conversation.append(response_text)
|
496 |
|
497 |
# Create conversation metadata
|
498 |
if master == False:
|
|
|
520 |
whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
|
521 |
|
522 |
|
523 |
+
return responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text
|
524 |
|
525 |
### INITIAL TOPIC MODEL DEVELOPMENT FUNCTIONS
|
526 |
|
|
|
628 |
|
629 |
return out_unique_topics_df
|
630 |
|
631 |
+
# Convert output table to markdown and then to a pandas dataframe to csv
|
632 |
+
def remove_before_last_term(input_string: str) -> str:
|
633 |
+
# Use regex to find the last occurrence of the term
|
634 |
+
match = re.search(r'(\| ?General Topic)', input_string)
|
635 |
+
if match:
|
636 |
+
# Find the last occurrence by using rfind
|
637 |
+
last_index = input_string.rfind(match.group(0))
|
638 |
+
return input_string[last_index:] # Return everything from the last match onward
|
639 |
+
return input_string # Return the original string if the term is not found
|
640 |
+
|
641 |
+
def convert_response_text_to_markdown_table(response_text:str, table_type:str = "Main table"):
|
642 |
+
is_error = False
|
643 |
+
start_of_table_response = remove_before_last_term(response_text)
|
644 |
+
cleaned_response = clean_markdown_table(start_of_table_response)
|
645 |
+
|
646 |
+
markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
|
647 |
+
|
648 |
+
# Remove <p> tags and make sure it has a valid HTML structure
|
649 |
+
html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
|
650 |
+
html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
|
651 |
+
|
652 |
+
# Now ensure that the HTML structure is correct
|
653 |
+
if table_type == "Main table":
|
654 |
+
if "<table>" not in html_table:
|
655 |
+
html_table = f"""
|
656 |
+
<table>
|
657 |
+
<tr>
|
658 |
+
<th>General Topic</th>
|
659 |
+
<th>Subtopic</th>
|
660 |
+
<th>Sentiment</th>
|
661 |
+
<th>Response References</th>
|
662 |
+
<th>Summary</th>
|
663 |
+
</tr>
|
664 |
+
{html_table}
|
665 |
+
</table>
|
666 |
+
"""
|
667 |
+
elif table_type == "Revised topics table":
|
668 |
+
if "<table>" not in html_table:
|
669 |
+
html_table = f"""
|
670 |
+
<table>
|
671 |
+
<tr>
|
672 |
+
<th>General Topic</th>
|
673 |
+
<th>Subtopic</th>
|
674 |
+
</tr>
|
675 |
+
{html_table}
|
676 |
+
</table>
|
677 |
+
"""
|
678 |
+
|
679 |
+
html_buffer = StringIO(html_table)
|
680 |
+
|
681 |
+
try:
|
682 |
+
out_df = pd.read_html(html_buffer)[0] # Assuming the first table in the HTML is the one you want
|
683 |
+
except Exception as e:
|
684 |
+
print("Error when trying to parse table:", e)
|
685 |
+
is_error = True
|
686 |
+
raise ValueError()
|
687 |
+
return pd.DataFrame(), is_error
|
688 |
+
|
689 |
+
return out_df, is_error
|
690 |
+
|
691 |
|
692 |
def write_llm_output_and_logs(responses: List[ResponseObject],
|
693 |
whole_conversation: List[str],
|
|
|
764 |
|
765 |
#log_files_output_paths.append(whole_conversation_path)
|
766 |
log_files_output_paths.append(whole_conversation_path_meta)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
767 |
|
768 |
+
if isinstance(responses[-1], ResponseObject): response_text = responses[-1].text
|
769 |
+
elif "choices" in responses[-1]: response_text = responses[-1]["choices"][0]['text']
|
770 |
+
else: response_text = responses[-1].text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
771 |
|
772 |
+
# Convert response text to a markdown table
|
773 |
try:
|
774 |
+
topic_with_response_df, is_error = convert_response_text_to_markdown_table(response_text)
|
775 |
except Exception as e:
|
776 |
+
print("Error in parsing markdown table from response text:", e)
|
|
|
|
|
777 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
778 |
|
|
|
779 |
# Rename columns to ensure consistent use of data frames later in code
|
780 |
topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
|
781 |
|
|
|
893 |
temperature:float,
|
894 |
chosen_cols:List[str],
|
895 |
model_choice:str,
|
896 |
+
candidate_topics: GradioFileData = None,
|
897 |
latest_batch_completed:int=0,
|
898 |
out_message:List=[],
|
899 |
out_file_paths:List = [],
|
|
|
912 |
time_taken:float = 0,
|
913 |
max_tokens:int=max_tokens,
|
914 |
model_name_map:dict=model_name_map,
|
915 |
+
max_time_for_loop:int=max_time_for_loop,
|
916 |
progress=Progress(track_tqdm=True)):
|
917 |
|
918 |
'''
|
919 |
+
Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
|
920 |
|
921 |
Parameters:
|
922 |
- in_data_file (gr.File): Gradio file object containing input data
|
|
|
960 |
final_time = 0.0
|
961 |
whole_conversation_metadata = []
|
962 |
is_error = False
|
963 |
+
create_revised_general_topics = False
|
964 |
+
local_model = []
|
965 |
+
tokenizer = []
|
966 |
+
zero_shot_topics_df = pd.DataFrame()
|
967 |
#llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
|
968 |
#llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
|
969 |
+
#llama_cpp_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
|
970 |
+
#llama_cpp_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
|
971 |
+
#llama_cpp_prefix = "<|user|>\n" # This is for phi 3.5
|
972 |
+
#llama_cpp_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
|
973 |
+
llama_cpp_prefix = "<start_of_turn>user\n"
|
974 |
+
llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
|
975 |
|
976 |
# Reset output files on each run:
|
977 |
# out_file_paths = []
|
|
|
997 |
|
998 |
# If this is the first time around, set variables to 0/blank
|
999 |
if first_loop_state==True:
|
1000 |
+
print("This is the first time through the loop")
|
1001 |
if (latest_batch_completed == 999) | (latest_batch_completed == 0):
|
1002 |
latest_batch_completed = 0
|
1003 |
out_message = []
|
|
|
1009 |
local_model, tokenizer = load_model()
|
1010 |
print("Local model loaded:", local_model)
|
1011 |
|
1012 |
+
print("latest_batch_completed at start of function:", str(latest_batch_completed))
|
1013 |
+
print("total number of batches:", str(num_batches))
|
1014 |
|
1015 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
1016 |
if latest_batch_completed >= num_batches:
|
|
|
1082 |
log_files_output_paths.append(missing_df_out_path)
|
1083 |
|
1084 |
out_file_paths = list(set(out_file_paths))
|
1085 |
+
log_files_output_paths = list(set(log_files_output_paths))
|
1086 |
|
1087 |
+
summary_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
|
1088 |
+
print("summary_out_file_paths:", summary_out_file_paths)
|
1089 |
|
1090 |
#final_out_message = '\n'.join(out_message)
|
1091 |
+
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, summary_out_file_paths
|
1092 |
+
|
|
|
1093 |
|
1094 |
if num_batches > 0:
|
1095 |
progress_measure = round(latest_batch_completed / num_batches, 1)
|
|
|
1104 |
|
1105 |
if not out_file_paths:
|
1106 |
out_file_paths = []
|
1107 |
+
|
|
|
1108 |
|
1109 |
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
1110 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
|
|
1115 |
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
1116 |
|
1117 |
|
|
|
1118 |
for i in topics_loop:
|
|
|
1119 |
#for latest_batch_completed in range(num_batches):
|
1120 |
reported_batch_no = latest_batch_completed + 1
|
1121 |
print("Running query batch", str(reported_batch_no))
|
|
|
1133 |
# If the latest batch of responses contains at least one instance of text
|
1134 |
if not simple_table_df.empty:
|
1135 |
|
|
|
1136 |
print("latest_batch_completed:", latest_batch_completed)
|
1137 |
|
1138 |
+
print("candidate_topics:", candidate_topics)
|
1139 |
+
|
1140 |
# If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
|
1141 |
+
if latest_batch_completed >= 1 or candidate_topics is not None:
|
1142 |
|
1143 |
#print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
|
1144 |
|
|
|
1151 |
else:
|
1152 |
print("Using local model:", model_choice)
|
1153 |
|
1154 |
+
# Preparing candidate topics
|
1155 |
+
if candidate_topics and existing_unique_topics_df.empty:
|
1156 |
+
progress(0.1, "Creating revised zero shot topics table")
|
1157 |
# 'Zero shot topics' are those supplied by the user
|
1158 |
max_topic_no = 120
|
1159 |
|
1160 |
zero_shot_topics = read_file(candidate_topics.name)
|
1161 |
+
|
1162 |
if zero_shot_topics.shape[1] == 1: # Check if there is only one column
|
1163 |
zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
|
1164 |
# Max 120 topics allowed
|
|
|
1169 |
zero_shot_topics_list = list(zero_shot_topics_series)
|
1170 |
|
1171 |
print("Zero shot topics are:", zero_shot_topics_list)
|
1172 |
+
|
1173 |
+
if create_revised_general_topics == True:
|
1174 |
+
# Create the most up to date list of topics and subtopics.
|
1175 |
+
# If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
|
1176 |
+
unique_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
|
1177 |
+
unique_topics_markdown = unique_topics_df.to_markdown()
|
1178 |
+
|
1179 |
+
print("unique_topics_markdown:", unique_topics_markdown)
|
1180 |
+
|
1181 |
+
formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1182 |
+
|
1183 |
+
# Format the general_topics prompt with the topics
|
1184 |
+
formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
|
1185 |
+
|
1186 |
+
if model_choice == "gemma_2b_it_local":
|
1187 |
+
formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
|
1188 |
+
|
1189 |
+
formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
|
1190 |
+
|
1191 |
+
whole_conversation = []
|
1192 |
+
|
1193 |
+
general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = process_requests(formatted_general_topics_prompt_list, formatted_general_topics_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
|
1194 |
+
|
1195 |
+
# Convert response text to a markdown table
|
1196 |
+
try:
|
1197 |
+
zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
|
1198 |
+
print("Output revised zero shot topics table is:", zero_shot_topics_df)
|
1199 |
+
|
1200 |
+
zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
|
1201 |
+
zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
|
1202 |
+
out_file_paths.append(zero_shot_revised_path)
|
1203 |
+
except Exception as e:
|
1204 |
+
print("Error in parsing markdown table from response text:", e)
|
1205 |
+
print("Not adding revised General Topics to table")
|
1206 |
+
zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
|
1207 |
+
|
1208 |
+
if zero_shot_topics_df.empty:
|
1209 |
+
print("Creation of revised general topics df failed, reverting to original list")
|
1210 |
+
zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
|
1211 |
+
else:
|
1212 |
+
zero_shot_topics_df = pd.DataFrame(data={"General Topic":[""] * len(zero_shot_topics_list), "Subtopic":zero_shot_topics_list})
|
1213 |
|
1214 |
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1215 |
+
if not existing_unique_topics_df.empty:
|
|
|
1216 |
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1217 |
+
else:
|
1218 |
+
existing_unique_topics_df = zero_shot_topics_df
|
1219 |
|
1220 |
+
# If your zero shot column file already contains General Topic and Subtopic columns
|
1221 |
+
if set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
|
1222 |
# Max 120 topics allowed
|
1223 |
if zero_shot_topics.shape[0] > max_topic_no:
|
1224 |
print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
|
1225 |
zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
|
1226 |
|
1227 |
if existing_unique_topics_df.empty:
|
1228 |
+
existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1]})
|
1229 |
+
|
1230 |
+
zero_shot_topics_df = zero_shot_topics
|
1231 |
|
1232 |
+
if candidate_topics and not zero_shot_topics_df.empty:
|
1233 |
+
# If you have already created revised zero shot topics, concat to the current
|
1234 |
+
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df])
|
1235 |
|
1236 |
#existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
|
1237 |
|
1238 |
#all_topic_tables_df_merged = existing_unique_topics_df
|
1239 |
existing_unique_topics_df["Response References"] = ""
|
1240 |
|
1241 |
+
unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
|
1242 |
|
1243 |
+
#existing_unique_topics_df.to_csv(output_folder + f"{file_name}_existing_unique_topics_df_" + #model_choice_clean + "_temp_" + str(temperature) + "_batch_" + str(latest_batch_completed) + ".csv", index=None)
|
1244 |
|
1245 |
# Format the summary prompt with the response table and topics
|
1246 |
+
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1247 |
+
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
|
1248 |
+
|
1249 |
|
1250 |
if model_choice == "gemma_2b_it_local":
|
1251 |
+
formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
|
1252 |
+
full_prompt = formatted_summary_prompt
|
1253 |
+
else:
|
1254 |
+
full_prompt = formatted_system_prompt + formatted_summary_prompt
|
1255 |
|
1256 |
+
#latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
|
1257 |
|
1258 |
# Define the output file path for the formatted prompt
|
1259 |
+
formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1260 |
|
1261 |
# Write the formatted prompt to the specified file
|
1262 |
try:
|
1263 |
with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1264 |
+
f.write(full_prompt)
|
1265 |
except Exception as e:
|
1266 |
print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
|
1267 |
|
|
|
1273 |
summary_whole_conversation = []
|
1274 |
|
1275 |
# Process requests to large language model
|
1276 |
+
master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
|
1277 |
|
1278 |
# print("master_summary_response:", master_summary_response[-1].text)
|
1279 |
# print("Whole conversation metadata:", whole_conversation_metadata)
|
|
|
1310 |
|
1311 |
#whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1312 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
1313 |
+
|
1314 |
|
1315 |
+
#out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
|
1316 |
+
#log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
|
1317 |
|
1318 |
+
out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1319 |
+
log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1320 |
|
1321 |
print("out_file_paths at end of loop:", out_file_paths)
|
1322 |
|
|
|
1331 |
else:
|
1332 |
print("Using AWS Bedrock model:", model_choice)
|
1333 |
|
1334 |
+
formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table)
|
1335 |
+
|
1336 |
+
formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1337 |
|
1338 |
if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
|
1339 |
else: formatted_prompt2 = prompt2
|
|
|
1342 |
else: formatted_prompt3 = prompt3
|
1343 |
|
1344 |
if model_choice == "gemma_2b_it_local":
|
1345 |
+
formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
|
1346 |
+
formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
|
1347 |
+
formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
|
|
|
|
|
|
|
|
|
|
|
1348 |
|
1349 |
batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
|
1350 |
|
1351 |
+
whole_conversation = [formatted_initial_table_system_prompt]
|
1352 |
|
1353 |
# Process requests to large language model
|
1354 |
+
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
|
1355 |
|
1356 |
# print("Whole conversation metadata before:", whole_conversation_metadata)
|
1357 |
|
|
|
1401 |
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1402 |
f.write(responses[-1].text)
|
1403 |
display_table = responses[-1].text
|
|
|
|
|
1404 |
|
1405 |
log_files_output_paths.append(final_table_output_path)
|
1406 |
|
|
|
1411 |
new_reference_df = reference_df
|
1412 |
|
1413 |
else:
|
1414 |
+
print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
|
1415 |
|
1416 |
# Increase latest file completed count unless we are at the last file
|
1417 |
if latest_batch_completed != num_batches:
|
1418 |
+
print("Completed batch number:", str(reported_batch_no))
|
1419 |
latest_batch_completed += 1
|
1420 |
|
1421 |
toc = time.perf_counter()
|
|
|
1432 |
existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
|
1433 |
existing_topics_table = new_topic_df.dropna(how='all')
|
1434 |
|
1435 |
+
out_time = f"{final_time:0.1f} seconds."
|
|
|
1436 |
|
1437 |
out_message.append('All queries successfully completed in')
|
1438 |
|
1439 |
final_message_out = '\n'.join(out_message)
|
1440 |
+
final_message_out = final_message_out + " " + out_time
|
1441 |
|
1442 |
+
print(final_message_out)
|
1443 |
|
1444 |
+
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
|
1445 |
|
1446 |
# SUMMARISATION FUNCTIONS
|
1447 |
|
|
|
1503 |
|
1504 |
reference_df_unique = reference_df.drop_duplicates("old_category")
|
1505 |
|
1506 |
+
#reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
|
1507 |
|
1508 |
# Deduplicate categories within each sentiment group
|
1509 |
deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
|
|
|
1598 |
whole_conversation = [summarise_topic_descriptions_system_prompt]
|
1599 |
|
1600 |
# Process requests to large language model
|
1601 |
+
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, local_model=local_model)
|
1602 |
|
1603 |
print("Finished summary query")
|
1604 |
|
|
|
1609 |
else:
|
1610 |
response_texts = [resp.text for resp in responses]
|
1611 |
|
|
|
|
|
1612 |
latest_response_text = response_texts[-1]
|
1613 |
|
1614 |
#print("latest_response_text:", latest_response_text)
|
|
|
1635 |
Create better summaries of the raw batch-level summaries created in the first run of the model.
|
1636 |
'''
|
1637 |
out_metadata = []
|
1638 |
+
local_model = []
|
1639 |
|
1640 |
print("In summarise_output_topics function.")
|
1641 |
|
|
|
1711 |
print("Current summary number is:", summary_no)
|
1712 |
|
1713 |
summary_text = all_summaries[summary_no]
|
1714 |
+
#print("summary_text:", summary_text)
|
1715 |
formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
|
1716 |
|
1717 |
try:
|
|
|
1735 |
time_taken = tic - toc
|
1736 |
|
1737 |
if time_taken > max_time_for_loop:
|
1738 |
+
print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
|
1739 |
summary_loop.close()
|
1740 |
tqdm._instances.clear()
|
1741 |
break
|
tools/prompts.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called {column_name}. The context of this analysis is
|
2 |
|
3 |
initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
|
4 |
{response_table}
|
@@ -50,6 +50,16 @@ Your task is to make a consolidated summary of the above text. Return a summary
|
|
50 |
Summary:"""
|
51 |
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
# example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
54 |
# You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|
55 |
# Summarise the following text in less than {length} words: "{text}"\n
|
|
|
1 |
+
system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called '{column_name}'. The context of this analysis is '{consultation_context}'."""
|
2 |
|
3 |
initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
|
4 |
{response_table}
|
|
|
50 |
Summary:"""
|
51 |
|
52 |
|
53 |
+
create_general_topics_system_prompt = system_prompt
|
54 |
+
|
55 |
+
create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
|
56 |
+
{topics}
|
57 |
+
|
58 |
+
Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
|
59 |
+
|
60 |
+
New Topics table:"""
|
61 |
+
|
62 |
+
|
63 |
# example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
64 |
# You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|
65 |
# Summarise the following text in less than {length} words: "{text}"\n
|