Commit
Β·
b7f4700
1
Parent(s):
c978ec5
Added support for using local models (specifically Gemma 2b) for topic extraction and summary. Generally improved output format safeguards.
Browse files- Dockerfile +5 -2
- README.md +2 -2
- app.py +98 -13
- requirements.txt +6 -1
- requirements_cpu.txt +17 -0
- tools/chatfuncs.py +166 -0
- tools/helper_functions.py +11 -5
- tools/llm_api_call.py +250 -84
- tools/prompts.py +45 -24
Dockerfile
CHANGED
@@ -10,9 +10,12 @@ WORKDIR /src
|
|
10 |
|
11 |
COPY requirements.txt .
|
12 |
|
13 |
-
RUN pip
|
|
|
|
|
|
|
14 |
|
15 |
-
RUN rm
|
16 |
|
17 |
# Stage 2: Final runtime image
|
18 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
|
|
10 |
|
11 |
COPY requirements.txt .
|
12 |
|
13 |
+
RUN pip uninstall -y typing_extensions \
|
14 |
+
&& pip install --no-cache-dir --target=/install typing_extensions==4.12.2 \
|
15 |
+
&& pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
|
16 |
+
&& pip install --no-cache-dir --target=/install -r requirements_cpu.txt
|
17 |
|
18 |
+
RUN rm requirements_cpu.txt
|
19 |
|
20 |
# Stage 2: Final runtime image
|
21 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
README.md
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
---
|
2 |
-
title: Large language model topic
|
3 |
emoji: π
|
4 |
colorFrom: purple
|
5 |
colorTo: yellow
|
6 |
-
sdk:
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: cc-by-nc-4.0
|
|
|
1 |
---
|
2 |
+
title: Large language model topic modelling
|
3 |
emoji: π
|
4 |
colorFrom: purple
|
5 |
colorTo: yellow
|
6 |
+
sdk: 5.8.0
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: cc-by-nc-4.0
|
app.py
CHANGED
@@ -1,15 +1,20 @@
|
|
1 |
import os
|
2 |
import socket
|
3 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise
|
4 |
from tools.aws_functions import upload_file_to_s3
|
5 |
-
from tools.llm_api_call import
|
6 |
from tools.auth import authenticate_user
|
7 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
8 |
#from tools.aws_functions import load_data_from_aws
|
9 |
import gradio as gr
|
10 |
import pandas as pd
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
from datetime import datetime
|
|
|
13 |
today_rev = datetime.now().strftime("%Y%m%d")
|
14 |
|
15 |
ensure_output_folder_exists()
|
@@ -20,7 +25,87 @@ access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
|
20 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
21 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Create the gradio interface
|
26 |
app = gr.Blocks(theme = gr.themes.Base())
|
@@ -94,7 +179,7 @@ with app:
|
|
94 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
95 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
|
96 |
|
97 |
-
context_textbox = gr.Textbox(label="Write a short description (one sentence
|
98 |
|
99 |
extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
|
100 |
|
@@ -151,7 +236,7 @@ with app:
|
|
151 |
Define settings that affect large language model output.
|
152 |
""")
|
153 |
with gr.Accordion("Settings for LLM generation", open = True):
|
154 |
-
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.
|
155 |
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0)
|
156 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
157 |
|
@@ -198,13 +283,13 @@ with app:
|
|
198 |
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
|
199 |
then(load_in_data_file,
|
200 |
inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
|
201 |
-
fn=
|
202 |
-
inputs=[file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
203 |
-
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="
|
204 |
|
205 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
206 |
-
latest_batch_completed.change(fn=
|
207 |
-
inputs=[file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
208 |
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files]).\
|
209 |
then(fn = reveal_feedback_buttons,
|
210 |
outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
@@ -224,7 +309,7 @@ with app:
|
|
224 |
|
225 |
###
|
226 |
# LOGGING AND ON APP LOAD FUNCTIONS
|
227 |
-
###
|
228 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
229 |
|
230 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
@@ -259,7 +344,7 @@ print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
|
259 |
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
|
260 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
261 |
|
262 |
-
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '
|
263 |
print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
|
264 |
|
265 |
if __name__ == "__main__":
|
|
|
1 |
import os
|
2 |
import socket
|
3 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
|
4 |
from tools.aws_functions import upload_file_to_s3
|
5 |
+
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
|
6 |
from tools.auth import authenticate_user
|
7 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
8 |
#from tools.aws_functions import load_data_from_aws
|
9 |
import gradio as gr
|
10 |
import pandas as pd
|
11 |
+
import tools.chatfuncs as chatf
|
12 |
+
from tools.chatfuncs import llama_cpp_init_config_gpu, llama_cpp_init_config_cpu
|
13 |
+
from llama_cpp import Llama
|
14 |
+
from huggingface_hub import hf_hub_download
|
15 |
+
from torch import cuda, backends
|
16 |
from datetime import datetime
|
17 |
+
|
18 |
today_rev = datetime.now().strftime("%Y%m%d")
|
19 |
|
20 |
ensure_output_folder_exists()
|
|
|
25 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
26 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
27 |
|
28 |
+
###
|
29 |
+
# Load local model
|
30 |
+
###
|
31 |
+
|
32 |
+
# Check for torch cuda
|
33 |
+
print("Is CUDA enabled? ", cuda.is_available())
|
34 |
+
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
35 |
+
if cuda.is_available():
|
36 |
+
torch_device = "cuda"
|
37 |
+
os.system("nvidia-smi")
|
38 |
+
else:
|
39 |
+
torch_device = "cpu"
|
40 |
+
|
41 |
+
print("Device used is: ", torch_device)
|
42 |
+
|
43 |
+
def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu_config:llama_cpp_init_config_gpu=chatf.gpu_config, cpu_config:llama_cpp_init_config_cpu=chatf.cpu_config, torch_device:str=chatf.torch_device):
|
44 |
+
'''
|
45 |
+
Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
|
46 |
+
'''
|
47 |
+
print("Loading model ", local_model_type)
|
48 |
+
|
49 |
+
if local_model_type == "Gemma 2b":
|
50 |
+
if torch_device == "cuda":
|
51 |
+
gpu_config.update_gpu(gpu_layers)
|
52 |
+
gpu_config.update_context(max_context_length)
|
53 |
+
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
|
54 |
+
else:
|
55 |
+
gpu_config.update_gpu(gpu_layers)
|
56 |
+
cpu_config.update_gpu(gpu_layers)
|
57 |
+
|
58 |
+
# Update context length according to slider
|
59 |
+
gpu_config.update_context(max_context_length)
|
60 |
+
cpu_config.update_context(max_context_length)
|
61 |
+
|
62 |
+
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
|
63 |
+
|
64 |
+
#print(vars(gpu_config))
|
65 |
+
#print(vars(cpu_config))
|
66 |
+
|
67 |
+
def get_model_path():
|
68 |
+
repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
|
69 |
+
filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
|
70 |
+
model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory
|
71 |
+
|
72 |
+
# Construct the expected local path
|
73 |
+
local_path = os.path.join(model_dir, filename)
|
74 |
+
|
75 |
+
if os.path.exists(local_path):
|
76 |
+
print(f"Model already exists at: {local_path}")
|
77 |
+
return local_path
|
78 |
+
else:
|
79 |
+
print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
|
80 |
+
return hf_hub_download(repo_id=repo_id, filename=filename)
|
81 |
+
|
82 |
+
model_path = get_model_path()
|
83 |
+
|
84 |
+
try:
|
85 |
+
print(vars(gpu_config))
|
86 |
+
llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True,
|
87 |
+
|
88 |
+
except Exception as e:
|
89 |
+
print("GPU load failed")
|
90 |
+
print(e)
|
91 |
+
llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
|
92 |
+
|
93 |
+
tokenizer = []
|
94 |
+
|
95 |
+
chatf.model = llama_model
|
96 |
+
chatf.tokenizer = tokenizer
|
97 |
+
chatf.local_model_type = local_model_type
|
98 |
+
|
99 |
+
load_confirmation = "Finished loading model: " + local_model_type
|
100 |
+
|
101 |
+
print(load_confirmation)
|
102 |
+
return local_model_type, load_confirmation, local_model_type
|
103 |
+
|
104 |
+
|
105 |
+
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
106 |
+
local_model_type = "Gemma 2b"
|
107 |
+
if RUN_LOCAL_MODEL == "1":
|
108 |
+
load_model(local_model_type, chatf.gpu_layers, chatf.context_length, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
109 |
|
110 |
# Create the gradio interface
|
111 |
app = gr.Blocks(theme = gr.themes.Base())
|
|
|
179 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
180 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
|
181 |
|
182 |
+
context_textbox = gr.Textbox(label="Write a short description (up to one sentence) giving context to the large language model about the your consultation and any relevant context")
|
183 |
|
184 |
extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
|
185 |
|
|
|
236 |
Define settings that affect large language model output.
|
237 |
""")
|
238 |
with gr.Accordion("Settings for LLM generation", open = True):
|
239 |
+
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
|
240 |
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0)
|
241 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
242 |
|
|
|
283 |
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
|
284 |
then(load_in_data_file,
|
285 |
inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
|
286 |
+
fn=extract_topics,
|
287 |
+
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
288 |
+
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
|
289 |
|
290 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
291 |
+
latest_batch_completed.change(fn=extract_topics,
|
292 |
+
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
293 |
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files]).\
|
294 |
then(fn = reveal_feedback_buttons,
|
295 |
outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
|
|
309 |
|
310 |
###
|
311 |
# LOGGING AND ON APP LOAD FUNCTIONS
|
312 |
+
###
|
313 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
314 |
|
315 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
|
|
344 |
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
|
345 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
346 |
|
347 |
+
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7861'))
|
348 |
print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
|
349 |
|
350 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
boto3==1.35.71
|
4 |
pyarrow==18.1.0
|
5 |
openpyxl==3.1.3
|
@@ -10,3 +10,8 @@ google-generativeai==0.8.3
|
|
10 |
html5lib==1.1
|
11 |
beautifulsoup4==4.12.3
|
12 |
rapidfuzz==3.10.1
|
|
|
|
|
|
|
|
|
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.8.0
|
3 |
boto3==1.35.71
|
4 |
pyarrow==18.1.0
|
5 |
openpyxl==3.1.3
|
|
|
10 |
html5lib==1.1
|
11 |
beautifulsoup4==4.12.3
|
12 |
rapidfuzz==3.10.1
|
13 |
+
torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
|
14 |
+
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
15 |
+
transformers==4.47.0
|
16 |
+
numpy==1.26.4
|
17 |
+
typing_extensions==4.12.2
|
requirements_cpu.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.3
|
2 |
+
gradio==5.6.0
|
3 |
+
boto3==1.35.71
|
4 |
+
pyarrow==18.1.0
|
5 |
+
openpyxl==3.1.3
|
6 |
+
markdown==3.7
|
7 |
+
tabulate==0.9.0
|
8 |
+
lxml==5.3.0
|
9 |
+
google-generativeai==0.8.3
|
10 |
+
html5lib==1.1
|
11 |
+
beautifulsoup4==4.12.3
|
12 |
+
rapidfuzz==3.10.1
|
13 |
+
torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
|
14 |
+
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
15 |
+
transformers==4.47.0
|
16 |
+
numpy==1.26.4
|
17 |
+
typing_extensions==4.12.2
|
tools/chatfuncs.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from typing import TypeVar
|
3 |
+
|
4 |
+
# Model packages
|
5 |
+
import torch.cuda
|
6 |
+
from transformers import pipeline
|
7 |
+
import time
|
8 |
+
|
9 |
+
torch.cuda.empty_cache()
|
10 |
+
|
11 |
+
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
12 |
+
|
13 |
+
model_type = None # global variable setup
|
14 |
+
|
15 |
+
full_text = "" # Define dummy source text (full text) just to enable highlight function to load
|
16 |
+
|
17 |
+
model = [] # Define empty list for model functions to run
|
18 |
+
tokenizer = [] # Define empty list for model functions to run
|
19 |
+
|
20 |
+
|
21 |
+
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
22 |
+
if torch.cuda.is_available():
|
23 |
+
torch_device = "cuda"
|
24 |
+
gpu_layers = -1
|
25 |
+
else:
|
26 |
+
torch_device = "cpu"
|
27 |
+
gpu_layers = 0
|
28 |
+
|
29 |
+
print("Running on device:", torch_device)
|
30 |
+
threads = torch.get_num_threads() # 8
|
31 |
+
print("CPU threads:", threads)
|
32 |
+
|
33 |
+
temperature: float = 0.1
|
34 |
+
top_k: int = 3
|
35 |
+
top_p: float = 1
|
36 |
+
repetition_penalty: float = 1.2 # Mild repetition penalty to prevent repeating table rows
|
37 |
+
last_n_tokens: int = 512
|
38 |
+
max_new_tokens: int = 4096 # 200
|
39 |
+
seed: int = 42
|
40 |
+
reset: bool = True
|
41 |
+
stream: bool = False
|
42 |
+
threads: int = threads
|
43 |
+
batch_size:int = 256
|
44 |
+
context_length:int = 12288
|
45 |
+
sample = True
|
46 |
+
|
47 |
+
|
48 |
+
class llama_cpp_init_config_gpu:
|
49 |
+
def __init__(self,
|
50 |
+
last_n_tokens=last_n_tokens,
|
51 |
+
seed=seed,
|
52 |
+
n_threads=threads,
|
53 |
+
n_batch=batch_size,
|
54 |
+
n_ctx=context_length,
|
55 |
+
n_gpu_layers=gpu_layers):
|
56 |
+
|
57 |
+
self.last_n_tokens = last_n_tokens
|
58 |
+
self.seed = seed
|
59 |
+
self.n_threads = n_threads
|
60 |
+
self.n_batch = n_batch
|
61 |
+
self.n_ctx = n_ctx
|
62 |
+
self.n_gpu_layers = n_gpu_layers
|
63 |
+
# self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
64 |
+
|
65 |
+
def update_gpu(self, new_value):
|
66 |
+
self.n_gpu_layers = new_value
|
67 |
+
|
68 |
+
def update_context(self, new_value):
|
69 |
+
self.n_ctx = new_value
|
70 |
+
|
71 |
+
class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu):
|
72 |
+
def __init__(self):
|
73 |
+
super().__init__()
|
74 |
+
self.n_gpu_layers = gpu_layers
|
75 |
+
self.n_ctx=context_length
|
76 |
+
|
77 |
+
gpu_config = llama_cpp_init_config_gpu()
|
78 |
+
cpu_config = llama_cpp_init_config_cpu()
|
79 |
+
|
80 |
+
|
81 |
+
class CtransGenGenerationConfig:
|
82 |
+
def __init__(self, temperature=temperature,
|
83 |
+
top_k=top_k,
|
84 |
+
top_p=top_p,
|
85 |
+
repeat_penalty=repetition_penalty,
|
86 |
+
seed=seed,
|
87 |
+
stream=stream,
|
88 |
+
max_tokens=max_new_tokens
|
89 |
+
):
|
90 |
+
self.temperature = temperature
|
91 |
+
self.top_k = top_k
|
92 |
+
self.top_p = top_p
|
93 |
+
self.repeat_penalty = repeat_penalty
|
94 |
+
self.seed = seed
|
95 |
+
self.max_tokens=max_tokens
|
96 |
+
self.stream = stream
|
97 |
+
|
98 |
+
def update_temp(self, new_value):
|
99 |
+
self.temperature = new_value
|
100 |
+
|
101 |
+
|
102 |
+
def llama_cpp_streaming(history, full_prompt, temperature=temperature):
|
103 |
+
|
104 |
+
gen_config = CtransGenGenerationConfig()
|
105 |
+
gen_config.update_temp(temperature)
|
106 |
+
|
107 |
+
print(vars(gen_config))
|
108 |
+
|
109 |
+
# Pull the generated text from the streamer, and update the model output.
|
110 |
+
start = time.time()
|
111 |
+
NUM_TOKENS=0
|
112 |
+
print('-'*4+'Start Generation'+'-'*4)
|
113 |
+
|
114 |
+
output = model(
|
115 |
+
full_prompt, **vars(gen_config))
|
116 |
+
|
117 |
+
history[-1][1] = ""
|
118 |
+
for out in output:
|
119 |
+
|
120 |
+
if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
|
121 |
+
history[-1][1] += out["choices"][0]["text"]
|
122 |
+
NUM_TOKENS+=1
|
123 |
+
yield history
|
124 |
+
else:
|
125 |
+
print(f"Unexpected output structure: {out}")
|
126 |
+
|
127 |
+
time_generate = time.time() - start
|
128 |
+
print('\n')
|
129 |
+
print('-'*4+'End Generation'+'-'*4)
|
130 |
+
print(f'Num of generated tokens: {NUM_TOKENS}')
|
131 |
+
print(f'Time for complete generation: {time_generate}s')
|
132 |
+
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
133 |
+
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
134 |
+
|
135 |
+
|
136 |
+
def call_llama_cpp_model(formatted_string, gen_config):
|
137 |
+
"""
|
138 |
+
Calls your generation model with parameters from the CtransGenGenerationConfig object.
|
139 |
+
|
140 |
+
Args:
|
141 |
+
formatted_string (str): The formatted input text for the model.
|
142 |
+
gen_config (CtransGenGenerationConfig): An object containing generation parameters.
|
143 |
+
"""
|
144 |
+
# Extracting parameters from the gen_config object
|
145 |
+
temperature = gen_config.temperature
|
146 |
+
top_k = gen_config.top_k
|
147 |
+
top_p = gen_config.top_p
|
148 |
+
repeat_penalty = gen_config.repeat_penalty
|
149 |
+
seed = gen_config.seed
|
150 |
+
max_tokens = gen_config.max_tokens
|
151 |
+
stream = gen_config.stream
|
152 |
+
|
153 |
+
# Now you can call your model directly, passing the parameters:
|
154 |
+
output = model(
|
155 |
+
formatted_string,
|
156 |
+
temperature=temperature,
|
157 |
+
top_k=top_k,
|
158 |
+
top_p=top_p,
|
159 |
+
repeat_penalty=repeat_penalty,
|
160 |
+
seed=seed,
|
161 |
+
max_tokens=max_tokens,
|
162 |
+
stream=stream#,
|
163 |
+
#stop=["<|eot_id|>", "\n\n"]
|
164 |
+
)
|
165 |
+
|
166 |
+
return output
|
tools/helper_functions.py
CHANGED
@@ -2,7 +2,6 @@ import os
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
|
5 |
-
|
6 |
def empty_output_vars_extract_topics():
|
7 |
# Empty output objects before processing a new file
|
8 |
|
@@ -47,12 +46,19 @@ def get_or_create_env_var(var_name, default_value):
|
|
47 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
48 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
49 |
|
|
|
|
|
|
|
50 |
if RUN_AWS_FUNCTIONS == "1":
|
51 |
-
model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-1.5-flash-002", "gemini-1.5-pro-002"]
|
52 |
-
model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro"]
|
53 |
else:
|
54 |
-
model_full_names = ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]
|
55 |
-
model_short_names = ["gemini_flash", "gemini_pro"]
|
|
|
|
|
|
|
|
|
56 |
|
57 |
model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
|
58 |
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
|
|
|
5 |
def empty_output_vars_extract_topics():
|
6 |
# Empty output objects before processing a new file
|
7 |
|
|
|
46 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
47 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
48 |
|
49 |
+
RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "0")
|
50 |
+
print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
|
51 |
+
|
52 |
if RUN_AWS_FUNCTIONS == "1":
|
53 |
+
model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-1.5-flash-002", "gemini-1.5-pro-002", "gemma_2b_it_local"]
|
54 |
+
model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
|
55 |
else:
|
56 |
+
model_full_names = ["gemini-1.5-flash-002", "gemini-1.5-pro-002", "gemma_2b_it_local"]
|
57 |
+
model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
|
58 |
+
|
59 |
+
if RUN_LOCAL_MODEL == "0":
|
60 |
+
model_full_names.remove("gemma_2b_it_local")
|
61 |
+
model_short_names.remove("gemma_local")
|
62 |
|
63 |
model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
|
64 |
|
tools/llm_api_call.py
CHANGED
@@ -15,8 +15,11 @@ from gradio import Progress
|
|
15 |
from typing import List, Tuple
|
16 |
from io import StringIO
|
17 |
|
|
|
|
|
18 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
19 |
-
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map
|
|
|
20 |
|
21 |
# ResponseObject class for AWS Bedrock calls
|
22 |
class ResponseObject:
|
@@ -27,8 +30,8 @@ class ResponseObject:
|
|
27 |
max_tokens = 4096
|
28 |
timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
|
29 |
number_of_api_retry_attempts = 5
|
30 |
-
max_time_for_loop =
|
31 |
-
|
32 |
|
33 |
AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
|
34 |
print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
|
@@ -64,7 +67,7 @@ def load_in_file(file_path: str, colname:str=""):
|
|
64 |
|
65 |
file_data[colname] = file_data[colname].astype(str).str.replace("\bnan\b", "", regex=True)
|
66 |
|
67 |
-
print(file_data[colname])
|
68 |
|
69 |
return file_data, file_name
|
70 |
|
@@ -172,16 +175,24 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
172 |
|
173 |
simple_file = simple_file[start_row:end_row] # Select the current batch
|
174 |
|
175 |
-
|
|
|
|
|
|
|
176 |
|
177 |
# Remove problematic characters including ASCII and various quote marks
|
178 |
# Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
|
179 |
simple_file["Response"] = simple_file["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
|
180 |
simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
|
181 |
simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
|
|
|
|
|
182 |
|
183 |
# Remove blank and extremely short responses
|
184 |
-
simple_file = simple_file.loc[~(simple_file["Response"].isnull())
|
|
|
|
|
|
|
185 |
|
186 |
simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
|
187 |
simple_file.to_csv(simplified_csv_table_path, index=None)
|
@@ -353,7 +364,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
353 |
|
354 |
for i in progress_bar:
|
355 |
try:
|
356 |
-
print("Calling Gemini model")
|
357 |
#print("full_prompt:", full_prompt)
|
358 |
#print("generation_config:", config)
|
359 |
|
@@ -372,10 +383,10 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
372 |
|
373 |
if i == number_of_api_retry_attempts:
|
374 |
return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
|
375 |
-
|
376 |
for i in progress_bar:
|
377 |
try:
|
378 |
-
print("Calling AWS Claude model, attempt", i)
|
379 |
response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
|
380 |
|
381 |
#progress_bar.close()
|
@@ -392,11 +403,43 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
392 |
|
393 |
if i == number_of_api_retry_attempts:
|
394 |
return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
|
395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
|
397 |
# Update the conversation history with the new prompt and response
|
398 |
conversation_history.append({'role': 'user', 'parts': [prompt]})
|
399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
|
401 |
# Print the updated conversation history
|
402 |
#print("conversation_history:", conversation_history)
|
@@ -433,16 +476,22 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
|
|
433 |
#print("prompt to LLM:", prompt)
|
434 |
|
435 |
response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature)
|
436 |
-
|
437 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
#print("response.usage_metadata:", response.usage_metadata)
|
439 |
#print("Response.text:", response.text)
|
440 |
#print("responses:", responses)
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
whole_conversation.append(prompt)
|
445 |
-
whole_conversation.append(response.text)
|
446 |
|
447 |
# Create conversation metadata
|
448 |
if master == False:
|
@@ -459,12 +508,15 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
|
|
459 |
whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-output-token-count']))
|
460 |
whole_conversation_metadata.append('x-amzn-bedrock-input-token-count:')
|
461 |
whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-input-token-count']))
|
462 |
-
|
463 |
whole_conversation_metadata.append(str(response.usage_metadata))
|
|
|
|
|
464 |
except KeyError as e:
|
465 |
print(f"Key error: {e} - Check the structure of response.usage_metadata")
|
466 |
else:
|
467 |
print("Response is a string object.")
|
|
|
468 |
|
469 |
|
470 |
return responses, conversation_history, whole_conversation, whole_conversation_metadata
|
@@ -494,20 +546,26 @@ def clean_markdown_table(text: str):
|
|
494 |
if buffer:
|
495 |
merged_lines.append(buffer)
|
496 |
|
497 |
-
#
|
498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
result = []
|
|
|
500 |
|
501 |
for line in merged_lines:
|
502 |
# Strip excessive whitespace around pipes
|
503 |
line = re.sub(r'\s*\|\s*', '|', line.strip())
|
504 |
|
505 |
-
# Replace numbers between pipes with commas and a space
|
506 |
-
line = re.sub(r'(?<=\|)(\s*\d+)(,\s*\d+)+(?=\|)', lambda m: ', '.join(m.group(0).split(',')), line)
|
507 |
-
|
508 |
-
# Replace groups of numbers separated by spaces with commas and a space
|
509 |
-
line = re.sub(r'(?<=\|)(\s*\d+)(\s+\d+)+(?=\|)', lambda m: ', '.join(m.group(0).split()), line)
|
510 |
-
|
511 |
# Fix inconsistent number of pipes by adjusting them to match the header
|
512 |
pipe_count = line.count('|')
|
513 |
if pipe_count < header_pipes:
|
@@ -516,12 +574,17 @@ def clean_markdown_table(text: str):
|
|
516 |
# If too many pipes, split line and keep the first `header_pipes` columns
|
517 |
columns = line.split('|')[:header_pipes + 1] # +1 to keep last pipe at the end
|
518 |
line = '|'.join(columns)
|
|
|
|
|
519 |
|
520 |
result.append(line)
|
521 |
|
522 |
# Join lines back into the cleaned markdown text
|
523 |
cleaned_text = '\n'.join(result)
|
524 |
|
|
|
|
|
|
|
525 |
return cleaned_text
|
526 |
|
527 |
def clean_column_name(column_name, max_length=20):
|
@@ -642,8 +705,23 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
642 |
log_files_output_paths.append(whole_conversation_path_meta)
|
643 |
|
644 |
# Convert output table to markdown and then to a pandas dataframe to csv
|
645 |
-
|
646 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
647 |
|
648 |
markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
|
649 |
|
@@ -653,20 +731,24 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
653 |
html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
|
654 |
html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
|
655 |
|
656 |
-
print("html_table:", html_table)
|
657 |
-
|
658 |
# Now ensure that the HTML structure is correct
|
659 |
if "<table>" not in html_table:
|
660 |
html_table = f"""
|
661 |
<table>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
662 |
{html_table}
|
663 |
</table>
|
664 |
"""
|
665 |
|
666 |
# print("Markdown table as HTML:", html_table)
|
667 |
|
668 |
-
html_buffer = StringIO(html_table)
|
669 |
-
|
670 |
|
671 |
try:
|
672 |
topic_with_response_df = pd.read_html(html_buffer)[0] # Assuming the first table in the HTML is the one you want
|
@@ -678,11 +760,16 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
678 |
|
679 |
|
680 |
# Rename columns to ensure consistent use of data frames later in code
|
681 |
-
topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "
|
682 |
|
683 |
# Fill in NA rows with values from above (topics seem to be included only on one row):
|
684 |
topic_with_response_df = topic_with_response_df.ffill()
|
685 |
|
|
|
|
|
|
|
|
|
|
|
686 |
# Strip and lower case topic names to remove issues where model is randomly capitalising topics/sentiment
|
687 |
topic_with_response_df["General Topic"] = topic_with_response_df["General Topic"].str.strip().str.lower().str.capitalize()
|
688 |
topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].str.strip().str.lower().str.capitalize()
|
@@ -695,18 +782,32 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
695 |
|
696 |
# Iterate through each row in the original DataFrame
|
697 |
for index, row in topic_with_response_df.iterrows():
|
698 |
-
references = re.split(r',\s*|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
|
|
|
|
|
|
|
|
|
699 |
topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
|
700 |
subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
|
701 |
sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
|
702 |
-
summary = row.iloc[
|
|
|
|
|
|
|
703 |
|
704 |
summary = row_number_string_start + summary
|
705 |
|
706 |
# Create a new entry for each reference number
|
707 |
for ref in references:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
708 |
reference_data.append({
|
709 |
-
'Response References':
|
710 |
'General Topic': topic,
|
711 |
'Subtopic': subtopic,
|
712 |
'Sentiment': sentiment,
|
@@ -716,6 +817,8 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
716 |
|
717 |
# Create a new DataFrame from the reference data
|
718 |
new_reference_df = pd.DataFrame(reference_data)
|
|
|
|
|
719 |
|
720 |
# Append on old reference data
|
721 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
@@ -759,7 +862,10 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
759 |
|
760 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
761 |
|
762 |
-
|
|
|
|
|
|
|
763 |
existing_topics_table:pd.DataFrame,
|
764 |
existing_reference_df:pd.DataFrame,
|
765 |
existing_unique_topics_df:pd.DataFrame,
|
@@ -770,7 +876,7 @@ def llm_query(file_data:pd.DataFrame,
|
|
770 |
temperature:float,
|
771 |
chosen_cols:List[str],
|
772 |
model_choice:str,
|
773 |
-
candidate_topics:
|
774 |
latest_batch_completed:int=0,
|
775 |
out_message:List=[],
|
776 |
out_file_paths:List = [],
|
@@ -783,7 +889,7 @@ def llm_query(file_data:pd.DataFrame,
|
|
783 |
system_prompt:str=system_prompt,
|
784 |
add_existing_topics_system_prompt:str=add_existing_topics_system_prompt,
|
785 |
add_existing_topics_prompt:str=add_existing_topics_prompt,
|
786 |
-
|
787 |
batch_size:int=50,
|
788 |
context_textbox:str="",
|
789 |
time_taken:float = 0,
|
@@ -796,6 +902,7 @@ def llm_query(file_data:pd.DataFrame,
|
|
796 |
Query an LLM (Gemini or AWS Anthropic-based) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
|
797 |
|
798 |
Parameters:
|
|
|
799 |
- file_data (pd.DataFrame): Pandas dataframe containing the consultation response data.
|
800 |
- existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
|
801 |
- existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
|
@@ -806,7 +913,7 @@ def llm_query(file_data:pd.DataFrame,
|
|
806 |
- in_api_key (str): The API key for authentication.
|
807 |
- temperature (float): The temperature parameter for the model.
|
808 |
- chosen_cols (List[str]): A list of chosen columns to process.
|
809 |
-
- candidate_topics (
|
810 |
- model_choice (str): The choice of model to use.
|
811 |
- latest_batch_completed (int): The index of the latest file completed.
|
812 |
- out_message (list): A list to store output messages.
|
@@ -835,16 +942,37 @@ def llm_query(file_data:pd.DataFrame,
|
|
835 |
config = ""
|
836 |
final_time = 0.0
|
837 |
whole_conversation_metadata = []
|
838 |
-
#all_topic_tables_df = []
|
839 |
-
#all_markdown_topic_tables = []
|
840 |
is_error = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
841 |
|
842 |
# Reset output files on each run:
|
843 |
# out_file_paths = []
|
844 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
845 |
#model_choice_clean = replace_punctuation_with_underscore(model_choice)
|
846 |
-
model_choice_clean = model_name_map[model_choice]
|
847 |
-
print("model_choice_clean:", model_choice_clean)
|
848 |
|
849 |
# If this is the first time around, set variables to 0/blank
|
850 |
if first_loop_state==True:
|
@@ -852,8 +980,9 @@ def llm_query(file_data:pd.DataFrame,
|
|
852 |
latest_batch_completed = 0
|
853 |
out_message = []
|
854 |
out_file_paths = []
|
|
|
855 |
|
856 |
-
print("latest_batch_completed:", str(latest_batch_completed))
|
857 |
|
858 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
859 |
if latest_batch_completed >= num_batches:
|
@@ -866,7 +995,6 @@ def llm_query(file_data:pd.DataFrame,
|
|
866 |
out_time = f"Everything finished in {final_time} seconds."
|
867 |
print(out_time)
|
868 |
|
869 |
-
|
870 |
print("All summaries completed. Creating outputs.")
|
871 |
|
872 |
model_choice_clean = model_name_map[model_choice]
|
@@ -931,7 +1059,7 @@ def llm_query(file_data:pd.DataFrame,
|
|
931 |
print("out_file_paths:", out_file_paths)
|
932 |
|
933 |
#final_out_message = '\n'.join(out_message)
|
934 |
-
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
|
935 |
|
936 |
|
937 |
|
@@ -949,18 +1077,14 @@ def llm_query(file_data:pd.DataFrame,
|
|
949 |
if not out_file_paths:
|
950 |
out_file_paths = []
|
951 |
|
952 |
-
|
953 |
-
if file_data.empty:
|
954 |
-
out_message = "Please enter a data file to summarise."
|
955 |
-
print(out_message)
|
956 |
-
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
|
957 |
|
958 |
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
959 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
960 |
print(out_message)
|
961 |
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
|
962 |
|
963 |
-
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses).
|
964 |
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
965 |
|
966 |
for i in topics_loop:
|
@@ -994,39 +1118,53 @@ def llm_query(file_data:pd.DataFrame,
|
|
994 |
if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
|
995 |
print("Using Gemini model:", model_choice)
|
996 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
997 |
-
|
998 |
print("Using AWS Bedrock model:", model_choice)
|
|
|
|
|
999 |
|
1000 |
if candidate_topics:
|
1001 |
# 'Zero shot topics' are those supplied by the user
|
1002 |
-
|
1003 |
-
zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
|
1004 |
-
# Max 150 topics allowed
|
1005 |
-
if len(zero_shot_topics_series) > 120:
|
1006 |
-
print("Maximum 120 topics allowed to fit within large language model context limits.")
|
1007 |
-
zero_shot_topics_series = zero_shot_topics_series.iloc[:120]
|
1008 |
-
|
1009 |
-
zero_shot_topics_list = list(zero_shot_topics_series)
|
1010 |
|
1011 |
-
|
1012 |
-
|
1013 |
-
|
1014 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1015 |
|
1016 |
|
1017 |
-
# Create the most up to date list of topics and subtopics.
|
1018 |
-
# If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
|
1019 |
-
if candidate_topics and existing_unique_topics_df.empty:
|
1020 |
-
existing_unique_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
|
1021 |
-
|
1022 |
-
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1023 |
-
elif candidate_topics and not existing_unique_topics_df.empty:
|
1024 |
-
zero_shot_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
|
1025 |
-
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1026 |
-
zero_shot_topics_list_str = zero_shot_topics_list
|
1027 |
|
1028 |
#existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
|
1029 |
|
|
|
|
|
1030 |
|
1031 |
unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic", "Sentiment"]].drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).to_markdown(index=False)
|
1032 |
|
@@ -1035,6 +1173,13 @@ def llm_query(file_data:pd.DataFrame,
|
|
1035 |
# Format the summary prompt with the response table and topics
|
1036 |
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, consultation_context=context_textbox, column_name=chosen_cols)
|
1037 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1038 |
# Define the output file path for the formatted prompt
|
1039 |
formatted_prompt_output_path = output_folder + file_name + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1040 |
|
@@ -1130,7 +1275,17 @@ def llm_query(file_data:pd.DataFrame,
|
|
1130 |
if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table)
|
1131 |
else: formatted_prompt3 = prompt3
|
1132 |
|
1133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1134 |
|
1135 |
whole_conversation = [system_prompt]
|
1136 |
|
@@ -1173,15 +1328,21 @@ def llm_query(file_data:pd.DataFrame,
|
|
1173 |
try:
|
1174 |
final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1175 |
|
1176 |
-
|
1177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1178 |
|
1179 |
log_files_output_paths.append(final_table_output_path)
|
1180 |
|
1181 |
except Exception as e:
|
1182 |
print(e)
|
1183 |
-
|
1184 |
-
display_table = responses[-1].text
|
1185 |
new_topic_df = topic_table_df
|
1186 |
new_reference_df = reference_df
|
1187 |
|
@@ -1260,7 +1421,6 @@ def deduplicate_categories(category_series: pd.Series, join_series:pd.Series, th
|
|
1260 |
|
1261 |
return result_df
|
1262 |
|
1263 |
-
|
1264 |
def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
1265 |
unique_topics_df:pd.DataFrame,
|
1266 |
random_seed:int,
|
@@ -1380,7 +1540,11 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
|
|
1380 |
print("Finished summary query")
|
1381 |
|
1382 |
# Extract text from the `responses` list
|
1383 |
-
|
|
|
|
|
|
|
|
|
1384 |
latest_response_text = response_texts[-1]
|
1385 |
|
1386 |
#print("latest_response_text:", latest_response_text)
|
@@ -1482,6 +1646,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1482 |
try:
|
1483 |
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt)
|
1484 |
summarised_output = response
|
|
|
|
|
1485 |
except Exception as e:
|
1486 |
print(e)
|
1487 |
summarised_output = ""
|
|
|
15 |
from typing import List, Tuple
|
16 |
from io import StringIO
|
17 |
|
18 |
+
GradioFileData = gr.FileData
|
19 |
+
|
20 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
21 |
+
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
|
22 |
+
from tools.chatfuncs import model, CtransGenGenerationConfig, temperature, context_length, call_llama_cpp_model
|
23 |
|
24 |
# ResponseObject class for AWS Bedrock calls
|
25 |
class ResponseObject:
|
|
|
30 |
max_tokens = 4096
|
31 |
timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
|
32 |
number_of_api_retry_attempts = 5
|
33 |
+
max_time_for_loop = 99999
|
34 |
+
batch_size_default = 5
|
35 |
|
36 |
AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
|
37 |
print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
|
|
|
67 |
|
68 |
file_data[colname] = file_data[colname].astype(str).str.replace("\bnan\b", "", regex=True)
|
69 |
|
70 |
+
#print(file_data[colname])
|
71 |
|
72 |
return file_data, file_name
|
73 |
|
|
|
175 |
|
176 |
simple_file = simple_file[start_row:end_row] # Select the current batch
|
177 |
|
178 |
+
# Now replace the reference numbers with numbers starting from 1
|
179 |
+
simple_file["Reference"] = simple_file["Reference"] - start_row
|
180 |
+
|
181 |
+
#print("simple_file:", simple_file)
|
182 |
|
183 |
# Remove problematic characters including ASCII and various quote marks
|
184 |
# Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
|
185 |
simple_file["Response"] = simple_file["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
|
186 |
simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
|
187 |
simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
|
188 |
+
simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
|
189 |
+
simple_file["Response"] = simple_file["Response"].str.slice(0, 2500) # Maximum 1,500 character responses
|
190 |
|
191 |
# Remove blank and extremely short responses
|
192 |
+
simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
|
193 |
+
~(simple_file["Response"] == "None") &\
|
194 |
+
~(simple_file["Response"] == " ") &\
|
195 |
+
~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
|
196 |
|
197 |
simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
|
198 |
simple_file.to_csv(simplified_csv_table_path, index=None)
|
|
|
364 |
|
365 |
for i in progress_bar:
|
366 |
try:
|
367 |
+
print("Calling Gemini model, attempt", i + 1)
|
368 |
#print("full_prompt:", full_prompt)
|
369 |
#print("generation_config:", config)
|
370 |
|
|
|
383 |
|
384 |
if i == number_of_api_retry_attempts:
|
385 |
return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
|
386 |
+
elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
|
387 |
for i in progress_bar:
|
388 |
try:
|
389 |
+
print("Calling AWS Claude model, attempt", i + 1)
|
390 |
response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
|
391 |
|
392 |
#progress_bar.close()
|
|
|
403 |
|
404 |
if i == number_of_api_retry_attempts:
|
405 |
return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
|
406 |
+
else:
|
407 |
+
# This is the Gemma model
|
408 |
+
for i in progress_bar:
|
409 |
+
try:
|
410 |
+
print("Calling Gemma 2B Instruct model, attempt", i + 1)
|
411 |
+
|
412 |
+
gen_config = CtransGenGenerationConfig()
|
413 |
+
gen_config.update_temp(temperature)
|
414 |
+
|
415 |
+
response = call_llama_cpp_model(prompt, gen_config)
|
416 |
+
|
417 |
+
#progress_bar.close()
|
418 |
+
#tqdm._instances.clear()
|
419 |
+
|
420 |
+
print("Successful call to Gemma model.")
|
421 |
+
print("Response:", response)
|
422 |
+
break
|
423 |
+
except Exception as e:
|
424 |
+
# If fails, try again after X seconds in case there is a throttle limit
|
425 |
+
print("Call to Gemma model failed:", e, " Waiting for ", str(timeout_wait), "seconds and trying again.")
|
426 |
+
|
427 |
+
time.sleep(timeout_wait)
|
428 |
+
#response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
|
429 |
+
|
430 |
+
if i == number_of_api_retry_attempts:
|
431 |
+
return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
|
432 |
|
433 |
# Update the conversation history with the new prompt and response
|
434 |
conversation_history.append({'role': 'user', 'parts': [prompt]})
|
435 |
+
|
436 |
+
# output_str = output['choices'][0]['text']
|
437 |
+
|
438 |
+
# Check if is a LLama.cpp model response
|
439 |
+
if 'choices' in response:
|
440 |
+
conversation_history.append({'role': 'assistant', 'parts': [response['choices'][0]['text']]})
|
441 |
+
else:
|
442 |
+
conversation_history.append({'role': 'assistant', 'parts': [response.text]})
|
443 |
|
444 |
# Print the updated conversation history
|
445 |
#print("conversation_history:", conversation_history)
|
|
|
476 |
#print("prompt to LLM:", prompt)
|
477 |
|
478 |
response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature)
|
479 |
+
|
480 |
+
if 'choices' in response:
|
481 |
+
responses.append(response)
|
482 |
+
|
483 |
+
# Create conversation txt object
|
484 |
+
whole_conversation.append(prompt)
|
485 |
+
whole_conversation.append(response['choices'][0]['text'])
|
486 |
+
|
487 |
+
else:
|
488 |
+
responses.append(response)
|
489 |
#print("response.usage_metadata:", response.usage_metadata)
|
490 |
#print("Response.text:", response.text)
|
491 |
#print("responses:", responses)
|
492 |
+
# Create conversation txt object
|
493 |
+
whole_conversation.append(prompt)
|
494 |
+
whole_conversation.append(response.text)
|
|
|
|
|
495 |
|
496 |
# Create conversation metadata
|
497 |
if master == False:
|
|
|
508 |
whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-output-token-count']))
|
509 |
whole_conversation_metadata.append('x-amzn-bedrock-input-token-count:')
|
510 |
whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-input-token-count']))
|
511 |
+
elif "gemini" in model_choice:
|
512 |
whole_conversation_metadata.append(str(response.usage_metadata))
|
513 |
+
else:
|
514 |
+
whole_conversation_metadata.append(str(response['usage']))
|
515 |
except KeyError as e:
|
516 |
print(f"Key error: {e} - Check the structure of response.usage_metadata")
|
517 |
else:
|
518 |
print("Response is a string object.")
|
519 |
+
whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
|
520 |
|
521 |
|
522 |
return responses, conversation_history, whole_conversation, whole_conversation_metadata
|
|
|
546 |
if buffer:
|
547 |
merged_lines.append(buffer)
|
548 |
|
549 |
+
# Fix the header separator row if necessary
|
550 |
+
if len(merged_lines) > 1:
|
551 |
+
header_pipes = merged_lines[0].count('|') # Count pipes in the header row
|
552 |
+
header_separator = '|---|' * (header_pipes - 1) + '|---|' # Generate proper separator
|
553 |
+
|
554 |
+
# Replace or insert the separator row
|
555 |
+
if not re.match(r'^\|[-:|]+$', merged_lines[1]): # Check if the second row is a valid separator
|
556 |
+
merged_lines.insert(1, header_separator)
|
557 |
+
else:
|
558 |
+
# Adjust the separator to match the header pipes
|
559 |
+
merged_lines[1] = '|---|' * (header_pipes - 1) + '|'
|
560 |
+
|
561 |
+
# Ensure consistent number of pipes in each row
|
562 |
result = []
|
563 |
+
header_pipes = merged_lines[0].count('|') # Use the header row to count the number of pipes
|
564 |
|
565 |
for line in merged_lines:
|
566 |
# Strip excessive whitespace around pipes
|
567 |
line = re.sub(r'\s*\|\s*', '|', line.strip())
|
568 |
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
# Fix inconsistent number of pipes by adjusting them to match the header
|
570 |
pipe_count = line.count('|')
|
571 |
if pipe_count < header_pipes:
|
|
|
574 |
# If too many pipes, split line and keep the first `header_pipes` columns
|
575 |
columns = line.split('|')[:header_pipes + 1] # +1 to keep last pipe at the end
|
576 |
line = '|'.join(columns)
|
577 |
+
|
578 |
+
line = re.sub(r'(\d),(?=\d)', r'\1, ', line)
|
579 |
|
580 |
result.append(line)
|
581 |
|
582 |
# Join lines back into the cleaned markdown text
|
583 |
cleaned_text = '\n'.join(result)
|
584 |
|
585 |
+
# Replace numbers next to commas and other numbers with a space
|
586 |
+
|
587 |
+
|
588 |
return cleaned_text
|
589 |
|
590 |
def clean_column_name(column_name, max_length=20):
|
|
|
705 |
log_files_output_paths.append(whole_conversation_path_meta)
|
706 |
|
707 |
# Convert output table to markdown and then to a pandas dataframe to csv
|
708 |
+
def remove_before_last_term(input_string: str) -> str:
|
709 |
+
# Use regex to find the last occurrence of the term
|
710 |
+
match = re.search(r'(\| ?General Topic)', input_string)
|
711 |
+
if match:
|
712 |
+
# Find the last occurrence by using rfind
|
713 |
+
last_index = input_string.rfind(match.group(0))
|
714 |
+
return input_string[last_index:] # Return everything from the last match onward
|
715 |
+
return input_string # Return the original string if the term is not found
|
716 |
+
|
717 |
+
if "choices" in responses[-1]:
|
718 |
+
print("Text response:", responses[-1]["choices"][0]['text'])
|
719 |
+
start_of_table_response = remove_before_last_term(responses[-1]["choices"][0]['text'])
|
720 |
+
cleaned_response = clean_markdown_table(start_of_table_response)
|
721 |
+
print("cleaned_response:", cleaned_response)
|
722 |
+
else:
|
723 |
+
start_of_table_response = remove_before_last_term(responses[-1].text)
|
724 |
+
cleaned_response = clean_markdown_table(start_of_table_response)
|
725 |
|
726 |
markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
|
727 |
|
|
|
731 |
html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
|
732 |
html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
|
733 |
|
|
|
|
|
734 |
# Now ensure that the HTML structure is correct
|
735 |
if "<table>" not in html_table:
|
736 |
html_table = f"""
|
737 |
<table>
|
738 |
+
<tr>
|
739 |
+
<th>General Topic</th>
|
740 |
+
<th>Subtopic</th>
|
741 |
+
<th>Sentiment</th>
|
742 |
+
<th>Response References</th>
|
743 |
+
<th>Summary</th>
|
744 |
+
</tr>
|
745 |
{html_table}
|
746 |
</table>
|
747 |
"""
|
748 |
|
749 |
# print("Markdown table as HTML:", html_table)
|
750 |
|
751 |
+
html_buffer = StringIO(html_table)
|
|
|
752 |
|
753 |
try:
|
754 |
topic_with_response_df = pd.read_html(html_buffer)[0] # Assuming the first table in the HTML is the one you want
|
|
|
760 |
|
761 |
|
762 |
# Rename columns to ensure consistent use of data frames later in code
|
763 |
+
topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
|
764 |
|
765 |
# Fill in NA rows with values from above (topics seem to be included only on one row):
|
766 |
topic_with_response_df = topic_with_response_df.ffill()
|
767 |
|
768 |
+
#print("topic_with_response_df:", topic_with_response_df)
|
769 |
+
|
770 |
+
# For instances where you end up with float values in Response references
|
771 |
+
topic_with_response_df["Response References"] = topic_with_response_df["Response References"].astype(str).str.replace(".0", "", regex=False)
|
772 |
+
|
773 |
# Strip and lower case topic names to remove issues where model is randomly capitalising topics/sentiment
|
774 |
topic_with_response_df["General Topic"] = topic_with_response_df["General Topic"].str.strip().str.lower().str.capitalize()
|
775 |
topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].str.strip().str.lower().str.capitalize()
|
|
|
782 |
|
783 |
# Iterate through each row in the original DataFrame
|
784 |
for index, row in topic_with_response_df.iterrows():
|
785 |
+
#references = re.split(r',\s*|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
|
786 |
+
references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
|
787 |
+
# If no numbers found in the Response References column, check the Summary column in case reference numbers were put there by mistake
|
788 |
+
if not references:
|
789 |
+
references = re.findall(r'\d+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else []
|
790 |
topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
|
791 |
subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
|
792 |
sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
|
793 |
+
summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
|
794 |
+
# If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
|
795 |
+
if not summary and len(row.iloc[3] > 30):
|
796 |
+
summary = row.iloc[3]
|
797 |
|
798 |
summary = row_number_string_start + summary
|
799 |
|
800 |
# Create a new entry for each reference number
|
801 |
for ref in references:
|
802 |
+
# Add start_row back onto reference_number
|
803 |
+
try:
|
804 |
+
response_ref_no = str(int(ref) + int(start_row))
|
805 |
+
except ValueError:
|
806 |
+
print("Reference is not a number")
|
807 |
+
continue
|
808 |
+
|
809 |
reference_data.append({
|
810 |
+
'Response References': response_ref_no,
|
811 |
'General Topic': topic,
|
812 |
'Subtopic': subtopic,
|
813 |
'Sentiment': sentiment,
|
|
|
817 |
|
818 |
# Create a new DataFrame from the reference data
|
819 |
new_reference_df = pd.DataFrame(reference_data)
|
820 |
+
|
821 |
+
print("new_reference_df:", new_reference_df)
|
822 |
|
823 |
# Append on old reference data
|
824 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
|
|
862 |
|
863 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
864 |
|
865 |
+
|
866 |
+
|
867 |
+
def extract_topics(in_data_file,
|
868 |
+
file_data:pd.DataFrame,
|
869 |
existing_topics_table:pd.DataFrame,
|
870 |
existing_reference_df:pd.DataFrame,
|
871 |
existing_unique_topics_df:pd.DataFrame,
|
|
|
876 |
temperature:float,
|
877 |
chosen_cols:List[str],
|
878 |
model_choice:str,
|
879 |
+
candidate_topics: GradioFileData = [],
|
880 |
latest_batch_completed:int=0,
|
881 |
out_message:List=[],
|
882 |
out_file_paths:List = [],
|
|
|
889 |
system_prompt:str=system_prompt,
|
890 |
add_existing_topics_system_prompt:str=add_existing_topics_system_prompt,
|
891 |
add_existing_topics_prompt:str=add_existing_topics_prompt,
|
892 |
+
number_of_prompts_used:int=1,
|
893 |
batch_size:int=50,
|
894 |
context_textbox:str="",
|
895 |
time_taken:float = 0,
|
|
|
902 |
Query an LLM (Gemini or AWS Anthropic-based) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
|
903 |
|
904 |
Parameters:
|
905 |
+
- in_data_file (gr.File): Gradio file object containing input data
|
906 |
- file_data (pd.DataFrame): Pandas dataframe containing the consultation response data.
|
907 |
- existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
|
908 |
- existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
|
|
|
913 |
- in_api_key (str): The API key for authentication.
|
914 |
- temperature (float): The temperature parameter for the model.
|
915 |
- chosen_cols (List[str]): A list of chosen columns to process.
|
916 |
+
- candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
|
917 |
- model_choice (str): The choice of model to use.
|
918 |
- latest_batch_completed (int): The index of the latest file completed.
|
919 |
- out_message (list): A list to store output messages.
|
|
|
942 |
config = ""
|
943 |
final_time = 0.0
|
944 |
whole_conversation_metadata = []
|
|
|
|
|
945 |
is_error = False
|
946 |
+
#llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
|
947 |
+
#llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
|
948 |
+
#llama_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
|
949 |
+
#llama_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
|
950 |
+
#llama_prefix = "<|user|>\n" # This is for phi 3.5
|
951 |
+
#llama_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
|
952 |
+
llama_prefix = "<start_of_turn>user\n"
|
953 |
+
llama_suffix = "<end_of_turn>\n<start_of_turn>model\n"
|
954 |
|
955 |
# Reset output files on each run:
|
956 |
# out_file_paths = []
|
957 |
|
958 |
+
# If you have a file input but no file data it hasn't yet been loaded. Load it here.
|
959 |
+
if file_data.empty:
|
960 |
+
print("No data table found, loading from file")
|
961 |
+
try:
|
962 |
+
print("in_data_file:", in_data_file)
|
963 |
+
in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
|
964 |
+
print("in_colnames:", in_colnames_drop)
|
965 |
+
file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default)
|
966 |
+
print("file_data loaded in:", file_data)
|
967 |
+
except:
|
968 |
+
# Check if files and text exist
|
969 |
+
out_message = "Please enter a data file to summarise."
|
970 |
+
print(out_message)
|
971 |
+
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
|
972 |
+
|
973 |
+
|
974 |
#model_choice_clean = replace_punctuation_with_underscore(model_choice)
|
975 |
+
model_choice_clean = model_name_map[model_choice]
|
|
|
976 |
|
977 |
# If this is the first time around, set variables to 0/blank
|
978 |
if first_loop_state==True:
|
|
|
980 |
latest_batch_completed = 0
|
981 |
out_message = []
|
982 |
out_file_paths = []
|
983 |
+
print("model_choice_clean:", model_choice_clean)
|
984 |
|
985 |
+
#print("latest_batch_completed:", str(latest_batch_completed))
|
986 |
|
987 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
988 |
if latest_batch_completed >= num_batches:
|
|
|
995 |
out_time = f"Everything finished in {final_time} seconds."
|
996 |
print(out_time)
|
997 |
|
|
|
998 |
print("All summaries completed. Creating outputs.")
|
999 |
|
1000 |
model_choice_clean = model_name_map[model_choice]
|
|
|
1059 |
print("out_file_paths:", out_file_paths)
|
1060 |
|
1061 |
#final_out_message = '\n'.join(out_message)
|
1062 |
+
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
|
1063 |
|
1064 |
|
1065 |
|
|
|
1077 |
if not out_file_paths:
|
1078 |
out_file_paths = []
|
1079 |
|
1080 |
+
|
|
|
|
|
|
|
|
|
1081 |
|
1082 |
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
1083 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
1084 |
print(out_message)
|
1085 |
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
|
1086 |
|
1087 |
+
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
1088 |
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
1089 |
|
1090 |
for i in topics_loop:
|
|
|
1118 |
if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
|
1119 |
print("Using Gemini model:", model_choice)
|
1120 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
1121 |
+
elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
|
1122 |
print("Using AWS Bedrock model:", model_choice)
|
1123 |
+
else:
|
1124 |
+
print("Using local model:", model_choice)
|
1125 |
|
1126 |
if candidate_topics:
|
1127 |
# 'Zero shot topics' are those supplied by the user
|
1128 |
+
max_topic_no = 120
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1129 |
|
1130 |
+
zero_shot_topics = read_file(candidate_topics.name)
|
1131 |
+
if zero_shot_topics.shape[1] == 1: # Check if there is only one column
|
1132 |
+
zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
|
1133 |
+
# Max 120 topics allowed
|
1134 |
+
if len(zero_shot_topics_series) > max_topic_no:
|
1135 |
+
print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
|
1136 |
+
zero_shot_topics_series = zero_shot_topics_series.iloc[:max_topic_no]
|
1137 |
+
|
1138 |
+
zero_shot_topics_list = list(zero_shot_topics_series)
|
1139 |
+
|
1140 |
+
print("Zero shot topics are:", zero_shot_topics_list)
|
1141 |
+
|
1142 |
+
# Create the most up to date list of topics and subtopics.
|
1143 |
+
# If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
|
1144 |
+
if existing_unique_topics_df.empty:
|
1145 |
+
existing_unique_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
|
1146 |
+
|
1147 |
+
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1148 |
+
elif not existing_unique_topics_df.empty:
|
1149 |
+
zero_shot_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
|
1150 |
+
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1151 |
+
zero_shot_topics_list_str = zero_shot_topics_list
|
1152 |
+
|
1153 |
+
elif set(["General Topic", "Subtopic", "Sentiment"]).issubset(zero_shot_topics.columns):
|
1154 |
+
# Max 120 topics allowed
|
1155 |
+
if zero_shot_topics.shape[0] > max_topic_no:
|
1156 |
+
print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
|
1157 |
+
zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
|
1158 |
+
|
1159 |
+
if existing_unique_topics_df.empty:
|
1160 |
+
existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1], 'Sentiment':zero_shot_topics.iloc[:,2]})
|
1161 |
|
1162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1163 |
|
1164 |
#existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
|
1165 |
|
1166 |
+
#all_topic_tables_df_merged = existing_unique_topics_df
|
1167 |
+
existing_unique_topics_df["Response References"] = ""
|
1168 |
|
1169 |
unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic", "Sentiment"]].drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).to_markdown(index=False)
|
1170 |
|
|
|
1173 |
# Format the summary prompt with the response table and topics
|
1174 |
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, consultation_context=context_textbox, column_name=chosen_cols)
|
1175 |
|
1176 |
+
if model_choice == "gemma_2b_it_local":
|
1177 |
+
# add_existing_topics_system_prompt = llama_system_prefix + add_existing_topics_system_prompt + llama_system_suffix
|
1178 |
+
# formatted_initial_table_prompt = llama_prefix + formatted_summary_prompt + llama_suffix
|
1179 |
+
|
1180 |
+
formatted_initial_table_prompt = llama_prefix + add_existing_topics_system_prompt + formatted_summary_prompt + llama_suffix
|
1181 |
+
|
1182 |
+
|
1183 |
# Define the output file path for the formatted prompt
|
1184 |
formatted_prompt_output_path = output_folder + file_name + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1185 |
|
|
|
1275 |
if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table)
|
1276 |
else: formatted_prompt3 = prompt3
|
1277 |
|
1278 |
+
if model_choice == "gemma_2b_it_local":
|
1279 |
+
# system_prompt = llama_system_prefix + system_prompt + llama_system_suffix
|
1280 |
+
# formatted_initial_table_prompt = llama_prefix + formatted_initial_table_prompt + llama_suffix
|
1281 |
+
# formatted_prompt2 = llama_prefix + formatted_prompt2 + llama_suffix
|
1282 |
+
# formatted_prompt3 = llama_prefix + formatted_prompt3 + llama_suffix
|
1283 |
+
|
1284 |
+
formatted_initial_table_prompt = llama_prefix + system_prompt + formatted_initial_table_prompt + llama_suffix
|
1285 |
+
formatted_prompt2 = llama_prefix + system_prompt + formatted_prompt2 + llama_suffix
|
1286 |
+
formatted_prompt3 = llama_prefix + system_prompt + formatted_prompt3 + llama_suffix
|
1287 |
+
|
1288 |
+
batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
|
1289 |
|
1290 |
whole_conversation = [system_prompt]
|
1291 |
|
|
|
1328 |
try:
|
1329 |
final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
1330 |
|
1331 |
+
if "choices" in responses[-1]:
|
1332 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1333 |
+
f.write(responses[-1]["choices"][0]['text'])
|
1334 |
+
display_table =responses[-1]["choices"][0]['text']
|
1335 |
+
|
1336 |
+
else:
|
1337 |
+
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
1338 |
+
f.write(responses[-1].text)
|
1339 |
+
display_table = responses[-1].text
|
1340 |
|
1341 |
log_files_output_paths.append(final_table_output_path)
|
1342 |
|
1343 |
except Exception as e:
|
1344 |
print(e)
|
1345 |
+
|
|
|
1346 |
new_topic_df = topic_table_df
|
1347 |
new_reference_df = reference_df
|
1348 |
|
|
|
1421 |
|
1422 |
return result_df
|
1423 |
|
|
|
1424 |
def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
1425 |
unique_topics_df:pd.DataFrame,
|
1426 |
random_seed:int,
|
|
|
1540 |
print("Finished summary query")
|
1541 |
|
1542 |
# Extract text from the `responses` list
|
1543 |
+
if "choices" in responses[-1]:
|
1544 |
+
response_texts = [resp["choices"][0]['text'] for resp in responses]
|
1545 |
+
else:
|
1546 |
+
response_texts = [resp.text for resp in responses]
|
1547 |
+
|
1548 |
latest_response_text = response_texts[-1]
|
1549 |
|
1550 |
#print("latest_response_text:", latest_response_text)
|
|
|
1646 |
try:
|
1647 |
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt)
|
1648 |
summarised_output = response
|
1649 |
+
summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
|
1650 |
+
summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
|
1651 |
except Exception as e:
|
1652 |
print(e)
|
1653 |
summarised_output = ""
|
tools/prompts.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1 |
-
system_prompt = """You are a researcher analysing responses from
|
2 |
|
3 |
-
initial_table_prompt = """The
|
4 |
-
|
5 |
-
|
|
|
6 |
In the first column identify general topics relevant to responses. Create as many general topics as you can.
|
7 |
In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
|
8 |
In the third column write the sentiment of the subtopic: Negative, Neutral, or Positive.
|
9 |
-
In the fourth column
|
10 |
-
In the fifth column
|
|
|
11 |
|
12 |
-
|
13 |
|
14 |
prompt2 = ""
|
15 |
|
@@ -17,32 +19,51 @@ prompt3 = ""
|
|
17 |
|
18 |
## Adding existing topics to consultation responses
|
19 |
|
20 |
-
add_existing_topics_system_prompt =
|
21 |
-
|
22 |
-
add_existing_topics_prompt = """Responses from a recent consultation are shown in the following table:
|
23 |
|
24 |
-
|
|
|
25 |
|
26 |
-
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
Create a new markdown table to summarise the consultation responses.
|
32 |
-
In the first and second columns, assign responses to the General Topics and Subtopics from the Topics table if they are relevant. If you cannot find a relevant topic, add new General Topics and Subtopics to the table. Make the new Subtopics as specific as possible.
|
33 |
In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive.
|
34 |
-
In the fourth column
|
35 |
-
In the fifth column, a
|
|
|
|
|
36 |
|
37 |
-
|
38 |
|
39 |
|
40 |
-
summarise_topic_descriptions_system_prompt =
|
41 |
|
42 |
-
summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to
|
43 |
|
44 |
'{summaries}'
|
45 |
|
46 |
-
Your
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called {column_name}. The context of this analysis is: {consultation_context}. """
|
2 |
|
3 |
+
initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
|
4 |
+
{response_table}
|
5 |
+
|
6 |
+
Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Summary', and 'Response references'.
|
7 |
In the first column identify general topics relevant to responses. Create as many general topics as you can.
|
8 |
In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
|
9 |
In the third column write the sentiment of the subtopic: Negative, Neutral, or Positive.
|
10 |
+
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
11 |
+
In the fifth and final column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
|
12 |
+
Do not add any other columns. Do not repeat Subtopics with the same Sentiment. Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
|
13 |
|
14 |
+
New table:"""
|
15 |
|
16 |
prompt2 = ""
|
17 |
|
|
|
19 |
|
20 |
## Adding existing topics to consultation responses
|
21 |
|
22 |
+
add_existing_topics_system_prompt = system_prompt
|
|
|
|
|
23 |
|
24 |
+
add_existing_topics_prompt = """Responses are shown in the following Response table:
|
25 |
+
{response_table}
|
26 |
|
27 |
+
Topics known to be relevant to this dataset are shown in the following Topics table:
|
28 |
+
{topics}
|
29 |
|
30 |
+
Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
|
31 |
+
Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Summary', and 'Response references'.
|
32 |
+
In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above if they are very relevant to the text of the Response. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible.
|
|
|
|
|
33 |
In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive.
|
34 |
+
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
35 |
+
In the fifth and final column, write a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
|
36 |
+
Do not add any other columns. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
|
37 |
+
Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
|
38 |
|
39 |
+
New table:"""
|
40 |
|
41 |
|
42 |
+
summarise_topic_descriptions_system_prompt = system_prompt
|
43 |
|
44 |
+
summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
|
45 |
|
46 |
'{summaries}'
|
47 |
|
48 |
+
Your task is to make a consolidated summary of the above text. Return a summary up to two paragraphs long that includes as much detail as possible from the original text. Return only the summary and no other text.
|
49 |
+
|
50 |
+
Summary:"""
|
51 |
+
|
52 |
+
|
53 |
+
# example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
54 |
+
# You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|
55 |
+
# Summarise the following text in less than {length} words: "{text}"\n
|
56 |
+
# Summary:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"""
|
57 |
+
|
58 |
+
# example_instruction_prompt_phi3 = """<|user|>\n
|
59 |
+
# Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.\n
|
60 |
+
# CONTENT: {summaries}\n
|
61 |
+
# QUESTION: {question}\n
|
62 |
+
# Answer:<|end|>\n
|
63 |
+
# <|assistant|>"""
|
64 |
|
65 |
+
# example_instruction_prompt_gemma = """<start_of_turn>user
|
66 |
+
# Categorise the following text into only one of the following categories that seems most relevant: 'cat1', 'cat2', 'cat3', 'cat4'. Answer only with the choice of category. Do not add any other text. Do not explain your choice.
|
67 |
+
# Text: {text}<end_of_turn>
|
68 |
+
# <start_of_turn>model
|
69 |
+
# Category:"""
|