seanpedrickcase commited on
Commit
b7f4700
Β·
1 Parent(s): c978ec5

Added support for using local models (specifically Gemma 2b) for topic extraction and summary. Generally improved output format safeguards.

Browse files
Dockerfile CHANGED
@@ -10,9 +10,12 @@ WORKDIR /src
10
 
11
  COPY requirements.txt .
12
 
13
- RUN pip install --no-cache-dir --target=/install -r requirements.txt
 
 
 
14
 
15
- RUN rm requirements.txt
16
 
17
  # Stage 2: Final runtime image
18
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 
10
 
11
  COPY requirements.txt .
12
 
13
+ RUN pip uninstall -y typing_extensions \
14
+ && pip install --no-cache-dir --target=/install typing_extensions==4.12.2 \
15
+ && pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
16
+ && pip install --no-cache-dir --target=/install -r requirements_cpu.txt
17
 
18
+ RUN rm requirements_cpu.txt
19
 
20
  # Stage 2: Final runtime image
21
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- title: Large language model topic modeller
3
  emoji: πŸ“
4
  colorFrom: purple
5
  colorTo: yellow
6
- sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: cc-by-nc-4.0
 
1
  ---
2
+ title: Large language model topic modelling
3
  emoji: πŸ“
4
  colorFrom: purple
5
  colorTo: yellow
6
+ sdk: 5.8.0
7
  app_file: app.py
8
  pinned: true
9
  license: cc-by-nc-4.0
app.py CHANGED
@@ -1,15 +1,20 @@
1
  import os
2
  import socket
3
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise
4
  from tools.aws_functions import upload_file_to_s3
5
- from tools.llm_api_call import llm_query, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics
6
  from tools.auth import authenticate_user
7
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
8
  #from tools.aws_functions import load_data_from_aws
9
  import gradio as gr
10
  import pandas as pd
11
-
 
 
 
 
12
  from datetime import datetime
 
13
  today_rev = datetime.now().strftime("%Y%m%d")
14
 
15
  ensure_output_folder_exists()
@@ -20,7 +25,87 @@ access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
20
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
21
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
22
 
23
- batch_size_default = 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Create the gradio interface
26
  app = gr.Blocks(theme = gr.themes.Base())
@@ -94,7 +179,7 @@ with app:
94
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
95
  candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
96
 
97
- context_textbox = gr.Textbox(label="Write a short description (one sentence of less) giving context to the large language model about the your consultation and any relevant context")
98
 
99
  extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
100
 
@@ -151,7 +236,7 @@ with app:
151
  Define settings that affect large language model output.
152
  """)
153
  with gr.Accordion("Settings for LLM generation", open = True):
154
- temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, label="Choose LLM temperature setting")
155
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0)
156
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
157
 
@@ -198,13 +283,13 @@ with app:
198
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
199
  then(load_in_data_file,
200
  inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
201
- fn=llm_query,
202
- inputs=[file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
203
- outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="llm_query")
204
 
205
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
206
- latest_batch_completed.change(fn=llm_query,
207
- inputs=[file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
208
  outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files]).\
209
  then(fn = reveal_feedback_buttons,
210
  outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
@@ -224,7 +309,7 @@ with app:
224
 
225
  ###
226
  # LOGGING AND ON APP LOAD FUNCTIONS
227
- ###
228
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
229
 
230
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
@@ -259,7 +344,7 @@ print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
259
  MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
260
  print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
261
 
262
- GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
263
  print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
264
 
265
  if __name__ == "__main__":
 
1
  import os
2
  import socket
3
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
4
  from tools.aws_functions import upload_file_to_s3
5
+ from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
6
  from tools.auth import authenticate_user
7
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
8
  #from tools.aws_functions import load_data_from_aws
9
  import gradio as gr
10
  import pandas as pd
11
+ import tools.chatfuncs as chatf
12
+ from tools.chatfuncs import llama_cpp_init_config_gpu, llama_cpp_init_config_cpu
13
+ from llama_cpp import Llama
14
+ from huggingface_hub import hf_hub_download
15
+ from torch import cuda, backends
16
  from datetime import datetime
17
+
18
  today_rev = datetime.now().strftime("%Y%m%d")
19
 
20
  ensure_output_folder_exists()
 
25
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
26
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
27
 
28
+ ###
29
+ # Load local model
30
+ ###
31
+
32
+ # Check for torch cuda
33
+ print("Is CUDA enabled? ", cuda.is_available())
34
+ print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
35
+ if cuda.is_available():
36
+ torch_device = "cuda"
37
+ os.system("nvidia-smi")
38
+ else:
39
+ torch_device = "cpu"
40
+
41
+ print("Device used is: ", torch_device)
42
+
43
+ def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu_config:llama_cpp_init_config_gpu=chatf.gpu_config, cpu_config:llama_cpp_init_config_cpu=chatf.cpu_config, torch_device:str=chatf.torch_device):
44
+ '''
45
+ Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
46
+ '''
47
+ print("Loading model ", local_model_type)
48
+
49
+ if local_model_type == "Gemma 2b":
50
+ if torch_device == "cuda":
51
+ gpu_config.update_gpu(gpu_layers)
52
+ gpu_config.update_context(max_context_length)
53
+ print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
54
+ else:
55
+ gpu_config.update_gpu(gpu_layers)
56
+ cpu_config.update_gpu(gpu_layers)
57
+
58
+ # Update context length according to slider
59
+ gpu_config.update_context(max_context_length)
60
+ cpu_config.update_context(max_context_length)
61
+
62
+ print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
63
+
64
+ #print(vars(gpu_config))
65
+ #print(vars(cpu_config))
66
+
67
+ def get_model_path():
68
+ repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
69
+ filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
70
+ model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory
71
+
72
+ # Construct the expected local path
73
+ local_path = os.path.join(model_dir, filename)
74
+
75
+ if os.path.exists(local_path):
76
+ print(f"Model already exists at: {local_path}")
77
+ return local_path
78
+ else:
79
+ print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
80
+ return hf_hub_download(repo_id=repo_id, filename=filename)
81
+
82
+ model_path = get_model_path()
83
+
84
+ try:
85
+ print(vars(gpu_config))
86
+ llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True,
87
+
88
+ except Exception as e:
89
+ print("GPU load failed")
90
+ print(e)
91
+ llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
92
+
93
+ tokenizer = []
94
+
95
+ chatf.model = llama_model
96
+ chatf.tokenizer = tokenizer
97
+ chatf.local_model_type = local_model_type
98
+
99
+ load_confirmation = "Finished loading model: " + local_model_type
100
+
101
+ print(load_confirmation)
102
+ return local_model_type, load_confirmation, local_model_type
103
+
104
+
105
+ # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
106
+ local_model_type = "Gemma 2b"
107
+ if RUN_LOCAL_MODEL == "1":
108
+ load_model(local_model_type, chatf.gpu_layers, chatf.context_length, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
109
 
110
  # Create the gradio interface
111
  app = gr.Blocks(theme = gr.themes.Base())
 
179
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
180
  candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
181
 
182
+ context_textbox = gr.Textbox(label="Write a short description (up to one sentence) giving context to the large language model about the your consultation and any relevant context")
183
 
184
  extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
185
 
 
236
  Define settings that affect large language model output.
237
  """)
238
  with gr.Accordion("Settings for LLM generation", open = True):
239
+ temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
240
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0)
241
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
242
 
 
283
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number]).\
284
  then(load_in_data_file,
285
  inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
286
+ fn=extract_topics,
287
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
288
+ outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
289
 
290
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
291
+ latest_batch_completed.change(fn=extract_topics,
292
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
293
  outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files]).\
294
  then(fn = reveal_feedback_buttons,
295
  outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
 
309
 
310
  ###
311
  # LOGGING AND ON APP LOAD FUNCTIONS
312
+ ###
313
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
314
 
315
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
 
344
  MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
345
  print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
346
 
347
+ GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7861'))
348
  print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
349
 
350
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.6.0
3
  boto3==1.35.71
4
  pyarrow==18.1.0
5
  openpyxl==3.1.3
@@ -10,3 +10,8 @@ google-generativeai==0.8.3
10
  html5lib==1.1
11
  beautifulsoup4==4.12.3
12
  rapidfuzz==3.10.1
 
 
 
 
 
 
1
  pandas==2.2.3
2
+ gradio==5.8.0
3
  boto3==1.35.71
4
  pyarrow==18.1.0
5
  openpyxl==3.1.3
 
10
  html5lib==1.1
11
  beautifulsoup4==4.12.3
12
  rapidfuzz==3.10.1
13
+ torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
14
+ llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
15
+ transformers==4.47.0
16
+ numpy==1.26.4
17
+ typing_extensions==4.12.2
requirements_cpu.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.3
2
+ gradio==5.6.0
3
+ boto3==1.35.71
4
+ pyarrow==18.1.0
5
+ openpyxl==3.1.3
6
+ markdown==3.7
7
+ tabulate==0.9.0
8
+ lxml==5.3.0
9
+ google-generativeai==0.8.3
10
+ html5lib==1.1
11
+ beautifulsoup4==4.12.3
12
+ rapidfuzz==3.10.1
13
+ torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
14
+ llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
15
+ transformers==4.47.0
16
+ numpy==1.26.4
17
+ typing_extensions==4.12.2
tools/chatfuncs.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import TypeVar
3
+
4
+ # Model packages
5
+ import torch.cuda
6
+ from transformers import pipeline
7
+ import time
8
+
9
+ torch.cuda.empty_cache()
10
+
11
+ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
12
+
13
+ model_type = None # global variable setup
14
+
15
+ full_text = "" # Define dummy source text (full text) just to enable highlight function to load
16
+
17
+ model = [] # Define empty list for model functions to run
18
+ tokenizer = [] # Define empty list for model functions to run
19
+
20
+
21
+ # Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
22
+ if torch.cuda.is_available():
23
+ torch_device = "cuda"
24
+ gpu_layers = -1
25
+ else:
26
+ torch_device = "cpu"
27
+ gpu_layers = 0
28
+
29
+ print("Running on device:", torch_device)
30
+ threads = torch.get_num_threads() # 8
31
+ print("CPU threads:", threads)
32
+
33
+ temperature: float = 0.1
34
+ top_k: int = 3
35
+ top_p: float = 1
36
+ repetition_penalty: float = 1.2 # Mild repetition penalty to prevent repeating table rows
37
+ last_n_tokens: int = 512
38
+ max_new_tokens: int = 4096 # 200
39
+ seed: int = 42
40
+ reset: bool = True
41
+ stream: bool = False
42
+ threads: int = threads
43
+ batch_size:int = 256
44
+ context_length:int = 12288
45
+ sample = True
46
+
47
+
48
+ class llama_cpp_init_config_gpu:
49
+ def __init__(self,
50
+ last_n_tokens=last_n_tokens,
51
+ seed=seed,
52
+ n_threads=threads,
53
+ n_batch=batch_size,
54
+ n_ctx=context_length,
55
+ n_gpu_layers=gpu_layers):
56
+
57
+ self.last_n_tokens = last_n_tokens
58
+ self.seed = seed
59
+ self.n_threads = n_threads
60
+ self.n_batch = n_batch
61
+ self.n_ctx = n_ctx
62
+ self.n_gpu_layers = n_gpu_layers
63
+ # self.stop: list[str] = field(default_factory=lambda: [stop_string])
64
+
65
+ def update_gpu(self, new_value):
66
+ self.n_gpu_layers = new_value
67
+
68
+ def update_context(self, new_value):
69
+ self.n_ctx = new_value
70
+
71
+ class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu):
72
+ def __init__(self):
73
+ super().__init__()
74
+ self.n_gpu_layers = gpu_layers
75
+ self.n_ctx=context_length
76
+
77
+ gpu_config = llama_cpp_init_config_gpu()
78
+ cpu_config = llama_cpp_init_config_cpu()
79
+
80
+
81
+ class CtransGenGenerationConfig:
82
+ def __init__(self, temperature=temperature,
83
+ top_k=top_k,
84
+ top_p=top_p,
85
+ repeat_penalty=repetition_penalty,
86
+ seed=seed,
87
+ stream=stream,
88
+ max_tokens=max_new_tokens
89
+ ):
90
+ self.temperature = temperature
91
+ self.top_k = top_k
92
+ self.top_p = top_p
93
+ self.repeat_penalty = repeat_penalty
94
+ self.seed = seed
95
+ self.max_tokens=max_tokens
96
+ self.stream = stream
97
+
98
+ def update_temp(self, new_value):
99
+ self.temperature = new_value
100
+
101
+
102
+ def llama_cpp_streaming(history, full_prompt, temperature=temperature):
103
+
104
+ gen_config = CtransGenGenerationConfig()
105
+ gen_config.update_temp(temperature)
106
+
107
+ print(vars(gen_config))
108
+
109
+ # Pull the generated text from the streamer, and update the model output.
110
+ start = time.time()
111
+ NUM_TOKENS=0
112
+ print('-'*4+'Start Generation'+'-'*4)
113
+
114
+ output = model(
115
+ full_prompt, **vars(gen_config))
116
+
117
+ history[-1][1] = ""
118
+ for out in output:
119
+
120
+ if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
121
+ history[-1][1] += out["choices"][0]["text"]
122
+ NUM_TOKENS+=1
123
+ yield history
124
+ else:
125
+ print(f"Unexpected output structure: {out}")
126
+
127
+ time_generate = time.time() - start
128
+ print('\n')
129
+ print('-'*4+'End Generation'+'-'*4)
130
+ print(f'Num of generated tokens: {NUM_TOKENS}')
131
+ print(f'Time for complete generation: {time_generate}s')
132
+ print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
133
+ print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
134
+
135
+
136
+ def call_llama_cpp_model(formatted_string, gen_config):
137
+ """
138
+ Calls your generation model with parameters from the CtransGenGenerationConfig object.
139
+
140
+ Args:
141
+ formatted_string (str): The formatted input text for the model.
142
+ gen_config (CtransGenGenerationConfig): An object containing generation parameters.
143
+ """
144
+ # Extracting parameters from the gen_config object
145
+ temperature = gen_config.temperature
146
+ top_k = gen_config.top_k
147
+ top_p = gen_config.top_p
148
+ repeat_penalty = gen_config.repeat_penalty
149
+ seed = gen_config.seed
150
+ max_tokens = gen_config.max_tokens
151
+ stream = gen_config.stream
152
+
153
+ # Now you can call your model directly, passing the parameters:
154
+ output = model(
155
+ formatted_string,
156
+ temperature=temperature,
157
+ top_k=top_k,
158
+ top_p=top_p,
159
+ repeat_penalty=repeat_penalty,
160
+ seed=seed,
161
+ max_tokens=max_tokens,
162
+ stream=stream#,
163
+ #stop=["<|eot_id|>", "\n\n"]
164
+ )
165
+
166
+ return output
tools/helper_functions.py CHANGED
@@ -2,7 +2,6 @@ import os
2
  import gradio as gr
3
  import pandas as pd
4
 
5
-
6
  def empty_output_vars_extract_topics():
7
  # Empty output objects before processing a new file
8
 
@@ -47,12 +46,19 @@ def get_or_create_env_var(var_name, default_value):
47
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
48
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
49
 
 
 
 
50
  if RUN_AWS_FUNCTIONS == "1":
51
- model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-1.5-flash-002", "gemini-1.5-pro-002"]
52
- model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro"]
53
  else:
54
- model_full_names = ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]
55
- model_short_names = ["gemini_flash", "gemini_pro"]
 
 
 
 
56
 
57
  model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
58
 
 
2
  import gradio as gr
3
  import pandas as pd
4
 
 
5
  def empty_output_vars_extract_topics():
6
  # Empty output objects before processing a new file
7
 
 
46
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
47
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
48
 
49
+ RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "0")
50
+ print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
51
+
52
  if RUN_AWS_FUNCTIONS == "1":
53
+ model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-1.5-flash-002", "gemini-1.5-pro-002", "gemma_2b_it_local"]
54
+ model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
55
  else:
56
+ model_full_names = ["gemini-1.5-flash-002", "gemini-1.5-pro-002", "gemma_2b_it_local"]
57
+ model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
58
+
59
+ if RUN_LOCAL_MODEL == "0":
60
+ model_full_names.remove("gemma_2b_it_local")
61
+ model_short_names.remove("gemma_local")
62
 
63
  model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
64
 
tools/llm_api_call.py CHANGED
@@ -15,8 +15,11 @@ from gradio import Progress
15
  from typing import List, Tuple
16
  from io import StringIO
17
 
 
 
18
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
19
- from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map
 
20
 
21
  # ResponseObject class for AWS Bedrock calls
22
  class ResponseObject:
@@ -27,8 +30,8 @@ class ResponseObject:
27
  max_tokens = 4096
28
  timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
29
  number_of_api_retry_attempts = 5
30
- max_time_for_loop = 180
31
-
32
 
33
  AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
34
  print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
@@ -64,7 +67,7 @@ def load_in_file(file_path: str, colname:str=""):
64
 
65
  file_data[colname] = file_data[colname].astype(str).str.replace("\bnan\b", "", regex=True)
66
 
67
- print(file_data[colname])
68
 
69
  return file_data, file_name
70
 
@@ -172,16 +175,24 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
172
 
173
  simple_file = simple_file[start_row:end_row] # Select the current batch
174
 
175
- print("simple_file:", simple_file)
 
 
 
176
 
177
  # Remove problematic characters including ASCII and various quote marks
178
  # Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
179
  simple_file["Response"] = simple_file["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
180
  simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
181
  simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
 
 
182
 
183
  # Remove blank and extremely short responses
184
- simple_file = simple_file.loc[~(simple_file["Response"].isnull()) & ~(simple_file["Response"] == "None") & ~(simple_file["Response"] == " ") & ~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
 
 
 
185
 
186
  simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
187
  simple_file.to_csv(simplified_csv_table_path, index=None)
@@ -353,7 +364,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
353
 
354
  for i in progress_bar:
355
  try:
356
- print("Calling Gemini model")
357
  #print("full_prompt:", full_prompt)
358
  #print("generation_config:", config)
359
 
@@ -372,10 +383,10 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
372
 
373
  if i == number_of_api_retry_attempts:
374
  return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
375
- else:
376
  for i in progress_bar:
377
  try:
378
- print("Calling AWS Claude model, attempt", i)
379
  response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
380
 
381
  #progress_bar.close()
@@ -392,11 +403,43 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
392
 
393
  if i == number_of_api_retry_attempts:
394
  return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
395
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  # Update the conversation history with the new prompt and response
398
  conversation_history.append({'role': 'user', 'parts': [prompt]})
399
- conversation_history.append({'role': 'assistant', 'parts': [response.text]})
 
 
 
 
 
 
 
400
 
401
  # Print the updated conversation history
402
  #print("conversation_history:", conversation_history)
@@ -433,16 +476,22 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
433
  #print("prompt to LLM:", prompt)
434
 
435
  response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature)
436
-
437
- if not isinstance(response, str):
 
 
 
 
 
 
 
 
438
  #print("response.usage_metadata:", response.usage_metadata)
439
  #print("Response.text:", response.text)
440
  #print("responses:", responses)
441
- responses.append(response)
442
-
443
- # Create conversation txt object
444
- whole_conversation.append(prompt)
445
- whole_conversation.append(response.text)
446
 
447
  # Create conversation metadata
448
  if master == False:
@@ -459,12 +508,15 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
459
  whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-output-token-count']))
460
  whole_conversation_metadata.append('x-amzn-bedrock-input-token-count:')
461
  whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-input-token-count']))
462
- else:
463
  whole_conversation_metadata.append(str(response.usage_metadata))
 
 
464
  except KeyError as e:
465
  print(f"Key error: {e} - Check the structure of response.usage_metadata")
466
  else:
467
  print("Response is a string object.")
 
468
 
469
 
470
  return responses, conversation_history, whole_conversation, whole_conversation_metadata
@@ -494,20 +546,26 @@ def clean_markdown_table(text: str):
494
  if buffer:
495
  merged_lines.append(buffer)
496
 
497
- # Ensure consistent number of pipes in each row based on the header
498
- header_pipes = merged_lines[0].count('|') # Use the first row to count number of pipes
 
 
 
 
 
 
 
 
 
 
 
499
  result = []
 
500
 
501
  for line in merged_lines:
502
  # Strip excessive whitespace around pipes
503
  line = re.sub(r'\s*\|\s*', '|', line.strip())
504
 
505
- # Replace numbers between pipes with commas and a space
506
- line = re.sub(r'(?<=\|)(\s*\d+)(,\s*\d+)+(?=\|)', lambda m: ', '.join(m.group(0).split(',')), line)
507
-
508
- # Replace groups of numbers separated by spaces with commas and a space
509
- line = re.sub(r'(?<=\|)(\s*\d+)(\s+\d+)+(?=\|)', lambda m: ', '.join(m.group(0).split()), line)
510
-
511
  # Fix inconsistent number of pipes by adjusting them to match the header
512
  pipe_count = line.count('|')
513
  if pipe_count < header_pipes:
@@ -516,12 +574,17 @@ def clean_markdown_table(text: str):
516
  # If too many pipes, split line and keep the first `header_pipes` columns
517
  columns = line.split('|')[:header_pipes + 1] # +1 to keep last pipe at the end
518
  line = '|'.join(columns)
 
 
519
 
520
  result.append(line)
521
 
522
  # Join lines back into the cleaned markdown text
523
  cleaned_text = '\n'.join(result)
524
 
 
 
 
525
  return cleaned_text
526
 
527
  def clean_column_name(column_name, max_length=20):
@@ -642,8 +705,23 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
642
  log_files_output_paths.append(whole_conversation_path_meta)
643
 
644
  # Convert output table to markdown and then to a pandas dataframe to csv
645
- # try:
646
- cleaned_response = clean_markdown_table(responses[-1].text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
 
648
  markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
649
 
@@ -653,20 +731,24 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
653
  html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
654
  html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
655
 
656
- print("html_table:", html_table)
657
-
658
  # Now ensure that the HTML structure is correct
659
  if "<table>" not in html_table:
660
  html_table = f"""
661
  <table>
 
 
 
 
 
 
 
662
  {html_table}
663
  </table>
664
  """
665
 
666
  # print("Markdown table as HTML:", html_table)
667
 
668
- html_buffer = StringIO(html_table)
669
-
670
 
671
  try:
672
  topic_with_response_df = pd.read_html(html_buffer)[0] # Assuming the first table in the HTML is the one you want
@@ -678,11 +760,16 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
678
 
679
 
680
  # Rename columns to ensure consistent use of data frames later in code
681
- topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Summary", "Response References"]
682
 
683
  # Fill in NA rows with values from above (topics seem to be included only on one row):
684
  topic_with_response_df = topic_with_response_df.ffill()
685
 
 
 
 
 
 
686
  # Strip and lower case topic names to remove issues where model is randomly capitalising topics/sentiment
687
  topic_with_response_df["General Topic"] = topic_with_response_df["General Topic"].str.strip().str.lower().str.capitalize()
688
  topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].str.strip().str.lower().str.capitalize()
@@ -695,18 +782,32 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
695
 
696
  # Iterate through each row in the original DataFrame
697
  for index, row in topic_with_response_df.iterrows():
698
- references = re.split(r',\s*|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
 
 
 
 
699
  topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
700
  subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
701
  sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
702
- summary = row.iloc[3] if pd.notna(row.iloc[3]) else ""
 
 
 
703
 
704
  summary = row_number_string_start + summary
705
 
706
  # Create a new entry for each reference number
707
  for ref in references:
 
 
 
 
 
 
 
708
  reference_data.append({
709
- 'Response References': ref,
710
  'General Topic': topic,
711
  'Subtopic': subtopic,
712
  'Sentiment': sentiment,
@@ -716,6 +817,8 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
716
 
717
  # Create a new DataFrame from the reference data
718
  new_reference_df = pd.DataFrame(reference_data)
 
 
719
 
720
  # Append on old reference data
721
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
@@ -759,7 +862,10 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
759
 
760
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
761
 
762
- def llm_query(file_data:pd.DataFrame,
 
 
 
763
  existing_topics_table:pd.DataFrame,
764
  existing_reference_df:pd.DataFrame,
765
  existing_unique_topics_df:pd.DataFrame,
@@ -770,7 +876,7 @@ def llm_query(file_data:pd.DataFrame,
770
  temperature:float,
771
  chosen_cols:List[str],
772
  model_choice:str,
773
- candidate_topics: List=[],
774
  latest_batch_completed:int=0,
775
  out_message:List=[],
776
  out_file_paths:List = [],
@@ -783,7 +889,7 @@ def llm_query(file_data:pd.DataFrame,
783
  system_prompt:str=system_prompt,
784
  add_existing_topics_system_prompt:str=add_existing_topics_system_prompt,
785
  add_existing_topics_prompt:str=add_existing_topics_prompt,
786
- number_of_requests:int=1,
787
  batch_size:int=50,
788
  context_textbox:str="",
789
  time_taken:float = 0,
@@ -796,6 +902,7 @@ def llm_query(file_data:pd.DataFrame,
796
  Query an LLM (Gemini or AWS Anthropic-based) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
797
 
798
  Parameters:
 
799
  - file_data (pd.DataFrame): Pandas dataframe containing the consultation response data.
800
  - existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
801
  - existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
@@ -806,7 +913,7 @@ def llm_query(file_data:pd.DataFrame,
806
  - in_api_key (str): The API key for authentication.
807
  - temperature (float): The temperature parameter for the model.
808
  - chosen_cols (List[str]): A list of chosen columns to process.
809
- - candidate_topics (List): A list of existing candidate topics submitted by the user.
810
  - model_choice (str): The choice of model to use.
811
  - latest_batch_completed (int): The index of the latest file completed.
812
  - out_message (list): A list to store output messages.
@@ -835,16 +942,37 @@ def llm_query(file_data:pd.DataFrame,
835
  config = ""
836
  final_time = 0.0
837
  whole_conversation_metadata = []
838
- #all_topic_tables_df = []
839
- #all_markdown_topic_tables = []
840
  is_error = False
 
 
 
 
 
 
 
 
841
 
842
  # Reset output files on each run:
843
  # out_file_paths = []
844
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
846
- model_choice_clean = model_name_map[model_choice]
847
- print("model_choice_clean:", model_choice_clean)
848
 
849
  # If this is the first time around, set variables to 0/blank
850
  if first_loop_state==True:
@@ -852,8 +980,9 @@ def llm_query(file_data:pd.DataFrame,
852
  latest_batch_completed = 0
853
  out_message = []
854
  out_file_paths = []
 
855
 
856
- print("latest_batch_completed:", str(latest_batch_completed))
857
 
858
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
859
  if latest_batch_completed >= num_batches:
@@ -866,7 +995,6 @@ def llm_query(file_data:pd.DataFrame,
866
  out_time = f"Everything finished in {final_time} seconds."
867
  print(out_time)
868
 
869
-
870
  print("All summaries completed. Creating outputs.")
871
 
872
  model_choice_clean = model_name_map[model_choice]
@@ -931,7 +1059,7 @@ def llm_query(file_data:pd.DataFrame,
931
  print("out_file_paths:", out_file_paths)
932
 
933
  #final_out_message = '\n'.join(out_message)
934
- return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
935
 
936
 
937
 
@@ -949,18 +1077,14 @@ def llm_query(file_data:pd.DataFrame,
949
  if not out_file_paths:
950
  out_file_paths = []
951
 
952
- # Check if files and text exist
953
- if file_data.empty:
954
- out_message = "Please enter a data file to summarise."
955
- print(out_message)
956
- return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
957
 
958
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
959
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
960
  print(out_message)
961
  return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
962
 
963
- topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses). " + str(latest_batch_completed) + " batches completed."
964
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
965
 
966
  for i in topics_loop:
@@ -994,39 +1118,53 @@ def llm_query(file_data:pd.DataFrame,
994
  if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
995
  print("Using Gemini model:", model_choice)
996
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
997
- else:
998
  print("Using AWS Bedrock model:", model_choice)
 
 
999
 
1000
  if candidate_topics:
1001
  # 'Zero shot topics' are those supplied by the user
1002
- zero_shot_topics = read_file(candidate_topics.name)
1003
- zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
1004
- # Max 150 topics allowed
1005
- if len(zero_shot_topics_series) > 120:
1006
- print("Maximum 120 topics allowed to fit within large language model context limits.")
1007
- zero_shot_topics_series = zero_shot_topics_series.iloc[:120]
1008
-
1009
- zero_shot_topics_list = list(zero_shot_topics_series)
1010
 
1011
- print("Zero shot topics are:", zero_shot_topics_list)
1012
-
1013
- #all_topic_tables_df_merged = existing_unique_topics_df
1014
- existing_unique_topics_df["Response References"] = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1015
 
1016
 
1017
- # Create the most up to date list of topics and subtopics.
1018
- # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1019
- if candidate_topics and existing_unique_topics_df.empty:
1020
- existing_unique_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
1021
-
1022
- # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1023
- elif candidate_topics and not existing_unique_topics_df.empty:
1024
- zero_shot_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
1025
- existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1026
- zero_shot_topics_list_str = zero_shot_topics_list
1027
 
1028
  #existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
1029
 
 
 
1030
 
1031
  unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic", "Sentiment"]].drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).to_markdown(index=False)
1032
 
@@ -1035,6 +1173,13 @@ def llm_query(file_data:pd.DataFrame,
1035
  # Format the summary prompt with the response table and topics
1036
  formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, consultation_context=context_textbox, column_name=chosen_cols)
1037
 
 
 
 
 
 
 
 
1038
  # Define the output file path for the formatted prompt
1039
  formatted_prompt_output_path = output_folder + file_name + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1040
 
@@ -1130,7 +1275,17 @@ def llm_query(file_data:pd.DataFrame,
1130
  if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table)
1131
  else: formatted_prompt3 = prompt3
1132
 
1133
- batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_requests] # Adjust this list to send fewer requests
 
 
 
 
 
 
 
 
 
 
1134
 
1135
  whole_conversation = [system_prompt]
1136
 
@@ -1173,15 +1328,21 @@ def llm_query(file_data:pd.DataFrame,
1173
  try:
1174
  final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1175
 
1176
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1177
- f.write(responses[-1].text)
 
 
 
 
 
 
 
1178
 
1179
  log_files_output_paths.append(final_table_output_path)
1180
 
1181
  except Exception as e:
1182
  print(e)
1183
-
1184
- display_table = responses[-1].text
1185
  new_topic_df = topic_table_df
1186
  new_reference_df = reference_df
1187
 
@@ -1260,7 +1421,6 @@ def deduplicate_categories(category_series: pd.Series, join_series:pd.Series, th
1260
 
1261
  return result_df
1262
 
1263
-
1264
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
1265
  unique_topics_df:pd.DataFrame,
1266
  random_seed:int,
@@ -1380,7 +1540,11 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
1380
  print("Finished summary query")
1381
 
1382
  # Extract text from the `responses` list
1383
- response_texts = [resp.text for resp in responses]
 
 
 
 
1384
  latest_response_text = response_texts[-1]
1385
 
1386
  #print("latest_response_text:", latest_response_text)
@@ -1482,6 +1646,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1482
  try:
1483
  response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt)
1484
  summarised_output = response
 
 
1485
  except Exception as e:
1486
  print(e)
1487
  summarised_output = ""
 
15
  from typing import List, Tuple
16
  from io import StringIO
17
 
18
+ GradioFileData = gr.FileData
19
+
20
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
21
+ from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
22
+ from tools.chatfuncs import model, CtransGenGenerationConfig, temperature, context_length, call_llama_cpp_model
23
 
24
  # ResponseObject class for AWS Bedrock calls
25
  class ResponseObject:
 
30
  max_tokens = 4096
31
  timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API calls
32
  number_of_api_retry_attempts = 5
33
+ max_time_for_loop = 99999
34
+ batch_size_default = 5
35
 
36
  AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
37
  print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
 
67
 
68
  file_data[colname] = file_data[colname].astype(str).str.replace("\bnan\b", "", regex=True)
69
 
70
+ #print(file_data[colname])
71
 
72
  return file_data, file_name
73
 
 
175
 
176
  simple_file = simple_file[start_row:end_row] # Select the current batch
177
 
178
+ # Now replace the reference numbers with numbers starting from 1
179
+ simple_file["Reference"] = simple_file["Reference"] - start_row
180
+
181
+ #print("simple_file:", simple_file)
182
 
183
  # Remove problematic characters including ASCII and various quote marks
184
  # Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
185
  simple_file["Response"] = simple_file["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
186
  simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
187
  simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
188
+ simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
189
+ simple_file["Response"] = simple_file["Response"].str.slice(0, 2500) # Maximum 1,500 character responses
190
 
191
  # Remove blank and extremely short responses
192
+ simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
193
+ ~(simple_file["Response"] == "None") &\
194
+ ~(simple_file["Response"] == " ") &\
195
+ ~(simple_file["Response"] == ""),:]#~(simple_file["Response"].str.len() < 5), :]
196
 
197
  simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
198
  simple_file.to_csv(simplified_csv_table_path, index=None)
 
364
 
365
  for i in progress_bar:
366
  try:
367
+ print("Calling Gemini model, attempt", i + 1)
368
  #print("full_prompt:", full_prompt)
369
  #print("generation_config:", config)
370
 
 
383
 
384
  if i == number_of_api_retry_attempts:
385
  return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
386
+ elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
387
  for i in progress_bar:
388
  try:
389
+ print("Calling AWS Claude model, attempt", i + 1)
390
  response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
391
 
392
  #progress_bar.close()
 
403
 
404
  if i == number_of_api_retry_attempts:
405
  return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
406
+ else:
407
+ # This is the Gemma model
408
+ for i in progress_bar:
409
+ try:
410
+ print("Calling Gemma 2B Instruct model, attempt", i + 1)
411
+
412
+ gen_config = CtransGenGenerationConfig()
413
+ gen_config.update_temp(temperature)
414
+
415
+ response = call_llama_cpp_model(prompt, gen_config)
416
+
417
+ #progress_bar.close()
418
+ #tqdm._instances.clear()
419
+
420
+ print("Successful call to Gemma model.")
421
+ print("Response:", response)
422
+ break
423
+ except Exception as e:
424
+ # If fails, try again after X seconds in case there is a throttle limit
425
+ print("Call to Gemma model failed:", e, " Waiting for ", str(timeout_wait), "seconds and trying again.")
426
+
427
+ time.sleep(timeout_wait)
428
+ #response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice)
429
+
430
+ if i == number_of_api_retry_attempts:
431
+ return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
432
 
433
  # Update the conversation history with the new prompt and response
434
  conversation_history.append({'role': 'user', 'parts': [prompt]})
435
+
436
+ # output_str = output['choices'][0]['text']
437
+
438
+ # Check if is a LLama.cpp model response
439
+ if 'choices' in response:
440
+ conversation_history.append({'role': 'assistant', 'parts': [response['choices'][0]['text']]})
441
+ else:
442
+ conversation_history.append({'role': 'assistant', 'parts': [response.text]})
443
 
444
  # Print the updated conversation history
445
  #print("conversation_history:", conversation_history)
 
476
  #print("prompt to LLM:", prompt)
477
 
478
  response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature)
479
+
480
+ if 'choices' in response:
481
+ responses.append(response)
482
+
483
+ # Create conversation txt object
484
+ whole_conversation.append(prompt)
485
+ whole_conversation.append(response['choices'][0]['text'])
486
+
487
+ else:
488
+ responses.append(response)
489
  #print("response.usage_metadata:", response.usage_metadata)
490
  #print("Response.text:", response.text)
491
  #print("responses:", responses)
492
+ # Create conversation txt object
493
+ whole_conversation.append(prompt)
494
+ whole_conversation.append(response.text)
 
 
495
 
496
  # Create conversation metadata
497
  if master == False:
 
508
  whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-output-token-count']))
509
  whole_conversation_metadata.append('x-amzn-bedrock-input-token-count:')
510
  whole_conversation_metadata.append(str(response.usage_metadata['HTTPHeaders']['x-amzn-bedrock-input-token-count']))
511
+ elif "gemini" in model_choice:
512
  whole_conversation_metadata.append(str(response.usage_metadata))
513
+ else:
514
+ whole_conversation_metadata.append(str(response['usage']))
515
  except KeyError as e:
516
  print(f"Key error: {e} - Check the structure of response.usage_metadata")
517
  else:
518
  print("Response is a string object.")
519
+ whole_conversation_metadata.append("Length prompt: " + str(len(prompt)) + ". Length response: " + str(len(response)))
520
 
521
 
522
  return responses, conversation_history, whole_conversation, whole_conversation_metadata
 
546
  if buffer:
547
  merged_lines.append(buffer)
548
 
549
+ # Fix the header separator row if necessary
550
+ if len(merged_lines) > 1:
551
+ header_pipes = merged_lines[0].count('|') # Count pipes in the header row
552
+ header_separator = '|---|' * (header_pipes - 1) + '|---|' # Generate proper separator
553
+
554
+ # Replace or insert the separator row
555
+ if not re.match(r'^\|[-:|]+$', merged_lines[1]): # Check if the second row is a valid separator
556
+ merged_lines.insert(1, header_separator)
557
+ else:
558
+ # Adjust the separator to match the header pipes
559
+ merged_lines[1] = '|---|' * (header_pipes - 1) + '|'
560
+
561
+ # Ensure consistent number of pipes in each row
562
  result = []
563
+ header_pipes = merged_lines[0].count('|') # Use the header row to count the number of pipes
564
 
565
  for line in merged_lines:
566
  # Strip excessive whitespace around pipes
567
  line = re.sub(r'\s*\|\s*', '|', line.strip())
568
 
 
 
 
 
 
 
569
  # Fix inconsistent number of pipes by adjusting them to match the header
570
  pipe_count = line.count('|')
571
  if pipe_count < header_pipes:
 
574
  # If too many pipes, split line and keep the first `header_pipes` columns
575
  columns = line.split('|')[:header_pipes + 1] # +1 to keep last pipe at the end
576
  line = '|'.join(columns)
577
+
578
+ line = re.sub(r'(\d),(?=\d)', r'\1, ', line)
579
 
580
  result.append(line)
581
 
582
  # Join lines back into the cleaned markdown text
583
  cleaned_text = '\n'.join(result)
584
 
585
+ # Replace numbers next to commas and other numbers with a space
586
+
587
+
588
  return cleaned_text
589
 
590
  def clean_column_name(column_name, max_length=20):
 
705
  log_files_output_paths.append(whole_conversation_path_meta)
706
 
707
  # Convert output table to markdown and then to a pandas dataframe to csv
708
+ def remove_before_last_term(input_string: str) -> str:
709
+ # Use regex to find the last occurrence of the term
710
+ match = re.search(r'(\| ?General Topic)', input_string)
711
+ if match:
712
+ # Find the last occurrence by using rfind
713
+ last_index = input_string.rfind(match.group(0))
714
+ return input_string[last_index:] # Return everything from the last match onward
715
+ return input_string # Return the original string if the term is not found
716
+
717
+ if "choices" in responses[-1]:
718
+ print("Text response:", responses[-1]["choices"][0]['text'])
719
+ start_of_table_response = remove_before_last_term(responses[-1]["choices"][0]['text'])
720
+ cleaned_response = clean_markdown_table(start_of_table_response)
721
+ print("cleaned_response:", cleaned_response)
722
+ else:
723
+ start_of_table_response = remove_before_last_term(responses[-1].text)
724
+ cleaned_response = clean_markdown_table(start_of_table_response)
725
 
726
  markdown_table = markdown.markdown(cleaned_response, extensions=['tables'])
727
 
 
731
  html_table = re.sub(r'<p>(.*?)</p>', r'\1', markdown_table)
732
  html_table = html_table.replace('<p>', '').replace('</p>', '').strip()
733
 
 
 
734
  # Now ensure that the HTML structure is correct
735
  if "<table>" not in html_table:
736
  html_table = f"""
737
  <table>
738
+ <tr>
739
+ <th>General Topic</th>
740
+ <th>Subtopic</th>
741
+ <th>Sentiment</th>
742
+ <th>Response References</th>
743
+ <th>Summary</th>
744
+ </tr>
745
  {html_table}
746
  </table>
747
  """
748
 
749
  # print("Markdown table as HTML:", html_table)
750
 
751
+ html_buffer = StringIO(html_table)
 
752
 
753
  try:
754
  topic_with_response_df = pd.read_html(html_buffer)[0] # Assuming the first table in the HTML is the one you want
 
760
 
761
 
762
  # Rename columns to ensure consistent use of data frames later in code
763
+ topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
764
 
765
  # Fill in NA rows with values from above (topics seem to be included only on one row):
766
  topic_with_response_df = topic_with_response_df.ffill()
767
 
768
+ #print("topic_with_response_df:", topic_with_response_df)
769
+
770
+ # For instances where you end up with float values in Response references
771
+ topic_with_response_df["Response References"] = topic_with_response_df["Response References"].astype(str).str.replace(".0", "", regex=False)
772
+
773
  # Strip and lower case topic names to remove issues where model is randomly capitalising topics/sentiment
774
  topic_with_response_df["General Topic"] = topic_with_response_df["General Topic"].str.strip().str.lower().str.capitalize()
775
  topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].str.strip().str.lower().str.capitalize()
 
782
 
783
  # Iterate through each row in the original DataFrame
784
  for index, row in topic_with_response_df.iterrows():
785
+ #references = re.split(r',\s*|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
786
+ references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
787
+ # If no numbers found in the Response References column, check the Summary column in case reference numbers were put there by mistake
788
+ if not references:
789
+ references = re.findall(r'\d+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else []
790
  topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
791
  subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
792
  sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
793
+ summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
794
+ # If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
795
+ if not summary and len(row.iloc[3] > 30):
796
+ summary = row.iloc[3]
797
 
798
  summary = row_number_string_start + summary
799
 
800
  # Create a new entry for each reference number
801
  for ref in references:
802
+ # Add start_row back onto reference_number
803
+ try:
804
+ response_ref_no = str(int(ref) + int(start_row))
805
+ except ValueError:
806
+ print("Reference is not a number")
807
+ continue
808
+
809
  reference_data.append({
810
+ 'Response References': response_ref_no,
811
  'General Topic': topic,
812
  'Subtopic': subtopic,
813
  'Sentiment': sentiment,
 
817
 
818
  # Create a new DataFrame from the reference data
819
  new_reference_df = pd.DataFrame(reference_data)
820
+
821
+ print("new_reference_df:", new_reference_df)
822
 
823
  # Append on old reference data
824
  out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
 
862
 
863
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
864
 
865
+
866
+
867
+ def extract_topics(in_data_file,
868
+ file_data:pd.DataFrame,
869
  existing_topics_table:pd.DataFrame,
870
  existing_reference_df:pd.DataFrame,
871
  existing_unique_topics_df:pd.DataFrame,
 
876
  temperature:float,
877
  chosen_cols:List[str],
878
  model_choice:str,
879
+ candidate_topics: GradioFileData = [],
880
  latest_batch_completed:int=0,
881
  out_message:List=[],
882
  out_file_paths:List = [],
 
889
  system_prompt:str=system_prompt,
890
  add_existing_topics_system_prompt:str=add_existing_topics_system_prompt,
891
  add_existing_topics_prompt:str=add_existing_topics_prompt,
892
+ number_of_prompts_used:int=1,
893
  batch_size:int=50,
894
  context_textbox:str="",
895
  time_taken:float = 0,
 
902
  Query an LLM (Gemini or AWS Anthropic-based) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
903
 
904
  Parameters:
905
+ - in_data_file (gr.File): Gradio file object containing input data
906
  - file_data (pd.DataFrame): Pandas dataframe containing the consultation response data.
907
  - existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
908
  - existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
 
913
  - in_api_key (str): The API key for authentication.
914
  - temperature (float): The temperature parameter for the model.
915
  - chosen_cols (List[str]): A list of chosen columns to process.
916
+ - candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
917
  - model_choice (str): The choice of model to use.
918
  - latest_batch_completed (int): The index of the latest file completed.
919
  - out_message (list): A list to store output messages.
 
942
  config = ""
943
  final_time = 0.0
944
  whole_conversation_metadata = []
 
 
945
  is_error = False
946
+ #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
947
+ #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
948
+ #llama_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
949
+ #llama_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
950
+ #llama_prefix = "<|user|>\n" # This is for phi 3.5
951
+ #llama_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
952
+ llama_prefix = "<start_of_turn>user\n"
953
+ llama_suffix = "<end_of_turn>\n<start_of_turn>model\n"
954
 
955
  # Reset output files on each run:
956
  # out_file_paths = []
957
 
958
+ # If you have a file input but no file data it hasn't yet been loaded. Load it here.
959
+ if file_data.empty:
960
+ print("No data table found, loading from file")
961
+ try:
962
+ print("in_data_file:", in_data_file)
963
+ in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
964
+ print("in_colnames:", in_colnames_drop)
965
+ file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default)
966
+ print("file_data loaded in:", file_data)
967
+ except:
968
+ # Check if files and text exist
969
+ out_message = "Please enter a data file to summarise."
970
+ print(out_message)
971
+ return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
972
+
973
+
974
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
975
+ model_choice_clean = model_name_map[model_choice]
 
976
 
977
  # If this is the first time around, set variables to 0/blank
978
  if first_loop_state==True:
 
980
  latest_batch_completed = 0
981
  out_message = []
982
  out_file_paths = []
983
+ print("model_choice_clean:", model_choice_clean)
984
 
985
+ #print("latest_batch_completed:", str(latest_batch_completed))
986
 
987
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
988
  if latest_batch_completed >= num_batches:
 
995
  out_time = f"Everything finished in {final_time} seconds."
996
  print(out_time)
997
 
 
998
  print("All summaries completed. Creating outputs.")
999
 
1000
  model_choice_clean = model_name_map[model_choice]
 
1059
  print("out_file_paths:", out_file_paths)
1060
 
1061
  #final_out_message = '\n'.join(out_message)
1062
+ return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
1063
 
1064
 
1065
 
 
1077
  if not out_file_paths:
1078
  out_file_paths = []
1079
 
1080
+
 
 
 
 
1081
 
1082
  if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1083
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1084
  print(out_message)
1085
  return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
1086
 
1087
+ topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
1088
  topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
1089
 
1090
  for i in topics_loop:
 
1118
  if model_choice in ["gemini-1.5-flash-002", "gemini-1.5-pro-002"]:
1119
  print("Using Gemini model:", model_choice)
1120
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
1121
+ elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
1122
  print("Using AWS Bedrock model:", model_choice)
1123
+ else:
1124
+ print("Using local model:", model_choice)
1125
 
1126
  if candidate_topics:
1127
  # 'Zero shot topics' are those supplied by the user
1128
+ max_topic_no = 120
 
 
 
 
 
 
 
1129
 
1130
+ zero_shot_topics = read_file(candidate_topics.name)
1131
+ if zero_shot_topics.shape[1] == 1: # Check if there is only one column
1132
+ zero_shot_topics_series = zero_shot_topics.iloc[:, 0].str.strip().str.lower().str.capitalize()
1133
+ # Max 120 topics allowed
1134
+ if len(zero_shot_topics_series) > max_topic_no:
1135
+ print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1136
+ zero_shot_topics_series = zero_shot_topics_series.iloc[:max_topic_no]
1137
+
1138
+ zero_shot_topics_list = list(zero_shot_topics_series)
1139
+
1140
+ print("Zero shot topics are:", zero_shot_topics_list)
1141
+
1142
+ # Create the most up to date list of topics and subtopics.
1143
+ # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1144
+ if existing_unique_topics_df.empty:
1145
+ existing_unique_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
1146
+
1147
+ # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1148
+ elif not existing_unique_topics_df.empty:
1149
+ zero_shot_topics_df = pd.DataFrame(data={'General Topic':'', 'Subtopic':zero_shot_topics_list, 'Sentiment':''})
1150
+ existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1151
+ zero_shot_topics_list_str = zero_shot_topics_list
1152
+
1153
+ elif set(["General Topic", "Subtopic", "Sentiment"]).issubset(zero_shot_topics.columns):
1154
+ # Max 120 topics allowed
1155
+ if zero_shot_topics.shape[0] > max_topic_no:
1156
+ print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1157
+ zero_shot_topics = zero_shot_topics.iloc[:max_topic_no,:]
1158
+
1159
+ if existing_unique_topics_df.empty:
1160
+ existing_unique_topics_df = pd.DataFrame(data={'General Topic':zero_shot_topics.iloc[:,0], 'Subtopic':zero_shot_topics.iloc[:,1], 'Sentiment':zero_shot_topics.iloc[:,2]})
1161
 
1162
 
 
 
 
 
 
 
 
 
 
 
1163
 
1164
  #existing_unique_topics_df.to_csv(output_folder + "Existing topics with zero shot dropped.csv", index = None)
1165
 
1166
+ #all_topic_tables_df_merged = existing_unique_topics_df
1167
+ existing_unique_topics_df["Response References"] = ""
1168
 
1169
  unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic", "Sentiment"]].drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).to_markdown(index=False)
1170
 
 
1173
  # Format the summary prompt with the response table and topics
1174
  formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, consultation_context=context_textbox, column_name=chosen_cols)
1175
 
1176
+ if model_choice == "gemma_2b_it_local":
1177
+ # add_existing_topics_system_prompt = llama_system_prefix + add_existing_topics_system_prompt + llama_system_suffix
1178
+ # formatted_initial_table_prompt = llama_prefix + formatted_summary_prompt + llama_suffix
1179
+
1180
+ formatted_initial_table_prompt = llama_prefix + add_existing_topics_system_prompt + formatted_summary_prompt + llama_suffix
1181
+
1182
+
1183
  # Define the output file path for the formatted prompt
1184
  formatted_prompt_output_path = output_folder + file_name + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1185
 
 
1275
  if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table)
1276
  else: formatted_prompt3 = prompt3
1277
 
1278
+ if model_choice == "gemma_2b_it_local":
1279
+ # system_prompt = llama_system_prefix + system_prompt + llama_system_suffix
1280
+ # formatted_initial_table_prompt = llama_prefix + formatted_initial_table_prompt + llama_suffix
1281
+ # formatted_prompt2 = llama_prefix + formatted_prompt2 + llama_suffix
1282
+ # formatted_prompt3 = llama_prefix + formatted_prompt3 + llama_suffix
1283
+
1284
+ formatted_initial_table_prompt = llama_prefix + system_prompt + formatted_initial_table_prompt + llama_suffix
1285
+ formatted_prompt2 = llama_prefix + system_prompt + formatted_prompt2 + llama_suffix
1286
+ formatted_prompt3 = llama_prefix + system_prompt + formatted_prompt3 + llama_suffix
1287
+
1288
+ batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
1289
 
1290
  whole_conversation = [system_prompt]
1291
 
 
1328
  try:
1329
  final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1330
 
1331
+ if "choices" in responses[-1]:
1332
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1333
+ f.write(responses[-1]["choices"][0]['text'])
1334
+ display_table =responses[-1]["choices"][0]['text']
1335
+
1336
+ else:
1337
+ with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1338
+ f.write(responses[-1].text)
1339
+ display_table = responses[-1].text
1340
 
1341
  log_files_output_paths.append(final_table_output_path)
1342
 
1343
  except Exception as e:
1344
  print(e)
1345
+
 
1346
  new_topic_df = topic_table_df
1347
  new_reference_df = reference_df
1348
 
 
1421
 
1422
  return result_df
1423
 
 
1424
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
1425
  unique_topics_df:pd.DataFrame,
1426
  random_seed:int,
 
1540
  print("Finished summary query")
1541
 
1542
  # Extract text from the `responses` list
1543
+ if "choices" in responses[-1]:
1544
+ response_texts = [resp["choices"][0]['text'] for resp in responses]
1545
+ else:
1546
+ response_texts = [resp.text for resp in responses]
1547
+
1548
  latest_response_text = response_texts[-1]
1549
 
1550
  #print("latest_response_text:", latest_response_text)
 
1646
  try:
1647
  response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt)
1648
  summarised_output = response
1649
+ summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
1650
+ summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
1651
  except Exception as e:
1652
  print(e)
1653
  summarised_output = ""
tools/prompts.py CHANGED
@@ -1,15 +1,17 @@
1
- system_prompt = """You are a researcher analysing responses from a public consultation. . The subject of this consultation is: {consultation_context}. You are analysing a single question from this consultation that is {column_name}."""
2
 
3
- initial_table_prompt = """The responses from the consultation are shown in the following table that contains two columns - Reference and Response:
4
- '{response_table}'
5
- Based on the above table, create a markdown table to summarise the consultation responses.
 
6
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
7
  In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
8
  In the third column write the sentiment of the subtopic: Negative, Neutral, or Positive.
9
- In the fourth column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear relevant responses.
10
- In the fifth column list the Response reference numbers of responses relevant to the Subtopic separated by commas.
 
11
 
12
- Do not add any other columns. Return the table in markdown format, and don't include any special characters in the table. Do not add any other text to your response."""
13
 
14
  prompt2 = ""
15
 
@@ -17,32 +19,51 @@ prompt3 = ""
17
 
18
  ## Adding existing topics to consultation responses
19
 
20
- add_existing_topics_system_prompt = """You are a researcher analysing responses from a public consultation. The subject of this consultation is: {consultation_context}. You are analysing a single question from this consultation that is {column_name}."""
21
-
22
- add_existing_topics_prompt = """Responses from a recent consultation are shown in the following table:
23
 
24
- '{response_table}'
 
25
 
26
- And below is a table of topics currently known to be relevant to this consultation:
 
27
 
28
- '{topics}'
29
-
30
- Your job is to assign responses from the Response column to existing general topics and subtopics, or to new topics if no existing topics are relevant.
31
- Create a new markdown table to summarise the consultation responses.
32
- In the first and second columns, assign responses to the General Topics and Subtopics from the Topics table if they are relevant. If you cannot find a relevant topic, add new General Topics and Subtopics to the table. Make the new Subtopics as specific as possible.
33
  In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive.
34
- In the fourth column, a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
35
- In the fifth column, a list of Response reference numbers relevant to the Subtopic separated by commas.
 
 
36
 
37
- Do not add any other columns. Exclude rows for topics that are not assigned to any response. Return the table in markdown format, and do not include any special characters in the table. Do not add any other text to your response."""
38
 
39
 
40
- summarise_topic_descriptions_system_prompt = """You are a researcher analysing responses from a public consultation."""
41
 
42
- summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to consultation responses:
43
 
44
  '{summaries}'
45
 
46
- Your job is to make a consolidated summary of the above text. Return a summary up to two paragraphs long that includes as much detail as possible from the original text. Return only the summary and no other text.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- Summary:"""
 
 
 
 
 
1
+ system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called {column_name}. The context of this analysis is: {consultation_context}. """
2
 
3
+ initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
4
+ {response_table}
5
+
6
+ Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Summary', and 'Response references'.
7
  In the first column identify general topics relevant to responses. Create as many general topics as you can.
8
  In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
9
  In the third column write the sentiment of the subtopic: Negative, Neutral, or Positive.
10
+ In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
11
+ In the fifth and final column, write a short summary of the subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
12
+ Do not add any other columns. Do not repeat Subtopics with the same Sentiment. Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
13
 
14
+ New table:"""
15
 
16
  prompt2 = ""
17
 
 
19
 
20
  ## Adding existing topics to consultation responses
21
 
22
+ add_existing_topics_system_prompt = system_prompt
 
 
23
 
24
+ add_existing_topics_prompt = """Responses are shown in the following Response table:
25
+ {response_table}
26
 
27
+ Topics known to be relevant to this dataset are shown in the following Topics table:
28
+ {topics}
29
 
30
+ Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
31
+ Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Summary', and 'Response references'.
32
+ In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above if they are very relevant to the text of the Response. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible.
 
 
33
  In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive.
34
+ In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
35
+ In the fifth and final column, write a short summary of the Subtopic based on relevant responses. Highlight specific issues that appear in relevant responses.
36
+ Do not add any other columns. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
37
+ Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
38
 
39
+ New table:"""
40
 
41
 
42
+ summarise_topic_descriptions_system_prompt = system_prompt
43
 
44
+ summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
45
 
46
  '{summaries}'
47
 
48
+ Your task is to make a consolidated summary of the above text. Return a summary up to two paragraphs long that includes as much detail as possible from the original text. Return only the summary and no other text.
49
+
50
+ Summary:"""
51
+
52
+
53
+ # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
54
+ # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
55
+ # Summarise the following text in less than {length} words: "{text}"\n
56
+ # Summary:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"""
57
+
58
+ # example_instruction_prompt_phi3 = """<|user|>\n
59
+ # Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.\n
60
+ # CONTENT: {summaries}\n
61
+ # QUESTION: {question}\n
62
+ # Answer:<|end|>\n
63
+ # <|assistant|>"""
64
 
65
+ # example_instruction_prompt_gemma = """<start_of_turn>user
66
+ # Categorise the following text into only one of the following categories that seems most relevant: 'cat1', 'cat2', 'cat3', 'cat4'. Answer only with the choice of category. Do not add any other text. Do not explain your choice.
67
+ # Text: {text}<end_of_turn>
68
+ # <start_of_turn>model
69
+ # Category:"""