Spaces:
Runtime error
Runtime error
Commit
·
b0e08c8
1
Parent(s):
854a758
Changed default requirements to CPU version of llama cpp. Added Gemini Flash 2.0 to model list. Output files should contain only final files.
Browse files- app.py +7 -5
- requirements.txt +2 -4
- requirements_cpu.txt → requirements_gpu.txt +4 -2
- tools/helper_functions.py +6 -10
- tools/llm_api_call.py +148 -55
app.py
CHANGED
@@ -16,7 +16,7 @@ today_rev = datetime.now().strftime("%Y%m%d")
|
|
16 |
ensure_output_folder_exists()
|
17 |
|
18 |
host_name = socket.gethostname()
|
19 |
-
print("host_name is:", host_name)
|
20 |
|
21 |
access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
22 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
@@ -32,7 +32,7 @@ elif RUN_AWS_FUNCTIONS == "1":
|
|
32 |
default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
|
33 |
|
34 |
else:
|
35 |
-
default_model_choice = "gemini-
|
36 |
|
37 |
# Create the gradio interface
|
38 |
app = gr.Blocks(theme = gr.themes.Base())
|
@@ -229,6 +229,8 @@ with app:
|
|
229 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
230 |
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
|
231 |
|
|
|
|
|
232 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
233 |
latest_batch_completed.change(fn=extract_topics,
|
234 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
@@ -238,15 +240,15 @@ with app:
|
|
238 |
|
239 |
# When button pressed, deduplicate data
|
240 |
deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
241 |
-
then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files])
|
242 |
|
243 |
# When button pressed, summarise previous data
|
244 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
245 |
then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
246 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
247 |
-
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
|
248 |
|
249 |
-
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
|
250 |
|
251 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
252 |
continue_previous_data_files_btn.click(
|
|
|
16 |
ensure_output_folder_exists()
|
17 |
|
18 |
host_name = socket.gethostname()
|
19 |
+
# print("host_name is:", host_name)
|
20 |
|
21 |
access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
22 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
|
|
32 |
default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
|
33 |
|
34 |
else:
|
35 |
+
default_model_choice = "gemini-2.0-flash"
|
36 |
|
37 |
# Create the gradio interface
|
38 |
app = gr.Blocks(theme = gr.themes.Base())
|
|
|
229 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
230 |
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
|
231 |
|
232 |
+
# return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
|
233 |
+
|
234 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
235 |
latest_batch_completed.change(fn=extract_topics,
|
236 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
|
|
240 |
|
241 |
# When button pressed, deduplicate data
|
242 |
deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
243 |
+
then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files, log_files_output])
|
244 |
|
245 |
# When button pressed, summarise previous data
|
246 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
247 |
then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
248 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
249 |
+
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
|
250 |
|
251 |
+
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
|
252 |
|
253 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
254 |
continue_previous_data_files_btn.click(
|
requirements.txt
CHANGED
@@ -11,10 +11,8 @@ google-generativeai==0.8.3
|
|
11 |
html5lib==1.1
|
12 |
beautifulsoup4==4.12.3
|
13 |
rapidfuzz==3.10.1
|
14 |
-
torch==2.
|
15 |
-
|
16 |
-
# Specify exact llama_cpp wheel for huggingface compatibility
|
17 |
-
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
|
18 |
transformers==4.47.0
|
19 |
numpy==1.26.4
|
20 |
typing_extensions==4.12.2
|
|
|
11 |
html5lib==1.1
|
12 |
beautifulsoup4==4.12.3
|
13 |
rapidfuzz==3.10.1
|
14 |
+
torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
|
15 |
+
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
|
|
|
|
16 |
transformers==4.47.0
|
17 |
numpy==1.26.4
|
18 |
typing_extensions==4.12.2
|
requirements_cpu.txt → requirements_gpu.txt
RENAMED
@@ -11,8 +11,10 @@ google-generativeai==0.8.3
|
|
11 |
html5lib==1.1
|
12 |
beautifulsoup4==4.12.3
|
13 |
rapidfuzz==3.10.1
|
14 |
-
torch==2.
|
15 |
-
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/
|
|
|
|
|
16 |
transformers==4.47.0
|
17 |
numpy==1.26.4
|
18 |
typing_extensions==4.12.2
|
|
|
11 |
html5lib==1.1
|
12 |
beautifulsoup4==4.12.3
|
13 |
rapidfuzz==3.10.1
|
14 |
+
torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
|
15 |
+
#llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
16 |
+
# Specify exact llama_cpp wheel for huggingface compatibility
|
17 |
+
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
|
18 |
transformers==4.47.0
|
19 |
numpy==1.26.4
|
20 |
typing_extensions==4.12.2
|
tools/helper_functions.py
CHANGED
@@ -46,14 +46,14 @@ def get_or_create_env_var(var_name, default_value):
|
|
46 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
47 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
48 |
|
49 |
-
RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "
|
50 |
print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
|
51 |
|
52 |
if RUN_AWS_FUNCTIONS == "1":
|
53 |
-
model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-
|
54 |
model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
|
55 |
else:
|
56 |
-
model_full_names = ["gemini-
|
57 |
model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
|
58 |
|
59 |
if RUN_LOCAL_MODEL == "0":
|
@@ -76,7 +76,7 @@ def get_file_path_with_extension(file_path):
|
|
76 |
# Return the basename with its extension
|
77 |
return basename
|
78 |
|
79 |
-
def
|
80 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
81 |
basename = os.path.basename(file_path)
|
82 |
|
@@ -246,10 +246,6 @@ def put_columns_in_df(in_file):
|
|
246 |
# Read each sheet into a DataFrame
|
247 |
df = pd.read_excel(file_name, sheet_name=sheet_name)
|
248 |
|
249 |
-
# Process the DataFrame (e.g., print its contents)
|
250 |
-
print(f"Sheet Name: {sheet_name}")
|
251 |
-
print(df.head()) # Print the first few rows
|
252 |
-
|
253 |
new_choices.extend(list(df.columns))
|
254 |
|
255 |
all_sheet_names.extend(new_sheet_names)
|
@@ -261,10 +257,10 @@ def put_columns_in_df(in_file):
|
|
261 |
concat_choices.extend(new_choices)
|
262 |
|
263 |
# Drop duplicate columns
|
264 |
-
concat_choices =
|
265 |
|
266 |
if number_of_excel_files > 0:
|
267 |
-
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True), file_end
|
268 |
else:
|
269 |
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end
|
270 |
|
|
|
46 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
47 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
48 |
|
49 |
+
RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
|
50 |
print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
|
51 |
|
52 |
if RUN_AWS_FUNCTIONS == "1":
|
53 |
+
model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
|
54 |
model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
|
55 |
else:
|
56 |
+
model_full_names = ["gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
|
57 |
model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
|
58 |
|
59 |
if RUN_LOCAL_MODEL == "0":
|
|
|
76 |
# Return the basename with its extension
|
77 |
return basename
|
78 |
|
79 |
+
def get_file_name_no_ext(file_path):
|
80 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
81 |
basename = os.path.basename(file_path)
|
82 |
|
|
|
246 |
# Read each sheet into a DataFrame
|
247 |
df = pd.read_excel(file_name, sheet_name=sheet_name)
|
248 |
|
|
|
|
|
|
|
|
|
249 |
new_choices.extend(list(df.columns))
|
250 |
|
251 |
all_sheet_names.extend(new_sheet_names)
|
|
|
257 |
concat_choices.extend(new_choices)
|
258 |
|
259 |
# Drop duplicate columns
|
260 |
+
concat_choices = sorted(set(concat_choices))
|
261 |
|
262 |
if number_of_excel_files > 0:
|
263 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end
|
264 |
else:
|
265 |
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end
|
266 |
|
tools/llm_api_call.py
CHANGED
@@ -20,7 +20,7 @@ from io import StringIO
|
|
20 |
GradioFileData = gr.FileData
|
21 |
|
22 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
|
23 |
-
from tools.helper_functions import output_folder, detect_file_type,
|
24 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
25 |
|
26 |
# ResponseObject class for AWS Bedrock calls
|
@@ -65,9 +65,9 @@ def load_in_file(file_path: str, colname:str=""):
|
|
65 |
- file_path (str): The path to the file to be processed.
|
66 |
"""
|
67 |
file_type = detect_file_type(file_path)
|
68 |
-
print("File type is:", file_type)
|
69 |
|
70 |
-
file_name =
|
71 |
file_data = read_file(file_path)
|
72 |
|
73 |
if colname:
|
@@ -140,6 +140,14 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
|
|
140 |
|
141 |
return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
|
144 |
"""
|
145 |
Processes a file by simplifying its content based on chosen columns and saves the result to a specified output folder.
|
@@ -163,11 +171,9 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
163 |
simplified_csv_table_path = ""
|
164 |
|
165 |
# Simplify table to just responses column and the Response reference number
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
simple_file["Response"] = simple_file["Response"].str.strip()
|
170 |
-
file_len = len(simple_file["Reference"])
|
171 |
|
172 |
|
173 |
# Subset the data for the current batch
|
@@ -181,35 +187,35 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
181 |
else:
|
182 |
end_row = file_len + 1
|
183 |
|
184 |
-
|
185 |
|
186 |
# Now replace the reference numbers with numbers starting from 1
|
187 |
-
|
188 |
|
189 |
-
#print("
|
190 |
|
191 |
# Remove problematic characters including ASCII and various quote marks
|
192 |
# Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
|
199 |
# Remove blank and extremely short responses
|
200 |
-
|
201 |
-
~(
|
202 |
-
~(
|
203 |
-
~(
|
204 |
|
205 |
#simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
|
206 |
-
#
|
207 |
|
208 |
-
simple_markdown_table =
|
209 |
|
210 |
normalised_simple_markdown_table = normalise_string(simple_markdown_table)
|
211 |
|
212 |
-
return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row,
|
213 |
|
214 |
def replace_punctuation_with_underscore(input_string):
|
215 |
# Create a translation table where each punctuation character maps to '_'
|
@@ -368,7 +374,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
368 |
progress_bar = range(0,number_of_api_retry_attempts)
|
369 |
|
370 |
# Generate the model's response
|
371 |
-
if model_choice in ["gemini-
|
372 |
|
373 |
for i in progress_bar:
|
374 |
try:
|
@@ -841,7 +847,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
841 |
# Create a new DataFrame from the reference data
|
842 |
new_reference_df = pd.DataFrame(reference_data)
|
843 |
|
844 |
-
print("new_reference_df:", new_reference_df)
|
845 |
|
846 |
# Append on old reference data
|
847 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
@@ -1040,10 +1046,15 @@ def extract_topics(in_data_file,
|
|
1040 |
# Save outputs for each batch. If master file created, label file as master
|
1041 |
file_path_details = f"{file_name}_col_{in_column_cleaned}"
|
1042 |
|
|
|
|
|
|
|
1043 |
# Save the new DataFrame to CSV
|
1044 |
#topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
|
|
1045 |
reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1046 |
-
unique_topics_df_out_path = output_folder +file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
|
|
1047 |
|
1048 |
# Write outputs to csv
|
1049 |
## Topics with references
|
@@ -1058,19 +1069,32 @@ def extract_topics(in_data_file,
|
|
1058 |
existing_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1059 |
out_file_paths.append(unique_topics_df_out_path)
|
1060 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1061 |
## Create a dataframe for missing response references:
|
1062 |
# Assuming existing_reference_df and file_data are already defined
|
1063 |
|
1064 |
# Simplify table to just responses column and the Response reference number
|
1065 |
-
|
1066 |
-
|
1067 |
-
|
1068 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1069 |
|
1070 |
# Step 1: Identify missing references
|
1071 |
-
#print("
|
1072 |
|
1073 |
-
missing_references =
|
1074 |
|
1075 |
# Step 2: Create a new DataFrame with the same columns as existing_reference_df
|
1076 |
missing_df = pd.DataFrame(columns=existing_reference_df.columns)
|
@@ -1126,21 +1150,21 @@ def extract_topics(in_data_file,
|
|
1126 |
print("Running query batch", str(reported_batch_no))
|
1127 |
|
1128 |
# Call the function to prepare the input table
|
1129 |
-
simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row,
|
1130 |
-
log_files_output_paths.append(simplified_csv_table_path)
|
1131 |
|
1132 |
|
1133 |
# Conversation history
|
1134 |
conversation_history = []
|
1135 |
|
1136 |
-
print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
|
1137 |
|
1138 |
# If the latest batch of responses contains at least one instance of text
|
1139 |
-
if not
|
1140 |
|
1141 |
print("latest_batch_completed:", latest_batch_completed)
|
1142 |
|
1143 |
-
print("candidate_topics:", candidate_topics)
|
1144 |
|
1145 |
# If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
|
1146 |
if latest_batch_completed >= 1 or candidate_topics is not None:
|
@@ -1148,7 +1172,7 @@ def extract_topics(in_data_file,
|
|
1148 |
#print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
|
1149 |
|
1150 |
# Prepare Gemini models before query
|
1151 |
-
if model_choice in ["gemini-
|
1152 |
print("Using Gemini model:", model_choice)
|
1153 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
1154 |
elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
|
@@ -1323,14 +1347,14 @@ def extract_topics(in_data_file,
|
|
1323 |
out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1324 |
log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1325 |
|
1326 |
-
print("out_file_paths at end of loop:", out_file_paths)
|
1327 |
|
1328 |
# If this is the first batch, run this
|
1329 |
else:
|
1330 |
#system_prompt = system_prompt + normalised_simple_markdown_table
|
1331 |
|
1332 |
# Prepare Gemini models before query
|
1333 |
-
if model_choice in ["gemini-
|
1334 |
print("Using Gemini model:", model_choice)
|
1335 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1336 |
else:
|
@@ -1418,8 +1442,8 @@ def extract_topics(in_data_file,
|
|
1418 |
else:
|
1419 |
print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
|
1420 |
|
1421 |
-
# Increase latest file completed count unless we are
|
1422 |
-
if latest_batch_completed
|
1423 |
print("Completed batch number:", str(reported_batch_no))
|
1424 |
latest_batch_completed += 1
|
1425 |
|
@@ -1444,10 +1468,44 @@ def extract_topics(in_data_file,
|
|
1444 |
final_message_out = '\n'.join(out_message)
|
1445 |
final_message_out = final_message_out + " " + out_time
|
1446 |
|
1447 |
-
print(final_message_out)
|
|
|
|
|
|
|
1448 |
|
1449 |
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
|
1450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1451 |
# SUMMARISATION FUNCTIONS
|
1452 |
|
1453 |
def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
|
@@ -1525,21 +1583,28 @@ def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, r
|
|
1525 |
|
1526 |
return result_df
|
1527 |
|
1528 |
-
def deduplicate_topics(reference_df,
|
1529 |
-
unique_topics_df,
|
1530 |
reference_table_file_name:str,
|
1531 |
unique_topics_table_file_name:str,
|
1532 |
merge_sentiment:str= "No",
|
1533 |
merge_general_topics:str="No",
|
1534 |
score_threshold:int=deduplication_threshold,
|
1535 |
-
|
|
|
|
|
|
|
1536 |
'''
|
1537 |
Deduplicate topics based on a reference and unique topics table
|
1538 |
'''
|
1539 |
output_files = []
|
|
|
1540 |
|
1541 |
-
reference_table_file_name_no_ext =
|
1542 |
-
unique_topics_table_file_name_no_ext =
|
|
|
|
|
|
|
1543 |
|
1544 |
# Run through this x times to try to get all duplicate topics
|
1545 |
if deduplicate_topics == "Yes":
|
@@ -1572,7 +1637,7 @@ def deduplicate_topics(reference_df,
|
|
1572 |
|
1573 |
else:
|
1574 |
# Join deduplicated columns back to original df
|
1575 |
-
deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
|
1576 |
# Remove rows where 'deduplicated_category' is blank or NaN
|
1577 |
deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
|
1578 |
|
@@ -1634,8 +1699,12 @@ def deduplicate_topics(reference_df,
|
|
1634 |
# Remake unique_topics_df based on new reference_df
|
1635 |
unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
|
1636 |
|
1637 |
-
|
1638 |
-
|
|
|
|
|
|
|
|
|
1639 |
|
1640 |
reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
|
1641 |
unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
|
@@ -1645,7 +1714,12 @@ def deduplicate_topics(reference_df,
|
|
1645 |
output_files.append(reference_file_path)
|
1646 |
output_files.append(unique_topics_file_path)
|
1647 |
|
1648 |
-
|
|
|
|
|
|
|
|
|
|
|
1649 |
|
1650 |
def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
1651 |
unique_topics_df:pd.DataFrame,
|
@@ -1700,7 +1774,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
|
|
1700 |
whole_conversation_metadata = []
|
1701 |
|
1702 |
# Prepare Gemini models before query
|
1703 |
-
if model_choice in ["gemini-
|
1704 |
print("Using Gemini model:", model_choice)
|
1705 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1706 |
else:
|
@@ -1741,6 +1815,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1741 |
summarised_outputs:list = [],
|
1742 |
latest_summary_completed:int = 0,
|
1743 |
out_metadata_str:str = "",
|
|
|
|
|
1744 |
output_files:list = [],
|
1745 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
1746 |
do_summaries="Yes",
|
@@ -1750,6 +1826,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1750 |
'''
|
1751 |
out_metadata = []
|
1752 |
local_model = []
|
|
|
1753 |
summarised_output_markdown = ""
|
1754 |
|
1755 |
print("In summarise_output_topics function.")
|
@@ -1758,6 +1835,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1758 |
|
1759 |
length_all_summaries = len(all_summaries)
|
1760 |
|
|
|
|
|
|
|
|
|
1761 |
#print("latest_summary_completed:", latest_summary_completed)
|
1762 |
#print("length_all_summaries:", length_all_summaries)
|
1763 |
|
@@ -1798,7 +1879,12 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1798 |
# Remove topics that are tagged as 'Not Mentioned'
|
1799 |
unique_table_df_revised = unique_table_df_revised.loc[unique_table_df_revised["Sentiment"] != "Not Mentioned", :]
|
1800 |
reference_table_df_revised = reference_table_df_revised.loc[reference_table_df_revised["Sentiment"] != "Not Mentioned", :]
|
|
|
|
|
|
|
|
|
1801 |
|
|
|
1802 |
unique_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_unique_topic_table_" + model_choice_clean + ".csv"
|
1803 |
unique_table_df_revised.to_csv(unique_table_df_revised_path, index = None)
|
1804 |
|
@@ -1807,11 +1893,18 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1807 |
|
1808 |
output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
|
1809 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1810 |
unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(wrap_text))
|
1811 |
|
1812 |
summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
|
1813 |
|
1814 |
-
return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown
|
1815 |
|
1816 |
tic = time.perf_counter()
|
1817 |
|
@@ -1865,4 +1958,4 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1865 |
if latest_summary_completed >= length_all_summaries:
|
1866 |
print("At last summary.")
|
1867 |
|
1868 |
-
return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown
|
|
|
20 |
GradioFileData = gr.FileData
|
21 |
|
22 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
|
23 |
+
from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
|
24 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
25 |
|
26 |
# ResponseObject class for AWS Bedrock calls
|
|
|
65 |
- file_path (str): The path to the file to be processed.
|
66 |
"""
|
67 |
file_type = detect_file_type(file_path)
|
68 |
+
#print("File type is:", file_type)
|
69 |
|
70 |
+
file_name = get_file_name_no_ext(file_path)
|
71 |
file_data = read_file(file_path)
|
72 |
|
73 |
if colname:
|
|
|
140 |
|
141 |
return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
|
142 |
|
143 |
+
def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str]) -> pd.DataFrame:
|
144 |
+
basic_response_data = file_data[[chosen_cols]].reset_index(names="Reference")
|
145 |
+
basic_response_data["Reference"] = basic_response_data["Reference"].astype(int) + 1
|
146 |
+
basic_response_data = basic_response_data.rename(columns={chosen_cols: "Response"})
|
147 |
+
basic_response_data["Response"] = basic_response_data["Response"].str.strip()
|
148 |
+
|
149 |
+
return basic_response_data
|
150 |
+
|
151 |
def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
|
152 |
"""
|
153 |
Processes a file by simplifying its content based on chosen columns and saves the result to a specified output folder.
|
|
|
171 |
simplified_csv_table_path = ""
|
172 |
|
173 |
# Simplify table to just responses column and the Response reference number
|
174 |
+
basic_response_data = get_basic_response_data(file_data, chosen_cols)
|
175 |
+
|
176 |
+
file_len = len(basic_response_data["Reference"])
|
|
|
|
|
177 |
|
178 |
|
179 |
# Subset the data for the current batch
|
|
|
187 |
else:
|
188 |
end_row = file_len + 1
|
189 |
|
190 |
+
batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
|
191 |
|
192 |
# Now replace the reference numbers with numbers starting from 1
|
193 |
+
batch_basic_response_data["Reference"] = batch_basic_response_data["Reference"] - start_row
|
194 |
|
195 |
+
#print("batch_basic_response_data:", batch_basic_response_data)
|
196 |
|
197 |
# Remove problematic characters including ASCII and various quote marks
|
198 |
# Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
|
199 |
+
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
|
200 |
+
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.strip() # Remove leading and trailing whitespace
|
201 |
+
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
|
202 |
+
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
|
203 |
+
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
|
204 |
|
205 |
# Remove blank and extremely short responses
|
206 |
+
batch_basic_response_data = batch_basic_response_data.loc[~(batch_basic_response_data["Response"].isnull()) &\
|
207 |
+
~(batch_basic_response_data["Response"] == "None") &\
|
208 |
+
~(batch_basic_response_data["Response"] == " ") &\
|
209 |
+
~(batch_basic_response_data["Response"] == ""),:]#~(batch_basic_response_data["Response"].str.len() < 5), :]
|
210 |
|
211 |
#simplified_csv_table_path = output_folder + 'simple_markdown_table_' + file_name + '_row_' + str(start_row) + '_to_' + str(end_row) + '.csv'
|
212 |
+
#batch_basic_response_data.to_csv(simplified_csv_table_path, index=None)
|
213 |
|
214 |
+
simple_markdown_table = batch_basic_response_data.to_markdown(index=None)
|
215 |
|
216 |
normalised_simple_markdown_table = normalise_string(simple_markdown_table)
|
217 |
|
218 |
+
return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
|
219 |
|
220 |
def replace_punctuation_with_underscore(input_string):
|
221 |
# Create a translation table where each punctuation character maps to '_'
|
|
|
374 |
progress_bar = range(0,number_of_api_retry_attempts)
|
375 |
|
376 |
# Generate the model's response
|
377 |
+
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
|
378 |
|
379 |
for i in progress_bar:
|
380 |
try:
|
|
|
847 |
# Create a new DataFrame from the reference data
|
848 |
new_reference_df = pd.DataFrame(reference_data)
|
849 |
|
850 |
+
#print("new_reference_df:", new_reference_df)
|
851 |
|
852 |
# Append on old reference data
|
853 |
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
|
|
|
1046 |
# Save outputs for each batch. If master file created, label file as master
|
1047 |
file_path_details = f"{file_name}_col_{in_column_cleaned}"
|
1048 |
|
1049 |
+
# Create a pivoted reference table
|
1050 |
+
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
1051 |
+
|
1052 |
# Save the new DataFrame to CSV
|
1053 |
#topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1054 |
+
reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1055 |
reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1056 |
+
unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1057 |
+
basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
1058 |
|
1059 |
# Write outputs to csv
|
1060 |
## Topics with references
|
|
|
1069 |
existing_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
|
1070 |
out_file_paths.append(unique_topics_df_out_path)
|
1071 |
|
1072 |
+
# Ensure that we are only returning the final results to outputs
|
1073 |
+
out_file_paths = [x for x in out_file_paths if '_final_' in x]
|
1074 |
+
|
1075 |
+
## Reference table mapping response numbers to topics
|
1076 |
+
existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
|
1077 |
+
log_files_output_paths.append(reference_table_out_pivot_path)
|
1078 |
+
|
1079 |
## Create a dataframe for missing response references:
|
1080 |
# Assuming existing_reference_df and file_data are already defined
|
1081 |
|
1082 |
# Simplify table to just responses column and the Response reference number
|
1083 |
+
|
1084 |
+
|
1085 |
+
basic_response_data = get_basic_response_data(file_data, chosen_cols)
|
1086 |
+
|
1087 |
+
#print("basic_response_data:", basic_response_data)
|
1088 |
+
|
1089 |
+
# Save simplified file data to log outputs
|
1090 |
+
pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None)
|
1091 |
+
log_files_output_paths.append(basic_response_data_out_path)
|
1092 |
+
|
1093 |
|
1094 |
# Step 1: Identify missing references
|
1095 |
+
#print("basic_response_data:", basic_response_data)
|
1096 |
|
1097 |
+
missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
|
1098 |
|
1099 |
# Step 2: Create a new DataFrame with the same columns as existing_reference_df
|
1100 |
missing_df = pd.DataFrame(columns=existing_reference_df.columns)
|
|
|
1150 |
print("Running query batch", str(reported_batch_no))
|
1151 |
|
1152 |
# Call the function to prepare the input table
|
1153 |
+
simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size)
|
1154 |
+
#log_files_output_paths.append(simplified_csv_table_path)
|
1155 |
|
1156 |
|
1157 |
# Conversation history
|
1158 |
conversation_history = []
|
1159 |
|
1160 |
+
#print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
|
1161 |
|
1162 |
# If the latest batch of responses contains at least one instance of text
|
1163 |
+
if not batch_basic_response_df.empty:
|
1164 |
|
1165 |
print("latest_batch_completed:", latest_batch_completed)
|
1166 |
|
1167 |
+
#print("candidate_topics:", candidate_topics)
|
1168 |
|
1169 |
# If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
|
1170 |
if latest_batch_completed >= 1 or candidate_topics is not None:
|
|
|
1172 |
#print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
|
1173 |
|
1174 |
# Prepare Gemini models before query
|
1175 |
+
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
|
1176 |
print("Using Gemini model:", model_choice)
|
1177 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
1178 |
elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
|
|
|
1347 |
out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1348 |
log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
|
1349 |
|
1350 |
+
#print("out_file_paths at end of loop:", out_file_paths)
|
1351 |
|
1352 |
# If this is the first batch, run this
|
1353 |
else:
|
1354 |
#system_prompt = system_prompt + normalised_simple_markdown_table
|
1355 |
|
1356 |
# Prepare Gemini models before query
|
1357 |
+
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
|
1358 |
print("Using Gemini model:", model_choice)
|
1359 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1360 |
else:
|
|
|
1442 |
else:
|
1443 |
print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
|
1444 |
|
1445 |
+
# Increase latest file completed count unless we are over the last batch number
|
1446 |
+
if latest_batch_completed <= num_batches:
|
1447 |
print("Completed batch number:", str(reported_batch_no))
|
1448 |
latest_batch_completed += 1
|
1449 |
|
|
|
1468 |
final_message_out = '\n'.join(out_message)
|
1469 |
final_message_out = final_message_out + " " + out_time
|
1470 |
|
1471 |
+
print(final_message_out)
|
1472 |
+
|
1473 |
+
#print("out_file_paths:", out_file_paths)
|
1474 |
+
#print("log_files_output_paths:", log_files_output_paths)
|
1475 |
|
1476 |
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
|
1477 |
|
1478 |
+
def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
|
1479 |
+
|
1480 |
+
df_in = df[['Response References', 'General Topic', 'Subtopic', 'Sentiment']].copy()
|
1481 |
+
|
1482 |
+
df_in['Response References'] = df_in['Response References'].astype(int)
|
1483 |
+
|
1484 |
+
# Create a combined category column
|
1485 |
+
df_in['Category'] = df_in['General Topic'] + ' - ' + df_in['Subtopic'] + ' - ' + df_in['Sentiment']
|
1486 |
+
|
1487 |
+
# Create pivot table counting occurrences of each unique combination
|
1488 |
+
pivot_table = pd.crosstab(
|
1489 |
+
index=df_in['Response References'],
|
1490 |
+
columns=[df_in['General Topic'], df_in['Subtopic'], df_in['Sentiment']],
|
1491 |
+
margins=True
|
1492 |
+
)
|
1493 |
+
|
1494 |
+
# Flatten column names to make them more readable
|
1495 |
+
pivot_table.columns = [' - '.join(col) for col in pivot_table.columns]
|
1496 |
+
|
1497 |
+
pivot_table.reset_index(inplace=True)
|
1498 |
+
|
1499 |
+
if not basic_response_data.empty:
|
1500 |
+
pivot_table = basic_response_data.merge(pivot_table, right_on="Response References", left_on="Reference", how="left")
|
1501 |
+
|
1502 |
+
pivot_table.drop("Response References", axis=1, inplace=True)
|
1503 |
+
|
1504 |
+
# print("pivot_table:", pivot_table)
|
1505 |
+
|
1506 |
+
return pivot_table
|
1507 |
+
|
1508 |
+
|
1509 |
# SUMMARISATION FUNCTIONS
|
1510 |
|
1511 |
def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
|
|
|
1583 |
|
1584 |
return result_df
|
1585 |
|
1586 |
+
def deduplicate_topics(reference_df:pd.DataFrame,
|
1587 |
+
unique_topics_df:pd.DataFrame,
|
1588 |
reference_table_file_name:str,
|
1589 |
unique_topics_table_file_name:str,
|
1590 |
merge_sentiment:str= "No",
|
1591 |
merge_general_topics:str="No",
|
1592 |
score_threshold:int=deduplication_threshold,
|
1593 |
+
in_data_files=[],
|
1594 |
+
chosen_cols:List[str]="",
|
1595 |
+
deduplicate_topics:str="Yes"
|
1596 |
+
):
|
1597 |
'''
|
1598 |
Deduplicate topics based on a reference and unique topics table
|
1599 |
'''
|
1600 |
output_files = []
|
1601 |
+
log_output_files = []
|
1602 |
|
1603 |
+
reference_table_file_name_no_ext = get_file_name_no_ext(reference_table_file_name)
|
1604 |
+
unique_topics_table_file_name_no_ext = get_file_name_no_ext(unique_topics_table_file_name)
|
1605 |
+
|
1606 |
+
if in_data_files and chosen_cols:
|
1607 |
+
file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
|
1608 |
|
1609 |
# Run through this x times to try to get all duplicate topics
|
1610 |
if deduplicate_topics == "Yes":
|
|
|
1637 |
|
1638 |
else:
|
1639 |
# Join deduplicated columns back to original df
|
1640 |
+
#deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
|
1641 |
# Remove rows where 'deduplicated_category' is blank or NaN
|
1642 |
deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
|
1643 |
|
|
|
1699 |
# Remake unique_topics_df based on new reference_df
|
1700 |
unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
|
1701 |
|
1702 |
+
basic_response_data = get_basic_response_data(file_data, chosen_cols)
|
1703 |
+
|
1704 |
+
reference_df_pivot = convert_reference_table_to_pivot_table(reference_df, basic_response_data)
|
1705 |
+
|
1706 |
+
reference_table_file_name_no_ext = get_file_name_no_ext(reference_table_file_name)
|
1707 |
+
unique_topics_table_file_name_no_ext = get_file_name_no_ext(unique_topics_table_file_name)
|
1708 |
|
1709 |
reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
|
1710 |
unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
|
|
|
1714 |
output_files.append(reference_file_path)
|
1715 |
output_files.append(unique_topics_file_path)
|
1716 |
|
1717 |
+
reference_pivot_file_path = output_folder + reference_table_file_name_no_ext + "_pivot_dedup.csv"
|
1718 |
+
reference_df_pivot.to_csv(reference_pivot_file_path, index=None)
|
1719 |
+
|
1720 |
+
log_output_files.append(reference_pivot_file_path)
|
1721 |
+
|
1722 |
+
return reference_df, unique_topics_df, output_files, log_output_files
|
1723 |
|
1724 |
def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
1725 |
unique_topics_df:pd.DataFrame,
|
|
|
1774 |
whole_conversation_metadata = []
|
1775 |
|
1776 |
# Prepare Gemini models before query
|
1777 |
+
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
|
1778 |
print("Using Gemini model:", model_choice)
|
1779 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1780 |
else:
|
|
|
1815 |
summarised_outputs:list = [],
|
1816 |
latest_summary_completed:int = 0,
|
1817 |
out_metadata_str:str = "",
|
1818 |
+
in_data_files:List[str]=[],
|
1819 |
+
chosen_cols:List[str]=[],
|
1820 |
output_files:list = [],
|
1821 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
1822 |
do_summaries="Yes",
|
|
|
1826 |
'''
|
1827 |
out_metadata = []
|
1828 |
local_model = []
|
1829 |
+
log_output_files = []
|
1830 |
summarised_output_markdown = ""
|
1831 |
|
1832 |
print("In summarise_output_topics function.")
|
|
|
1835 |
|
1836 |
length_all_summaries = len(all_summaries)
|
1837 |
|
1838 |
+
# Load in data file and chosen columns if exists to create pivot table later
|
1839 |
+
if in_data_files and chosen_cols:
|
1840 |
+
file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
|
1841 |
+
|
1842 |
#print("latest_summary_completed:", latest_summary_completed)
|
1843 |
#print("length_all_summaries:", length_all_summaries)
|
1844 |
|
|
|
1879 |
# Remove topics that are tagged as 'Not Mentioned'
|
1880 |
unique_table_df_revised = unique_table_df_revised.loc[unique_table_df_revised["Sentiment"] != "Not Mentioned", :]
|
1881 |
reference_table_df_revised = reference_table_df_revised.loc[reference_table_df_revised["Sentiment"] != "Not Mentioned", :]
|
1882 |
+
|
1883 |
+
basic_response_data = get_basic_response_data(file_data, chosen_cols)
|
1884 |
+
|
1885 |
+
reference_table_df_revised_pivot = convert_reference_table_to_pivot_table(reference_table_df_revised, basic_response_data)
|
1886 |
|
1887 |
+
# Save to file
|
1888 |
unique_table_df_revised_path = output_folder + batch_file_path_details + "_summarised_unique_topic_table_" + model_choice_clean + ".csv"
|
1889 |
unique_table_df_revised.to_csv(unique_table_df_revised_path, index = None)
|
1890 |
|
|
|
1893 |
|
1894 |
output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
|
1895 |
|
1896 |
+
### Save pivot file to log area
|
1897 |
+
reference_table_df_revised_pivot_path = output_folder + batch_file_path_details + "_summarised_reference_table_pivot_" + model_choice_clean + ".csv"
|
1898 |
+
reference_table_df_revised_pivot.to_csv(reference_table_df_revised_pivot_path, index=None)
|
1899 |
+
|
1900 |
+
log_output_files.append(reference_table_df_revised_pivot_path)
|
1901 |
+
|
1902 |
+
###
|
1903 |
unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(wrap_text))
|
1904 |
|
1905 |
summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
|
1906 |
|
1907 |
+
return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|
1908 |
|
1909 |
tic = time.perf_counter()
|
1910 |
|
|
|
1958 |
if latest_summary_completed >= length_all_summaries:
|
1959 |
print("At last summary.")
|
1960 |
|
1961 |
+
return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|