Commit
·
854a758
1
Parent(s):
a6d1841
Topic deduplication/merging now separated from summarisation. Gradio upgrade
Browse files- Dockerfile +1 -1
- app.py +40 -23
- requirements.txt +1 -1
- requirements_aws.txt +1 -1
- requirements_cpu.txt +1 -1
- tools/chatfuncs.py +1 -1
- tools/llm_api_call.py +192 -76
Dockerfile
CHANGED
@@ -26,7 +26,7 @@ RUN rm requirements_aws.txt
|
|
26 |
# Stage 2: Final runtime image
|
27 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
28 |
|
29 |
-
# Install system dependencies.
|
30 |
RUN apt-get update \
|
31 |
&& apt-get clean \
|
32 |
&& rm -rf /var/lib/apt/lists/*
|
|
|
26 |
# Stage 2: Final runtime image
|
27 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
28 |
|
29 |
+
# Install system dependencies.
|
30 |
RUN apt-get update \
|
31 |
&& apt-get clean \
|
32 |
&& rm -rf /var/lib/apt/lists/*
|
app.py
CHANGED
@@ -3,7 +3,7 @@ import socket
|
|
3 |
import spaces
|
4 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
|
5 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
6 |
-
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
|
7 |
from tools.auth import authenticate_user
|
8 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
9 |
#from tools.aws_functions import load_data_from_aws
|
@@ -21,6 +21,7 @@ print("host_name is:", host_name)
|
|
21 |
access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
22 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
23 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
|
|
24 |
|
25 |
print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
|
26 |
|
@@ -47,10 +48,10 @@ with app:
|
|
47 |
first_loop_state = gr.State(True)
|
48 |
second_loop_state = gr.State(False)
|
49 |
|
50 |
-
file_data_state = gr.State(pd.DataFrame())
|
51 |
-
master_topic_df_state = gr.State(pd.DataFrame())
|
52 |
-
master_reference_df_state = gr.State(pd.DataFrame())
|
53 |
-
master_unique_topics_df_state = gr.State(pd.DataFrame())
|
54 |
|
55 |
session_hash_state = gr.State()
|
56 |
s3_output_folder_state = gr.State()
|
@@ -66,13 +67,15 @@ with app:
|
|
66 |
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
67 |
|
68 |
# Summary state objects
|
69 |
-
summary_reference_table_sample_state = gr.State(pd.DataFrame())
|
70 |
-
master_reference_df_revised_summaries_state = gr.State(pd.DataFrame())
|
71 |
-
master_unique_topics_df_revised_summaries_state = gr.State(pd.DataFrame())
|
72 |
summarised_references_markdown = gr.Markdown("", visible=False)
|
73 |
summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
|
74 |
latest_summary_completed_num = gr.Number(0, visible=False)
|
75 |
|
|
|
|
|
76 |
###
|
77 |
# UI LAYOUT
|
78 |
###
|
@@ -99,20 +102,20 @@ with app:
|
|
99 |
in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
100 |
|
101 |
with gr.Accordion("Upload xlsx or csv file", open = True):
|
102 |
-
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
103 |
|
104 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
105 |
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
|
106 |
|
107 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
108 |
-
candidate_topics = gr.File(label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
|
109 |
|
110 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
111 |
|
112 |
extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
|
113 |
|
114 |
text_output_summary = gr.Markdown(value="### Language model response will appear here")
|
115 |
-
text_output_file = gr.File(label="Output files")
|
116 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
117 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
118 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
@@ -126,16 +129,26 @@ with app:
|
|
126 |
with gr.Row():
|
127 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
128 |
|
129 |
-
with gr.Tab(label="
|
130 |
gr.Markdown(
|
131 |
"""
|
132 |
### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
|
133 |
""")
|
134 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
135 |
-
summarisation_in_previous_data_files = gr.File(label="
|
136 |
summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
|
138 |
-
summary_output_files = gr.File(label="Summarised output files", interactive=False)
|
139 |
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
|
140 |
|
141 |
with gr.Tab(label="Continue previous topic extraction"):
|
@@ -145,28 +158,28 @@ with app:
|
|
145 |
""")
|
146 |
|
147 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
148 |
-
in_previous_data_files = gr.File(label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
149 |
in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
|
150 |
continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
|
151 |
|
152 |
|
153 |
-
with gr.Tab(label="
|
154 |
gr.Markdown(
|
155 |
"""
|
156 |
### View a 'unique_topic_table' csv file in markdown format.
|
157 |
""")
|
158 |
|
159 |
-
in_view_table = gr.File(label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
160 |
view_table_markdown = gr.Markdown(value = "", label="View table")
|
161 |
|
162 |
-
with gr.Tab(label="
|
163 |
gr.Markdown(
|
164 |
"""
|
165 |
Define settings that affect large language model output.
|
166 |
""")
|
167 |
with gr.Accordion("Settings for LLM generation", open = True):
|
168 |
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
|
169 |
-
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0)
|
170 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
171 |
|
172 |
with gr.Accordion("Prompt settings", open = True):
|
@@ -178,7 +191,7 @@ with app:
|
|
178 |
add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
|
179 |
add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
|
180 |
|
181 |
-
log_files_output = gr.File(label="Log file output", interactive=False)
|
182 |
conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
|
183 |
|
184 |
# Invisible text box to hold the session hash/username just for logging purposes
|
@@ -214,18 +227,22 @@ with app:
|
|
214 |
inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
|
215 |
fn=extract_topics,
|
216 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
217 |
-
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files], api_name="extract_topics")
|
218 |
|
219 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
220 |
latest_batch_completed.change(fn=extract_topics,
|
221 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
222 |
-
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files]).\
|
223 |
then(fn = reveal_feedback_buttons,
|
224 |
outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
225 |
|
|
|
|
|
|
|
|
|
226 |
# When button pressed, summarise previous data
|
227 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
228 |
-
then(load_in_previous_data_files, inputs=[
|
229 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
230 |
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
|
231 |
|
|
|
3 |
import spaces
|
4 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
|
5 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
6 |
+
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics
|
7 |
from tools.auth import authenticate_user
|
8 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
9 |
#from tools.aws_functions import load_data_from_aws
|
|
|
21 |
access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
22 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
23 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
24 |
+
file_input_height = 150
|
25 |
|
26 |
print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
|
27 |
|
|
|
48 |
first_loop_state = gr.State(True)
|
49 |
second_loop_state = gr.State(False)
|
50 |
|
51 |
+
file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
52 |
+
master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
53 |
+
master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
54 |
+
master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
55 |
|
56 |
session_hash_state = gr.State()
|
57 |
s3_output_folder_state = gr.State()
|
|
|
67 |
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
68 |
|
69 |
# Summary state objects
|
70 |
+
summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
71 |
+
master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
72 |
+
master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
73 |
summarised_references_markdown = gr.Markdown("", visible=False)
|
74 |
summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
|
75 |
latest_summary_completed_num = gr.Number(0, visible=False)
|
76 |
|
77 |
+
unique_topics_table_file_textbox = gr.Textbox(label="unique_topics_table_file_textbox", visible=False)
|
78 |
+
|
79 |
###
|
80 |
# UI LAYOUT
|
81 |
###
|
|
|
102 |
in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
103 |
|
104 |
with gr.Accordion("Upload xlsx or csv file", open = True):
|
105 |
+
in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
106 |
|
107 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
108 |
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
|
109 |
|
110 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
111 |
+
candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have a single column with a header, and all topic keywords below.")
|
112 |
|
113 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
114 |
|
115 |
extract_topics_btn = gr.Button("Extract topics from open text", variant="primary")
|
116 |
|
117 |
text_output_summary = gr.Markdown(value="### Language model response will appear here")
|
118 |
+
text_output_file = gr.File(height=file_input_height, label="Output files")
|
119 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
120 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
121 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
|
|
129 |
with gr.Row():
|
130 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
131 |
|
132 |
+
with gr.Tab(label="Deduplicate and summarise topics"):
|
133 |
gr.Markdown(
|
134 |
"""
|
135 |
### Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to summarise the outputs.
|
136 |
""")
|
137 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
138 |
+
summarisation_in_previous_data_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
139 |
summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
140 |
+
|
141 |
+
with gr.Row():
|
142 |
+
merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
143 |
+
merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
144 |
+
deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
|
145 |
+
|
146 |
+
deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
|
147 |
+
|
148 |
+
duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
149 |
+
|
150 |
summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
|
151 |
+
summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
|
152 |
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
|
153 |
|
154 |
with gr.Tab(label="Continue previous topic extraction"):
|
|
|
158 |
""")
|
159 |
|
160 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
161 |
+
in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
162 |
in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
|
163 |
continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
|
164 |
|
165 |
|
166 |
+
with gr.Tab(label="Topic table viewer"):
|
167 |
gr.Markdown(
|
168 |
"""
|
169 |
### View a 'unique_topic_table' csv file in markdown format.
|
170 |
""")
|
171 |
|
172 |
+
in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
173 |
view_table_markdown = gr.Markdown(value = "", label="View table")
|
174 |
|
175 |
+
with gr.Tab(label="Topic extraction settings"):
|
176 |
gr.Markdown(
|
177 |
"""
|
178 |
Define settings that affect large language model output.
|
179 |
""")
|
180 |
with gr.Accordion("Settings for LLM generation", open = True):
|
181 |
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
|
182 |
+
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
|
183 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
184 |
|
185 |
with gr.Accordion("Prompt settings", open = True):
|
|
|
191 |
add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
|
192 |
add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
|
193 |
|
194 |
+
log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
|
195 |
conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
|
196 |
|
197 |
# Invisible text box to hold the session hash/username just for logging purposes
|
|
|
227 |
inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state, data_file_names_textbox, total_number_of_batches], api_name="load_data").then(\
|
228 |
fn=extract_topics,
|
229 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
230 |
+
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files], api_name="extract_topics")
|
231 |
|
232 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
233 |
latest_batch_completed.change(fn=extract_topics,
|
234 |
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, text_output_summary, data_file_names_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, text_output_summary, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number],
|
235 |
+
outputs=[text_output_summary, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, summarisation_in_previous_data_files, duplicate_output_files]).\
|
236 |
then(fn = reveal_feedback_buttons,
|
237 |
outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
238 |
|
239 |
+
# When button pressed, deduplicate data
|
240 |
+
deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
241 |
+
then(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, data_file_names_textbox, unique_topics_table_file_textbox, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold], outputs=[master_reference_df_state, master_unique_topics_df_state, duplicate_output_files])
|
242 |
+
|
243 |
# When button pressed, summarise previous data
|
244 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
245 |
+
then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
246 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
247 |
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
|
248 |
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.12.0
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
requirements_aws.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.12.0
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
requirements_cpu.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.12.0
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
tools/chatfuncs.py
CHANGED
@@ -50,7 +50,7 @@ reset: bool = True
|
|
50 |
stream: bool = False
|
51 |
threads: int = threads
|
52 |
batch_size:int = 256
|
53 |
-
context_length:int =
|
54 |
sample = True
|
55 |
|
56 |
|
|
|
50 |
stream: bool = False
|
51 |
threads: int = threads
|
52 |
batch_size:int = 256
|
53 |
+
context_length:int = 16384
|
54 |
sample = True
|
55 |
|
56 |
|
tools/llm_api_call.py
CHANGED
@@ -34,6 +34,12 @@ timeout_wait = 30 # AWS now seems to have a 60 second minimum wait between API c
|
|
34 |
number_of_api_retry_attempts = 5
|
35 |
max_time_for_loop = 99999
|
36 |
batch_size_default = 5
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
|
39 |
print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
|
@@ -104,7 +110,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
|
|
104 |
if 'reference_table' in file.name:
|
105 |
try:
|
106 |
reference_file_data, reference_file_name = load_in_file(file)
|
107 |
-
print("reference_file_data:", reference_file_data.head(2))
|
108 |
out_message = out_message + " Reference file load successful"
|
109 |
except Exception as e:
|
110 |
out_message = "Could not load reference file data:" + str(e)
|
@@ -113,7 +119,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
|
|
113 |
if 'unique_topics' in file.name:
|
114 |
try:
|
115 |
unique_file_data, unique_file_name = load_in_file(file)
|
116 |
-
print("unique_topics_file:", unique_file_data.head(2))
|
117 |
out_message = out_message + " Unique table file load successful"
|
118 |
except Exception as e:
|
119 |
out_message = "Could not load unique table file data:" + str(e)
|
@@ -132,7 +138,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str]):
|
|
132 |
|
133 |
print(out_message)
|
134 |
|
135 |
-
return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name
|
136 |
|
137 |
def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
|
138 |
"""
|
@@ -188,7 +194,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
188 |
simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
|
189 |
simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
|
190 |
simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
|
191 |
-
simple_file["Response"] = simple_file["Response"].str.slice(0,
|
192 |
|
193 |
# Remove blank and extremely short responses
|
194 |
simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
|
@@ -988,7 +994,7 @@ def extract_topics(in_data_file,
|
|
988 |
# Check if files and text exist
|
989 |
out_message = "Please enter a data file to summarise."
|
990 |
print(out_message)
|
991 |
-
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
|
992 |
|
993 |
|
994 |
#model_choice_clean = replace_punctuation_with_underscore(model_choice)
|
@@ -1087,7 +1093,7 @@ def extract_topics(in_data_file,
|
|
1087 |
print("summary_out_file_paths:", summary_out_file_paths)
|
1088 |
|
1089 |
#final_out_message = '\n'.join(out_message)
|
1090 |
-
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time,
|
1091 |
|
1092 |
|
1093 |
if num_batches > 0:
|
@@ -1108,7 +1114,7 @@ def extract_topics(in_data_file,
|
|
1108 |
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
1109 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
1110 |
print(out_message)
|
1111 |
-
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, out_message
|
1112 |
|
1113 |
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
1114 |
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
@@ -1440,74 +1446,125 @@ def extract_topics(in_data_file,
|
|
1440 |
|
1441 |
print(final_message_out)
|
1442 |
|
1443 |
-
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths
|
1444 |
|
1445 |
# SUMMARISATION FUNCTIONS
|
1446 |
|
1447 |
-
def deduplicate_categories(category_series: pd.Series, join_series:pd.Series, threshold: float =
|
1448 |
"""
|
1449 |
-
Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold
|
1450 |
-
|
|
|
1451 |
Parameters:
|
1452 |
category_series (pd.Series): Series containing category names to deduplicate.
|
1453 |
-
join_series (pd.Series): Additional series used for joining back to original results
|
|
|
1454 |
threshold (float): Similarity threshold for considering two strings as duplicates.
|
1455 |
-
|
1456 |
Returns:
|
1457 |
pd.DataFrame: DataFrame with columns ['old_category', 'deduplicated_category'].
|
1458 |
"""
|
|
|
|
|
|
|
1459 |
# Initialize the result dictionary
|
1460 |
deduplication_map = {}
|
1461 |
-
|
1462 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1463 |
for category in category_series.unique():
|
1464 |
# Skip if the category is already processed
|
1465 |
if category in deduplication_map:
|
1466 |
continue
|
1467 |
|
1468 |
# Find close matches to the current category, excluding the current category itself
|
1469 |
-
matches = process.extract(category,
|
1470 |
-
|
1471 |
-
|
|
|
|
|
1472 |
if matches: # Check if there are any matches
|
1473 |
best_match = max(matches, key=lambda x: x[1]) # Get the match with the highest score
|
1474 |
match, score, _ = best_match # Unpack the best match
|
1475 |
-
|
1476 |
-
|
1477 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1478 |
# Create the result DataFrame
|
1479 |
-
|
1480 |
-
|
1481 |
-
|
1482 |
-
|
1483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1484 |
return result_df
|
1485 |
|
1486 |
-
def
|
1487 |
-
|
1488 |
-
|
1489 |
-
|
1490 |
-
|
1491 |
-
|
1492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1493 |
|
1494 |
-
#
|
1495 |
if deduplicate_topics == "Yes":
|
|
|
|
|
|
|
1496 |
|
1497 |
-
|
1498 |
-
for i in range(0, 3):
|
1499 |
-
print("Run:", i)
|
1500 |
-
# First, combine duplicate topics in reference_df
|
1501 |
-
reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
|
1502 |
|
1503 |
-
|
|
|
|
|
|
|
1504 |
|
1505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1506 |
|
1507 |
-
|
1508 |
-
deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
|
1509 |
-
lambda group: deduplicate_categories(group["Subtopic"], group["Sentiment"], threshold=80)
|
1510 |
-
).reset_index(drop=True) # Reset index after groupby
|
1511 |
|
1512 |
if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
|
1513 |
# Check if 'deduplicated_category' contains any values
|
@@ -1515,10 +1572,11 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
|
1515 |
|
1516 |
else:
|
1517 |
# Join deduplicated columns back to original df
|
|
|
1518 |
# Remove rows where 'deduplicated_category' is blank or NaN
|
1519 |
-
deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()),
|
1520 |
|
1521 |
-
|
1522 |
|
1523 |
reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
|
1524 |
|
@@ -1541,9 +1599,65 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
|
1541 |
reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
|
1542 |
reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
|
1543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1544 |
# Remake unique_topics_df based on new reference_df
|
1545 |
unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
|
1546 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1547 |
|
1548 |
reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
|
1549 |
|
@@ -1629,6 +1743,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1629 |
out_metadata_str:str = "",
|
1630 |
output_files:list = [],
|
1631 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
|
|
1632 |
progress=gr.Progress(track_tqdm=True)):
|
1633 |
'''
|
1634 |
Create better summaries of the raw batch-level summaries created in the first run of the model.
|
@@ -1711,39 +1826,40 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1711 |
summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
1712 |
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
|
1713 |
|
1714 |
-
|
|
|
1715 |
|
1716 |
-
|
1717 |
|
1718 |
-
|
1719 |
-
|
1720 |
-
|
1721 |
|
1722 |
-
|
1723 |
-
|
1724 |
-
|
1725 |
-
|
1726 |
-
|
1727 |
-
|
1728 |
-
|
1729 |
-
|
1730 |
-
|
1731 |
-
|
1732 |
-
summarised_outputs.append(summarised_output)
|
1733 |
-
out_metadata.extend(metadata)
|
1734 |
-
out_metadata_str = '. '.join(out_metadata)
|
1735 |
-
|
1736 |
-
latest_summary_completed += 1
|
1737 |
-
|
1738 |
-
# Check if beyond max time allowed for processing and break if necessary
|
1739 |
-
toc = time.perf_counter()
|
1740 |
-
time_taken = tic - toc
|
1741 |
|
1742 |
-
|
1743 |
-
|
1744 |
-
|
1745 |
-
|
1746 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1747 |
|
1748 |
# If all summaries completeed
|
1749 |
if latest_summary_completed >= length_all_summaries:
|
|
|
34 |
number_of_api_retry_attempts = 5
|
35 |
max_time_for_loop = 99999
|
36 |
batch_size_default = 5
|
37 |
+
deduplication_threshold = 90
|
38 |
+
|
39 |
+
MAX_COMMENT_CHARS = get_or_create_env_var('MAX_COMMENT_CHARS', '14000')
|
40 |
+
print(f'The value of MAX_COMMENT_CHARS is {MAX_COMMENT_CHARS}')
|
41 |
+
|
42 |
+
max_comment_character_length = int(MAX_COMMENT_CHARS)
|
43 |
|
44 |
AWS_DEFAULT_REGION = get_or_create_env_var('AWS_DEFAULT_REGION', 'eu-west-2')
|
45 |
print(f'The value of AWS_DEFAULT_REGION is {AWS_DEFAULT_REGION}')
|
|
|
110 |
if 'reference_table' in file.name:
|
111 |
try:
|
112 |
reference_file_data, reference_file_name = load_in_file(file)
|
113 |
+
#print("reference_file_data:", reference_file_data.head(2))
|
114 |
out_message = out_message + " Reference file load successful"
|
115 |
except Exception as e:
|
116 |
out_message = "Could not load reference file data:" + str(e)
|
|
|
119 |
if 'unique_topics' in file.name:
|
120 |
try:
|
121 |
unique_file_data, unique_file_name = load_in_file(file)
|
122 |
+
#print("unique_topics_file:", unique_file_data.head(2))
|
123 |
out_message = out_message + " Unique table file load successful"
|
124 |
except Exception as e:
|
125 |
out_message = "Could not load unique table file data:" + str(e)
|
|
|
138 |
|
139 |
print(out_message)
|
140 |
|
141 |
+
return reference_file_data, unique_file_data, latest_batch, out_message, reference_file_name, unique_file_name
|
142 |
|
143 |
def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_cols: List[str], output_folder: str, batch_number: int, batch_size: int) -> Tuple[str, str, str]:
|
144 |
"""
|
|
|
194 |
simple_file["Response"] = simple_file["Response"].str.strip() # Remove leading and trailing whitespace
|
195 |
simple_file["Response"] = simple_file["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
|
196 |
simple_file["Response"] = simple_file["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
|
197 |
+
simple_file["Response"] = simple_file["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
|
198 |
|
199 |
# Remove blank and extremely short responses
|
200 |
simple_file = simple_file.loc[~(simple_file["Response"].isnull()) &\
|
|
|
994 |
# Check if files and text exist
|
995 |
out_message = "Please enter a data file to summarise."
|
996 |
print(out_message)
|
997 |
+
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
|
998 |
|
999 |
|
1000 |
#model_choice_clean = replace_punctuation_with_underscore(model_choice)
|
|
|
1093 |
print("summary_out_file_paths:", summary_out_file_paths)
|
1094 |
|
1095 |
#final_out_message = '\n'.join(out_message)
|
1096 |
+
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, summary_out_file_paths, summary_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
|
1097 |
|
1098 |
|
1099 |
if num_batches > 0:
|
|
|
1114 |
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
|
1115 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
1116 |
print(out_message)
|
1117 |
+
return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
|
1118 |
|
1119 |
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
1120 |
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
|
|
|
1446 |
|
1447 |
print(final_message_out)
|
1448 |
|
1449 |
+
return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
|
1450 |
|
1451 |
# SUMMARISATION FUNCTIONS
|
1452 |
|
1453 |
+
def deduplicate_categories(category_series: pd.Series, join_series: pd.Series, reference_df: pd.DataFrame, merge_sentiment:str="Yes", threshold: float = deduplication_threshold) -> pd.DataFrame:
|
1454 |
"""
|
1455 |
+
Deduplicates similar category names in a pandas Series based on a fuzzy matching threshold,
|
1456 |
+
merging smaller topics into larger topics.
|
1457 |
+
|
1458 |
Parameters:
|
1459 |
category_series (pd.Series): Series containing category names to deduplicate.
|
1460 |
+
join_series (pd.Series): Additional series used for joining back to original results.
|
1461 |
+
reference_df (pd.DataFrame): DataFrame containing the reference data to count occurrences.
|
1462 |
threshold (float): Similarity threshold for considering two strings as duplicates.
|
1463 |
+
|
1464 |
Returns:
|
1465 |
pd.DataFrame: DataFrame with columns ['old_category', 'deduplicated_category'].
|
1466 |
"""
|
1467 |
+
# Count occurrences of each category in the reference_df
|
1468 |
+
category_counts = reference_df['Subtopic'].value_counts().to_dict()
|
1469 |
+
|
1470 |
# Initialize the result dictionary
|
1471 |
deduplication_map = {}
|
1472 |
+
|
1473 |
+
# First pass: Handle exact matches
|
1474 |
+
for category in category_series.unique():
|
1475 |
+
if category in deduplication_map:
|
1476 |
+
continue
|
1477 |
+
|
1478 |
+
# Find all exact matches
|
1479 |
+
exact_matches = category_series[category_series.str.lower() == category.lower()].index.tolist()
|
1480 |
+
if len(exact_matches) > 1:
|
1481 |
+
# Find the variant with the highest count
|
1482 |
+
match_counts = {match: category_counts.get(category_series[match], 0) for match in exact_matches}
|
1483 |
+
most_common = max(match_counts.items(), key=lambda x: x[1])[0]
|
1484 |
+
most_common_category = category_series[most_common]
|
1485 |
+
|
1486 |
+
# Map all exact matches to the most common variant
|
1487 |
+
for match in exact_matches:
|
1488 |
+
deduplication_map[category_series[match]] = most_common_category
|
1489 |
+
|
1490 |
+
# Second pass: Handle fuzzy matches for remaining categories
|
1491 |
for category in category_series.unique():
|
1492 |
# Skip if the category is already processed
|
1493 |
if category in deduplication_map:
|
1494 |
continue
|
1495 |
|
1496 |
# Find close matches to the current category, excluding the current category itself
|
1497 |
+
matches = process.extract(category,
|
1498 |
+
[cat for cat in category_series.unique() if cat != category],
|
1499 |
+
scorer=fuzz.token_set_ratio,
|
1500 |
+
score_cutoff=threshold)
|
1501 |
+
|
1502 |
if matches: # Check if there are any matches
|
1503 |
best_match = max(matches, key=lambda x: x[1]) # Get the match with the highest score
|
1504 |
match, score, _ = best_match # Unpack the best match
|
1505 |
+
|
1506 |
+
# Compare counts to ensure smaller topics merge into larger ones
|
1507 |
+
if category_counts.get(category, 0) < category_counts.get(match, 0):
|
1508 |
+
deduplication_map[category] = match # Map the smaller category to the larger one
|
1509 |
+
else:
|
1510 |
+
deduplication_map[match] = category # Map the larger category to the smaller one
|
1511 |
+
else:
|
1512 |
+
deduplication_map[category] = category # No match found, keep the category as is
|
1513 |
+
|
1514 |
# Create the result DataFrame
|
1515 |
+
if merge_sentiment == "Yes":
|
1516 |
+
result_df = pd.DataFrame({
|
1517 |
+
'old_category': category_series + " | " + join_series,
|
1518 |
+
'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
|
1519 |
+
})
|
1520 |
+
else:
|
1521 |
+
result_df = pd.DataFrame({
|
1522 |
+
'old_category': category_series + " | " + join_series,
|
1523 |
+
'deduplicated_category': category_series.map(lambda x: deduplication_map.get(x, x))
|
1524 |
+
})
|
1525 |
+
|
1526 |
return result_df
|
1527 |
|
1528 |
+
def deduplicate_topics(reference_df,
|
1529 |
+
unique_topics_df,
|
1530 |
+
reference_table_file_name:str,
|
1531 |
+
unique_topics_table_file_name:str,
|
1532 |
+
merge_sentiment:str= "No",
|
1533 |
+
merge_general_topics:str="No",
|
1534 |
+
score_threshold:int=deduplication_threshold,
|
1535 |
+
deduplicate_topics:str="Yes"):
|
1536 |
+
'''
|
1537 |
+
Deduplicate topics based on a reference and unique topics table
|
1538 |
+
'''
|
1539 |
+
output_files = []
|
1540 |
+
|
1541 |
+
reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
|
1542 |
+
unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
|
1543 |
|
1544 |
+
# Run through this x times to try to get all duplicate topics
|
1545 |
if deduplicate_topics == "Yes":
|
1546 |
+
for i in range(0, 5):
|
1547 |
+
#print("Deduplication run:", i)
|
1548 |
+
|
1549 |
|
1550 |
+
#reference_df_unique[["old_category"]].to_csv(output_folder + "reference_df_unique_old_categories_" + str(i) + ".csv", index=None)
|
|
|
|
|
|
|
|
|
1551 |
|
1552 |
+
if merge_sentiment == "No":
|
1553 |
+
# First, combine duplicate topics in reference_df
|
1554 |
+
reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
|
1555 |
+
reference_df_unique = reference_df.drop_duplicates("old_category")
|
1556 |
|
1557 |
+
# Deduplicate categories within each sentiment group
|
1558 |
+
deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
|
1559 |
+
lambda group: deduplicate_categories(group["Subtopic"], group["Sentiment"], reference_df, threshold=score_threshold)
|
1560 |
+
).reset_index(drop=True) # Reset index after groupby
|
1561 |
+
else:
|
1562 |
+
# Deduplicate categories by subtopic name only
|
1563 |
+
# First, combine duplicate topics in reference_df
|
1564 |
+
reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
|
1565 |
+
reference_df_unique = reference_df.drop_duplicates("old_category")
|
1566 |
|
1567 |
+
deduplicated_topic_map_df = deduplicate_categories(reference_df_unique["Subtopic"], reference_df_unique["Sentiment"], reference_df, merge_sentiment=merge_sentiment, threshold=score_threshold).reset_index(drop=True)
|
|
|
|
|
|
|
1568 |
|
1569 |
if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
|
1570 |
# Check if 'deduplicated_category' contains any values
|
|
|
1572 |
|
1573 |
else:
|
1574 |
# Join deduplicated columns back to original df
|
1575 |
+
deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
|
1576 |
# Remove rows where 'deduplicated_category' is blank or NaN
|
1577 |
+
deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category']]
|
1578 |
|
1579 |
+
deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
|
1580 |
|
1581 |
reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
|
1582 |
|
|
|
1599 |
reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
|
1600 |
reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
|
1601 |
|
1602 |
+
if merge_general_topics == "Yes":
|
1603 |
+
# Replace General topic names for each Subtopic with that for the Subtopic with the most responses
|
1604 |
+
# Step 1: Count the number of occurrences for each General Topic and Subtopic combination
|
1605 |
+
count_df = reference_df.groupby(['Subtopic', 'General Topic']).size().reset_index(name='Count')
|
1606 |
+
|
1607 |
+
# Step 2: Find the General Topic with the maximum count for each Subtopic
|
1608 |
+
max_general_topic = count_df.loc[count_df.groupby('Subtopic')['Count'].idxmax()]
|
1609 |
+
|
1610 |
+
# Step 3: Map the General Topic back to the original DataFrame
|
1611 |
+
reference_df = reference_df.merge(max_general_topic[['Subtopic', 'General Topic']], on='Subtopic', suffixes=('', '_max'), how='left')
|
1612 |
+
|
1613 |
+
reference_df['General Topic'] = reference_df["General Topic_max"].combine_first(reference_df["General Topic"])
|
1614 |
+
|
1615 |
+
if merge_sentiment == "Yes":
|
1616 |
+
# Step 1: Count the number of occurrences for each General Topic and Subtopic combination
|
1617 |
+
count_df = reference_df.groupby(['Subtopic', 'Sentiment']).size().reset_index(name='Count')
|
1618 |
+
|
1619 |
+
# Step 2: Determine the number of unique Sentiment values for each Subtopic
|
1620 |
+
unique_sentiments = count_df.groupby('Subtopic')['Sentiment'].nunique().reset_index(name='UniqueCount')
|
1621 |
+
|
1622 |
+
# Step 3: Update Sentiment to 'Mixed' where there is more than one unique sentiment
|
1623 |
+
reference_df = reference_df.merge(unique_sentiments, on='Subtopic', how='left')
|
1624 |
+
reference_df['Sentiment'] = reference_df.apply(
|
1625 |
+
lambda row: 'Mixed' if row['UniqueCount'] > 1 else row['Sentiment'],
|
1626 |
+
axis=1
|
1627 |
+
)
|
1628 |
+
|
1629 |
+
# Clean up the DataFrame by dropping the UniqueCount column
|
1630 |
+
reference_df.drop(columns=['UniqueCount'], inplace=True)
|
1631 |
+
|
1632 |
+
reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
|
1633 |
+
|
1634 |
# Remake unique_topics_df based on new reference_df
|
1635 |
unique_topics_df = create_unique_table_df_from_reference_table(reference_df)
|
1636 |
|
1637 |
+
reference_table_file_name_no_ext = get_file_path_end(reference_table_file_name)
|
1638 |
+
unique_topics_table_file_name_no_ext = get_file_path_end(unique_topics_table_file_name)
|
1639 |
+
|
1640 |
+
reference_file_path = output_folder + reference_table_file_name_no_ext + "_dedup.csv"
|
1641 |
+
unique_topics_file_path = output_folder + unique_topics_table_file_name_no_ext + "_dedup.csv"
|
1642 |
+
reference_df.to_csv(reference_file_path, index = None)
|
1643 |
+
unique_topics_df.to_csv(unique_topics_file_path, index=None)
|
1644 |
+
|
1645 |
+
output_files.append(reference_file_path)
|
1646 |
+
output_files.append(unique_topics_file_path)
|
1647 |
+
|
1648 |
+
return reference_df, unique_topics_df, output_files
|
1649 |
+
|
1650 |
+
def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
1651 |
+
unique_topics_df:pd.DataFrame,
|
1652 |
+
random_seed:int,
|
1653 |
+
no_of_sampled_summaries:int=150):
|
1654 |
+
|
1655 |
+
'''
|
1656 |
+
Sample x number of summaries from which to produce summaries, so that the input token length is not too long.
|
1657 |
+
'''
|
1658 |
+
|
1659 |
+
all_summaries = pd.DataFrame()
|
1660 |
+
output_files = []
|
1661 |
|
1662 |
reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
|
1663 |
|
|
|
1743 |
out_metadata_str:str = "",
|
1744 |
output_files:list = [],
|
1745 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
1746 |
+
do_summaries="Yes",
|
1747 |
progress=gr.Progress(track_tqdm=True)):
|
1748 |
'''
|
1749 |
Create better summaries of the raw batch-level summaries created in the first run of the model.
|
|
|
1826 |
summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
1827 |
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
|
1828 |
|
1829 |
+
if do_summaries == "Yes":
|
1830 |
+
for summary_no in summary_loop:
|
1831 |
|
1832 |
+
print("Current summary number is:", summary_no)
|
1833 |
|
1834 |
+
summary_text = all_summaries[summary_no]
|
1835 |
+
#print("summary_text:", summary_text)
|
1836 |
+
formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
|
1837 |
|
1838 |
+
try:
|
1839 |
+
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
|
1840 |
+
summarised_output = response
|
1841 |
+
summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
|
1842 |
+
summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
|
1843 |
+
summarised_output = summarised_output.strip()
|
1844 |
+
except Exception as e:
|
1845 |
+
print(e)
|
1846 |
+
summarised_output = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1847 |
|
1848 |
+
summarised_outputs.append(summarised_output)
|
1849 |
+
out_metadata.extend(metadata)
|
1850 |
+
out_metadata_str = '. '.join(out_metadata)
|
1851 |
+
|
1852 |
+
latest_summary_completed += 1
|
1853 |
+
|
1854 |
+
# Check if beyond max time allowed for processing and break if necessary
|
1855 |
+
toc = time.perf_counter()
|
1856 |
+
time_taken = tic - toc
|
1857 |
+
|
1858 |
+
if time_taken > max_time_for_loop:
|
1859 |
+
print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
|
1860 |
+
summary_loop.close()
|
1861 |
+
tqdm._instances.clear()
|
1862 |
+
break
|
1863 |
|
1864 |
# If all summaries completeed
|
1865 |
if latest_summary_completed >= length_all_summaries:
|