Spaces:
Runtime error
Runtime error
Merge pull request #2 from seanpedrick-case/dev
Browse filesVarious improvements to zero shot topic modelling/categorisation. Varied sentiment and summary options. More resilient LLM calls.
- app.py +89 -55
- requirements.txt +1 -1
- requirements_aws.txt +1 -1
- requirements_gpu.txt +1 -1
- tools/helper_functions.py +16 -4
- tools/llm_api_call.py +0 -0
- tools/prompts.py +25 -11
app.py
CHANGED
@@ -3,7 +3,7 @@ import socket
|
|
3 |
import spaces
|
4 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
|
5 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
6 |
-
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics
|
7 |
from tools.auth import authenticate_user
|
8 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
9 |
#from tools.aws_functions import load_data_from_aws
|
@@ -44,14 +44,20 @@ with app:
|
|
44 |
###
|
45 |
|
46 |
text_output_file_list_state = gr.State([])
|
|
|
47 |
log_files_output_list_state = gr.State([])
|
48 |
first_loop_state = gr.State(True)
|
49 |
second_loop_state = gr.State(False)
|
|
|
50 |
|
51 |
-
file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas")
|
52 |
-
master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas")
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
|
56 |
session_hash_state = gr.State()
|
57 |
s3_output_folder_state = gr.State()
|
@@ -67,14 +73,15 @@ with app:
|
|
67 |
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
68 |
|
69 |
# Summary state objects
|
70 |
-
summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas")
|
71 |
-
master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas")
|
72 |
-
master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas")
|
73 |
summarised_references_markdown = gr.Markdown("", visible=False)
|
74 |
summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
|
75 |
latest_summary_completed_num = gr.Number(0, visible=False)
|
76 |
|
77 |
-
|
|
|
78 |
|
79 |
###
|
80 |
# UI LAYOUT
|
@@ -105,17 +112,20 @@ with app:
|
|
105 |
in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
106 |
|
107 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
108 |
-
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest.", allow_custom_value=True, interactive=True)
|
109 |
|
110 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
111 |
-
candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have
|
|
|
112 |
|
113 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
114 |
|
115 |
-
|
|
|
|
|
116 |
|
117 |
-
|
118 |
-
|
119 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
120 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
121 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
@@ -129,14 +139,28 @@ with app:
|
|
129 |
with gr.Row():
|
130 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
131 |
|
132 |
-
with gr.Tab(label="
|
133 |
gr.Markdown(
|
134 |
"""
|
135 |
-
|
136 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
140 |
|
141 |
with gr.Row():
|
142 |
merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
@@ -144,17 +168,21 @@ with app:
|
|
144 |
deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
|
145 |
|
146 |
deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
149 |
|
150 |
summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
|
151 |
summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
|
152 |
-
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
|
153 |
|
154 |
-
with gr.Tab(label="Continue
|
155 |
gr.Markdown(
|
156 |
"""
|
157 |
-
### Load in
|
158 |
""")
|
159 |
|
160 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
@@ -170,7 +198,7 @@ with app:
|
|
170 |
""")
|
171 |
|
172 |
in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
173 |
-
view_table_markdown = gr.Markdown(value = "", label="View table")
|
174 |
|
175 |
with gr.Tab(label="Topic extraction settings"):
|
176 |
gr.Markdown(
|
@@ -183,7 +211,7 @@ with app:
|
|
183 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
184 |
|
185 |
with gr.Accordion("Prompt settings", open = True):
|
186 |
-
number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3)
|
187 |
system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
|
188 |
initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
|
189 |
prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
|
@@ -196,7 +224,7 @@ with app:
|
|
196 |
|
197 |
# Invisible text box to hold the session hash/username just for logging purposes
|
198 |
session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
|
199 |
-
|
200 |
estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
201 |
total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
|
202 |
|
@@ -220,40 +248,46 @@ with app:
|
|
220 |
###
|
221 |
|
222 |
# Tabular data upload
|
223 |
-
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets,
|
224 |
-
|
225 |
-
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state,
|
226 |
-
|
227 |
-
inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state,
|
228 |
-
|
229 |
-
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state,
|
230 |
-
outputs=[
|
231 |
|
232 |
-
# return display_table, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths
|
233 |
|
234 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
235 |
-
latest_batch_completed.change(fn=extract_topics,
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
# When button pressed, deduplicate data
|
242 |
-
deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[
|
243 |
-
|
244 |
|
245 |
# When button pressed, summarise previous data
|
246 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
|
251 |
-
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide,
|
252 |
|
253 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
254 |
continue_previous_data_files_btn.click(
|
255 |
-
load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number], outputs = [file_data_state,
|
256 |
-
|
257 |
|
258 |
###
|
259 |
# LOGGING AND ON APP LOAD FUNCTIONS
|
@@ -264,21 +298,21 @@ with app:
|
|
264 |
access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
265 |
access_callback.setup([session_hash_textbox], access_logs_data_folder)
|
266 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
267 |
-
|
268 |
|
269 |
# Log usage usage when making a query
|
270 |
usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
271 |
-
usage_callback.setup([session_hash_textbox,
|
272 |
|
273 |
-
conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox,
|
274 |
-
|
275 |
|
276 |
# User submitted feedback
|
277 |
feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
278 |
-
feedback_callback.setup([data_feedback_radio, data_further_details_text,
|
279 |
|
280 |
-
data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text,
|
281 |
-
|
282 |
|
283 |
in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
|
284 |
|
|
|
3 |
import spaces
|
4 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
|
5 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
6 |
+
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
|
7 |
from tools.auth import authenticate_user
|
8 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
9 |
#from tools.aws_functions import load_data_from_aws
|
|
|
44 |
###
|
45 |
|
46 |
text_output_file_list_state = gr.State([])
|
47 |
+
text_output_modify_file_list_state = gr.State([])
|
48 |
log_files_output_list_state = gr.State([])
|
49 |
first_loop_state = gr.State(True)
|
50 |
second_loop_state = gr.State(False)
|
51 |
+
modified_unique_table_change_bool = gr.State(True) # This boolean is used to flag whether a file upload should change just the modified unique table object on the second tab
|
52 |
|
53 |
+
file_data_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="file_data_state", visible=False, type="pandas")
|
54 |
+
master_topic_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_topic_df_state", visible=False, type="pandas")
|
55 |
+
master_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_state", visible=False, type="pandas")
|
56 |
+
master_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_state", visible=False, type="pandas")
|
57 |
+
|
58 |
+
master_modify_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_unique_topics_df_state", visible=False, type="pandas")
|
59 |
+
master_modify_reference_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_modify_reference_df_state", visible=False, type="pandas")
|
60 |
+
|
61 |
|
62 |
session_hash_state = gr.State()
|
63 |
s3_output_folder_state = gr.State()
|
|
|
73 |
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
74 |
|
75 |
# Summary state objects
|
76 |
+
summary_reference_table_sample_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summary_reference_table_sample_state", visible=False, type="pandas")
|
77 |
+
master_reference_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_reference_df_revised_summaries_state", visible=False, type="pandas")
|
78 |
+
master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas")
|
79 |
summarised_references_markdown = gr.Markdown("", visible=False)
|
80 |
summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
|
81 |
latest_summary_completed_num = gr.Number(0, visible=False)
|
82 |
|
83 |
+
reference_data_file_name_textbox = gr.Textbox(label = "Reference data file name", value="", visible=False)
|
84 |
+
unique_topics_table_file_name_textbox = gr.Textbox(label="Unique topics data file name textbox", visible=False)
|
85 |
|
86 |
###
|
87 |
# UI LAYOUT
|
|
|
112 |
in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
113 |
|
114 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
115 |
+
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
116 |
|
117 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
118 |
+
candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
|
119 |
+
force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
|
120 |
|
121 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
122 |
|
123 |
+
sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative, Neutral, or Positive", choices=["Negative, Neutral, or Positive", "Negative or Positive", "Do not assess sentiment"])
|
124 |
+
|
125 |
+
extract_topics_btn = gr.Button("Extract topics", variant="primary")
|
126 |
|
127 |
+
topic_extraction_output_files = gr.File(height=file_input_height, label="Output files")
|
128 |
+
display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
|
129 |
latest_batch_completed = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
130 |
# Duplicate version of the above variable for when you don't want to initiate the summarisation loop
|
131 |
latest_batch_completed_no_loop = gr.Number(value=0, label="Number of files prepared", interactive=False, visible=False)
|
|
|
139 |
with gr.Row():
|
140 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
141 |
|
142 |
+
with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
|
143 |
gr.Markdown(
|
144 |
"""
|
145 |
+
Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.
|
146 |
""")
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
with gr.Accordion("Modify existing topics", open = False):
|
151 |
+
modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
152 |
+
|
153 |
+
modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
|
154 |
+
|
155 |
+
save_modified_files_button = gr.Button(value="Save modified topic names")
|
156 |
+
|
157 |
+
|
158 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
159 |
+
|
160 |
+
|
161 |
+
### DEDUPLICATION
|
162 |
+
deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
163 |
+
deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
164 |
|
165 |
with gr.Row():
|
166 |
merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
|
|
168 |
deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
|
169 |
|
170 |
deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
|
171 |
+
|
172 |
+
|
173 |
+
### SUMMARISATION
|
174 |
+
summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
175 |
+
|
176 |
+
summarise_format_radio = gr.Radio(label="Choose summary type", value="Return a summary up to two paragraphs long that includes as much detail as possible from the original text", choices=["Return a summary up to two paragraphs long that includes as much detail as possible from the original text", "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"])
|
177 |
|
178 |
summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
|
179 |
summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
|
180 |
+
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
|
181 |
|
182 |
+
with gr.Tab(label="Continue unfinished topic extraction"):
|
183 |
gr.Markdown(
|
184 |
"""
|
185 |
+
### Load in output files from a previous topic extraction process and continue topic extraction with new data.
|
186 |
""")
|
187 |
|
188 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
|
|
198 |
""")
|
199 |
|
200 |
in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
201 |
+
view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
|
202 |
|
203 |
with gr.Tab(label="Topic extraction settings"):
|
204 |
gr.Markdown(
|
|
|
211 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
212 |
|
213 |
with gr.Accordion("Prompt settings", open = True):
|
214 |
+
number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
|
215 |
system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
|
216 |
initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
|
217 |
prompt_2_textbox = gr.Textbox(label = "Prompt 2", lines = 8, value = prompt2, visible=False)
|
|
|
224 |
|
225 |
# Invisible text box to hold the session hash/username just for logging purposes
|
226 |
session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
|
227 |
+
|
228 |
estimated_time_taken_number = gr.Number(label= "Estimated time taken (seconds)", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
229 |
total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
|
230 |
|
|
|
248 |
###
|
249 |
|
250 |
# Tabular data upload
|
251 |
+
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
|
252 |
+
|
253 |
+
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
|
254 |
+
success(load_in_data_file,
|
255 |
+
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
|
256 |
+
success(fn=extract_topics,
|
257 |
+
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
|
258 |
+
outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
|
259 |
|
|
|
260 |
|
261 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
262 |
+
# latest_batch_completed.change(fn=extract_topics,
|
263 |
+
# inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio],
|
264 |
+
# outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
|
265 |
+
# success(fn = reveal_feedback_buttons,
|
266 |
+
# outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
267 |
+
|
268 |
+
# If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
|
269 |
+
modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
|
270 |
+
|
271 |
+
|
272 |
+
# Modify output table with custom topic names
|
273 |
+
save_modified_files_button.click(fn=modify_existing_output_tables, inputs=[master_modify_unique_topics_df_state, modifiable_unique_topics_df_state, master_modify_reference_df_state, text_output_modify_file_list_state], outputs=[master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, deduplication_input_files, summarisation_input_files, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, summarised_output_markdown])
|
274 |
|
275 |
# When button pressed, deduplicate data
|
276 |
+
deduplicate_previous_data_btn.click(load_in_previous_data_files, inputs=[deduplication_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
277 |
+
success(deduplicate_topics, inputs=[master_reference_df_state, master_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, in_excel_sheets, merge_sentiment_drop, merge_general_topics_drop, deduplicate_score_threshold, in_data_files, in_colnames], outputs=[master_reference_df_state, master_unique_topics_df_state, summarisation_input_files, log_files_output, summarised_output_markdown], scroll_to_output=True)
|
278 |
|
279 |
# When button pressed, summarise previous data
|
280 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
281 |
+
success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
282 |
+
success(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
283 |
+
success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
|
284 |
|
285 |
+
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output], scroll_to_output=True)
|
286 |
|
287 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
288 |
continue_previous_data_files_btn.click(
|
289 |
+
load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
|
290 |
+
success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
|
291 |
|
292 |
###
|
293 |
# LOGGING AND ON APP LOAD FUNCTIONS
|
|
|
298 |
access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
299 |
access_callback.setup([session_hash_textbox], access_logs_data_folder)
|
300 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
301 |
+
success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
302 |
|
303 |
# Log usage usage when making a query
|
304 |
usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
305 |
+
usage_callback.setup([session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], usage_data_folder)
|
306 |
|
307 |
+
conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, reference_data_file_name_textbox, model_choice, conversation_metadata_textbox, estimated_time_taken_number], None, preprocess=False).\
|
308 |
+
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
309 |
|
310 |
# User submitted feedback
|
311 |
feedback_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
312 |
+
feedback_callback.setup([data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], feedback_data_folder)
|
313 |
|
314 |
+
data_submit_feedback_btn.click(lambda *args: feedback_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, reference_data_file_name_textbox, model_choice, temperature_slide, display_topic_table_markdown, conversation_metadata_textbox], None, preprocess=False).\
|
315 |
+
success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
316 |
|
317 |
in_view_table.upload(view_table, inputs=[in_view_table], outputs=[view_table_markdown])
|
318 |
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.20.1
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
requirements_aws.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.20.1
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
requirements_gpu.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.20.1
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
tools/helper_functions.py
CHANGED
@@ -15,8 +15,11 @@ def empty_output_vars_extract_topics():
|
|
15 |
log_files_output_list_state = []
|
16 |
conversation_metadata_textbox = ""
|
17 |
estimated_time_taken_number = 0
|
|
|
|
|
|
|
18 |
|
19 |
-
return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number
|
20 |
|
21 |
def empty_output_vars_summarise():
|
22 |
# Empty output objects before summarising files
|
@@ -106,22 +109,31 @@ def detect_file_type(filename):
|
|
106 |
else:
|
107 |
raise ValueError("Unsupported file type.")
|
108 |
|
109 |
-
def read_file(filename):
|
110 |
"""Read the file based on its detected type."""
|
111 |
file_type = detect_file_type(filename)
|
112 |
|
113 |
if file_type == 'csv':
|
114 |
return pd.read_csv(filename, low_memory=False)
|
115 |
elif file_type == 'xlsx':
|
116 |
-
|
|
|
|
|
|
|
117 |
elif file_type == 'parquet':
|
118 |
return pd.read_parquet(filename)
|
119 |
|
120 |
# Wrap text in each column to the specified max width, including whole words
|
121 |
-
def wrap_text(text, max_width=60):
|
122 |
if not isinstance(text, str):
|
123 |
return text
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
words = text.split()
|
126 |
if not words:
|
127 |
return text
|
|
|
15 |
log_files_output_list_state = []
|
16 |
conversation_metadata_textbox = ""
|
17 |
estimated_time_taken_number = 0
|
18 |
+
file_data_state = pd.DataFrame()
|
19 |
+
reference_data_file_name_textbox = ""
|
20 |
+
display_topic_table_markdown = ""
|
21 |
|
22 |
+
return master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown
|
23 |
|
24 |
def empty_output_vars_summarise():
|
25 |
# Empty output objects before summarising files
|
|
|
109 |
else:
|
110 |
raise ValueError("Unsupported file type.")
|
111 |
|
112 |
+
def read_file(filename:str, sheet:str=""):
|
113 |
"""Read the file based on its detected type."""
|
114 |
file_type = detect_file_type(filename)
|
115 |
|
116 |
if file_type == 'csv':
|
117 |
return pd.read_csv(filename, low_memory=False)
|
118 |
elif file_type == 'xlsx':
|
119 |
+
if sheet:
|
120 |
+
return pd.read_excel(filename, sheet_name=sheet)
|
121 |
+
else:
|
122 |
+
return pd.read_excel(filename)
|
123 |
elif file_type == 'parquet':
|
124 |
return pd.read_parquet(filename)
|
125 |
|
126 |
# Wrap text in each column to the specified max width, including whole words
|
127 |
+
def wrap_text(text:str, max_width=60, max_text_length=None):
|
128 |
if not isinstance(text, str):
|
129 |
return text
|
130 |
|
131 |
+
# If max_text_length is set, truncate the text and add ellipsis
|
132 |
+
if max_text_length and len(text) > max_text_length:
|
133 |
+
text = text[:max_text_length] + '...'
|
134 |
+
|
135 |
+
text = text.replace('\r\n', '<br>').replace('\n', '<br>')
|
136 |
+
|
137 |
words = text.split()
|
138 |
if not words:
|
139 |
return text
|
tools/llm_api_call.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tools/prompts.py
CHANGED
@@ -5,14 +5,16 @@ initial_table_prompt = """The open text data is shown in the following table tha
|
|
5 |
|
6 |
Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
|
7 |
In the first column identify general topics relevant to responses. Create as many general topics as you can.
|
8 |
-
In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned.
|
9 |
-
|
10 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
11 |
-
In the fifth
|
12 |
-
Do not add any other columns. Do not
|
13 |
|
14 |
New table:"""
|
15 |
|
|
|
|
|
16 |
prompt2 = ""
|
17 |
|
18 |
prompt3 = ""
|
@@ -21,6 +23,12 @@ prompt3 = ""
|
|
21 |
|
22 |
add_existing_topics_system_prompt = system_prompt
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
add_existing_topics_prompt = """Responses are shown in the following Response table:
|
25 |
{response_table}
|
26 |
|
@@ -28,16 +36,16 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
|
|
28 |
{topics}
|
29 |
|
30 |
Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
|
31 |
-
|
32 |
-
|
33 |
-
In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive.
|
34 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
35 |
-
In the fifth
|
36 |
-
Do not add any other columns.
|
37 |
-
Return only one table in markdown format containing all relevant topics. Do not add any other text, thoughts, or notes to your response.
|
38 |
|
39 |
New table:"""
|
40 |
|
|
|
|
|
41 |
|
42 |
summarise_topic_descriptions_system_prompt = system_prompt
|
43 |
|
@@ -45,11 +53,13 @@ summarise_topic_descriptions_prompt = """Below is a table with number of paragra
|
|
45 |
|
46 |
'{summaries}'
|
47 |
|
48 |
-
Your task is to make a consolidated summary of the above text.
|
49 |
|
50 |
Summary:"""
|
51 |
|
52 |
|
|
|
|
|
53 |
create_general_topics_system_prompt = system_prompt
|
54 |
|
55 |
create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
|
@@ -60,6 +70,10 @@ Your task is to create a General Topic name for each Subtopic. The new Topics ta
|
|
60 |
New Topics table:"""
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
63 |
# example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
64 |
# You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|
65 |
# Summarise the following text in less than {length} words: "{text}"\n
|
|
|
5 |
|
6 |
Your task is to create one new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
|
7 |
In the first column identify general topics relevant to responses. Create as many general topics as you can.
|
8 |
+
In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
|
9 |
+
{sentiment_choices}.
|
10 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
11 |
+
In the fifth column, write a short summary of the subtopic based on relevant responses - highlight specific issues that appear.
|
12 |
+
Do not add any other columns. Do not add any other text to your response.
|
13 |
|
14 |
New table:"""
|
15 |
|
16 |
+
# Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
|
17 |
+
|
18 |
prompt2 = ""
|
19 |
|
20 |
prompt3 = ""
|
|
|
23 |
|
24 |
add_existing_topics_system_prompt = system_prompt
|
25 |
|
26 |
+
force_existing_topics_prompt = """Create a new markdown table with the headings 'Placeholder', 'Subtopics', 'Sentiment', 'Response references', and 'Summary'.
|
27 |
+
In the first column, write 'Not assessed'. In the second column, assign Subtopics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
|
28 |
+
|
29 |
+
allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response references', and 'Summary'.
|
30 |
+
In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
|
31 |
+
|
32 |
add_existing_topics_prompt = """Responses are shown in the following Response table:
|
33 |
{response_table}
|
34 |
|
|
|
36 |
{topics}
|
37 |
|
38 |
Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
|
39 |
+
{topic_assignment}
|
40 |
+
{sentiment_choices}.
|
|
|
41 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
42 |
+
In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
|
43 |
+
Do not add any other columns. Do not add any other text to your response.
|
|
|
44 |
|
45 |
New table:"""
|
46 |
|
47 |
+
# Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
|
48 |
+
|
49 |
|
50 |
summarise_topic_descriptions_system_prompt = system_prompt
|
51 |
|
|
|
53 |
|
54 |
'{summaries}'
|
55 |
|
56 |
+
Your task is to make a consolidated summary of the above text. {summary_format}. Return only the summary and no other text.
|
57 |
|
58 |
Summary:"""
|
59 |
|
60 |
|
61 |
+
## The following didn't work well in testing and so is not currently used
|
62 |
+
|
63 |
create_general_topics_system_prompt = system_prompt
|
64 |
|
65 |
create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
|
|
|
70 |
New Topics table:"""
|
71 |
|
72 |
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
# example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
78 |
# You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|
79 |
# Summarise the following text in less than {length} words: "{text}"\n
|