Spaces:
Runtime error
Runtime error
Merge pull request #1 from seanpedrick-case/dev
Browse files- app.py +4 -4
- requirements.txt +2 -2
- requirements_aws.txt +2 -2
- requirements_gpu.txt +2 -2
- tools/llm_api_call.py +32 -10
app.py
CHANGED
@@ -139,15 +139,15 @@ with app:
|
|
139 |
summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
140 |
|
141 |
with gr.Row():
|
142 |
-
merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
143 |
merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
|
|
144 |
deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
|
145 |
|
146 |
deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
|
147 |
|
148 |
duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
149 |
|
150 |
-
summarise_previous_data_btn = gr.Button("Summarise
|
151 |
summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
|
152 |
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
|
153 |
|
@@ -246,9 +246,9 @@ with app:
|
|
246 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
247 |
then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
248 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
249 |
-
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
|
250 |
|
251 |
-
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
|
252 |
|
253 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
254 |
continue_previous_data_files_btn.click(
|
|
|
139 |
summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
140 |
|
141 |
with gr.Row():
|
|
|
142 |
merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
143 |
+
merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
144 |
deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
|
145 |
|
146 |
deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
|
147 |
|
148 |
duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
149 |
|
150 |
+
summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
|
151 |
summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
|
152 |
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
|
153 |
|
|
|
246 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
247 |
then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
|
248 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
249 |
+
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
|
250 |
|
251 |
+
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
|
252 |
|
253 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
254 |
continue_previous_data_files_btn.click(
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
@@ -13,6 +13,6 @@ beautifulsoup4==4.12.3
|
|
13 |
rapidfuzz==3.10.1
|
14 |
torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
|
15 |
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
16 |
-
transformers==4.
|
17 |
numpy==1.26.4
|
18 |
typing_extensions==4.12.2
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.18.0
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
13 |
rapidfuzz==3.10.1
|
14 |
torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
|
15 |
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
16 |
+
transformers==4.49.0
|
17 |
numpy==1.26.4
|
18 |
typing_extensions==4.12.2
|
requirements_aws.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
@@ -12,6 +12,6 @@ html5lib==1.1
|
|
12 |
beautifulsoup4==4.12.3
|
13 |
rapidfuzz==3.10.1
|
14 |
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
15 |
-
transformers==4.
|
16 |
numpy==1.26.4
|
17 |
typing_extensions==4.12.2
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.18.0
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
12 |
beautifulsoup4==4.12.3
|
13 |
rapidfuzz==3.10.1
|
14 |
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
15 |
+
transformers==4.49.0
|
16 |
numpy==1.26.4
|
17 |
typing_extensions==4.12.2
|
requirements_gpu.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
@@ -15,6 +15,6 @@ torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
|
|
15 |
#llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
16 |
# Specify exact llama_cpp wheel for huggingface compatibility
|
17 |
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
|
18 |
-
transformers==4.
|
19 |
numpy==1.26.4
|
20 |
typing_extensions==4.12.2
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.18.0
|
3 |
spaces==0.31.0
|
4 |
boto3==1.35.71
|
5 |
pyarrow==18.1.0
|
|
|
15 |
#llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
16 |
# Specify exact llama_cpp wheel for huggingface compatibility
|
17 |
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
|
18 |
+
transformers==4.49.0
|
19 |
numpy==1.26.4
|
20 |
typing_extensions==4.12.2
|
tools/llm_api_call.py
CHANGED
@@ -196,11 +196,11 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
196 |
|
197 |
# Remove problematic characters including ASCII and various quote marks
|
198 |
# Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
|
199 |
-
batch_basic_response_data["Response"]
|
200 |
-
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.strip() # Remove leading and trailing whitespace
|
201 |
-
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
|
202 |
-
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
|
203 |
-
batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
|
204 |
|
205 |
# Remove blank and extremely short responses
|
206 |
batch_basic_response_data = batch_basic_response_data.loc[~(batch_basic_response_data["Response"].isnull()) &\
|
@@ -855,6 +855,12 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
855 |
# Remove duplicate Response references for the same topic
|
856 |
out_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
|
857 |
|
|
|
|
|
|
|
|
|
|
|
|
|
858 |
out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
|
859 |
|
860 |
# Save the new DataFrame to CSV
|
@@ -1817,7 +1823,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1817 |
out_metadata_str:str = "",
|
1818 |
in_data_files:List[str]=[],
|
1819 |
chosen_cols:List[str]=[],
|
1820 |
-
|
|
|
1821 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
1822 |
do_summaries="Yes",
|
1823 |
progress=gr.Progress(track_tqdm=True)):
|
@@ -1826,7 +1833,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1826 |
'''
|
1827 |
out_metadata = []
|
1828 |
local_model = []
|
1829 |
-
log_output_files = []
|
1830 |
summarised_output_markdown = ""
|
1831 |
|
1832 |
print("In summarise_output_topics function.")
|
@@ -1835,12 +1841,23 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1835 |
|
1836 |
length_all_summaries = len(all_summaries)
|
1837 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1838 |
# Load in data file and chosen columns if exists to create pivot table later
|
1839 |
if in_data_files and chosen_cols:
|
1840 |
file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
|
1841 |
-
|
1842 |
-
|
1843 |
-
|
|
|
|
|
1844 |
|
1845 |
# If all summaries completed, make final outputs
|
1846 |
if latest_summary_completed >= length_all_summaries:
|
@@ -1866,6 +1883,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1866 |
summarised_references_j = summarised_references[join_plus_summary_cols].drop_duplicates(join_plus_summary_cols)
|
1867 |
|
1868 |
unique_table_df_revised = unique_table_df.merge(summarised_references_j, on = join_cols, how = "left")
|
|
|
1869 |
# If no new summary is available, keep the original
|
1870 |
unique_table_df_revised["Revised summary"] = unique_table_df_revised["Revised summary"].combine_first(unique_table_df_revised["Summary"])
|
1871 |
|
@@ -1904,6 +1922,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1904 |
|
1905 |
summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
|
1906 |
|
|
|
|
|
|
|
|
|
1907 |
return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|
1908 |
|
1909 |
tic = time.perf_counter()
|
|
|
196 |
|
197 |
# Remove problematic characters including ASCII and various quote marks
|
198 |
# Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
|
199 |
+
batch_basic_response_data.loc[:, "Response"]= batch_basic_response_data["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
|
200 |
+
batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.strip() # Remove leading and trailing whitespace
|
201 |
+
batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
|
202 |
+
batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
|
203 |
+
batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
|
204 |
|
205 |
# Remove blank and extremely short responses
|
206 |
batch_basic_response_data = batch_basic_response_data.loc[~(batch_basic_response_data["Response"].isnull()) &\
|
|
|
855 |
# Remove duplicate Response references for the same topic
|
856 |
out_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
|
857 |
|
858 |
+
# Try converting response references column to int, keep as string if fails
|
859 |
+
try:
|
860 |
+
out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
|
861 |
+
except Exception as e:
|
862 |
+
print("Could not convert Response References column to integer due to", e)
|
863 |
+
|
864 |
out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
|
865 |
|
866 |
# Save the new DataFrame to CSV
|
|
|
1823 |
out_metadata_str:str = "",
|
1824 |
in_data_files:List[str]=[],
|
1825 |
chosen_cols:List[str]=[],
|
1826 |
+
log_output_files:list[str]=[],
|
1827 |
+
output_files:list[str] = [],
|
1828 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
1829 |
do_summaries="Yes",
|
1830 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
1833 |
'''
|
1834 |
out_metadata = []
|
1835 |
local_model = []
|
|
|
1836 |
summarised_output_markdown = ""
|
1837 |
|
1838 |
print("In summarise_output_topics function.")
|
|
|
1841 |
|
1842 |
length_all_summaries = len(all_summaries)
|
1843 |
|
1844 |
+
# Check for data for summarisations
|
1845 |
+
if not unique_table_df.empty and not reference_table_df.empty:
|
1846 |
+
print("Unique table and reference table data found.")
|
1847 |
+
else:
|
1848 |
+
out_message = "Please upload a unique topic table and reference table file to continue with summarisation."
|
1849 |
+
print(out_message)
|
1850 |
+
raise(out_message)
|
1851 |
+
return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|
1852 |
+
|
1853 |
# Load in data file and chosen columns if exists to create pivot table later
|
1854 |
if in_data_files and chosen_cols:
|
1855 |
file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
|
1856 |
+
else:
|
1857 |
+
out_message = "No file data found, please load a data file on the first tab and select a column."
|
1858 |
+
print(out_message)
|
1859 |
+
raise(out_message)
|
1860 |
+
return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|
1861 |
|
1862 |
# If all summaries completed, make final outputs
|
1863 |
if latest_summary_completed >= length_all_summaries:
|
|
|
1883 |
summarised_references_j = summarised_references[join_plus_summary_cols].drop_duplicates(join_plus_summary_cols)
|
1884 |
|
1885 |
unique_table_df_revised = unique_table_df.merge(summarised_references_j, on = join_cols, how = "left")
|
1886 |
+
|
1887 |
# If no new summary is available, keep the original
|
1888 |
unique_table_df_revised["Revised summary"] = unique_table_df_revised["Revised summary"].combine_first(unique_table_df_revised["Summary"])
|
1889 |
|
|
|
1922 |
|
1923 |
summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
|
1924 |
|
1925 |
+
# Ensure same file name not returned twice
|
1926 |
+
output_files = list(set(output_files))
|
1927 |
+
log_output_files = list(set(log_output_files))
|
1928 |
+
|
1929 |
return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
|
1930 |
|
1931 |
tic = time.perf_counter()
|