Sean Pedrick-Case commited on
Commit
fd8dddc
·
unverified ·
2 Parent(s): b0e08c8 64ffd3a

Merge pull request #1 from seanpedrick-case/dev

Browse files
app.py CHANGED
@@ -139,15 +139,15 @@ with app:
139
  summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
140
 
141
  with gr.Row():
142
- merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
143
  merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
 
144
  deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
145
 
146
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
147
 
148
  duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
149
 
150
- summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
151
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
152
  summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
153
 
@@ -246,9 +246,9 @@ with app:
246
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
247
  then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
248
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
249
- then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
250
 
251
- latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
252
 
253
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
254
  continue_previous_data_files_btn.click(
 
139
  summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
140
 
141
  with gr.Row():
 
142
  merge_general_topics_drop = gr.Dropdown(label="Merge general topic values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
143
+ merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
144
  deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
145
 
146
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
147
 
148
  duplicate_output_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
149
 
150
+ summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
151
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
152
  summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
153
 
 
246
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
247
  then(load_in_previous_data_files, inputs=[duplicate_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox, unique_topics_table_file_textbox]).\
248
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
249
+ then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
250
 
251
+ latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_colnames, log_files_output_list_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output])
252
 
253
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
254
  continue_previous_data_files_btn.click(
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.12.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
@@ -13,6 +13,6 @@ beautifulsoup4==4.12.3
13
  rapidfuzz==3.10.1
14
  torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
15
  llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
16
- transformers==4.47.0
17
  numpy==1.26.4
18
  typing_extensions==4.12.2
 
1
  pandas==2.2.3
2
+ gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
13
  rapidfuzz==3.10.1
14
  torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
15
  llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
16
+ transformers==4.49.0
17
  numpy==1.26.4
18
  typing_extensions==4.12.2
requirements_aws.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.12.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
@@ -12,6 +12,6 @@ html5lib==1.1
12
  beautifulsoup4==4.12.3
13
  rapidfuzz==3.10.1
14
  llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
15
- transformers==4.47.0
16
  numpy==1.26.4
17
  typing_extensions==4.12.2
 
1
  pandas==2.2.3
2
+ gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
12
  beautifulsoup4==4.12.3
13
  rapidfuzz==3.10.1
14
  llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
15
+ transformers==4.49.0
16
  numpy==1.26.4
17
  typing_extensions==4.12.2
requirements_gpu.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.2.3
2
- gradio==5.12.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
@@ -15,6 +15,6 @@ torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
15
  #llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
16
  # Specify exact llama_cpp wheel for huggingface compatibility
17
  https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
18
- transformers==4.47.0
19
  numpy==1.26.4
20
  typing_extensions==4.12.2
 
1
  pandas==2.2.3
2
+ gradio==5.18.0
3
  spaces==0.31.0
4
  boto3==1.35.71
5
  pyarrow==18.1.0
 
15
  #llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
16
  # Specify exact llama_cpp wheel for huggingface compatibility
17
  https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
18
+ transformers==4.49.0
19
  numpy==1.26.4
20
  typing_extensions==4.12.2
tools/llm_api_call.py CHANGED
@@ -196,11 +196,11 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
196
 
197
  # Remove problematic characters including ASCII and various quote marks
198
  # Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
199
- batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
200
- batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.strip() # Remove leading and trailing whitespace
201
- batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
202
- batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
203
- batch_basic_response_data["Response"] = batch_basic_response_data["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
204
 
205
  # Remove blank and extremely short responses
206
  batch_basic_response_data = batch_basic_response_data.loc[~(batch_basic_response_data["Response"].isnull()) &\
@@ -855,6 +855,12 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
855
  # Remove duplicate Response references for the same topic
856
  out_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
857
 
 
 
 
 
 
 
858
  out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
859
 
860
  # Save the new DataFrame to CSV
@@ -1817,7 +1823,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1817
  out_metadata_str:str = "",
1818
  in_data_files:List[str]=[],
1819
  chosen_cols:List[str]=[],
1820
- output_files:list = [],
 
1821
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
1822
  do_summaries="Yes",
1823
  progress=gr.Progress(track_tqdm=True)):
@@ -1826,7 +1833,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1826
  '''
1827
  out_metadata = []
1828
  local_model = []
1829
- log_output_files = []
1830
  summarised_output_markdown = ""
1831
 
1832
  print("In summarise_output_topics function.")
@@ -1835,12 +1841,23 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1835
 
1836
  length_all_summaries = len(all_summaries)
1837
 
 
 
 
 
 
 
 
 
 
1838
  # Load in data file and chosen columns if exists to create pivot table later
1839
  if in_data_files and chosen_cols:
1840
  file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
1841
-
1842
- #print("latest_summary_completed:", latest_summary_completed)
1843
- #print("length_all_summaries:", length_all_summaries)
 
 
1844
 
1845
  # If all summaries completed, make final outputs
1846
  if latest_summary_completed >= length_all_summaries:
@@ -1866,6 +1883,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1866
  summarised_references_j = summarised_references[join_plus_summary_cols].drop_duplicates(join_plus_summary_cols)
1867
 
1868
  unique_table_df_revised = unique_table_df.merge(summarised_references_j, on = join_cols, how = "left")
 
1869
  # If no new summary is available, keep the original
1870
  unique_table_df_revised["Revised summary"] = unique_table_df_revised["Revised summary"].combine_first(unique_table_df_revised["Summary"])
1871
 
@@ -1904,6 +1922,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1904
 
1905
  summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
1906
 
 
 
 
 
1907
  return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
1908
 
1909
  tic = time.perf_counter()
 
196
 
197
  # Remove problematic characters including ASCII and various quote marks
198
  # Remove problematic characters including control characters, special characters, and excessive leading/trailing whitespace
199
+ batch_basic_response_data.loc[:, "Response"]= batch_basic_response_data["Response"].str.replace(r'[\x00-\x1F\x7F]|[""<>]|\\', '', regex=True) # Remove control and special characters
200
+ batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.strip() # Remove leading and trailing whitespace
201
+ batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
202
+ batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.replace(r'\n{2,}', '\n', regex=True) # Replace multiple line breaks with a single line break
203
+ batch_basic_response_data.loc[:, "Response"] = batch_basic_response_data["Response"].str.slice(0, max_comment_character_length) # Maximum 1,500 character responses
204
 
205
  # Remove blank and extremely short responses
206
  batch_basic_response_data = batch_basic_response_data.loc[~(batch_basic_response_data["Response"].isnull()) &\
 
855
  # Remove duplicate Response references for the same topic
856
  out_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
857
 
858
+ # Try converting response references column to int, keep as string if fails
859
+ try:
860
+ out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
861
+ except Exception as e:
862
+ print("Could not convert Response References column to integer due to", e)
863
+
864
  out_reference_df.sort_values(["Start row of group", "Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)
865
 
866
  # Save the new DataFrame to CSV
 
1823
  out_metadata_str:str = "",
1824
  in_data_files:List[str]=[],
1825
  chosen_cols:List[str]=[],
1826
+ log_output_files:list[str]=[],
1827
+ output_files:list[str] = [],
1828
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
1829
  do_summaries="Yes",
1830
  progress=gr.Progress(track_tqdm=True)):
 
1833
  '''
1834
  out_metadata = []
1835
  local_model = []
 
1836
  summarised_output_markdown = ""
1837
 
1838
  print("In summarise_output_topics function.")
 
1841
 
1842
  length_all_summaries = len(all_summaries)
1843
 
1844
+ # Check for data for summarisations
1845
+ if not unique_table_df.empty and not reference_table_df.empty:
1846
+ print("Unique table and reference table data found.")
1847
+ else:
1848
+ out_message = "Please upload a unique topic table and reference table file to continue with summarisation."
1849
+ print(out_message)
1850
+ raise(out_message)
1851
+ return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
1852
+
1853
  # Load in data file and chosen columns if exists to create pivot table later
1854
  if in_data_files and chosen_cols:
1855
  file_data, data_file_names_textbox, total_number_of_batches = load_in_data_file(in_data_files, chosen_cols, 1)
1856
+ else:
1857
+ out_message = "No file data found, please load a data file on the first tab and select a column."
1858
+ print(out_message)
1859
+ raise(out_message)
1860
+ return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
1861
 
1862
  # If all summaries completed, make final outputs
1863
  if latest_summary_completed >= length_all_summaries:
 
1883
  summarised_references_j = summarised_references[join_plus_summary_cols].drop_duplicates(join_plus_summary_cols)
1884
 
1885
  unique_table_df_revised = unique_table_df.merge(summarised_references_j, on = join_cols, how = "left")
1886
+
1887
  # If no new summary is available, keep the original
1888
  unique_table_df_revised["Revised summary"] = unique_table_df_revised["Revised summary"].combine_first(unique_table_df_revised["Summary"])
1889
 
 
1922
 
1923
  summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
1924
 
1925
+ # Ensure same file name not returned twice
1926
+ output_files = list(set(output_files))
1927
+ log_output_files = list(set(log_output_files))
1928
+
1929
  return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown, log_output_files
1930
 
1931
  tic = time.perf_counter()