Sean-Case commited on
Commit
e0f53cc
·
1 Parent(s): ff32b4a

Now should save embeddings by default. Added random seed to representation

Browse files
Files changed (2) hide show
  1. app.py +9 -1
  2. funcs/representation_model.py +2 -2
app.py CHANGED
@@ -242,7 +242,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
242
  zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
243
  output_list.append(topic_model_save_name_zip)
244
 
 
 
 
 
 
 
 
245
  # Visualise the topics:
 
246
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
247
 
248
  return output_text, output_list, topics_vis
@@ -290,7 +298,7 @@ with block:
290
  with gr.Accordion("Data load and processing options", open = True):
291
  with gr.Row():
292
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
293
- return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
294
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
295
  with gr.Row():
296
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
 
242
  zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
243
  output_list.append(topic_model_save_name_zip)
244
 
245
+ if return_intermediate_files == "Yes":
246
+ print("Saving embeddings to file")
247
+ semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
248
+ np.savez_compressed(semantic_search_file_name, embeddings_out)
249
+
250
+ output_list.append(semantic_search_file_name)
251
+
252
  # Visualise the topics:
253
+ print("Creating visualisation")
254
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
255
 
256
  return output_text, output_list, topics_vis
 
298
  with gr.Accordion("Data load and processing options", open = True):
299
  with gr.Row():
300
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
301
+ return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
302
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
303
  with gr.Row():
304
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
funcs/representation_model.py CHANGED
@@ -9,7 +9,7 @@ import torch.cuda
9
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
10
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
11
 
12
-
13
 
14
  chosen_prompt = open_hermes_prompt # stablelm_prompt
15
  chosen_start_tag = open_hermes_start # stablelm_start
@@ -117,7 +117,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
117
 
118
  ## Create representation model parameters ##
119
  # KeyBERT
120
- keybert = KeyBERTInspired()
121
 
122
  def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
123
 
 
9
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
10
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
11
 
12
+ random_seed = 42
13
 
14
  chosen_prompt = open_hermes_prompt # stablelm_prompt
15
  chosen_start_tag = open_hermes_start # stablelm_start
 
117
 
118
  ## Create representation model parameters ##
119
  # KeyBERT
120
+ keybert = KeyBERTInspired(random_state=random_seed)
121
 
122
  def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
123