Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sean-Case commited on Jan 23, 2024

Commit

e0f53cc

1 Parent(s): ff32b4a

Now should save embeddings by default. Added random seed to representation

Browse files

Files changed (2) hide show

app.py +9 -1
funcs/representation_model.py +2 -2

app.py CHANGED Viewed

@@ -242,7 +242,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
     zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
     output_list.append(topic_model_save_name_zip)
     # Visualise the topics:
     topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
     return output_text, output_list, topics_vis
@@ -290,7 +298,7 @@ with block:
         with gr.Accordion("Data load and processing options", open = True):
             with gr.Row():
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
-                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
             with gr.Row():
                 low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])

     zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
     output_list.append(topic_model_save_name_zip)
+    if return_intermediate_files == "Yes":
+        print("Saving embeddings to file")
+        semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
+        np.savez_compressed(semantic_search_file_name, embeddings_out)
+        output_list.append(semantic_search_file_name)
     # Visualise the topics:
+    print("Creating visualisation")
     topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
     return output_text, output_list, topics_vis
         with gr.Accordion("Data load and processing options", open = True):
             with gr.Row():
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
+                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
             with gr.Row():
                 low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])

funcs/representation_model.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch.cuda
 from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
 from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
 chosen_prompt = open_hermes_prompt # stablelm_prompt
 chosen_start_tag =  open_hermes_start # stablelm_start
@@ -117,7 +117,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
 ## Create representation model parameters ##
 # KeyBERT
-keybert = KeyBERTInspired()
 def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):

 from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
 from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
+random_seed = 42
 chosen_prompt = open_hermes_prompt # stablelm_prompt
 chosen_start_tag =  open_hermes_start # stablelm_start
 ## Create representation model parameters ##
 # KeyBERT
+keybert = KeyBERTInspired(random_state=random_seed)
 def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):