Spaces:
Running
Running
Sean-Case
commited on
Commit
·
e0f53cc
1
Parent(s):
ff32b4a
Now should save embeddings by default. Added random seed to representation
Browse files- app.py +9 -1
- funcs/representation_model.py +2 -2
app.py
CHANGED
@@ -242,7 +242,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
242 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
243 |
output_list.append(topic_model_save_name_zip)
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
# Visualise the topics:
|
|
|
246 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
247 |
|
248 |
return output_text, output_list, topics_vis
|
@@ -290,7 +298,7 @@ with block:
|
|
290 |
with gr.Accordion("Data load and processing options", open = True):
|
291 |
with gr.Row():
|
292 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
|
293 |
-
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="
|
294 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
295 |
with gr.Row():
|
296 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
|
|
|
242 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
243 |
output_list.append(topic_model_save_name_zip)
|
244 |
|
245 |
+
if return_intermediate_files == "Yes":
|
246 |
+
print("Saving embeddings to file")
|
247 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
248 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
249 |
+
|
250 |
+
output_list.append(semantic_search_file_name)
|
251 |
+
|
252 |
# Visualise the topics:
|
253 |
+
print("Creating visualisation")
|
254 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
255 |
|
256 |
return output_text, output_list, topics_vis
|
|
|
298 |
with gr.Accordion("Data load and processing options", open = True):
|
299 |
with gr.Row():
|
300 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
|
301 |
+
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
|
302 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
303 |
with gr.Row():
|
304 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
|
funcs/representation_model.py
CHANGED
@@ -9,7 +9,7 @@ import torch.cuda
|
|
9 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
|
10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
11 |
|
12 |
-
|
13 |
|
14 |
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
15 |
chosen_start_tag = open_hermes_start # stablelm_start
|
@@ -117,7 +117,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
|
|
117 |
|
118 |
## Create representation model parameters ##
|
119 |
# KeyBERT
|
120 |
-
keybert = KeyBERTInspired()
|
121 |
|
122 |
def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
|
123 |
|
|
|
9 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
|
10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
11 |
|
12 |
+
random_seed = 42
|
13 |
|
14 |
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
15 |
chosen_start_tag = open_hermes_start # stablelm_start
|
|
|
117 |
|
118 |
## Create representation model parameters ##
|
119 |
# KeyBERT
|
120 |
+
keybert = KeyBERTInspired(random_state=random_seed)
|
121 |
|
122 |
def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
|
123 |
|