dejanseo commited on
Commit
a19307d
·
verified ·
1 Parent(s): e846a94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -6
app.py CHANGED
@@ -230,9 +230,9 @@ def load_embedding_model(model_name="mixedbread-ai/mxbai-embed-large-v1"):
230
  return tokenizer, model
231
 
232
  @st.cache_data
233
- def generate_embeddings(text_list, tokenizer, model):
234
  """Generates embeddings for a list of text entries."""
235
- encoded_input = tokenizer(
236
  text_list, padding=True, truncation=True, return_tensors="pt"
237
  )
238
  with torch.no_grad():
@@ -256,7 +256,7 @@ def main():
256
  st.markdown(
257
  """
258
  **General Usage Guide**
259
-
260
  * Both tools work best with larger datasets (hundreds or thousands of entries).
261
  * For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
262
  * Output files are compressed to 16 dimensions.
@@ -270,7 +270,7 @@ def main():
270
  st.header("Compress Your Embeddings")
271
  st.markdown(
272
  """
273
- Upload a CSV file containing pre-existing embeddings.
274
  This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
275
  """
276
  )
@@ -311,7 +311,10 @@ def main():
311
  help="Enter each text entry on a new line. This tool works best with a large sample size.",
312
  )
313
 
314
- if text_input:
 
 
 
315
  text_list = text_input.strip().split("\n")
316
  if len(text_list) == 0:
317
  st.warning("Please enter some text for embedding")
@@ -319,7 +322,7 @@ def main():
319
  try:
320
  with st.spinner("Generating and compressing embeddings..."):
321
  tokenizer, model = load_embedding_model()
322
- embeddings = generate_embeddings(text_list, tokenizer, model)
323
  compressor = veczip(target_dims=16)
324
  retained_indices = compressor.compress(embeddings)
325
  compressed_embeddings = embeddings[:, retained_indices]
 
230
  return tokenizer, model
231
 
232
  @st.cache_data
233
+ def generate_embeddings(_tokenizer, model, text_list):
234
  """Generates embeddings for a list of text entries."""
235
+ encoded_input = _tokenizer(
236
  text_list, padding=True, truncation=True, return_tensors="pt"
237
  )
238
  with torch.no_grad():
 
256
  st.markdown(
257
  """
258
  **General Usage Guide**
259
+
260
  * Both tools work best with larger datasets (hundreds or thousands of entries).
261
  * For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
262
  * Output files are compressed to 16 dimensions.
 
270
  st.header("Compress Your Embeddings")
271
  st.markdown(
272
  """
273
+ Upload a CSV file containing pre-existing embeddings.
274
  This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
275
  """
276
  )
 
311
  help="Enter each text entry on a new line. This tool works best with a large sample size.",
312
  )
313
 
314
+ generate_button = st.button("Generate and Compress")
315
+
316
+
317
+ if generate_button and text_input:
318
  text_list = text_input.strip().split("\n")
319
  if len(text_list) == 0:
320
  st.warning("Please enter some text for embedding")
 
322
  try:
323
  with st.spinner("Generating and compressing embeddings..."):
324
  tokenizer, model = load_embedding_model()
325
+ embeddings = generate_embeddings(tokenizer, model, text_list)
326
  compressor = veczip(target_dims=16)
327
  retained_indices = compressor.compress(embeddings)
328
  compressed_embeddings = embeddings[:, retained_indices]