Spaces:

dejanseo
/

zip-embed

Running

App Files Files Community

dejanseo commited on Dec 12, 2024

Commit

a19307d

verified ·

1 Parent(s): e846a94

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -6

app.py CHANGED Viewed

@@ -230,9 +230,9 @@ def load_embedding_model(model_name="mixedbread-ai/mxbai-embed-large-v1"):
     return tokenizer, model
 @st.cache_data
-def generate_embeddings(text_list, tokenizer, model):
     """Generates embeddings for a list of text entries."""
-    encoded_input = tokenizer(
         text_list, padding=True, truncation=True, return_tensors="pt"
     )
     with torch.no_grad():
@@ -256,7 +256,7 @@ def main():
     st.markdown(
         """
         **General Usage Guide**
         *   Both tools work best with larger datasets (hundreds or thousands of entries).
         *   For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
         *   Output files are compressed to 16 dimensions.
@@ -270,7 +270,7 @@ def main():
         st.header("Compress Your Embeddings")
         st.markdown(
             """
-            Upload a CSV file containing pre-existing embeddings.
             This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
             """
         )
@@ -311,7 +311,10 @@ def main():
             help="Enter each text entry on a new line. This tool works best with a large sample size.",
         )
-        if text_input:
             text_list = text_input.strip().split("\n")
             if len(text_list) == 0:
                 st.warning("Please enter some text for embedding")
@@ -319,7 +322,7 @@ def main():
                 try:
                     with st.spinner("Generating and compressing embeddings..."):
                         tokenizer, model = load_embedding_model()
-                        embeddings = generate_embeddings(text_list, tokenizer, model)
                         compressor = veczip(target_dims=16)
                         retained_indices = compressor.compress(embeddings)
                         compressed_embeddings = embeddings[:, retained_indices]

     return tokenizer, model
 @st.cache_data
+def generate_embeddings(_tokenizer, model, text_list):
     """Generates embeddings for a list of text entries."""
+    encoded_input = _tokenizer(
         text_list, padding=True, truncation=True, return_tensors="pt"
     )
     with torch.no_grad():
     st.markdown(
         """
         **General Usage Guide**
         *   Both tools work best with larger datasets (hundreds or thousands of entries).
         *   For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
         *   Output files are compressed to 16 dimensions.
         st.header("Compress Your Embeddings")
         st.markdown(
             """
+            Upload a CSV file containing pre-existing embeddings.
             This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
             """
         )
             help="Enter each text entry on a new line. This tool works best with a large sample size.",
         )
+        generate_button = st.button("Generate and Compress")
+        if generate_button and text_input:
             text_list = text_input.strip().split("\n")
             if len(text_list) == 0:
                 st.warning("Please enter some text for embedding")
                 try:
                     with st.spinner("Generating and compressing embeddings..."):
                         tokenizer, model = load_embedding_model()
+                        embeddings = generate_embeddings(tokenizer, model, text_list)
                         compressor = veczip(target_dims=16)
                         retained_indices = compressor.compress(embeddings)
                         compressed_embeddings = embeddings[:, retained_indices]