Update app.py
Browse files
app.py
CHANGED
@@ -230,9 +230,9 @@ def load_embedding_model(model_name="mixedbread-ai/mxbai-embed-large-v1"):
|
|
230 |
return tokenizer, model
|
231 |
|
232 |
@st.cache_data
|
233 |
-
def generate_embeddings(
|
234 |
"""Generates embeddings for a list of text entries."""
|
235 |
-
encoded_input =
|
236 |
text_list, padding=True, truncation=True, return_tensors="pt"
|
237 |
)
|
238 |
with torch.no_grad():
|
@@ -256,7 +256,7 @@ def main():
|
|
256 |
st.markdown(
|
257 |
"""
|
258 |
**General Usage Guide**
|
259 |
-
|
260 |
* Both tools work best with larger datasets (hundreds or thousands of entries).
|
261 |
* For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
|
262 |
* Output files are compressed to 16 dimensions.
|
@@ -270,7 +270,7 @@ def main():
|
|
270 |
st.header("Compress Your Embeddings")
|
271 |
st.markdown(
|
272 |
"""
|
273 |
-
Upload a CSV file containing pre-existing embeddings.
|
274 |
This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
|
275 |
"""
|
276 |
)
|
@@ -311,7 +311,10 @@ def main():
|
|
311 |
help="Enter each text entry on a new line. This tool works best with a large sample size.",
|
312 |
)
|
313 |
|
314 |
-
|
|
|
|
|
|
|
315 |
text_list = text_input.strip().split("\n")
|
316 |
if len(text_list) == 0:
|
317 |
st.warning("Please enter some text for embedding")
|
@@ -319,7 +322,7 @@ def main():
|
|
319 |
try:
|
320 |
with st.spinner("Generating and compressing embeddings..."):
|
321 |
tokenizer, model = load_embedding_model()
|
322 |
-
embeddings = generate_embeddings(
|
323 |
compressor = veczip(target_dims=16)
|
324 |
retained_indices = compressor.compress(embeddings)
|
325 |
compressed_embeddings = embeddings[:, retained_indices]
|
|
|
230 |
return tokenizer, model
|
231 |
|
232 |
@st.cache_data
|
233 |
+
def generate_embeddings(_tokenizer, model, text_list):
|
234 |
"""Generates embeddings for a list of text entries."""
|
235 |
+
encoded_input = _tokenizer(
|
236 |
text_list, padding=True, truncation=True, return_tensors="pt"
|
237 |
)
|
238 |
with torch.no_grad():
|
|
|
256 |
st.markdown(
|
257 |
"""
|
258 |
**General Usage Guide**
|
259 |
+
|
260 |
* Both tools work best with larger datasets (hundreds or thousands of entries).
|
261 |
* For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
|
262 |
* Output files are compressed to 16 dimensions.
|
|
|
270 |
st.header("Compress Your Embeddings")
|
271 |
st.markdown(
|
272 |
"""
|
273 |
+
Upload a CSV file containing pre-existing embeddings.
|
274 |
This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
|
275 |
"""
|
276 |
)
|
|
|
311 |
help="Enter each text entry on a new line. This tool works best with a large sample size.",
|
312 |
)
|
313 |
|
314 |
+
generate_button = st.button("Generate and Compress")
|
315 |
+
|
316 |
+
|
317 |
+
if generate_button and text_input:
|
318 |
text_list = text_input.strip().split("\n")
|
319 |
if len(text_list) == 0:
|
320 |
st.warning("Please enter some text for embedding")
|
|
|
322 |
try:
|
323 |
with st.spinner("Generating and compressing embeddings..."):
|
324 |
tokenizer, model = load_embedding_model()
|
325 |
+
embeddings = generate_embeddings(tokenizer, model, text_list)
|
326 |
compressor = veczip(target_dims=16)
|
327 |
retained_indices = compressor.compress(embeddings)
|
328 |
compressed_embeddings = embeddings[:, retained_indices]
|