Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Dec 6, 2024

Commit

0ba80af

verified ·

1 Parent(s): eb5dfd3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -235,28 +235,33 @@ def classify_emotion(text, classifier):
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text while preserving all content."""
-    # Split into optimal chunks of exactly 512 tokens
     tokenized_text = tokenizer.encode(text)
-    chunks = [tokenized_text[i:i + 512] for i in range(0, len(tokenized_text), 512)]
-    chunk_embeddings = []
     for chunk in chunks:
-        inputs = tokenizer.encode(
-            tokenizer.decode(chunk),
             return_tensors="pt",
             padding='max_length',
             max_length=512
         )
-        inputs = inputs.to(model.device)
         with torch.no_grad():
-            outputs = model(inputs)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
-        # Weight each chunk based on its content length
-        weights = np.array([len(chunk) for chunk in chunks])
         weights = weights / weights.sum()
         weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
         return weighted_embedding

 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text while preserving all content."""
+    # Get exact token counts
     tokenized_text = tokenizer.encode(text)
+    total_tokens = len(tokenized_text)
+    # Create precise chunks of 512 tokens
+    chunks = []
+    for i in range(0, total_tokens, 512):
+        chunk = tokenized_text[i:i + 512]
+        chunks.append(tokenizer.decode(chunk))
+    chunk_embeddings = []
     for chunk in chunks:
+        inputs = tokenizer(
+            chunk,
             return_tensors="pt",
             padding='max_length',
             max_length=512
         )
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         with torch.no_grad():
+            outputs = model(**inputs)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
+        weights = np.array([len(chunk.split()) for chunk in chunks])
         weights = weights / weights.sum()
         weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
         return weighted_embedding