Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Dec 6, 2024

Commit

636f3e1

verified ·

1 Parent(s): 7ad0bec

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -235,34 +235,32 @@ def classify_emotion(text, classifier):
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
-    chunks = []
-    current_text = ""
-    words = text.split()
-    for word in words:
-        test_text = current_text + " " + word if current_text else word
-        tokens = tokenizer.encode(test_text)
-        if len(tokens) >= 512:
-            if current_text:
-                chunks.append(current_text)
-            current_text = word
-        else:
-            current_text = test_text
-    if current_text:
-        chunks.append(current_text)
     chunk_embeddings = []
     for chunk in chunks:
-        inputs = tokenizer(
-            chunk,
-            return_tensors="pt",
-            padding=True,
-            max_length=512,
-            truncation=True
-        )
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = model(**inputs)[0]

 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
+    # First encode the full text to get actual tokens
+    encoded = tokenizer(text, return_tensors="pt", add_special_tokens=False)
+    all_tokens = encoded['input_ids'][0]
+    # Split into chunks of 510 tokens to leave room for [CLS] and [SEP]
+    chunk_size = 510
+    chunks = []
+    for i in range(0, len(all_tokens), chunk_size):
+        chunk_tokens = all_tokens[i:i + chunk_size]
+        # Add [CLS] and [SEP] tokens
+        chunk_tokens = torch.cat([
+            torch.tensor([tokenizer.cls_token_id]),
+            chunk_tokens,
+            torch.tensor([tokenizer.sep_token_id])
+        ])
+        chunks.append(chunk_tokens)
+    # Get embeddings for each chunk
     chunk_embeddings = []
     for chunk in chunks:
+        # Create proper input format
+        inputs = {
+            'input_ids': chunk.unsqueeze(0).to(model.device),
+            'attention_mask': torch.ones_like(chunk.unsqueeze(0)).to(model.device)
+        }
         with torch.no_grad():
             outputs = model(**inputs)[0]