Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Dec 6, 2024

Commit

fcc17a2

verified ·

1 Parent(s): 636f3e1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -235,35 +235,37 @@ def classify_emotion(text, classifier):
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
-    # First encode the full text to get actual tokens
-    encoded = tokenizer(text, return_tensors="pt", add_special_tokens=False)
-    all_tokens = encoded['input_ids'][0]
-    # Split into chunks of 510 tokens to leave room for [CLS] and [SEP]
     chunk_size = 510
     chunks = []
-    for i in range(0, len(all_tokens), chunk_size):
-        chunk_tokens = all_tokens[i:i + chunk_size]
-        # Add [CLS] and [SEP] tokens
-        chunk_tokens = torch.cat([
-            torch.tensor([tokenizer.cls_token_id]),
-            chunk_tokens,
-            torch.tensor([tokenizer.sep_token_id])
-        ])
-        chunks.append(chunk_tokens)
-    # Get embeddings for each chunk
     chunk_embeddings = []
     for chunk in chunks:
-        # Create proper input format
-        inputs = {
-            'input_ids': chunk.unsqueeze(0).to(model.device),
-            'attention_mask': torch.ones_like(chunk.unsqueeze(0)).to(model.device)
-        }
         with torch.no_grad():
-            outputs = model(**inputs)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])

 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
+    # Get the raw tokens first
+    tokens = tokenizer.tokenize(text)
+    # Process in chunks of exactly 510 tokens (512 - 2 special tokens)
     chunk_size = 510
     chunks = []
+    for i in range(0, len(tokens), chunk_size):
+        chunk = tokens[i:i + chunk_size]
+        token_ids = tokenizer.convert_tokens_to_ids(chunk)
+        # Add special tokens manually
+        token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
+        # Create attention mask
+        attention_mask = [1] * len(token_ids)
+        # Pad if needed
+        padding_length = 512 - len(token_ids)
+        if padding_length > 0:
+            token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
+            attention_mask = attention_mask + ([0] * padding_length)
+        chunks.append({
+            'input_ids': torch.tensor([token_ids]),
+            'attention_mask': torch.tensor([attention_mask])
+        })
+    # Get embeddings
     chunk_embeddings = []
     for chunk in chunks:
+        chunk = {k: v.to(model.device) for k, v in chunk.items()}
         with torch.no_grad():
+            outputs = model(**chunk)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])