Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 6, 2024

Commit

7ad0bec

verified ·

1 Parent(s): e5d60e7

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -36

app.py CHANGED Viewed

@@ -234,51 +234,42 @@ def classify_emotion(text, classifier):
     return final_emotion
 def get_embedding_for_text(text, tokenizer, model):
-    """Get embedding for complete text while preserving all content."""
-    # Pre-tokenize to get exact chunks
-    encoded = tokenizer.encode_plus(
-        text,
-        add_special_tokens=True,
-        return_tensors="pt",
-        return_attention_mask=True,
-        return_token_type_ids=True
-    )
-    # Get total length
-    total_length = encoded['input_ids'].size(1)
-    # Process in chunks of 512 tokens
     chunk_embeddings = []
-    for i in range(0, total_length, 512):
-        # Extract chunk
-        chunk_dict = {
-            'input_ids': encoded['input_ids'][:, i:i + 512],
-            'attention_mask': encoded['attention_mask'][:, i:i + 512],
-            'token_type_ids': encoded['token_type_ids'][:, i:i + 512]
-        }
-        # Pad if necessary
-        if chunk_dict['input_ids'].size(1) < 512:
-            pad_length = 512 - chunk_dict['input_ids'].size(1)
-            for key in chunk_dict:
-                chunk_dict[key] = torch.nn.functional.pad(
-                    chunk_dict[key],
-                    (0, pad_length),
-                    'constant',
-                    0
-                )
-        # Move to device
-        chunk_dict = {k: v.to(model.device) for k, v in chunk_dict.items()}
-        # Get embeddings
         with torch.no_grad():
-            outputs = model(**chunk_dict)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
-        # Average the embeddings
         return np.mean(chunk_embeddings, axis=0)
     return np.zeros(model.config.hidden_size)

     return final_emotion
 def get_embedding_for_text(text, tokenizer, model):
+    """Get embedding for complete text."""
+    chunks = []
+    current_text = ""
+    words = text.split()
+    for word in words:
+        test_text = current_text + " " + word if current_text else word
+        tokens = tokenizer.encode(test_text)
+        if len(tokens) >= 512:
+            if current_text:
+                chunks.append(current_text)
+            current_text = word
+        else:
+            current_text = test_text
+    if current_text:
+        chunks.append(current_text)
     chunk_embeddings = []
+    for chunk in chunks:
+        inputs = tokenizer(
+            chunk,
+            return_tensors="pt",
+            padding=True,
+            max_length=512,
+            truncation=True
+        )
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         with torch.no_grad():
+            outputs = model(**inputs)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
         return np.mean(chunk_embeddings, axis=0)
     return np.zeros(model.config.hidden_size)