Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 6, 2024

Commit

e5d60e7

verified ·

1 Parent(s): 0ba80af

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -21

app.py CHANGED Viewed

@@ -235,36 +235,51 @@ def classify_emotion(text, classifier):
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text while preserving all content."""
-    # Get exact token counts
-    tokenized_text = tokenizer.encode(text)
-    total_tokens = len(tokenized_text)
-    # Create precise chunks of 512 tokens
-    chunks = []
-    for i in range(0, total_tokens, 512):
-        chunk = tokenized_text[i:i + 512]
-        chunks.append(tokenizer.decode(chunk))
     chunk_embeddings = []
-    for chunk in chunks:
-        inputs = tokenizer(
-            chunk,
-            return_tensors="pt",
-            padding='max_length',
-            max_length=512
-        )
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         with torch.no_grad():
-            outputs = model(**inputs)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
-        weights = np.array([len(chunk.split()) for chunk in chunks])
-        weights = weights / weights.sum()
-        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
-        return weighted_embedding
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):

 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text while preserving all content."""
+    # Pre-tokenize to get exact chunks
+    encoded = tokenizer.encode_plus(
+        text,
+        add_special_tokens=True,
+        return_tensors="pt",
+        return_attention_mask=True,
+        return_token_type_ids=True
+    )
+    # Get total length
+    total_length = encoded['input_ids'].size(1)
+    # Process in chunks of 512 tokens
     chunk_embeddings = []
+    for i in range(0, total_length, 512):
+        # Extract chunk
+        chunk_dict = {
+            'input_ids': encoded['input_ids'][:, i:i + 512],
+            'attention_mask': encoded['attention_mask'][:, i:i + 512],
+            'token_type_ids': encoded['token_type_ids'][:, i:i + 512]
+        }
+        # Pad if necessary
+        if chunk_dict['input_ids'].size(1) < 512:
+            pad_length = 512 - chunk_dict['input_ids'].size(1)
+            for key in chunk_dict:
+                chunk_dict[key] = torch.nn.functional.pad(
+                    chunk_dict[key],
+                    (0, pad_length),
+                    'constant',
+                    0
+                )
+        # Move to device
+        chunk_dict = {k: v.to(model.device) for k, v in chunk_dict.items()}
+        # Get embeddings
         with torch.no_grad():
+            outputs = model(**chunk_dict)[0]
             embedding = outputs[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
+        # Average the embeddings
+        return np.mean(chunk_embeddings, axis=0)
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):