Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -235,35 +235,37 @@ def classify_emotion(text, classifier):
|
|
235 |
|
236 |
def get_embedding_for_text(text, tokenizer, model):
|
237 |
"""Get embedding for complete text."""
|
238 |
-
#
|
239 |
-
|
240 |
-
all_tokens = encoded['input_ids'][0]
|
241 |
|
242 |
-
#
|
243 |
chunk_size = 510
|
244 |
chunks = []
|
245 |
|
246 |
-
for i in range(0, len(
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
-
# Get embeddings
|
257 |
chunk_embeddings = []
|
258 |
for chunk in chunks:
|
259 |
-
|
260 |
-
inputs = {
|
261 |
-
'input_ids': chunk.unsqueeze(0).to(model.device),
|
262 |
-
'attention_mask': torch.ones_like(chunk.unsqueeze(0)).to(model.device)
|
263 |
-
}
|
264 |
-
|
265 |
with torch.no_grad():
|
266 |
-
outputs = model(**
|
267 |
embedding = outputs[:, 0, :].cpu().numpy()
|
268 |
chunk_embeddings.append(embedding[0])
|
269 |
|
|
|
235 |
|
236 |
def get_embedding_for_text(text, tokenizer, model):
|
237 |
"""Get embedding for complete text."""
|
238 |
+
# Get the raw tokens first
|
239 |
+
tokens = tokenizer.tokenize(text)
|
|
|
240 |
|
241 |
+
# Process in chunks of exactly 510 tokens (512 - 2 special tokens)
|
242 |
chunk_size = 510
|
243 |
chunks = []
|
244 |
|
245 |
+
for i in range(0, len(tokens), chunk_size):
|
246 |
+
chunk = tokens[i:i + chunk_size]
|
247 |
+
token_ids = tokenizer.convert_tokens_to_ids(chunk)
|
248 |
+
# Add special tokens manually
|
249 |
+
token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
|
250 |
+
# Create attention mask
|
251 |
+
attention_mask = [1] * len(token_ids)
|
252 |
+
# Pad if needed
|
253 |
+
padding_length = 512 - len(token_ids)
|
254 |
+
if padding_length > 0:
|
255 |
+
token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
|
256 |
+
attention_mask = attention_mask + ([0] * padding_length)
|
257 |
+
|
258 |
+
chunks.append({
|
259 |
+
'input_ids': torch.tensor([token_ids]),
|
260 |
+
'attention_mask': torch.tensor([attention_mask])
|
261 |
+
})
|
262 |
|
263 |
+
# Get embeddings
|
264 |
chunk_embeddings = []
|
265 |
for chunk in chunks:
|
266 |
+
chunk = {k: v.to(model.device) for k, v in chunk.items()}
|
|
|
|
|
|
|
|
|
|
|
267 |
with torch.no_grad():
|
268 |
+
outputs = model(**chunk)[0]
|
269 |
embedding = outputs[:, 0, :].cpu().numpy()
|
270 |
chunk_embeddings.append(embedding[0])
|
271 |
|