kambris commited on
Commit
fcc17a2
·
verified ·
1 Parent(s): 636f3e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -21
app.py CHANGED
@@ -235,35 +235,37 @@ def classify_emotion(text, classifier):
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
  """Get embedding for complete text."""
238
- # First encode the full text to get actual tokens
239
- encoded = tokenizer(text, return_tensors="pt", add_special_tokens=False)
240
- all_tokens = encoded['input_ids'][0]
241
 
242
- # Split into chunks of 510 tokens to leave room for [CLS] and [SEP]
243
  chunk_size = 510
244
  chunks = []
245
 
246
- for i in range(0, len(all_tokens), chunk_size):
247
- chunk_tokens = all_tokens[i:i + chunk_size]
248
- # Add [CLS] and [SEP] tokens
249
- chunk_tokens = torch.cat([
250
- torch.tensor([tokenizer.cls_token_id]),
251
- chunk_tokens,
252
- torch.tensor([tokenizer.sep_token_id])
253
- ])
254
- chunks.append(chunk_tokens)
 
 
 
 
 
 
 
 
255
 
256
- # Get embeddings for each chunk
257
  chunk_embeddings = []
258
  for chunk in chunks:
259
- # Create proper input format
260
- inputs = {
261
- 'input_ids': chunk.unsqueeze(0).to(model.device),
262
- 'attention_mask': torch.ones_like(chunk.unsqueeze(0)).to(model.device)
263
- }
264
-
265
  with torch.no_grad():
266
- outputs = model(**inputs)[0]
267
  embedding = outputs[:, 0, :].cpu().numpy()
268
  chunk_embeddings.append(embedding[0])
269
 
 
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
  """Get embedding for complete text."""
238
+ # Get the raw tokens first
239
+ tokens = tokenizer.tokenize(text)
 
240
 
241
+ # Process in chunks of exactly 510 tokens (512 - 2 special tokens)
242
  chunk_size = 510
243
  chunks = []
244
 
245
+ for i in range(0, len(tokens), chunk_size):
246
+ chunk = tokens[i:i + chunk_size]
247
+ token_ids = tokenizer.convert_tokens_to_ids(chunk)
248
+ # Add special tokens manually
249
+ token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
250
+ # Create attention mask
251
+ attention_mask = [1] * len(token_ids)
252
+ # Pad if needed
253
+ padding_length = 512 - len(token_ids)
254
+ if padding_length > 0:
255
+ token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
256
+ attention_mask = attention_mask + ([0] * padding_length)
257
+
258
+ chunks.append({
259
+ 'input_ids': torch.tensor([token_ids]),
260
+ 'attention_mask': torch.tensor([attention_mask])
261
+ })
262
 
263
+ # Get embeddings
264
  chunk_embeddings = []
265
  for chunk in chunks:
266
+ chunk = {k: v.to(model.device) for k, v in chunk.items()}
 
 
 
 
 
267
  with torch.no_grad():
268
+ outputs = model(**chunk)[0]
269
  embedding = outputs[:, 0, :].cpu().numpy()
270
  chunk_embeddings.append(embedding[0])
271