kambris commited on
Commit
2198b18
·
verified ·
1 Parent(s): fcc17a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -28
app.py CHANGED
@@ -234,45 +234,47 @@ def classify_emotion(text, classifier):
234
  return final_emotion
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
- """Get embedding for complete text."""
238
- # Get the raw tokens first
239
  tokens = tokenizer.tokenize(text)
240
 
241
- # Process in chunks of exactly 510 tokens (512 - 2 special tokens)
242
  chunk_size = 510
243
- chunks = []
244
 
245
  for i in range(0, len(tokens), chunk_size):
246
- chunk = tokens[i:i + chunk_size]
247
- token_ids = tokenizer.convert_tokens_to_ids(chunk)
248
- # Add special tokens manually
249
- token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
 
 
 
 
 
 
 
 
250
  # Create attention mask
251
- attention_mask = [1] * len(token_ids)
252
- # Pad if needed
253
- padding_length = 512 - len(token_ids)
254
- if padding_length > 0:
255
- token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
256
- attention_mask = attention_mask + ([0] * padding_length)
257
-
258
- chunks.append({
259
- 'input_ids': torch.tensor([token_ids]),
260
- 'attention_mask': torch.tensor([attention_mask])
261
- })
262
-
263
- # Get embeddings
264
- chunk_embeddings = []
265
- for chunk in chunks:
266
- chunk = {k: v.to(model.device) for k, v in chunk.items()}
267
  with torch.no_grad():
268
- outputs = model(**chunk)[0]
269
- embedding = outputs[:, 0, :].cpu().numpy()
270
- chunk_embeddings.append(embedding[0])
271
 
 
272
  if chunk_embeddings:
273
  return np.mean(chunk_embeddings, axis=0)
 
 
274
  return np.zeros(model.config.hidden_size)
275
-
276
  def format_topics(topic_model, topic_counts):
277
  """Format topics for display."""
278
  formatted_topics = []
 
234
  return final_emotion
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
+ """Get embedding for complete text by processing in chunks."""
238
+ # Tokenize the entire text
239
  tokens = tokenizer.tokenize(text)
240
 
241
+ # Process in chunks of 510 tokens (512 - 2 special tokens)
242
  chunk_size = 510
243
+ chunk_embeddings = []
244
 
245
  for i in range(0, len(tokens), chunk_size):
246
+ # Take a chunk of tokens
247
+ chunk_tokens = tokens[i:i + chunk_size]
248
+
249
+ # Add special tokens
250
+ chunk_tokens = ['[CLS]'] + chunk_tokens + ['[SEP]']
251
+
252
+ # Convert to input IDs
253
+ input_ids = tokenizer.convert_tokens_to_ids(chunk_tokens)
254
+
255
+ # Pad to 512 tokens
256
+ input_ids += [tokenizer.pad_token_id] * (512 - len(input_ids))
257
+
258
  # Create attention mask
259
+ attention_mask = [1] * len(chunk_tokens) + [0] * (512 - len(chunk_tokens))
260
+
261
+ # Convert to tensors
262
+ input_ids = torch.tensor([input_ids])
263
+ attention_mask = torch.tensor([attention_mask])
264
+
265
+ # Get embedding for this chunk
 
 
 
 
 
 
 
 
 
266
  with torch.no_grad():
267
+ outputs = model(input_ids, attention_mask=attention_mask)
268
+ chunk_embedding = outputs[0][:, 0, :].cpu().numpy()
269
+ chunk_embeddings.append(chunk_embedding[0])
270
 
271
+ # Average embeddings from all chunks
272
  if chunk_embeddings:
273
  return np.mean(chunk_embeddings, axis=0)
274
+
275
+ # Fallback if no embeddings could be generated
276
  return np.zeros(model.config.hidden_size)
277
+
278
  def format_topics(topic_model, topic_counts):
279
  """Format topics for display."""
280
  formatted_topics = []