kambris commited on
Commit
eb5dfd3
·
verified ·
1 Parent(s): f62fb31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -234,28 +234,29 @@ def classify_emotion(text, classifier):
234
  return final_emotion
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
- """Get embedding for complete text."""
238
- chunks = split_text(text)
 
 
239
  chunk_embeddings = []
240
 
241
  for chunk in chunks:
242
- inputs = tokenizer(
243
- chunk,
244
  return_tensors="pt",
245
- padding=True,
246
- truncation=True,
247
  max_length=512
248
  )
249
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
250
 
251
  with torch.no_grad():
252
- # Access the first element of the tuple which contains the hidden states
253
- outputs = model(**inputs)[0]
254
  embedding = outputs[:, 0, :].cpu().numpy()
255
  chunk_embeddings.append(embedding[0])
256
 
257
  if chunk_embeddings:
258
- weights = np.array([len(chunk.split()) for chunk in chunks])
 
259
  weights = weights / weights.sum()
260
  weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
261
  return weighted_embedding
 
234
  return final_emotion
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
+ """Get embedding for complete text while preserving all content."""
238
+ # Split into optimal chunks of exactly 512 tokens
239
+ tokenized_text = tokenizer.encode(text)
240
+ chunks = [tokenized_text[i:i + 512] for i in range(0, len(tokenized_text), 512)]
241
  chunk_embeddings = []
242
 
243
  for chunk in chunks:
244
+ inputs = tokenizer.encode(
245
+ tokenizer.decode(chunk),
246
  return_tensors="pt",
247
+ padding='max_length',
 
248
  max_length=512
249
  )
250
+ inputs = inputs.to(model.device)
251
 
252
  with torch.no_grad():
253
+ outputs = model(inputs)[0]
 
254
  embedding = outputs[:, 0, :].cpu().numpy()
255
  chunk_embeddings.append(embedding[0])
256
 
257
  if chunk_embeddings:
258
+ # Weight each chunk based on its content length
259
+ weights = np.array([len(chunk) for chunk in chunks])
260
  weights = weights / weights.sum()
261
  weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
262
  return weighted_embedding