kambris commited on
Commit
0ba80af
·
verified ·
1 Parent(s): eb5dfd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -9
app.py CHANGED
@@ -235,28 +235,33 @@ def classify_emotion(text, classifier):
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
  """Get embedding for complete text while preserving all content."""
238
- # Split into optimal chunks of exactly 512 tokens
239
  tokenized_text = tokenizer.encode(text)
240
- chunks = [tokenized_text[i:i + 512] for i in range(0, len(tokenized_text), 512)]
241
- chunk_embeddings = []
 
 
 
 
 
242
 
 
243
  for chunk in chunks:
244
- inputs = tokenizer.encode(
245
- tokenizer.decode(chunk),
246
  return_tensors="pt",
247
  padding='max_length',
248
  max_length=512
249
  )
250
- inputs = inputs.to(model.device)
251
 
252
  with torch.no_grad():
253
- outputs = model(inputs)[0]
254
  embedding = outputs[:, 0, :].cpu().numpy()
255
  chunk_embeddings.append(embedding[0])
256
 
257
  if chunk_embeddings:
258
- # Weight each chunk based on its content length
259
- weights = np.array([len(chunk) for chunk in chunks])
260
  weights = weights / weights.sum()
261
  weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
262
  return weighted_embedding
 
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
  """Get embedding for complete text while preserving all content."""
238
+ # Get exact token counts
239
  tokenized_text = tokenizer.encode(text)
240
+ total_tokens = len(tokenized_text)
241
+
242
+ # Create precise chunks of 512 tokens
243
+ chunks = []
244
+ for i in range(0, total_tokens, 512):
245
+ chunk = tokenized_text[i:i + 512]
246
+ chunks.append(tokenizer.decode(chunk))
247
 
248
+ chunk_embeddings = []
249
  for chunk in chunks:
250
+ inputs = tokenizer(
251
+ chunk,
252
  return_tensors="pt",
253
  padding='max_length',
254
  max_length=512
255
  )
256
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
257
 
258
  with torch.no_grad():
259
+ outputs = model(**inputs)[0]
260
  embedding = outputs[:, 0, :].cpu().numpy()
261
  chunk_embeddings.append(embedding[0])
262
 
263
  if chunk_embeddings:
264
+ weights = np.array([len(chunk.split()) for chunk in chunks])
 
265
  weights = weights / weights.sum()
266
  weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
267
  return weighted_embedding