kambris commited on
Commit
cb29d0f
·
verified ·
1 Parent(s): 950bcef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -21
app.py CHANGED
@@ -234,31 +234,27 @@ def classify_emotion(text, classifier):
234
  return final_emotion
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
- """Get embedding for complete text."""
238
- chunks = split_text(text)
 
239
  chunk_embeddings = []
240
 
241
  for chunk in chunks:
242
- try:
243
- inputs = tokenizer(
244
- chunk,
245
- return_tensors="pt",
246
- padding=True,
247
- truncation=True,
248
- max_length=512
249
- )
250
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
251
-
252
- with torch.no_grad():
253
- # Get the correct output format
254
- outputs = model(**inputs)[0] # Access first element of tuple
255
-
256
- embedding = outputs[:, 0, :].cpu().numpy()
257
- chunk_embeddings.append(embedding[0])
258
- except Exception as e:
259
- st.warning(f"Error processing chunk: {str(e)}")
260
- continue
261
 
 
262
  if chunk_embeddings:
263
  weights = np.array([len(chunk.split()) for chunk in chunks])
264
  weights = weights / weights.sum()
 
234
  return final_emotion
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
+ """Get embedding for complete text while preserving all content."""
238
+ # Split into optimal chunks of 512 tokens
239
+ chunks = split_text(text, max_length=512)
240
  chunk_embeddings = []
241
 
242
  for chunk in chunks:
243
+ inputs = tokenizer(
244
+ chunk,
245
+ return_tensors="pt",
246
+ padding=True,
247
+ max_length=512
248
+ )
249
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
250
+
251
+ with torch.no_grad():
252
+ outputs = model(**inputs)[0]
253
+
254
+ embedding = outputs[:, 0, :].cpu().numpy()
255
+ chunk_embeddings.append(embedding[0])
 
 
 
 
 
 
256
 
257
+ # Weight each chunk based on its content
258
  if chunk_embeddings:
259
  weights = np.array([len(chunk.split()) for chunk in chunks])
260
  weights = weights / weights.sum()