Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -235,28 +235,33 @@ def classify_emotion(text, classifier):
|
|
235 |
|
236 |
def get_embedding_for_text(text, tokenizer, model):
|
237 |
"""Get embedding for complete text while preserving all content."""
|
238 |
-
#
|
239 |
tokenized_text = tokenizer.encode(text)
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
242 |
|
|
|
243 |
for chunk in chunks:
|
244 |
-
inputs = tokenizer
|
245 |
-
|
246 |
return_tensors="pt",
|
247 |
padding='max_length',
|
248 |
max_length=512
|
249 |
)
|
250 |
-
inputs =
|
251 |
|
252 |
with torch.no_grad():
|
253 |
-
outputs = model(inputs)[0]
|
254 |
embedding = outputs[:, 0, :].cpu().numpy()
|
255 |
chunk_embeddings.append(embedding[0])
|
256 |
|
257 |
if chunk_embeddings:
|
258 |
-
|
259 |
-
weights = np.array([len(chunk) for chunk in chunks])
|
260 |
weights = weights / weights.sum()
|
261 |
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
262 |
return weighted_embedding
|
|
|
235 |
|
236 |
def get_embedding_for_text(text, tokenizer, model):
|
237 |
"""Get embedding for complete text while preserving all content."""
|
238 |
+
# Get exact token counts
|
239 |
tokenized_text = tokenizer.encode(text)
|
240 |
+
total_tokens = len(tokenized_text)
|
241 |
+
|
242 |
+
# Create precise chunks of 512 tokens
|
243 |
+
chunks = []
|
244 |
+
for i in range(0, total_tokens, 512):
|
245 |
+
chunk = tokenized_text[i:i + 512]
|
246 |
+
chunks.append(tokenizer.decode(chunk))
|
247 |
|
248 |
+
chunk_embeddings = []
|
249 |
for chunk in chunks:
|
250 |
+
inputs = tokenizer(
|
251 |
+
chunk,
|
252 |
return_tensors="pt",
|
253 |
padding='max_length',
|
254 |
max_length=512
|
255 |
)
|
256 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
257 |
|
258 |
with torch.no_grad():
|
259 |
+
outputs = model(**inputs)[0]
|
260 |
embedding = outputs[:, 0, :].cpu().numpy()
|
261 |
chunk_embeddings.append(embedding[0])
|
262 |
|
263 |
if chunk_embeddings:
|
264 |
+
weights = np.array([len(chunk.split()) for chunk in chunks])
|
|
|
265 |
weights = weights / weights.sum()
|
266 |
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
267 |
return weighted_embedding
|