kambris commited on
Commit
6e846e7
·
verified ·
1 Parent(s): c0f831c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -52
app.py CHANGED
@@ -181,60 +181,57 @@ def clean_arabic_text(text):
181
  return ' '.join(cleaned_words)
182
 
183
  def classify_emotion(text, classifier):
184
- """Classify emotion for complete text with proper token handling."""
185
- try:
186
- words = text.split()
187
- chunks = []
188
- current_chunk = []
189
- current_length = 0
190
-
191
- for word in words:
192
- word_tokens = len(classifier.tokenizer.encode(word))
193
- if current_length + word_tokens > 512:
194
- if current_chunk:
195
- chunks.append(' '.join(current_chunk))
196
- current_chunk = [word]
197
- current_length = word_tokens
198
- else:
199
- current_chunk.append(word)
200
- current_length += word_tokens
201
-
202
- if current_chunk:
203
- chunks.append(' '.join(current_chunk))
204
-
205
- if not chunks:
206
- chunks = [text]
207
-
208
- all_scores = []
209
- for chunk in chunks:
210
- try:
211
- # Direct classification without additional tokenization
212
- result = classifier(chunk)
213
- scores = result[0]
214
- all_scores.append(scores)
215
- except Exception as chunk_error:
216
- st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
217
- continue
218
-
219
- if all_scores:
220
- label_scores = {}
221
- count = len(all_scores)
222
-
223
- for scores in all_scores:
224
- label = scores['label']
225
- if label not in label_scores:
226
- label_scores[label] = 0
227
- label_scores[label] += scores['score']
228
-
229
- avg_scores = {label: score/count for label, score in label_scores.items()}
230
- final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
231
- return final_emotion
232
-
233
  return "LABEL_2"
234
-
235
- except Exception as e:
236
- st.warning(f"Error in emotion classification: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  return "LABEL_2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  def get_embedding_for_text(text, tokenizer, model):
240
  """Get embedding for complete text."""
 
181
  return ' '.join(cleaned_words)
182
 
183
  def classify_emotion(text, classifier):
184
+ """Classify emotion for complete text with precise token handling."""
185
+ # Ensure text is properly formatted
186
+ if not text or not isinstance(text, str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  return "LABEL_2"
188
+
189
+ # Split into manageable chunks
190
+ words = text.split()
191
+ chunks = []
192
+ current_chunk = []
193
+ current_length = 0
194
+
195
+ # Create proper-sized chunks
196
+ for word in words:
197
+ word_tokens = len(classifier.tokenizer.encode(word))
198
+ if current_length + word_tokens > 512:
199
+ if current_chunk:
200
+ chunks.append(' '.join(current_chunk))
201
+ current_chunk = [word]
202
+ current_length = word_tokens
203
+ else:
204
+ current_chunk.append(word)
205
+ current_length += word_tokens
206
+
207
+ if current_chunk:
208
+ chunks.append(' '.join(current_chunk))
209
+
210
+ if not chunks:
211
  return "LABEL_2"
212
+
213
+ # Process chunks with proper output handling
214
+ all_scores = []
215
+ for chunk in chunks:
216
+ # Direct classification with proper output structure
217
+ result = classifier(chunk, return_all_scores=True)[0]
218
+ all_scores.append(result)
219
+
220
+ # Calculate final emotion
221
+ label_scores = {}
222
+ count = len(all_scores)
223
+
224
+ for scores in all_scores:
225
+ for score_dict in scores:
226
+ label = score_dict['label']
227
+ if label not in label_scores:
228
+ label_scores[label] = 0
229
+ label_scores[label] += score_dict['score']
230
+
231
+ avg_scores = {label: score/count for label, score in label_scores.items()}
232
+ final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
233
+
234
+ return final_emotion
235
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
  """Get embedding for complete text."""