kambris commited on
Commit
00bf9b7
·
verified ·
1 Parent(s): b88eade

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -7
app.py CHANGED
@@ -53,17 +53,53 @@ def split_text(text, max_length=512):
53
 
54
  return chunks
55
 
 
 
56
  def classify_emotion(text, classifier):
57
- """Classify emotion for complete text."""
58
  try:
59
  # Split text into manageable chunks
60
- chunks = split_text(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  all_scores = []
63
  for chunk in chunks:
64
- result = classifier(chunk)
65
- scores = result[0] # Get scores for all labels
66
- all_scores.append(scores)
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  # Average scores across all chunks
69
  if all_scores:
@@ -92,6 +128,7 @@ def classify_emotion(text, classifier):
92
  st.warning(f"Error in emotion classification: {str(e)}")
93
  return "LABEL_2" # Default to neutral
94
 
 
95
  def get_embedding_for_text(text, tokenizer, model):
96
  """Get embedding for complete text."""
97
  chunks = split_text(text)
@@ -301,7 +338,7 @@ else:
301
  # Example format
302
  st.write("### Expected File Format:")
303
  example_df = pd.DataFrame({
304
- 'country': ['Egypt', 'Saudi Arabia'],
305
- 'poem': ['قصيدة مصرية', 'قصيدة سعودية']
306
  })
307
  st.dataframe(example_df)
 
53
 
54
  return chunks
55
 
56
+ # The beginning of the code remains the same until the classify_emotion function
57
+
58
  def classify_emotion(text, classifier):
59
+ """Classify emotion for complete text with proper token handling."""
60
  try:
61
  # Split text into manageable chunks
62
+ words = text.split()
63
+ chunks = []
64
+ current_chunk = []
65
+ current_length = 0
66
+
67
+ # Create chunks that respect the 512 token limit
68
+ for word in words:
69
+ # Add word length plus 1 for space
70
+ word_tokens = len(classifier.tokenizer.encode(word))
71
+ if current_length + word_tokens > 512:
72
+ if current_chunk:
73
+ chunks.append(' '.join(current_chunk))
74
+ current_chunk = [word]
75
+ current_length = word_tokens
76
+ else:
77
+ current_chunk.append(word)
78
+ current_length += word_tokens
79
+
80
+ if current_chunk:
81
+ chunks.append(' '.join(current_chunk))
82
+
83
+ # If no chunks were created, use the original text with truncation
84
+ if not chunks:
85
+ chunks = [text]
86
 
87
  all_scores = []
88
  for chunk in chunks:
89
+ try:
90
+ # Ensure proper truncation
91
+ inputs = classifier.tokenizer(
92
+ chunk,
93
+ truncation=True,
94
+ max_length=512,
95
+ return_tensors="pt"
96
+ )
97
+ result = classifier(chunk, truncation=True, max_length=512)
98
+ scores = result[0]
99
+ all_scores.append(scores)
100
+ except Exception as chunk_error:
101
+ st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
102
+ continue
103
 
104
  # Average scores across all chunks
105
  if all_scores:
 
128
  st.warning(f"Error in emotion classification: {str(e)}")
129
  return "LABEL_2" # Default to neutral
130
 
131
+
132
  def get_embedding_for_text(text, tokenizer, model):
133
  """Get embedding for complete text."""
134
  chunks = split_text(text)
 
338
  # Example format
339
  st.write("### Expected File Format:")
340
  example_df = pd.DataFrame({
341
+ 'country': ['Egypt', 'Palestine'],
342
+ 'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
343
  })
344
  st.dataframe(example_df)