Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -53,17 +53,53 @@ def split_text(text, max_length=512):
|
|
53 |
|
54 |
return chunks
|
55 |
|
|
|
|
|
56 |
def classify_emotion(text, classifier):
|
57 |
-
"""Classify emotion for complete text."""
|
58 |
try:
|
59 |
# Split text into manageable chunks
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
all_scores = []
|
63 |
for chunk in chunks:
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
# Average scores across all chunks
|
69 |
if all_scores:
|
@@ -92,6 +128,7 @@ def classify_emotion(text, classifier):
|
|
92 |
st.warning(f"Error in emotion classification: {str(e)}")
|
93 |
return "LABEL_2" # Default to neutral
|
94 |
|
|
|
95 |
def get_embedding_for_text(text, tokenizer, model):
|
96 |
"""Get embedding for complete text."""
|
97 |
chunks = split_text(text)
|
@@ -301,7 +338,7 @@ else:
|
|
301 |
# Example format
|
302 |
st.write("### Expected File Format:")
|
303 |
example_df = pd.DataFrame({
|
304 |
-
'country': ['Egypt', '
|
305 |
-
'poem': ['قصيدة مصرية', 'قصيدة
|
306 |
})
|
307 |
st.dataframe(example_df)
|
|
|
53 |
|
54 |
return chunks
|
55 |
|
56 |
+
# The beginning of the code remains the same until the classify_emotion function
|
57 |
+
|
58 |
def classify_emotion(text, classifier):
|
59 |
+
"""Classify emotion for complete text with proper token handling."""
|
60 |
try:
|
61 |
# Split text into manageable chunks
|
62 |
+
words = text.split()
|
63 |
+
chunks = []
|
64 |
+
current_chunk = []
|
65 |
+
current_length = 0
|
66 |
+
|
67 |
+
# Create chunks that respect the 512 token limit
|
68 |
+
for word in words:
|
69 |
+
# Add word length plus 1 for space
|
70 |
+
word_tokens = len(classifier.tokenizer.encode(word))
|
71 |
+
if current_length + word_tokens > 512:
|
72 |
+
if current_chunk:
|
73 |
+
chunks.append(' '.join(current_chunk))
|
74 |
+
current_chunk = [word]
|
75 |
+
current_length = word_tokens
|
76 |
+
else:
|
77 |
+
current_chunk.append(word)
|
78 |
+
current_length += word_tokens
|
79 |
+
|
80 |
+
if current_chunk:
|
81 |
+
chunks.append(' '.join(current_chunk))
|
82 |
+
|
83 |
+
# If no chunks were created, use the original text with truncation
|
84 |
+
if not chunks:
|
85 |
+
chunks = [text]
|
86 |
|
87 |
all_scores = []
|
88 |
for chunk in chunks:
|
89 |
+
try:
|
90 |
+
# Ensure proper truncation
|
91 |
+
inputs = classifier.tokenizer(
|
92 |
+
chunk,
|
93 |
+
truncation=True,
|
94 |
+
max_length=512,
|
95 |
+
return_tensors="pt"
|
96 |
+
)
|
97 |
+
result = classifier(chunk, truncation=True, max_length=512)
|
98 |
+
scores = result[0]
|
99 |
+
all_scores.append(scores)
|
100 |
+
except Exception as chunk_error:
|
101 |
+
st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
|
102 |
+
continue
|
103 |
|
104 |
# Average scores across all chunks
|
105 |
if all_scores:
|
|
|
128 |
st.warning(f"Error in emotion classification: {str(e)}")
|
129 |
return "LABEL_2" # Default to neutral
|
130 |
|
131 |
+
|
132 |
def get_embedding_for_text(text, tokenizer, model):
|
133 |
"""Get embedding for complete text."""
|
134 |
chunks = split_text(text)
|
|
|
338 |
# Example format
|
339 |
st.write("### Expected File Format:")
|
340 |
example_df = pd.DataFrame({
|
341 |
+
'country': ['Egypt', 'Palestine'],
|
342 |
+
'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
|
343 |
})
|
344 |
st.dataframe(example_df)
|