Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 24, 2024

Commit

00bf9b7

verified ·

1 Parent(s): b88eade

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -7

app.py CHANGED Viewed

@@ -53,17 +53,53 @@ def split_text(text, max_length=512):
     return chunks
 def classify_emotion(text, classifier):
-    """Classify emotion for complete text."""
     try:
         # Split text into manageable chunks
-        chunks = split_text(text)
         all_scores = []
         for chunk in chunks:
-            result = classifier(chunk)
-            scores = result[0]  # Get scores for all labels
-            all_scores.append(scores)
         # Average scores across all chunks
         if all_scores:
@@ -92,6 +128,7 @@ def classify_emotion(text, classifier):
         st.warning(f"Error in emotion classification: {str(e)}")
         return "LABEL_2"  # Default to neutral
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
     chunks = split_text(text)
@@ -301,7 +338,7 @@ else:
     # Example format
     st.write("### Expected File Format:")
     example_df = pd.DataFrame({
-        'country': ['Egypt', 'Saudi Arabia'],
-        'poem': ['قصيدة مصرية', 'قصيدة سعودية']
     })
     st.dataframe(example_df)

     return chunks
+# The beginning of the code remains the same until the classify_emotion function
 def classify_emotion(text, classifier):
+    """Classify emotion for complete text with proper token handling."""
     try:
         # Split text into manageable chunks
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        # Create chunks that respect the 512 token limit
+        for word in words:
+            # Add word length plus 1 for space
+            word_tokens = len(classifier.tokenizer.encode(word))
+            if current_length + word_tokens > 512:
+                if current_chunk:
+                    chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_length = word_tokens
+            else:
+                current_chunk.append(word)
+                current_length += word_tokens
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        # If no chunks were created, use the original text with truncation
+        if not chunks:
+            chunks = [text]
         all_scores = []
         for chunk in chunks:
+            try:
+                # Ensure proper truncation
+                inputs = classifier.tokenizer(
+                    chunk,
+                    truncation=True,
+                    max_length=512,
+                    return_tensors="pt"
+                )
+                result = classifier(chunk, truncation=True, max_length=512)
+                scores = result[0]
+                all_scores.append(scores)
+            except Exception as chunk_error:
+                st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
+                continue
         # Average scores across all chunks
         if all_scores:
         st.warning(f"Error in emotion classification: {str(e)}")
         return "LABEL_2"  # Default to neutral
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
     chunks = split_text(text)
     # Example format
     st.write("### Expected File Format:")
     example_df = pd.DataFrame({
+        'country': ['Egypt', 'Palestine'],
+        'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
     })
     st.dataframe(example_df)