Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 7, 2024

Commit

729733d

verified ·

1 Parent(s): cebfb12

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -19

app.py CHANGED Viewed

@@ -170,9 +170,9 @@ def classify_emotion(text, classifier):
         return "LABEL_2"
 def get_embedding_for_text(text, tokenizer, model):
-    """Get embedding for complete text."""
     chunks = split_text(text)
     chunk_embeddings = []
     for chunk in chunks:
         try:
@@ -189,18 +189,15 @@ def get_embedding_for_text(text, tokenizer, model):
                 outputs = model(**inputs)
             embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-            chunk_embeddings.append(embedding[0])
         except Exception as e:
-            st.warning(f"Error processing chunk: {str(e)}")
             continue
     if chunk_embeddings:
-        # Convert to numpy array and ensure 2D shape
-        chunk_embeddings = np.array(chunk_embeddings)
-        if len(chunk_embeddings.shape) == 1:
-            chunk_embeddings = chunk_embeddings.reshape(1, -1)
-        return chunk_embeddings
-    return np.zeros((1, model.config.hidden_size))
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
@@ -237,6 +234,7 @@ def format_emotions(emotion_counts):
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     summaries = []
     topic_model_params = {
         "language": "arabic",
@@ -253,7 +251,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
-        embedding_model=None,  # Changed from bert_model to None
         **topic_model_params
     )
@@ -270,26 +268,29 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
         # Generate embeddings
-        embeddings = []
         for i, text in enumerate(texts):
             try:
                 embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
-                if embedding is not None and not np.isnan(embedding).any():
                     embeddings.append(embedding)
                 progress = (i + 1) / len(texts) * 0.4
                 progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
             except Exception as e:
                 st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
-        # Convert embeddings to numpy array
-        embeddings = np.array(embeddings)
         # Process emotions
-        for i, text in enumerate(texts):
             try:
                 emotion = classify_emotion(text, emotion_classifier)
                 all_emotions.append(emotion)
@@ -305,8 +306,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
                 continue
             # Ensure texts and embeddings match
-            if len(embeddings) != len(texts):
-                texts = texts[:len(embeddings)]
             # Fit and transform the topic model
             topics, probs = topic_model.fit_transform(texts, embeddings)
@@ -329,6 +329,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")

         return "LABEL_2"
 def get_embedding_for_text(text, tokenizer, model):
     chunks = split_text(text)
     chunk_embeddings = []
+    embedding_size = model.config.hidden_size
     for chunk in chunks:
         try:
                 outputs = model(**inputs)
             embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+            chunk_embeddings.append(embedding.reshape(-1))
         except Exception as e:
             continue
     if chunk_embeddings:
+        # Ensure consistent shape
+        final_embedding = np.mean(chunk_embeddings, axis=0)
+        return final_embedding.reshape(-1)
+    return np.zeros(embedding_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     summaries = []
+    embedding_size = bert_model.config.hidden_size
     topic_model_params = {
         "language": "arabic",
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
+        embedding_model=None,
         **topic_model_params
     )
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
+        embeddings = []
         # Generate embeddings
         for i, text in enumerate(texts):
             try:
                 embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
+                if embedding is not None and embedding.shape[0] == embedding_size:
                     embeddings.append(embedding)
                 progress = (i + 1) / len(texts) * 0.4
                 progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
             except Exception as e:
                 st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
+        # Convert to numpy array and ensure 2D shape
+        if embeddings:
+            embeddings = np.vstack(embeddings)
+        else:
+            st.warning(f"No valid embeddings generated for {country}")
+            continue
         # Process emotions
+        for i, text in enumerate(texts[:len(embeddings)]):
             try:
                 emotion = classify_emotion(text, emotion_classifier)
                 all_emotions.append(emotion)
                 continue
             # Ensure texts and embeddings match
+            texts = texts[:len(embeddings)]
             # Fit and transform the topic model
             topics, probs = topic_model.fit_transform(texts, embeddings)
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")