Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 5, 2024

Commit

bd35972

verified ·

1 Parent(s): e9be7bd

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -27

app.py CHANGED Viewed

@@ -39,19 +39,57 @@ st.set_page_config(
 @st.cache_resource
 def load_models():
-    """Load and cache the models to prevent reloading"""
-    tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
-    bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
-    emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
-    emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
     emotion_classifier = pipeline(
         "sentiment-analysis",
         model=emotion_model,
-        tokenizer=emotion_tokenizer,
-        return_all_scores=True
     )
     return tokenizer, bert_model, emotion_classifier
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
@@ -223,31 +261,26 @@ def format_emotions(emotion_counts):
             'count': count
         })
     return formatted_emotions
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     """Process the data and generate summaries with flexible topic configuration."""
     summaries = []
-    topic_model_params = {
-        "language": "arabic",
-        "calculate_probabilities": True,
-        "min_topic_size": 3,
-        "n_gram_range": (1, 1),
-        "top_n_words": 15,
-        "verbose": True,
-    }
-    st.write(f"Total documents: {len(df)}")
-    st.write(f"Topic strategy: {topic_strategy}")
-    st.write(f"Min topic size: {min_topic_size}")
-    if topic_strategy == "Manual":
-        topic_model_params["nr_topics"] = n_topics
-    else:
-        topic_model_params["nr_topics"] = "auto"
-    topic_model = BERTopic(
-        embedding_model=bert_model,
-        **topic_model_params)
     vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
                                 min_df=1,

 @st.cache_resource
 def load_models():
+    """Load and cache the models"""
+    # + Added use_fast=True for faster tokenization
+    tokenizer = AutoTokenizer.from_pretrained(
+        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
+        use_fast=True
+    )
+    # + Added torchscript and low_cpu_mem_usage
+    bert_model = AutoModel.from_pretrained(
+        "aubmindlab/bert-base-arabertv2",
+        torchscript=True,
+        low_cpu_mem_usage=True
+    )
+    # + Added optimizations for emotion model
+    emotion_model = AutoModelForSequenceClassification.from_pretrained(
+        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
+        torchscript=True,
+        low_cpu_mem_usage=True
+    )
+    # ~ Changed pipeline configuration to use batching
     emotion_classifier = pipeline(
         "sentiment-analysis",
         model=emotion_model,
+        tokenizer=tokenizer,
+        batch_size=32,
+        device=-1  # + Added to force CPU usage
     )
     return tokenizer, bert_model, emotion_classifier
+# + Added new batch processing function
+def process_texts_in_batches(texts, batch_size=32):
+    """Process texts in batches for better CPU utilization"""
+    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
+    results = []
+    for batch in batches:
+        batch_results = emotion_classifier(batch, truncation=True, max_length=512)
+        results.extend(batch_results)
+    return results
+# + Added caching decorator for embeddings
+@st.cache_data
+def get_cached_embeddings(text, tokenizer, model):
+    """Cache embeddings to avoid recomputation"""
+    return get_embedding_for_text(text, tokenizer, model)
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
             'count': count
         })
     return formatted_emotions
+def get_optimized_topic_model(bert_model):
+    """Configure BERTopic for better CPU performance"""
+    return BERTopic(
+        embedding_model=bert_model,
+        language="arabic",
+        calculate_probabilities=False,
+        verbose=False,
+        n_gram_range=(1, 1),
+        min_topic_size=5,
+        nr_topics="auto",
+        low_memory=True
+    )
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     """Process the data and generate summaries with flexible topic configuration."""
     summaries = []
+    topic_model = get_optimized_topic_model(bert_model)
     vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
                                 min_df=1,