kambris commited on
Commit
bd35972
·
verified ·
1 Parent(s): e9be7bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -27
app.py CHANGED
@@ -39,19 +39,57 @@ st.set_page_config(
39
 
40
  @st.cache_resource
41
  def load_models():
42
- """Load and cache the models to prevent reloading"""
43
- tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
44
- bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
45
- emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
46
- emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  emotion_classifier = pipeline(
48
  "sentiment-analysis",
49
  model=emotion_model,
50
- tokenizer=emotion_tokenizer,
51
- return_all_scores=True
 
52
  )
 
53
  return tokenizer, bert_model, emotion_classifier
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def split_text(text, max_length=512):
56
  """Split text into chunks of maximum token length while preserving word boundaries."""
57
  words = text.split()
@@ -223,31 +261,26 @@ def format_emotions(emotion_counts):
223
  'count': count
224
  })
225
  return formatted_emotions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
228
  """Process the data and generate summaries with flexible topic configuration."""
229
  summaries = []
230
 
231
- topic_model_params = {
232
- "language": "arabic",
233
- "calculate_probabilities": True,
234
- "min_topic_size": 3,
235
- "n_gram_range": (1, 1),
236
- "top_n_words": 15,
237
- "verbose": True,
238
- }
239
- st.write(f"Total documents: {len(df)}")
240
- st.write(f"Topic strategy: {topic_strategy}")
241
- st.write(f"Min topic size: {min_topic_size}")
242
-
243
- if topic_strategy == "Manual":
244
- topic_model_params["nr_topics"] = n_topics
245
- else:
246
- topic_model_params["nr_topics"] = "auto"
247
-
248
- topic_model = BERTopic(
249
- embedding_model=bert_model,
250
- **topic_model_params)
251
 
252
  vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
253
  min_df=1,
 
39
 
40
  @st.cache_resource
41
  def load_models():
42
+ """Load and cache the models"""
43
+ # + Added use_fast=True for faster tokenization
44
+ tokenizer = AutoTokenizer.from_pretrained(
45
+ "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
46
+ use_fast=True
47
+ )
48
+
49
+ # + Added torchscript and low_cpu_mem_usage
50
+ bert_model = AutoModel.from_pretrained(
51
+ "aubmindlab/bert-base-arabertv2",
52
+ torchscript=True,
53
+ low_cpu_mem_usage=True
54
+ )
55
+
56
+ # + Added optimizations for emotion model
57
+ emotion_model = AutoModelForSequenceClassification.from_pretrained(
58
+ "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
59
+ torchscript=True,
60
+ low_cpu_mem_usage=True
61
+ )
62
+
63
+ # ~ Changed pipeline configuration to use batching
64
  emotion_classifier = pipeline(
65
  "sentiment-analysis",
66
  model=emotion_model,
67
+ tokenizer=tokenizer,
68
+ batch_size=32,
69
+ device=-1 # + Added to force CPU usage
70
  )
71
+
72
  return tokenizer, bert_model, emotion_classifier
73
 
74
+ # + Added new batch processing function
75
+ def process_texts_in_batches(texts, batch_size=32):
76
+ """Process texts in batches for better CPU utilization"""
77
+ batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
78
+ results = []
79
+
80
+ for batch in batches:
81
+ batch_results = emotion_classifier(batch, truncation=True, max_length=512)
82
+ results.extend(batch_results)
83
+
84
+ return results
85
+
86
+ # + Added caching decorator for embeddings
87
+ @st.cache_data
88
+ def get_cached_embeddings(text, tokenizer, model):
89
+ """Cache embeddings to avoid recomputation"""
90
+ return get_embedding_for_text(text, tokenizer, model)
91
+
92
+
93
  def split_text(text, max_length=512):
94
  """Split text into chunks of maximum token length while preserving word boundaries."""
95
  words = text.split()
 
261
  'count': count
262
  })
263
  return formatted_emotions
264
+
265
+ def get_optimized_topic_model(bert_model):
266
+ """Configure BERTopic for better CPU performance"""
267
+ return BERTopic(
268
+ embedding_model=bert_model,
269
+ language="arabic",
270
+ calculate_probabilities=False,
271
+ verbose=False,
272
+ n_gram_range=(1, 1),
273
+ min_topic_size=5,
274
+ nr_topics="auto",
275
+ low_memory=True
276
+ )
277
+
278
 
279
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
280
  """Process the data and generate summaries with flexible topic configuration."""
281
  summaries = []
282
 
283
+ topic_model = get_optimized_topic_model(bert_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
  vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
286
  min_df=1,