Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -233,7 +233,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
233 |
"n_gram_range": (1, 1),
|
234 |
"top_n_words": 15,
|
235 |
"verbose": True,
|
236 |
-
}
|
237 |
st.write(f"Total documents: {len(df)}")
|
238 |
st.write(f"Topic strategy: {topic_strategy}")
|
239 |
st.write(f"Min topic size: {min_topic_size}")
|
@@ -243,10 +243,14 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
243 |
else:
|
244 |
topic_model_params["nr_topics"] = "auto"
|
245 |
|
246 |
-
topic_model = BERTopic(
|
|
|
|
|
247 |
|
248 |
# Create vectorizer with stop words
|
249 |
-
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS)
|
|
|
|
|
250 |
topic_model.vectorizer_model = vectorizer
|
251 |
|
252 |
for country, group in df.groupby('country'):
|
@@ -258,11 +262,21 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
258 |
|
259 |
embeddings = []
|
260 |
for i, text in enumerate(texts):
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
progress = (i + 1) / len(texts) * 0.4
|
264 |
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
|
265 |
|
|
|
|
|
266 |
embeddings = np.array(embeddings)
|
267 |
|
268 |
for i, text in enumerate(texts):
|
@@ -272,16 +286,25 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
272 |
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
273 |
|
274 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
if
|
280 |
-
|
|
|
|
|
|
|
281 |
|
282 |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
|
283 |
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
|
284 |
-
|
285 |
summaries.append({
|
286 |
'country': country,
|
287 |
'total_poems': len(texts),
|
@@ -295,7 +318,6 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
295 |
continue
|
296 |
|
297 |
return summaries, topic_model
|
298 |
-
|
299 |
# Load models
|
300 |
try:
|
301 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
|
|
233 |
"n_gram_range": (1, 1),
|
234 |
"top_n_words": 15,
|
235 |
"verbose": True,
|
236 |
+
}
|
237 |
st.write(f"Total documents: {len(df)}")
|
238 |
st.write(f"Topic strategy: {topic_strategy}")
|
239 |
st.write(f"Min topic size: {min_topic_size}")
|
|
|
243 |
else:
|
244 |
topic_model_params["nr_topics"] = "auto"
|
245 |
|
246 |
+
topic_model = BERTopic(
|
247 |
+
embedding_model=bert_model,
|
248 |
+
**topic_model_params)
|
249 |
|
250 |
# Create vectorizer with stop words
|
251 |
+
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
|
252 |
+
min_df=2,
|
253 |
+
max_df=0.95)
|
254 |
topic_model.vectorizer_model = vectorizer
|
255 |
|
256 |
for country, group in df.groupby('country'):
|
|
|
262 |
|
263 |
embeddings = []
|
264 |
for i, text in enumerate(texts):
|
265 |
+
try:
|
266 |
+
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
|
267 |
+
if embedding is not None and not np.isnan(embedding).any():
|
268 |
+
embeddings.append(embedding)
|
269 |
+
else:
|
270 |
+
st.warning(f"Invalid embedding generated for text {i+1} in {country}")
|
271 |
+
continue
|
272 |
+
except Exception as e:
|
273 |
+
st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
|
274 |
+
continue
|
275 |
progress = (i + 1) / len(texts) * 0.4
|
276 |
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
|
277 |
|
278 |
+
if len(embeddings) != len(texts):
|
279 |
+
texts = texts[:len(embeddings)]
|
280 |
embeddings = np.array(embeddings)
|
281 |
|
282 |
for i, text in enumerate(texts):
|
|
|
286 |
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
287 |
|
288 |
try:
|
289 |
+
|
290 |
+
if len(texts) < min_topic_size:
|
291 |
+
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
|
292 |
+
continue
|
293 |
+
|
294 |
+
|
295 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
296 |
+
|
297 |
+
|
298 |
+
valid_topics = [t for t in topics if t != -1]
|
299 |
+
if not valid_topics:
|
300 |
+
st.warning(f"No valid topics generated for {country}")
|
301 |
+
continue
|
302 |
+
|
303 |
+
topic_counts = Counter(valid_topics)
|
304 |
|
305 |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
|
306 |
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
|
307 |
+
|
308 |
summaries.append({
|
309 |
'country': country,
|
310 |
'total_poems': len(texts),
|
|
|
318 |
continue
|
319 |
|
320 |
return summaries, topic_model
|
|
|
321 |
# Load models
|
322 |
try:
|
323 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|