Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,7 @@ import os
|
|
10 |
from wordcloud import WordCloud
|
11 |
import matplotlib.pyplot as plt
|
12 |
import pkg_resources
|
|
|
13 |
|
14 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
15 |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
|
@@ -51,7 +52,16 @@ def load_models():
|
|
51 |
return_all_scores=True
|
52 |
)
|
53 |
return tokenizer, bert_model, emotion_classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
|
|
55 |
def split_text(text, max_length=512):
|
56 |
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
57 |
words = text.split()
|
@@ -225,96 +235,70 @@ def format_emotions(emotion_counts):
|
|
225 |
return formatted_emotions
|
226 |
|
227 |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
|
228 |
-
"""Process the data and generate summaries with flexible topic configuration."""
|
229 |
summaries = []
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
"min_topic_size": 3,
|
235 |
-
"n_gram_range": (1, 1),
|
236 |
-
"top_n_words": 15,
|
237 |
-
"verbose": True,
|
238 |
-
}
|
239 |
-
st.write(f"Total documents: {len(df)}")
|
240 |
-
st.write(f"Topic strategy: {topic_strategy}")
|
241 |
-
st.write(f"Min topic size: {min_topic_size}")
|
242 |
-
|
243 |
-
if topic_strategy == "Manual":
|
244 |
-
topic_model_params["nr_topics"] = n_topics
|
245 |
-
else:
|
246 |
-
topic_model_params["nr_topics"] = "auto"
|
247 |
|
248 |
-
|
249 |
-
|
250 |
-
**topic_model_params)
|
251 |
-
|
252 |
-
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
|
253 |
-
min_df=1,
|
254 |
-
max_df=1.0)
|
255 |
-
topic_model.vectorizer_model = vectorizer
|
256 |
|
257 |
for country, group in df.groupby('country'):
|
258 |
-
|
259 |
-
|
|
|
260 |
|
|
|
261 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
262 |
all_emotions = []
|
263 |
|
|
|
264 |
embeddings = []
|
|
|
265 |
for i, text in enumerate(texts):
|
266 |
try:
|
267 |
-
embedding =
|
268 |
if embedding is not None and not np.isnan(embedding).any():
|
269 |
embeddings.append(embedding)
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
273 |
except Exception as e:
|
274 |
-
st.warning(f"Error
|
275 |
continue
|
276 |
-
progress = (i + 1) / len(texts) * 0.4
|
277 |
-
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
|
278 |
-
|
279 |
-
if len(embeddings) != len(texts):
|
280 |
-
texts = texts[:len(embeddings)]
|
281 |
-
embeddings = np.array(embeddings)
|
282 |
|
|
|
283 |
for i, text in enumerate(texts):
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
288 |
-
|
289 |
-
try:
|
290 |
-
|
291 |
-
if len(texts) < min_topic_size:
|
292 |
-
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
|
293 |
-
continue
|
294 |
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
return summaries, topic_model
|
317 |
-
|
318 |
try:
|
319 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
320 |
st.success("Models loaded successfully!")
|
|
|
10 |
from wordcloud import WordCloud
|
11 |
import matplotlib.pyplot as plt
|
12 |
import pkg_resources
|
13 |
+
import gc
|
14 |
|
15 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
16 |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
|
|
|
52 |
return_all_scores=True
|
53 |
)
|
54 |
return tokenizer, bert_model, emotion_classifier
|
55 |
+
|
56 |
+
@st.cache_data
|
57 |
+
def cache_embeddings(text, tokenizer, model):
|
58 |
+
return get_embedding_for_text(text, tokenizer, model)
|
59 |
+
|
60 |
+
@st.cache_data
|
61 |
+
def cache_emotion_classification(text, classifier):
|
62 |
+
return classify_emotion(text, classifier)
|
63 |
|
64 |
+
@st.cache_data
|
65 |
def split_text(text, max_length=512):
|
66 |
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
67 |
words = text.split()
|
|
|
235 |
return formatted_emotions
|
236 |
|
237 |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
|
|
|
238 |
summaries = []
|
239 |
|
240 |
+
# Create a placeholder for the progress bar
|
241 |
+
progress_placeholder = st.empty()
|
242 |
+
progress_bar = progress_placeholder.progress(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
+
# Create status message placeholder
|
245 |
+
status_message = st.empty()
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
for country, group in df.groupby('country'):
|
248 |
+
# Clear memory at the start of each country's processing
|
249 |
+
gc.collect()
|
250 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
251 |
|
252 |
+
status_message.text(f"Processing poems for {country}...")
|
253 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
254 |
all_emotions = []
|
255 |
|
256 |
+
# Use cached embeddings with progress tracking
|
257 |
embeddings = []
|
258 |
+
total_texts = len(texts)
|
259 |
for i, text in enumerate(texts):
|
260 |
try:
|
261 |
+
embedding = cache_embeddings(text, bert_tokenizer, bert_model)
|
262 |
if embedding is not None and not np.isnan(embedding).any():
|
263 |
embeddings.append(embedding)
|
264 |
+
|
265 |
+
# Update progress more frequently
|
266 |
+
if i % max(1, total_texts // 100) == 0:
|
267 |
+
progress = (i + 1) / total_texts * 0.4
|
268 |
+
progress_bar.progress(progress)
|
269 |
+
status_message.text(f"Generated embeddings for {i+1}/{total_texts} poems in {country}...")
|
270 |
+
|
271 |
except Exception as e:
|
272 |
+
st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
|
273 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
+
# Process emotions with caching and progress tracking
|
276 |
for i, text in enumerate(texts):
|
277 |
+
try:
|
278 |
+
emotion = cache_emotion_classification(text, emotion_classifier)
|
279 |
+
all_emotions.append(emotion)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
+
if i % max(1, total_texts // 100) == 0:
|
282 |
+
progress = 0.4 + ((i + 1) / total_texts * 0.3)
|
283 |
+
progress_bar.progress(progress)
|
284 |
+
status_message.text(f"Classified emotions for {i+1}/{total_texts} poems in {country}...")
|
285 |
+
|
286 |
+
except Exception as e:
|
287 |
+
st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
|
288 |
+
continue
|
289 |
+
|
290 |
+
# Rest of your existing processing code...
|
291 |
+
|
292 |
+
# Clear progress for next country
|
293 |
+
progress_placeholder.empty()
|
294 |
+
status_message.empty()
|
295 |
+
|
296 |
+
# Create new progress bar for next country
|
297 |
+
progress_placeholder = st.empty()
|
298 |
+
progress_bar = progress_placeholder.progress(0)
|
299 |
+
status_message = st.empty()
|
300 |
+
|
|
|
301 |
return summaries, topic_model
|
|
|
302 |
try:
|
303 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
304 |
st.success("Models loaded successfully!")
|