Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -13,8 +13,8 @@ st.set_page_config(
|
|
13 |
page_icon="๐",
|
14 |
layout="wide"
|
15 |
)
|
16 |
-
@st.cache_resource
|
17 |
|
|
|
18 |
def load_models():
|
19 |
"""Load and cache the models to prevent reloading"""
|
20 |
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
@@ -55,20 +55,21 @@ def split_text(text, max_length=512):
|
|
55 |
def clean_arabic_text(text):
|
56 |
# Add Arabic stop words
|
57 |
ARABIC_STOP_WORDS = {
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
72 |
"""Clean Arabic text by removing stop words and normalizing."""
|
73 |
words = text.split()
|
74 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
@@ -270,142 +271,142 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
270 |
continue
|
271 |
|
272 |
return summaries, topic_model
|
273 |
-
|
274 |
-
# Load models
|
275 |
-
try:
|
276 |
-
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
277 |
-
st.success("Models loaded successfully!")
|
278 |
-
except Exception as e:
|
279 |
-
st.error(f"Error loading models: {str(e)}")
|
280 |
-
st.stop()
|
281 |
|
282 |
-
|
283 |
-
|
284 |
-
|
|
|
|
|
|
|
|
|
285 |
|
286 |
-
|
287 |
-
|
|
|
288 |
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
["Auto", "Manual"],
|
315 |
-
help="Choose whether to let the model determine the optimal number of topics or set it manually"
|
316 |
-
)
|
317 |
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
max_topics
|
322 |
-
|
323 |
-
|
324 |
-
"Number of Topics",
|
325 |
-
min_value=2,
|
326 |
-
max_value=max_topics,
|
327 |
-
value=min(20, max_topics),
|
328 |
-
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
329 |
-
)
|
330 |
-
|
331 |
-
st.info(f"""
|
332 |
-
๐ก For your dataset of {n_documents:,} documents:
|
333 |
-
- Minimum topics: 2
|
334 |
-
- Maximum topics: {max_topics}
|
335 |
-
- Recommended range: {max(2, max_topics//5)}-{max_topics//2}
|
336 |
-
""")
|
337 |
-
|
338 |
-
with col2:
|
339 |
-
top_n = st.number_input(
|
340 |
-
"Number of top topics/emotions to display:",
|
341 |
-
min_value=1,
|
342 |
-
max_value=100,
|
343 |
-
value=10
|
344 |
)
|
345 |
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
)
|
353 |
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
# Display results in tabs
|
371 |
-
tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
|
372 |
-
|
373 |
-
with tab1:
|
374 |
-
for summary in summaries:
|
375 |
-
with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"):
|
376 |
-
col1, col2 = st.columns(2)
|
377 |
-
|
378 |
-
with col1:
|
379 |
-
st.subheader("Top Topics")
|
380 |
-
for topic in summary['top_topics']:
|
381 |
-
st.write(f"โข {topic['topic']}: {topic['count']} poems")
|
382 |
-
|
383 |
-
with col2:
|
384 |
-
st.subheader("Emotions")
|
385 |
-
for emotion in summary['top_emotions']:
|
386 |
-
st.write(f"โข {emotion['emotion']}: {emotion['count']} poems")
|
387 |
-
|
388 |
-
with tab2:
|
389 |
-
st.subheader("Global Topic Distribution")
|
390 |
-
topic_info = topic_model.get_topic_info()
|
391 |
-
for _, row in topic_info.iterrows():
|
392 |
-
if row['Topic'] == -1:
|
393 |
-
topic_name = "Miscellaneous"
|
394 |
-
else:
|
395 |
-
words = topic_model.get_topic(row['Topic'])
|
396 |
-
topic_name = " | ".join([word for word, _ in words[:5]])
|
397 |
-
st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
398 |
-
|
399 |
-
except Exception as e:
|
400 |
-
st.error(f"Error processing file: {str(e)}")
|
401 |
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
})
|
411 |
-
st.dataframe(example_df)
|
|
|
13 |
page_icon="๐",
|
14 |
layout="wide"
|
15 |
)
|
|
|
16 |
|
17 |
+
@st.cache_resource
|
18 |
def load_models():
|
19 |
"""Load and cache the models to prevent reloading"""
|
20 |
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
|
|
55 |
def clean_arabic_text(text):
|
56 |
# Add Arabic stop words
|
57 |
ARABIC_STOP_WORDS = {
|
58 |
+
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู
',
|
59 |
+
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู',
|
60 |
+
'ุชูู', 'ูุคูุงุก', 'ูู
', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู
',
|
61 |
+
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู',
|
62 |
+
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู
', 'ูู', 'ูู', 'ู
ุง', 'ู
ุงุฐุง', 'ู
ุชู', 'ููู',
|
63 |
+
'ุงูู', 'ูู
ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู',
|
64 |
+
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู
ุงู
', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ',
|
65 |
+
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู
', 'ูู', 'ูู', 'ู
ู', 'ูู', 'ูู', 'ููุฉ',
|
66 |
+
'ูู
ุง', 'ููุง', 'ู
ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู
', 'ุญูุซ', 'ููุงู',
|
67 |
+
'ุฌุฏุง', 'ุฐุงุช', 'ุถู
ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู
ุซู', 'ููู', 'ุนูุฏ',
|
68 |
+
'ุฃู
ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู',
|
69 |
+
'ููู', 'ุชูู', 'ููู
', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู
ู', 'ููุฐุง',
|
70 |
+
'ุงูู', 'ุถู
ู', 'ุงููุง', 'ุฌู
ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง',
|
71 |
+
'ูุงุฒู
', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
|
72 |
+
}
|
73 |
"""Clean Arabic text by removing stop words and normalizing."""
|
74 |
words = text.split()
|
75 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
|
|
271 |
continue
|
272 |
|
273 |
return summaries, topic_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
+
# Load models
|
276 |
+
try:
|
277 |
+
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
278 |
+
st.success("Models loaded successfully!")
|
279 |
+
except Exception as e:
|
280 |
+
st.error(f"Error loading models: {str(e)}")
|
281 |
+
st.stop()
|
282 |
|
283 |
+
# Main app interface
|
284 |
+
st.title("๐ Arabic Poem Analysis")
|
285 |
+
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
|
286 |
|
287 |
+
# File upload
|
288 |
+
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
|
289 |
+
|
290 |
+
if uploaded_file is not None:
|
291 |
+
try:
|
292 |
+
# Read the file
|
293 |
+
if uploaded_file.name.endswith('.csv'):
|
294 |
+
df = pd.read_csv(uploaded_file)
|
295 |
+
else:
|
296 |
+
df = pd.read_excel(uploaded_file)
|
297 |
+
|
298 |
+
# Validate columns
|
299 |
+
required_columns = ['country', 'poem']
|
300 |
+
if not all(col in df.columns for col in required_columns):
|
301 |
+
st.error("File must contain 'country' and 'poem' columns.")
|
302 |
+
st.stop()
|
303 |
+
|
304 |
+
# Clean data
|
305 |
+
df['country'] = df['country'].str.strip()
|
306 |
+
df = df.dropna(subset=['country', 'poem'])
|
307 |
+
|
308 |
+
# Add topic modeling controls
|
309 |
+
st.subheader("Topic Modeling Settings")
|
310 |
+
col1, col2 = st.columns(2)
|
311 |
+
|
312 |
+
with col1:
|
313 |
+
topic_strategy = st.radio(
|
314 |
+
"Topic Number Strategy",
|
315 |
+
["Auto", "Manual"],
|
316 |
+
help="Choose whether to let the model determine the optimal number of topics or set it manually"
|
317 |
+
)
|
318 |
|
319 |
+
if topic_strategy == "Manual":
|
320 |
+
n_documents = len(df)
|
321 |
+
max_topics = max(2, min(50, n_documents // 20))
|
|
|
|
|
|
|
322 |
|
323 |
+
n_topics = st.slider(
|
324 |
+
"Number of Topics",
|
325 |
+
min_value=2,
|
326 |
+
max_value=max_topics,
|
327 |
+
value=min(20, max_topics),
|
328 |
+
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
)
|
330 |
|
331 |
+
st.info(f"""
|
332 |
+
๐ก For your dataset of {n_documents:,} documents:
|
333 |
+
- Minimum topics: 2
|
334 |
+
- Maximum topics: {max_topics}
|
335 |
+
- Recommended range: {max(2, max_topics//5)}-{max_topics//2}
|
336 |
+
""")
|
337 |
+
|
338 |
+
with col2:
|
339 |
+
top_n = st.number_input(
|
340 |
+
"Number of top topics/emotions to display:",
|
341 |
+
min_value=1,
|
342 |
+
max_value=100,
|
343 |
+
value=10
|
344 |
+
)
|
345 |
+
|
346 |
+
min_topic_size = st.slider(
|
347 |
+
"Minimum Topic Size",
|
348 |
+
min_value=10,
|
349 |
+
max_value=100,
|
350 |
+
value=30,
|
351 |
+
help="Minimum number of documents required to form a topic"
|
352 |
+
)
|
353 |
+
|
354 |
+
if st.button("Process Data"):
|
355 |
+
with st.spinner("Processing your data..."):
|
356 |
+
summaries, topic_model = process_and_summarize(
|
357 |
+
df,
|
358 |
+
bert_tokenizer,
|
359 |
+
bert_model,
|
360 |
+
emotion_classifier,
|
361 |
+
top_n=top_n,
|
362 |
+
topic_strategy=topic_strategy,
|
363 |
+
n_topics=n_topics if topic_strategy == "Manual" else None,
|
364 |
+
min_topic_size=min_topic_size
|
365 |
)
|
366 |
|
367 |
+
if summaries:
|
368 |
+
st.success("Analysis complete!")
|
369 |
+
|
370 |
+
# Display results in tabs
|
371 |
+
tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
|
372 |
+
|
373 |
+
with tab1:
|
374 |
+
for summary in summaries:
|
375 |
+
with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"):
|
376 |
+
col1, col2 = st.columns(2)
|
377 |
+
|
378 |
+
with col1:
|
379 |
+
st.subheader("Top Topics")
|
380 |
+
for topic in summary['top_topics']:
|
381 |
+
st.write(f"โข {topic['topic']}: {topic['count']} poems")
|
382 |
+
|
383 |
+
with col2:
|
384 |
+
st.subheader("Emotions")
|
385 |
+
for emotion in summary['top_emotions']:
|
386 |
+
st.write(f"โข {emotion['emotion']}: {emotion['count']} poems")
|
387 |
+
|
388 |
+
with tab2:
|
389 |
+
st.subheader("Global Topic Distribution")
|
390 |
+
topic_info = topic_model.get_topic_info()
|
391 |
+
for _, row in topic_info.iterrows():
|
392 |
+
if row['Topic'] == -1:
|
393 |
+
topic_name = "Miscellaneous"
|
394 |
+
else:
|
395 |
+
words = topic_model.get_topic(row['Topic'])
|
396 |
+
topic_name = " | ".join([word for word, _ in words[:5]])
|
397 |
+
st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
398 |
+
|
399 |
+
except Exception as e:
|
400 |
+
st.error(f"Error processing file: {str(e)}")
|
401 |
|
402 |
+
else:
|
403 |
+
st.info("๐ Upload a file to get started!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
|
405 |
+
# Example format
|
406 |
+
st.write("### Expected File Format:")
|
407 |
+
example_df = pd.DataFrame({
|
408 |
+
'country': ['Egypt', 'Palestine'],
|
409 |
+
'poem': ['ูุตูุฏุฉ ู
ุตุฑูุฉ', 'ูุตูุฏุฉ ููุณุทูููุฉ']
|
410 |
+
})
|
411 |
+
st.dataframe(example_df)
|
412 |
+
|
|
|
|