kambris commited on
Commit
db1f2f7
ยท
verified ยท
1 Parent(s): 78da58a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -143
app.py CHANGED
@@ -13,8 +13,8 @@ st.set_page_config(
13
  page_icon="๐Ÿ“š",
14
  layout="wide"
15
  )
16
- @st.cache_resource
17
 
 
18
  def load_models():
19
  """Load and cache the models to prevent reloading"""
20
  tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
@@ -55,20 +55,21 @@ def split_text(text, max_length=512):
55
  def clean_arabic_text(text):
56
  # Add Arabic stop words
57
  ARABIC_STOP_WORDS = {
58
- 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู†', 'ู…ุน', 'ุฎู„ุงู„', 'ุญุชู‰', 'ุฅุฐุง', 'ุซู…',
59
- 'ุฃูˆ', 'ูˆ', 'ู', 'ู„', 'ุจ', 'ูƒ', 'ู„ู„', 'ุงู„', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ',
60
- 'ุชู„ูƒ', 'ู‡ุคู„ุงุก', 'ู‡ู…', 'ู‡ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู†ุญู†', 'ุงู†ุช', 'ุงู†ุชู…',
61
- 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ุงูŠ', 'ูƒู„', 'ุจุนุถ', 'ุบูŠุฑ', 'ุญูˆู„',
62
- 'ุนู†ุฏ', 'ู‚ุฏ', 'ู„ู‚ุฏ', 'ู„ู…', 'ู„ู†', 'ู„ูˆ', 'ู…ุง', 'ู…ุงุฐุง', 'ู…ุชู‰', 'ูƒูŠู',
63
- 'ุงูŠู†', 'ู„ู…ุงุฐุง', 'ุงู„ุฐูŠ', 'ุงู„ุชูŠ', 'ุงู„ุฐูŠู†', 'ุงู„ู„ุงุชูŠ', 'ุงู„ู„ูˆุงุชูŠ',
64
- 'ุงู„ุงู†', 'ุจูŠู†', 'ููˆู‚', 'ุชุญุช', 'ุงู…ุงู…', 'ุฎู„ู', 'ุญูŠู†', 'ู‚ุจู„', 'ุจุนุฏ',
65
- 'ูˆ', 'ุฃู†', 'ููŠ', 'ูƒู„', 'ู„ู…', 'ู„ู†', 'ู„ู‡', 'ู…ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู‚ูˆุฉ',
66
- 'ูƒู…ุง', 'ู„ู‡ุง', 'ู…ู†ุฐ', 'ูˆู‚ุฏ', 'ูˆู„ุง', 'ู†ูุณ', 'ูˆู„ู…', 'ุญูŠุซ', 'ู‡ู†ุงูƒ',
67
- 'ุฌุฏุง', 'ุฐุงุช', 'ุถู…ู†', 'ุงู†ู‡', 'ู„ุฏู‰', 'ุนู„ูŠู‡', 'ู…ุซู„', 'ูˆู„ู‡', 'ุนู†ุฏ',
68
- 'ุฃู…ุง', 'ู‡ุฐู‡', 'ูˆุฃู†', 'ูˆูƒู„', 'ูˆู‚ุงู„', 'ู„ุฏูŠ', 'ูˆูƒุงู†', 'ููŠู‡', 'ูˆู‡ูŠ',
69
- 'ูˆู‡ูˆ', 'ุชู„ูƒ', 'ูƒู„ู…', 'ู„ูƒู†', 'ูˆููŠ', 'ูˆู‚ู', 'ูˆู„ู‚ุฏ', 'ูˆู…ู†', 'ูˆู‡ุฐุง',
70
- 'ุงูˆู„', 'ุถู…ู†', 'ุงู†ู‡ุง', 'ุฌู…ูŠุน', 'ุงู„ุฐูŠ', 'ู‚ุจู„', 'ุจุนุฏ', 'ุญูˆู„', 'ุงูŠุถุง',
71
- 'ู„ุงุฒู…', 'ุญุงุฌุฉ', 'ุนู„ูŠ', 'ูŠุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'}
 
72
  """Clean Arabic text by removing stop words and normalizing."""
73
  words = text.split()
74
  cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
@@ -270,142 +271,142 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
270
  continue
271
 
272
  return summaries, topic_model
273
-
274
- # Load models
275
- try:
276
- bert_tokenizer, bert_model, emotion_classifier = load_models()
277
- st.success("Models loaded successfully!")
278
- except Exception as e:
279
- st.error(f"Error loading models: {str(e)}")
280
- st.stop()
281
 
282
- # Main app interface
283
- st.title("๐Ÿ“š Arabic Poem Analysis")
284
- st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
 
 
 
 
285
 
286
- # File upload
287
- uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
 
288
 
289
- if uploaded_file is not None:
290
- try:
291
- # Read the file
292
- if uploaded_file.name.endswith('.csv'):
293
- df = pd.read_csv(uploaded_file)
294
- else:
295
- df = pd.read_excel(uploaded_file)
296
-
297
- # Validate columns
298
- required_columns = ['country', 'poem']
299
- if not all(col in df.columns for col in required_columns):
300
- st.error("File must contain 'country' and 'poem' columns.")
301
- st.stop()
302
-
303
- # Clean data
304
- df['country'] = df['country'].str.strip()
305
- df = df.dropna(subset=['country', 'poem'])
306
-
307
- # Add topic modeling controls
308
- st.subheader("Topic Modeling Settings")
309
- col1, col2 = st.columns(2)
 
 
 
 
 
 
 
 
 
 
310
 
311
- with col1:
312
- topic_strategy = st.radio(
313
- "Topic Number Strategy",
314
- ["Auto", "Manual"],
315
- help="Choose whether to let the model determine the optimal number of topics or set it manually"
316
- )
317
 
318
- if topic_strategy == "Manual":
319
- # Calculate reasonable max topics based on dataset size
320
- n_documents = len(df)
321
- max_topics = max(2, min(50, n_documents // 20)) # Ensure minimum of 2
322
-
323
- n_topics = st.slider(
324
- "Number of Topics",
325
- min_value=2,
326
- max_value=max_topics,
327
- value=min(20, max_topics),
328
- help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
329
- )
330
-
331
- st.info(f"""
332
- ๐Ÿ’ก For your dataset of {n_documents:,} documents:
333
- - Minimum topics: 2
334
- - Maximum topics: {max_topics}
335
- - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
336
- """)
337
-
338
- with col2:
339
- top_n = st.number_input(
340
- "Number of top topics/emotions to display:",
341
- min_value=1,
342
- max_value=100,
343
- value=10
344
  )
345
 
346
- min_topic_size = st.slider(
347
- "Minimum Topic Size",
348
- min_value=10,
349
- max_value=100,
350
- value=30,
351
- help="Minimum number of documents required to form a topic"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  )
353
 
354
- if st.button("Process Data"):
355
- with st.spinner("Processing your data..."):
356
- summaries, topic_model = process_and_summarize(
357
- df,
358
- bert_tokenizer,
359
- bert_model,
360
- emotion_classifier,
361
- top_n=top_n,
362
- topic_strategy=topic_strategy,
363
- n_topics=n_topics if topic_strategy == "Manual" else None,
364
- min_topic_size=min_topic_size
365
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
- if summaries:
368
- st.success("Analysis complete!")
369
-
370
- # Display results in tabs
371
- tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
372
-
373
- with tab1:
374
- for summary in summaries:
375
- with st.expander(f"๐Ÿ“ {summary['country']} ({summary['total_poems']} poems)"):
376
- col1, col2 = st.columns(2)
377
-
378
- with col1:
379
- st.subheader("Top Topics")
380
- for topic in summary['top_topics']:
381
- st.write(f"โ€ข {topic['topic']}: {topic['count']} poems")
382
-
383
- with col2:
384
- st.subheader("Emotions")
385
- for emotion in summary['top_emotions']:
386
- st.write(f"โ€ข {emotion['emotion']}: {emotion['count']} poems")
387
-
388
- with tab2:
389
- st.subheader("Global Topic Distribution")
390
- topic_info = topic_model.get_topic_info()
391
- for _, row in topic_info.iterrows():
392
- if row['Topic'] == -1:
393
- topic_name = "Miscellaneous"
394
- else:
395
- words = topic_model.get_topic(row['Topic'])
396
- topic_name = " | ".join([word for word, _ in words[:5]])
397
- st.write(f"โ€ข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
398
-
399
- except Exception as e:
400
- st.error(f"Error processing file: {str(e)}")
401
 
402
- else:
403
- st.info("๐Ÿ‘† Upload a file to get started!")
404
-
405
- # Example format
406
- st.write("### Expected File Format:")
407
- example_df = pd.DataFrame({
408
- 'country': ['Egypt', 'Palestine'],
409
- 'poem': ['ู‚ุตูŠุฏุฉ ู…ุตุฑูŠุฉ', 'ู‚ุตูŠุฏุฉ ูู„ุณุทูŠู†ูŠุฉ']
410
- })
411
- st.dataframe(example_df)
 
13
  page_icon="๐Ÿ“š",
14
  layout="wide"
15
  )
 
16
 
17
+ @st.cache_resource
18
  def load_models():
19
  """Load and cache the models to prevent reloading"""
20
  tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
 
55
  def clean_arabic_text(text):
56
  # Add Arabic stop words
57
  ARABIC_STOP_WORDS = {
58
+ 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู†', 'ู…ุน', 'ุฎู„ุงู„', 'ุญุชู‰', 'ุฅุฐุง', 'ุซู…',
59
+ 'ุฃูˆ', 'ูˆ', 'ู', 'ู„', 'ุจ', 'ูƒ', 'ู„ู„', 'ุงู„', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ',
60
+ 'ุชู„ูƒ', 'ู‡ุคู„ุงุก', 'ู‡ู…', 'ู‡ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู†ุญู†', 'ุงู†ุช', 'ุงู†ุชู…',
61
+ 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ุงูŠ', 'ูƒู„', 'ุจุนุถ', 'ุบูŠุฑ', 'ุญูˆู„',
62
+ 'ุนู†ุฏ', 'ู‚ุฏ', 'ู„ู‚ุฏ', 'ู„ู…', 'ู„ู†', 'ู„ูˆ', 'ู…ุง', 'ู…ุงุฐุง', 'ู…ุชู‰', 'ูƒูŠู',
63
+ 'ุงูŠู†', 'ู„ู…ุงุฐุง', 'ุงู„ุฐูŠ', 'ุงู„ุชูŠ', 'ุงู„ุฐูŠู†', 'ุงู„ู„ุงุชูŠ', 'ุงู„ู„ูˆุงุชูŠ',
64
+ 'ุงู„ุงู†', 'ุจูŠู†', 'ููˆู‚', 'ุชุญุช', 'ุงู…ุงู…', 'ุฎู„ู', 'ุญูŠู†', 'ู‚ุจู„', 'ุจุนุฏ',
65
+ 'ูˆ', 'ุฃู†', 'ููŠ', 'ูƒู„', 'ู„ู…', 'ู„ู†', 'ู„ู‡', 'ู…ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู‚ูˆุฉ',
66
+ 'ูƒู…ุง', 'ู„ู‡ุง', 'ู…ู†ุฐ', 'ูˆู‚ุฏ', 'ูˆู„ุง', 'ู†ูุณ', 'ูˆู„ู…', 'ุญูŠุซ', 'ู‡ู†ุงูƒ',
67
+ 'ุฌุฏุง', 'ุฐุงุช', 'ุถู…ู†', 'ุงู†ู‡', 'ู„ุฏู‰', 'ุนู„ูŠู‡', 'ู…ุซู„', 'ูˆู„ู‡', 'ุนู†ุฏ',
68
+ 'ุฃู…ุง', 'ู‡ุฐู‡', 'ูˆุฃู†', 'ูˆูƒู„', 'ูˆู‚ุงู„', 'ู„ุฏูŠ', 'ูˆูƒุงู†', 'ููŠู‡', 'ูˆู‡ูŠ',
69
+ 'ูˆู‡ูˆ', 'ุชู„ูƒ', 'ูƒู„ู…', 'ู„ูƒู†', 'ูˆููŠ', 'ูˆู‚ู', 'ูˆู„ู‚ุฏ', 'ูˆู…ู†', 'ูˆู‡ุฐุง',
70
+ 'ุงูˆู„', 'ุถู…ู†', 'ุงู†ู‡ุง', 'ุฌู…ูŠุน', 'ุงู„ุฐูŠ', 'ู‚ุจู„', 'ุจุนุฏ', 'ุญูˆู„', 'ุงูŠุถุง',
71
+ 'ู„ุงุฒู…', 'ุญุงุฌุฉ', 'ุนู„ูŠ', 'ูŠุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
72
+ }
73
  """Clean Arabic text by removing stop words and normalizing."""
74
  words = text.split()
75
  cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
 
271
  continue
272
 
273
  return summaries, topic_model
 
 
 
 
 
 
 
 
274
 
275
+ # Load models
276
+ try:
277
+ bert_tokenizer, bert_model, emotion_classifier = load_models()
278
+ st.success("Models loaded successfully!")
279
+ except Exception as e:
280
+ st.error(f"Error loading models: {str(e)}")
281
+ st.stop()
282
 
283
+ # Main app interface
284
+ st.title("๐Ÿ“š Arabic Poem Analysis")
285
+ st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
286
 
287
+ # File upload
288
+ uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
289
+
290
+ if uploaded_file is not None:
291
+ try:
292
+ # Read the file
293
+ if uploaded_file.name.endswith('.csv'):
294
+ df = pd.read_csv(uploaded_file)
295
+ else:
296
+ df = pd.read_excel(uploaded_file)
297
+
298
+ # Validate columns
299
+ required_columns = ['country', 'poem']
300
+ if not all(col in df.columns for col in required_columns):
301
+ st.error("File must contain 'country' and 'poem' columns.")
302
+ st.stop()
303
+
304
+ # Clean data
305
+ df['country'] = df['country'].str.strip()
306
+ df = df.dropna(subset=['country', 'poem'])
307
+
308
+ # Add topic modeling controls
309
+ st.subheader("Topic Modeling Settings")
310
+ col1, col2 = st.columns(2)
311
+
312
+ with col1:
313
+ topic_strategy = st.radio(
314
+ "Topic Number Strategy",
315
+ ["Auto", "Manual"],
316
+ help="Choose whether to let the model determine the optimal number of topics or set it manually"
317
+ )
318
 
319
+ if topic_strategy == "Manual":
320
+ n_documents = len(df)
321
+ max_topics = max(2, min(50, n_documents // 20))
 
 
 
322
 
323
+ n_topics = st.slider(
324
+ "Number of Topics",
325
+ min_value=2,
326
+ max_value=max_topics,
327
+ value=min(20, max_topics),
328
+ help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  )
330
 
331
+ st.info(f"""
332
+ ๐Ÿ’ก For your dataset of {n_documents:,} documents:
333
+ - Minimum topics: 2
334
+ - Maximum topics: {max_topics}
335
+ - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
336
+ """)
337
+
338
+ with col2:
339
+ top_n = st.number_input(
340
+ "Number of top topics/emotions to display:",
341
+ min_value=1,
342
+ max_value=100,
343
+ value=10
344
+ )
345
+
346
+ min_topic_size = st.slider(
347
+ "Minimum Topic Size",
348
+ min_value=10,
349
+ max_value=100,
350
+ value=30,
351
+ help="Minimum number of documents required to form a topic"
352
+ )
353
+
354
+ if st.button("Process Data"):
355
+ with st.spinner("Processing your data..."):
356
+ summaries, topic_model = process_and_summarize(
357
+ df,
358
+ bert_tokenizer,
359
+ bert_model,
360
+ emotion_classifier,
361
+ top_n=top_n,
362
+ topic_strategy=topic_strategy,
363
+ n_topics=n_topics if topic_strategy == "Manual" else None,
364
+ min_topic_size=min_topic_size
365
  )
366
 
367
+ if summaries:
368
+ st.success("Analysis complete!")
369
+
370
+ # Display results in tabs
371
+ tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
372
+
373
+ with tab1:
374
+ for summary in summaries:
375
+ with st.expander(f"๐Ÿ“ {summary['country']} ({summary['total_poems']} poems)"):
376
+ col1, col2 = st.columns(2)
377
+
378
+ with col1:
379
+ st.subheader("Top Topics")
380
+ for topic in summary['top_topics']:
381
+ st.write(f"โ€ข {topic['topic']}: {topic['count']} poems")
382
+
383
+ with col2:
384
+ st.subheader("Emotions")
385
+ for emotion in summary['top_emotions']:
386
+ st.write(f"โ€ข {emotion['emotion']}: {emotion['count']} poems")
387
+
388
+ with tab2:
389
+ st.subheader("Global Topic Distribution")
390
+ topic_info = topic_model.get_topic_info()
391
+ for _, row in topic_info.iterrows():
392
+ if row['Topic'] == -1:
393
+ topic_name = "Miscellaneous"
394
+ else:
395
+ words = topic_model.get_topic(row['Topic'])
396
+ topic_name = " | ".join([word for word, _ in words[:5]])
397
+ st.write(f"โ€ข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
398
+
399
+ except Exception as e:
400
+ st.error(f"Error processing file: {str(e)}")
401
 
402
+ else:
403
+ st.info("๐Ÿ‘† Upload a file to get started!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ # Example format
406
+ st.write("### Expected File Format:")
407
+ example_df = pd.DataFrame({
408
+ 'country': ['Egypt', 'Palestine'],
409
+ 'poem': ['ู‚ุตูŠุฏุฉ ู…ุตุฑูŠุฉ', 'ู‚ุตูŠุฏุฉ ูู„ุณุทูŠู†ูŠุฉ']
410
+ })
411
+ st.dataframe(example_df)
412
+