kambris commited on
Commit
afa7452
ยท
verified ยท
1 Parent(s): f963213

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -124
app.py CHANGED
@@ -271,138 +271,135 @@ def process_and_summarize(df, top_n=50, topic_strategy="Auto", n_topics=None, mi
271
 
272
  return summaries, topic_model
273
 
274
- # Load models
275
- try:
276
- bert_tokenizer, bert_model, emotion_classifier = load_models()
277
- st.success("Models loaded successfully!")
278
- except Exception as e:
279
- st.error(f"Error loading models: {str(e)}")
280
- st.stop()
 
 
281
 
282
- # Main app interface
283
- st.title("๐Ÿ“š Arabic Poem Analysis")
284
- st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
285
 
286
- # File upload
287
- uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
288
 
289
- if uploaded_file is not None:
290
- try:
291
- # Read the file
292
- if uploaded_file.name.endswith('.csv'):
293
- df = pd.read_csv(uploaded_file)
294
- else:
295
- df = pd.read_excel(uploaded_file)
296
-
297
- # Validate columns
298
- required_columns = ['country', 'poem']
299
- if not all(col in df.columns for col in required_columns):
300
- st.error("File must contain 'country' and 'poem' columns.")
301
- st.stop()
302
-
303
- # Clean data
304
- df['country'] = df['country'].str.strip()
305
- df = df.dropna(subset=['country', 'poem'])
306
-
307
- # Add topic modeling controls
308
- st.subheader("Topic Modeling Settings")
309
- col1, col2 = st.columns(2)
310
-
311
- with col1:
312
- topic_strategy = st.radio(
313
- "Topic Number Strategy",
314
- ["Auto", "Manual"],
315
- help="Choose whether to let the model determine the optimal number of topics or set it manually"
316
- )
317
 
318
- if topic_strategy == "Manual":
319
- # Calculate reasonable max topics based on dataset size
320
- n_documents = len(df)
321
- if n_documents < 1000:
322
- max_topics = min(50, n_documents // 20)
323
- else:
324
- max_topics = min(500, int(np.log10(n_documents) * 100))
325
-
326
- n_topics = st.slider(
327
- "Number of Topics",
328
- min_value=2,
329
- max_value=max_topics,
330
- value=min(20, max_topics),
331
- help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
 
 
 
 
 
332
  )
333
 
334
- st.info(f"""
335
- ๐Ÿ’ก For your dataset of {n_documents:,} documents:
336
- - Minimum topics: 2
337
- - Maximum topics: {max_topics}
338
- - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
339
- """)
340
-
341
- with col2:
342
- top_n = st.number_input(
343
- "Number of top topics/emotions to display:",
344
- min_value=1,
345
- max_value=100,
346
- value=10
347
- )
348
 
349
- min_topic_size = st.slider(
350
- "Minimum Topic Size",
351
- min_value=10,
352
- max_value=100,
353
- value=30,
354
- help="Minimum number of documents required to form a topic"
355
- )
356
- except Exception as e:
357
- st.error(f"Error loading models: {str(e)}")
358
- st.stop()
359
- if st.button("Process Data"):
360
- with st.spinner("Processing your data..."):
361
- summaries, topic_model = process_and_summarize(df, top_n=top_n, topic_strategy=topic_strategy, n_topics=n_topics, min_topic_size=min_topic_size)
 
 
362
 
 
 
 
 
 
 
 
 
 
363
 
364
- if summaries:
365
- st.success("Analysis complete!")
366
-
367
- # Display results in tabs
368
- tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
369
-
370
- with tab1:
371
- for summary in summaries:
372
- with st.expander(f"๐Ÿ“ {summary['country']} ({summary['total_poems']} poems)"):
373
- col1, col2 = st.columns(2)
374
-
375
- with col1:
376
- st.subheader("Top Topics")
377
- for topic in summary['top_topics']:
378
- st.write(f"โ€ข {topic['topic']}: {topic['count']} poems")
379
-
380
- with col2:
381
- st.subheader("Emotions")
382
- for emotion in summary['top_emotions']:
383
- st.write(f"โ€ข {emotion['emotion']}: {emotion['count']} poems")
384
-
385
- with tab2:
386
- st.subheader("Global Topic Distribution")
387
- topic_info = topic_model.get_topic_info()
388
- for _, row in topic_info.iterrows():
389
- if row['Topic'] == -1:
390
- topic_name = "Miscellaneous"
391
- else:
392
- words = topic_model.get_topic(row['Topic'])
393
- topic_name = " | ".join([word for word, _ in words[:5]])
394
- st.write(f"โ€ข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
395
-
396
- except Exception as e:
397
- st.error(f"Error processing file: {str(e)}")
398
- else:
399
- st.info("๐Ÿ‘† Upload a file to get started!")
400
 
401
- # Example format
402
- st.write("### Expected File Format:")
403
- example_df = pd.DataFrame({
404
- 'country': ['Egypt', 'Palestine'],
405
- 'poem': ['ู‚ุตูŠุฏุฉ ู…ุตุฑูŠุฉ', 'ู‚ุตูŠุฏุฉ ูู„ุณุทูŠู†ูŠุฉ ']
406
- })
407
- st.dataframe(example_df)
 
 
 
408
 
 
 
 
271
 
272
  return summaries, topic_model
273
 
274
+ # Main application logic
275
+ def main():
276
+ # Load models
277
+ try:
278
+ bert_tokenizer, bert_model, emotion_classifier = load_models()
279
+ st.success("Models loaded successfully!")
280
+ except Exception as e:
281
+ st.error(f"Error loading models: {str(e)}")
282
+ st.stop()
283
 
284
+ # Main app interface
285
+ st.title("๐Ÿ“š Arabic Poem Analysis")
286
+ st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
287
 
288
+ # File upload
289
+ uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
290
 
291
+ if uploaded_file is not None:
292
+ try:
293
+ # Read the file
294
+ if uploaded_file.name.endswith('.csv'):
295
+ df = pd.read_csv(uploaded_file)
296
+ else:
297
+ df = pd.read_excel(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ # Validate columns
300
+ required_columns = ['country', 'poem']
301
+ if not all(col in df.columns for col in required_columns):
302
+ st.error("File must contain 'country' and 'poem' columns.")
303
+ st.stop()
304
+
305
+ # Clean data
306
+ df['country'] = df['country'].str.strip()
307
+ df = df.dropna(subset=['country', 'poem'])
308
+
309
+ # Add topic modeling controls
310
+ st.subheader("Topic Modeling Settings")
311
+ col1, col2 = st.columns(2)
312
+
313
+ with col1:
314
+ topic_strategy = st.radio(
315
+ "Topic Number Strategy",
316
+ ["Auto", "Manual"],
317
+ help="Choose whether to let the model determine the optimal number of topics or set it manually"
318
  )
319
 
320
+ if topic_strategy == "Manual":
321
+ n_documents = len(df)
322
+ max_topics = min(500, int(np.log10(n_documents) * 100))
323
+
324
+ n_topics = st.slider(
325
+ "Number of Topics",
326
+ min_value=2,
327
+ max_value=max_topics,
328
+ value=min(20, max_topics),
329
+ help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
330
+ )
 
 
 
331
 
332
+ with col2:
333
+ top_n = st.number_input(
334
+ "Number of top topics/emotions to display:",
335
+ min_value=1,
336
+ max_value=100,
337
+ value=10
338
+ )
339
+
340
+ min_topic_size = st.slider(
341
+ "Minimum Topic Size",
342
+ min_value=10,
343
+ max_value=100,
344
+ value=30,
345
+ help="Minimum number of documents required to form a topic"
346
+ )
347
 
348
+ if st.button("Process Data"):
349
+ with st.spinner("Processing your data..."):
350
+ summaries, topic_model = process_and_summarize(
351
+ df,
352
+ top_n=top_n,
353
+ topic_strategy=topic_strategy,
354
+ n_topics=n_topics if topic_strategy == "Manual" else None,
355
+ min_topic_size=min_topic_size
356
+ )
357
 
358
+ if summaries:
359
+ st.success("Analysis complete!")
360
+
361
+ # Display results in tabs
362
+ tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
363
+
364
+ with tab1:
365
+ for summary in summaries:
366
+ with st.expander(f"๐Ÿ“ {summary['country']} ({summary['total_poems']} poems)"):
367
+ col1, col2 = st.columns(2)
368
+
369
+ with col1:
370
+ st.subheader("Top Topics")
371
+ for topic in summary['top_topics']:
372
+ st.write(f"โ€ข {topic['topic']}: {topic['count']} poems")
373
+
374
+ with col2:
375
+ st.subheader("Emotions")
376
+ for emotion in summary['top_emotions']:
377
+ st.write(f"โ€ข {emotion['emotion']}: {emotion['count']} poems")
378
+
379
+ with tab2:
380
+ st.subheader("Global Topic Distribution")
381
+ topic_info = topic_model.get_topic_info()
382
+ for _, row in topic_info.iterrows():
383
+ if row['Topic'] == -1:
384
+ topic_name = "Miscellaneous"
385
+ else:
386
+ words = topic_model.get_topic(row['Topic'])
387
+ topic_name = " | ".join([word for word, _ in words[:5]])
388
+ st.write(f"โ€ข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
389
+
390
+ except Exception as e:
391
+ st.error(f"Error processing file: {str(e)}")
 
 
392
 
393
+ else:
394
+ st.info("๐Ÿ‘† Upload a file to get started!")
395
+
396
+ # Example format
397
+ st.write("### Expected File Format:")
398
+ example_df = pd.DataFrame({
399
+ 'country': ['Egypt', 'Palestine'],
400
+ 'poem': ['ู‚ุตูŠุฏุฉ ู…ุตุฑูŠุฉ', 'ู‚ุตูŠุฏุฉ ูู„ุณุทูŠู†ูŠุฉ']
401
+ })
402
+ st.dataframe(example_df)
403
 
404
+ if __name__ == "__main__":
405
+ main()