kambris commited on
Commit
9402b4b
·
verified ·
1 Parent(s): 277802e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -3
app.py CHANGED
@@ -7,6 +7,8 @@ import torch
7
  import numpy as np
8
  from collections import Counter
9
  import os
 
 
10
  # Add Arabic stop words
11
  ARABIC_STOP_WORDS = {
12
  'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
@@ -69,6 +71,21 @@ def split_text(text, max_length=512):
69
 
70
  return chunks
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def clean_arabic_text(text):
73
  """Clean Arabic text by removing stop words and normalizing."""
74
  words = text.split()
@@ -202,18 +219,22 @@ def format_emotions(emotion_counts):
202
  })
203
  return formatted_emotions
204
 
205
- def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=30):
206
  """Process the data and generate summaries with flexible topic configuration."""
207
  summaries = []
208
 
209
  topic_model_params = {
210
  "language": "arabic",
211
  "calculate_probabilities": True,
212
- "min_topic_size": min_topic_size,
213
  "n_gram_range": (1, 2),
214
  "top_n_words": 15,
215
  "verbose": True
 
216
  }
 
 
 
217
 
218
  if topic_strategy == "Manual":
219
  topic_model_params["nr_topics"] = n_topics
@@ -250,7 +271,8 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
250
 
251
  try:
252
  topics, probs = topic_model.fit_transform(texts, embeddings)
253
-
 
254
  topic_counts = Counter(topics)
255
  if -1 in topic_counts:
256
  del topic_counts[-1]
@@ -385,6 +407,12 @@ if uploaded_file is not None:
385
  st.subheader("Emotions")
386
  for emotion in summary['top_emotions']:
387
  st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
 
 
 
 
 
 
388
 
389
  with tab2:
390
  st.subheader("Global Topic Distribution")
 
7
  import numpy as np
8
  from collections import Counter
9
  import os
10
+ from wordcloud import WordCloud
11
+ import matplotlib.pyplot as plt
12
  # Add Arabic stop words
13
  ARABIC_STOP_WORDS = {
14
  'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
 
71
 
72
  return chunks
73
 
74
+ def create_arabic_wordcloud(text, title):
75
+ wordcloud = WordCloud(
76
+ width=1200,
77
+ height=600,
78
+ background_color='white',
79
+ font_path='arial', # Works with system Arabic fonts
80
+ max_words=200
81
+ ).generate(text)
82
+
83
+ fig, ax = plt.subplots(figsize=(15, 8))
84
+ ax.imshow(wordcloud, interpolation='bilinear')
85
+ ax.axis('off')
86
+ ax.set_title(title, fontsize=16, pad=20)
87
+ return fig
88
+
89
  def clean_arabic_text(text):
90
  """Clean Arabic text by removing stop words and normalizing."""
91
  words = text.split()
 
219
  })
220
  return formatted_emotions
221
 
222
+ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=1):
223
  """Process the data and generate summaries with flexible topic configuration."""
224
  summaries = []
225
 
226
  topic_model_params = {
227
  "language": "arabic",
228
  "calculate_probabilities": True,
229
+ "min_topic_size": 1,
230
  "n_gram_range": (1, 2),
231
  "top_n_words": 15,
232
  "verbose": True
233
+ "seed_topic_list": None
234
  }
235
+ st.write(f"Total documents: {len(df)}")
236
+ st.write(f"Topic strategy: {topic_strategy}")
237
+ st.write(f"Min topic size: {min_topic_size}")
238
 
239
  if topic_strategy == "Manual":
240
  topic_model_params["nr_topics"] = n_topics
 
271
 
272
  try:
273
  topics, probs = topic_model.fit_transform(texts, embeddings)
274
+ st.write(f"Number of unique topics: {len(set(topics))}")
275
+ st.write(f"Topic distribution: {Counter(topics)}")
276
  topic_counts = Counter(topics)
277
  if -1 in topic_counts:
278
  del topic_counts[-1]
 
407
  st.subheader("Emotions")
408
  for emotion in summary['top_emotions']:
409
  st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
410
+
411
+ st.subheader("Word Cloud Visualization")
412
+ country_poems = df[df['country'] == summary['country']]['poem']
413
+ combined_text = ' '.join(country_poems)
414
+ wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
415
+ st.pyplot(wordcloud_fig)
416
 
417
  with tab2:
418
  st.subheader("Global Topic Distribution")