kambris commited on
Commit
7173364
·
verified ·
1 Parent(s): 00bf9b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -47
app.py CHANGED
@@ -7,6 +7,24 @@ import numpy as np
7
  from collections import Counter
8
  import os
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Configure page
11
  st.set_page_config(
12
  page_title="Arabic Poem Analysis",
@@ -17,7 +35,6 @@ st.set_page_config(
17
  @st.cache_resource
18
  def load_models():
19
  """Load and cache the models to prevent reloading"""
20
- # Use CAMeL-Lab's tokenizer for consistency with the emotion model
21
  tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
22
  bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
23
  emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
@@ -40,7 +57,7 @@ def split_text(text, max_length=512):
40
  for word in words:
41
  word_length = len(word.split())
42
  if current_length + word_length > max_length:
43
- if current_chunk: # Only append if there are words in the current chunk
44
  chunks.append(' '.join(current_chunk))
45
  current_chunk = [word]
46
  current_length = word_length
@@ -48,25 +65,26 @@ def split_text(text, max_length=512):
48
  current_chunk.append(word)
49
  current_length += word_length
50
 
51
- if current_chunk: # Append the last chunk if it exists
52
  chunks.append(' '.join(current_chunk))
53
 
54
  return chunks
55
 
56
- # The beginning of the code remains the same until the classify_emotion function
 
 
 
 
57
 
58
  def classify_emotion(text, classifier):
59
  """Classify emotion for complete text with proper token handling."""
60
  try:
61
- # Split text into manageable chunks
62
  words = text.split()
63
  chunks = []
64
  current_chunk = []
65
  current_length = 0
66
 
67
- # Create chunks that respect the 512 token limit
68
  for word in words:
69
- # Add word length plus 1 for space
70
  word_tokens = len(classifier.tokenizer.encode(word))
71
  if current_length + word_tokens > 512:
72
  if current_chunk:
@@ -80,14 +98,12 @@ def classify_emotion(text, classifier):
80
  if current_chunk:
81
  chunks.append(' '.join(current_chunk))
82
 
83
- # If no chunks were created, use the original text with truncation
84
  if not chunks:
85
  chunks = [text]
86
 
87
  all_scores = []
88
  for chunk in chunks:
89
  try:
90
- # Ensure proper truncation
91
  inputs = classifier.tokenizer(
92
  chunk,
93
  truncation=True,
@@ -101,13 +117,10 @@ def classify_emotion(text, classifier):
101
  st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
102
  continue
103
 
104
- # Average scores across all chunks
105
  if all_scores:
106
- # Create a dictionary to store summed scores for each label
107
  label_scores = {}
108
  count = len(all_scores)
109
 
110
- # Sum up scores for each label
111
  for scores in all_scores:
112
  for score in scores:
113
  label = score['label']
@@ -115,19 +128,15 @@ def classify_emotion(text, classifier):
115
  label_scores[label] = 0
116
  label_scores[label] += score['score']
117
 
118
- # Calculate averages
119
  avg_scores = {label: score/count for label, score in label_scores.items()}
120
-
121
- # Get the label with highest average score
122
  final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
123
  return final_emotion
124
 
125
- return "LABEL_2" # Default to neutral if no valid results
126
 
127
  except Exception as e:
128
  st.warning(f"Error in emotion classification: {str(e)}")
129
- return "LABEL_2" # Default to neutral
130
-
131
 
132
  def get_embedding_for_text(text, tokenizer, model):
133
  """Get embedding for complete text."""
@@ -155,7 +164,6 @@ def get_embedding_for_text(text, tokenizer, model):
155
  continue
156
 
157
  if chunk_embeddings:
158
- # Use weighted average based on chunk length
159
  weights = np.array([len(chunk.split()) for chunk in chunks])
160
  weights = weights / weights.sum()
161
  weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
@@ -170,7 +178,7 @@ def format_topics(topic_model, topic_counts):
170
  topic_label = "Miscellaneous"
171
  else:
172
  words = topic_model.get_topic(topic_num)
173
- topic_label = " | ".join([word for word, _ in words[:5]]) # Show top 5 words per topic
174
 
175
  formatted_topics.append({
176
  'topic': topic_label,
@@ -180,7 +188,6 @@ def format_topics(topic_model, topic_counts):
180
 
181
  def format_emotions(emotion_counts):
182
  """Format emotions for display."""
183
- # Define emotion labels mapping
184
  EMOTION_LABELS = {
185
  'LABEL_0': 'Negative',
186
  'LABEL_1': 'Positive',
@@ -196,29 +203,35 @@ def format_emotions(emotion_counts):
196
  })
197
  return formatted_emotions
198
 
199
- def process_and_summarize(df, top_n=50):
200
- """Process the data and generate summaries."""
201
  summaries = []
202
 
203
- # Initialize BERTopic with Arabic-specific settings
204
- topic_model = BERTopic(
205
- language="multilingual",
206
- calculate_probabilities=True,
207
- min_topic_size=2, # Allow smaller topic groups
208
- n_gram_range=(1, 3), # Include up to trigrams
209
- top_n_words=15, # Show more words per topic
210
- verbose=True
211
- )
 
 
 
 
 
 
 
 
212
 
213
- # Group by country
214
  for country, group in df.groupby('country'):
215
  progress_text = f"Processing poems for {country}..."
216
  progress_bar = st.progress(0, text=progress_text)
217
 
218
- texts = group['poem'].dropna().tolist()
219
  all_emotions = []
220
 
221
- # Generate embeddings with progress tracking
222
  embeddings = []
223
  for i, text in enumerate(texts):
224
  embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
@@ -228,7 +241,6 @@ def process_and_summarize(df, top_n=50):
228
 
229
  embeddings = np.array(embeddings)
230
 
231
- # Process emotions with progress tracking
232
  for i, text in enumerate(texts):
233
  emotion = classify_emotion(text, emotion_classifier)
234
  all_emotions.append(emotion)
@@ -236,11 +248,13 @@ def process_and_summarize(df, top_n=50):
236
  progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
237
 
238
  try:
239
- # Fit topic model
240
- topics, _ = topic_model.fit_transform(texts, embeddings)
 
 
 
241
 
242
- # Format results
243
- top_topics = format_topics(topic_model, Counter(topics).most_common(top_n))
244
  top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
245
 
246
  summaries.append({
@@ -290,14 +304,61 @@ if uploaded_file is not None:
290
  df['country'] = df['country'].str.strip()
291
  df = df.dropna(subset=['country', 'poem'])
292
 
293
- # Process data
294
- top_n = st.number_input("Number of top topics/emotions to display:",
295
- min_value=1, max_value=100, value=10)
296
 
297
- if st.button("Process Data"):
298
- with st.spinner("Processing your data..."):
299
- summaries, topic_model = process_and_summarize(df, top_n=top_n)
 
 
 
 
 
 
 
 
 
 
 
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  if summaries:
302
  st.success("Analysis complete!")
303
 
@@ -341,4 +402,5 @@ else:
341
  'country': ['Egypt', 'Palestine'],
342
  'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
343
  })
344
- st.dataframe(example_df)
 
 
7
  from collections import Counter
8
  import os
9
 
10
+ # Add Arabic stop words
11
+ ARABIC_STOP_WORDS = {
12
+ 'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
13
+ 'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
14
+ 'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
15
+ 'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
16
+ 'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
17
+ 'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
18
+ 'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
19
+ 'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
20
+ 'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
21
+ 'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
22
+ 'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
23
+ 'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
24
+ 'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
25
+ 'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
26
+ }
27
+
28
  # Configure page
29
  st.set_page_config(
30
  page_title="Arabic Poem Analysis",
 
35
  @st.cache_resource
36
  def load_models():
37
  """Load and cache the models to prevent reloading"""
 
38
  tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
39
  bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
40
  emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
 
57
  for word in words:
58
  word_length = len(word.split())
59
  if current_length + word_length > max_length:
60
+ if current_chunk:
61
  chunks.append(' '.join(current_chunk))
62
  current_chunk = [word]
63
  current_length = word_length
 
65
  current_chunk.append(word)
66
  current_length += word_length
67
 
68
+ if current_chunk:
69
  chunks.append(' '.join(current_chunk))
70
 
71
  return chunks
72
 
73
+ def clean_arabic_text(text):
74
+ """Clean Arabic text by removing stop words and normalizing."""
75
+ words = text.split()
76
+ cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
77
+ return ' '.join(cleaned_words)
78
 
79
  def classify_emotion(text, classifier):
80
  """Classify emotion for complete text with proper token handling."""
81
  try:
 
82
  words = text.split()
83
  chunks = []
84
  current_chunk = []
85
  current_length = 0
86
 
 
87
  for word in words:
 
88
  word_tokens = len(classifier.tokenizer.encode(word))
89
  if current_length + word_tokens > 512:
90
  if current_chunk:
 
98
  if current_chunk:
99
  chunks.append(' '.join(current_chunk))
100
 
 
101
  if not chunks:
102
  chunks = [text]
103
 
104
  all_scores = []
105
  for chunk in chunks:
106
  try:
 
107
  inputs = classifier.tokenizer(
108
  chunk,
109
  truncation=True,
 
117
  st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
118
  continue
119
 
 
120
  if all_scores:
 
121
  label_scores = {}
122
  count = len(all_scores)
123
 
 
124
  for scores in all_scores:
125
  for score in scores:
126
  label = score['label']
 
128
  label_scores[label] = 0
129
  label_scores[label] += score['score']
130
 
 
131
  avg_scores = {label: score/count for label, score in label_scores.items()}
 
 
132
  final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
133
  return final_emotion
134
 
135
+ return "LABEL_2"
136
 
137
  except Exception as e:
138
  st.warning(f"Error in emotion classification: {str(e)}")
139
+ return "LABEL_2"
 
140
 
141
  def get_embedding_for_text(text, tokenizer, model):
142
  """Get embedding for complete text."""
 
164
  continue
165
 
166
  if chunk_embeddings:
 
167
  weights = np.array([len(chunk.split()) for chunk in chunks])
168
  weights = weights / weights.sum()
169
  weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
 
178
  topic_label = "Miscellaneous"
179
  else:
180
  words = topic_model.get_topic(topic_num)
181
+ topic_label = " | ".join([word for word, _ in words[:5]])
182
 
183
  formatted_topics.append({
184
  'topic': topic_label,
 
188
 
189
  def format_emotions(emotion_counts):
190
  """Format emotions for display."""
 
191
  EMOTION_LABELS = {
192
  'LABEL_0': 'Negative',
193
  'LABEL_1': 'Positive',
 
203
  })
204
  return formatted_emotions
205
 
206
+ def process_and_summarize(df, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=30):
207
+ """Process the data and generate summaries with flexible topic configuration."""
208
  summaries = []
209
 
210
+ topic_model_params = {
211
+ "language": "multilingual",
212
+ "calculate_probabilities": True,
213
+ "min_topic_size": min_topic_size,
214
+ "n_gram_range": (1, 3),
215
+ "top_n_words": 15,
216
+ "verbose": True,
217
+ "diversity": 0.5,
218
+ "stop_words": ARABIC_STOP_WORDS
219
+ }
220
+
221
+ if topic_strategy == "Manual" and n_topics is not None:
222
+ topic_model_params["nr_topics"] = n_topics
223
+ else:
224
+ topic_model_params["nr_topics"] = "auto"
225
+
226
+ topic_model = BERTopic(**topic_model_params)
227
 
 
228
  for country, group in df.groupby('country'):
229
  progress_text = f"Processing poems for {country}..."
230
  progress_bar = st.progress(0, text=progress_text)
231
 
232
+ texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
233
  all_emotions = []
234
 
 
235
  embeddings = []
236
  for i, text in enumerate(texts):
237
  embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
 
241
 
242
  embeddings = np.array(embeddings)
243
 
 
244
  for i, text in enumerate(texts):
245
  emotion = classify_emotion(text, emotion_classifier)
246
  all_emotions.append(emotion)
 
248
  progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
249
 
250
  try:
251
+ topics, probs = topic_model.fit_transform(texts, embeddings)
252
+
253
+ topic_counts = Counter(topics)
254
+ if -1 in topic_counts:
255
+ del topic_counts[-1]
256
 
257
+ top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
 
258
  top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
259
 
260
  summaries.append({
 
304
  df['country'] = df['country'].str.strip()
305
  df = df.dropna(subset=['country', 'poem'])
306
 
307
+ # Add topic modeling controls
308
+ st.subheader("Topic Modeling Settings")
309
+ col1, col2 = st.columns(2)
310
 
311
+ with col1:
312
+ topic_strategy = st.radio(
313
+ "Topic Number Strategy",
314
+ ["Auto", "Manual"],
315
+ help="Choose whether to let the model determine the optimal number of topics or set it manually"
316
+ )
317
+
318
+ if topic_strategy == "Manual":
319
+ # Calculate reasonable max topics based on dataset size
320
+ n_documents = len(df)
321
+ if n_documents < 1000:
322
+ max_topics = min(50, n_documents // 20)
323
+ else:
324
+ max_topics = min(500, int(np.log10(n_documents) * 100))
325
 
326
+ n_topics = st.slider(
327
+ "Number of Topics",
328
+ min_value=2,
329
+ max_value=max_topics,
330
+ value=min(20, max_topics),
331
+ help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
332
+ )
333
+
334
+ st.info(f"""
335
+ 💡 For your dataset of {n_documents:,} documents:
336
+ - Minimum topics: 2
337
+ - Maximum topics: {max_topics}
338
+ - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
339
+ """)
340
+
341
+ with col2:
342
+ top_n = st.number_input(
343
+ "Number of top topics/emotions to display:",
344
+ min_value=1,
345
+ max_value=100,
346
+ value=10
347
+ )
348
+
349
+ min_topic_size = st.slider(
350
+ "Minimum Topic Size",
351
+ min_value=10,
352
+ max_value=100,
353
+ value=30,
354
+ help="Minimum number of documents required to form a topic"
355
+ )
356
+
357
+ if st.button("Process Data"):
358
+ with st.spinner("Processing your data..."):
359
+ summaries, topic_model = process_and_summarize(df, top_n=top_n, topic_strategy=topic_strategy, n_topics=n_topics, min_topic_size=min_topic_size)
360
+
361
+
362
  if summaries:
363
  st.success("Analysis complete!")
364
 
 
402
  'country': ['Egypt', 'Palestine'],
403
  'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
404
  })
405
+ st.dataframe(example_df)
406
+