kambris commited on
Commit
b88eade
·
verified ·
1 Parent(s): 89175c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -93
app.py CHANGED
@@ -17,96 +17,113 @@ st.set_page_config(
17
  @st.cache_resource
18
  def load_models():
19
  """Load and cache the models to prevent reloading"""
20
- bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
 
21
  bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
22
  emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
23
- emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
24
- return bert_tokenizer, bert_model, emotion_classifier
25
-
26
- # Load models
27
- try:
28
- bert_tokenizer, bert_model, emotion_classifier = load_models()
29
- st.success("Models loaded successfully!")
30
- except Exception as e:
31
- st.error(f"Error loading models: {str(e)}")
32
- st.stop()
33
-
34
- # Define emotion labels mapping
35
- EMOTION_LABELS = {
36
- 'LABEL_0': 'Negative',
37
- 'LABEL_1': 'Positive',
38
- 'LABEL_2': 'Neutral'
39
- }
40
 
41
- def chunk_long_text(text, tokenizer, max_length=512):
42
- """Split text into chunks respecting token limit."""
43
- tokens = tokenizer.encode(text, add_special_tokens=False)
44
  chunks = []
45
- text_chunks = []
46
-
47
- for i in range(0, len(tokens), max_length-2):
48
- chunk = tokens[i:i + max_length-2]
49
- full_chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
50
- chunks.append(full_chunk)
51
- text_chunks.append(tokenizer.decode(chunk))
52
 
53
- return chunks, text_chunks
54
-
55
- def get_embedding_for_text(text, tokenizer, model):
56
- """Get embedding for a text, handling long sequences."""
57
- _, text_chunks = chunk_long_text(text, tokenizer)
58
- chunk_embeddings = []
59
-
60
- for chunk in text_chunks:
61
- inputs = tokenizer(chunk,
62
- return_tensors="pt",
63
- padding=True,
64
- truncation=True,
65
- max_length=512)
66
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
67
-
68
- with torch.no_grad():
69
- outputs = model(**inputs)
70
-
71
- embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
72
- chunk_embeddings.append(embedding[0])
73
-
74
- if chunk_embeddings:
75
- return np.mean(chunk_embeddings, axis=0)
76
- return np.zeros(model.config.hidden_size)
77
-
78
- def generate_embeddings(texts, tokenizer, model):
79
- """Generate embeddings for a list of texts."""
80
- embeddings = []
81
 
82
- for text in texts:
83
- try:
84
- embedding = get_embedding_for_text(text, tokenizer, model)
85
- embeddings.append(embedding)
86
- except Exception as e:
87
- st.warning(f"Error processing text: {str(e)}")
88
- embeddings.append(np.zeros(model.config.hidden_size))
89
 
90
- return np.array(embeddings)
91
 
92
- def classify_emotion(text, tokenizer, classifier):
93
- """Classify emotion for a text using majority voting."""
94
  try:
95
- _, text_chunks = chunk_long_text(text, tokenizer)
96
- chunk_emotions = []
97
 
98
- for chunk in text_chunks:
99
- result = classifier(chunk, max_length=512, truncation=True)[0]
100
- chunk_emotions.append(result['label'])
 
 
101
 
102
- if chunk_emotions:
103
- final_emotion = Counter(chunk_emotions).most_common(1)[0][0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  return final_emotion
105
- return "unknown"
 
106
 
107
  except Exception as e:
108
  st.warning(f"Error in emotion classification: {str(e)}")
109
- return "unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def format_topics(topic_model, topic_counts):
112
  """Format topics for display."""
@@ -116,7 +133,7 @@ def format_topics(topic_model, topic_counts):
116
  topic_label = "Miscellaneous"
117
  else:
118
  words = topic_model.get_topic(topic_num)
119
- topic_label = " | ".join([word for word, _ in words[:3]])
120
 
121
  formatted_topics.append({
122
  'topic': topic_label,
@@ -126,6 +143,13 @@ def format_topics(topic_model, topic_counts):
126
 
127
  def format_emotions(emotion_counts):
128
  """Format emotions for display."""
 
 
 
 
 
 
 
129
  formatted_emotions = []
130
  for label, count in emotion_counts:
131
  emotion = EMOTION_LABELS.get(label, label)
@@ -139,11 +163,13 @@ def process_and_summarize(df, top_n=50):
139
  """Process the data and generate summaries."""
140
  summaries = []
141
 
142
- # Initialize BERTopic
143
  topic_model = BERTopic(
144
- language="arabic",
145
  calculate_probabilities=True,
146
- min_topic_size=5,
 
 
147
  verbose=True
148
  )
149
 
@@ -153,20 +179,24 @@ def process_and_summarize(df, top_n=50):
153
  progress_bar = st.progress(0, text=progress_text)
154
 
155
  texts = group['poem'].dropna().tolist()
156
- batch_size = 10
157
  all_emotions = []
158
 
159
- # Generate embeddings
160
- embeddings = generate_embeddings(texts, bert_tokenizer, bert_model)
161
- progress_bar.progress(0.33, text="Generating embeddings...")
 
 
 
 
 
 
162
 
163
- # Process emotions
164
- for i in range(0, len(texts), batch_size):
165
- batch_texts = texts[i:i + batch_size]
166
- batch_emotions = [classify_emotion(text, bert_tokenizer, emotion_classifier)
167
- for text in batch_texts]
168
- all_emotions.extend(batch_emotions)
169
- progress_bar.progress(0.66, text="Classifying emotions...")
170
 
171
  try:
172
  # Fit topic model
@@ -183,12 +213,21 @@ def process_and_summarize(df, top_n=50):
183
  'top_emotions': top_emotions
184
  })
185
  progress_bar.progress(1.0, text="Processing complete!")
 
186
  except Exception as e:
187
  st.warning(f"Could not generate topics for {country}: {str(e)}")
188
  continue
189
 
190
  return summaries, topic_model
191
 
 
 
 
 
 
 
 
 
192
  # Main app interface
193
  st.title("📚 Arabic Poem Analysis")
194
  st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
@@ -251,7 +290,7 @@ if uploaded_file is not None:
251
  topic_name = "Miscellaneous"
252
  else:
253
  words = topic_model.get_topic(row['Topic'])
254
- topic_name = " | ".join([word for word, _ in words[:3]])
255
  st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
256
 
257
  except Exception as e:
@@ -265,5 +304,4 @@ else:
265
  'country': ['Egypt', 'Saudi Arabia'],
266
  'poem': ['قصيدة مصرية', 'قصيدة سعودية']
267
  })
268
- st.dataframe(example_df)
269
-
 
17
  @st.cache_resource
18
  def load_models():
19
  """Load and cache the models to prevent reloading"""
20
+ # Use CAMeL-Lab's tokenizer for consistency with the emotion model
21
+ tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
22
  bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
23
  emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
24
+ emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
25
+ emotion_classifier = pipeline(
26
+ "sentiment-analysis",
27
+ model=emotion_model,
28
+ tokenizer=emotion_tokenizer,
29
+ return_all_scores=True
30
+ )
31
+ return tokenizer, bert_model, emotion_classifier
 
 
 
 
 
 
 
 
 
32
 
33
+ def split_text(text, max_length=512):
34
+ """Split text into chunks of maximum token length while preserving word boundaries."""
35
+ words = text.split()
36
  chunks = []
37
+ current_chunk = []
38
+ current_length = 0
 
 
 
 
 
39
 
40
+ for word in words:
41
+ word_length = len(word.split())
42
+ if current_length + word_length > max_length:
43
+ if current_chunk: # Only append if there are words in the current chunk
44
+ chunks.append(' '.join(current_chunk))
45
+ current_chunk = [word]
46
+ current_length = word_length
47
+ else:
48
+ current_chunk.append(word)
49
+ current_length += word_length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ if current_chunk: # Append the last chunk if it exists
52
+ chunks.append(' '.join(current_chunk))
 
 
 
 
 
53
 
54
+ return chunks
55
 
56
+ def classify_emotion(text, classifier):
57
+ """Classify emotion for complete text."""
58
  try:
59
+ # Split text into manageable chunks
60
+ chunks = split_text(text)
61
 
62
+ all_scores = []
63
+ for chunk in chunks:
64
+ result = classifier(chunk)
65
+ scores = result[0] # Get scores for all labels
66
+ all_scores.append(scores)
67
 
68
+ # Average scores across all chunks
69
+ if all_scores:
70
+ # Create a dictionary to store summed scores for each label
71
+ label_scores = {}
72
+ count = len(all_scores)
73
+
74
+ # Sum up scores for each label
75
+ for scores in all_scores:
76
+ for score in scores:
77
+ label = score['label']
78
+ if label not in label_scores:
79
+ label_scores[label] = 0
80
+ label_scores[label] += score['score']
81
+
82
+ # Calculate averages
83
+ avg_scores = {label: score/count for label, score in label_scores.items()}
84
+
85
+ # Get the label with highest average score
86
+ final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
87
  return final_emotion
88
+
89
+ return "LABEL_2" # Default to neutral if no valid results
90
 
91
  except Exception as e:
92
  st.warning(f"Error in emotion classification: {str(e)}")
93
+ return "LABEL_2" # Default to neutral
94
+
95
+ def get_embedding_for_text(text, tokenizer, model):
96
+ """Get embedding for complete text."""
97
+ chunks = split_text(text)
98
+ chunk_embeddings = []
99
+
100
+ for chunk in chunks:
101
+ try:
102
+ inputs = tokenizer(
103
+ chunk,
104
+ return_tensors="pt",
105
+ padding=True,
106
+ truncation=True,
107
+ max_length=512
108
+ )
109
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
110
+
111
+ with torch.no_grad():
112
+ outputs = model(**inputs)
113
+
114
+ embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
115
+ chunk_embeddings.append(embedding[0])
116
+ except Exception as e:
117
+ st.warning(f"Error processing chunk: {str(e)}")
118
+ continue
119
+
120
+ if chunk_embeddings:
121
+ # Use weighted average based on chunk length
122
+ weights = np.array([len(chunk.split()) for chunk in chunks])
123
+ weights = weights / weights.sum()
124
+ weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
125
+ return weighted_embedding
126
+ return np.zeros(model.config.hidden_size)
127
 
128
  def format_topics(topic_model, topic_counts):
129
  """Format topics for display."""
 
133
  topic_label = "Miscellaneous"
134
  else:
135
  words = topic_model.get_topic(topic_num)
136
+ topic_label = " | ".join([word for word, _ in words[:5]]) # Show top 5 words per topic
137
 
138
  formatted_topics.append({
139
  'topic': topic_label,
 
143
 
144
  def format_emotions(emotion_counts):
145
  """Format emotions for display."""
146
+ # Define emotion labels mapping
147
+ EMOTION_LABELS = {
148
+ 'LABEL_0': 'Negative',
149
+ 'LABEL_1': 'Positive',
150
+ 'LABEL_2': 'Neutral'
151
+ }
152
+
153
  formatted_emotions = []
154
  for label, count in emotion_counts:
155
  emotion = EMOTION_LABELS.get(label, label)
 
163
  """Process the data and generate summaries."""
164
  summaries = []
165
 
166
+ # Initialize BERTopic with Arabic-specific settings
167
  topic_model = BERTopic(
168
+ language="multilingual",
169
  calculate_probabilities=True,
170
+ min_topic_size=2, # Allow smaller topic groups
171
+ n_gram_range=(1, 3), # Include up to trigrams
172
+ top_n_words=15, # Show more words per topic
173
  verbose=True
174
  )
175
 
 
179
  progress_bar = st.progress(0, text=progress_text)
180
 
181
  texts = group['poem'].dropna().tolist()
 
182
  all_emotions = []
183
 
184
+ # Generate embeddings with progress tracking
185
+ embeddings = []
186
+ for i, text in enumerate(texts):
187
+ embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
188
+ embeddings.append(embedding)
189
+ progress = (i + 1) / len(texts) * 0.4
190
+ progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
191
+
192
+ embeddings = np.array(embeddings)
193
 
194
+ # Process emotions with progress tracking
195
+ for i, text in enumerate(texts):
196
+ emotion = classify_emotion(text, emotion_classifier)
197
+ all_emotions.append(emotion)
198
+ progress = 0.4 + ((i + 1) / len(texts) * 0.3)
199
+ progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
 
200
 
201
  try:
202
  # Fit topic model
 
213
  'top_emotions': top_emotions
214
  })
215
  progress_bar.progress(1.0, text="Processing complete!")
216
+
217
  except Exception as e:
218
  st.warning(f"Could not generate topics for {country}: {str(e)}")
219
  continue
220
 
221
  return summaries, topic_model
222
 
223
+ # Load models
224
+ try:
225
+ bert_tokenizer, bert_model, emotion_classifier = load_models()
226
+ st.success("Models loaded successfully!")
227
+ except Exception as e:
228
+ st.error(f"Error loading models: {str(e)}")
229
+ st.stop()
230
+
231
  # Main app interface
232
  st.title("📚 Arabic Poem Analysis")
233
  st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
 
290
  topic_name = "Miscellaneous"
291
  else:
292
  words = topic_model.get_topic(row['Topic'])
293
+ topic_name = " | ".join([word for word, _ in words[:5]])
294
  st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
295
 
296
  except Exception as e:
 
304
  'country': ['Egypt', 'Saudi Arabia'],
305
  'poem': ['قصيدة مصرية', 'قصيدة سعودية']
306
  })
307
+ st.dataframe(example_df)