Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -17,96 +17,113 @@ st.set_page_config(
|
|
17 |
@st.cache_resource
|
18 |
def load_models():
|
19 |
"""Load and cache the models to prevent reloading"""
|
20 |
-
|
|
|
21 |
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
|
22 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
st.error(f"Error loading models: {str(e)}")
|
32 |
-
st.stop()
|
33 |
-
|
34 |
-
# Define emotion labels mapping
|
35 |
-
EMOTION_LABELS = {
|
36 |
-
'LABEL_0': 'Negative',
|
37 |
-
'LABEL_1': 'Positive',
|
38 |
-
'LABEL_2': 'Neutral'
|
39 |
-
}
|
40 |
|
41 |
-
def
|
42 |
-
"""Split text into chunks
|
43 |
-
|
44 |
chunks = []
|
45 |
-
|
46 |
-
|
47 |
-
for i in range(0, len(tokens), max_length-2):
|
48 |
-
chunk = tokens[i:i + max_length-2]
|
49 |
-
full_chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
|
50 |
-
chunks.append(full_chunk)
|
51 |
-
text_chunks.append(tokenizer.decode(chunk))
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
padding=True,
|
64 |
-
truncation=True,
|
65 |
-
max_length=512)
|
66 |
-
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
67 |
-
|
68 |
-
with torch.no_grad():
|
69 |
-
outputs = model(**inputs)
|
70 |
-
|
71 |
-
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
72 |
-
chunk_embeddings.append(embedding[0])
|
73 |
-
|
74 |
-
if chunk_embeddings:
|
75 |
-
return np.mean(chunk_embeddings, axis=0)
|
76 |
-
return np.zeros(model.config.hidden_size)
|
77 |
-
|
78 |
-
def generate_embeddings(texts, tokenizer, model):
|
79 |
-
"""Generate embeddings for a list of texts."""
|
80 |
-
embeddings = []
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
embedding = get_embedding_for_text(text, tokenizer, model)
|
85 |
-
embeddings.append(embedding)
|
86 |
-
except Exception as e:
|
87 |
-
st.warning(f"Error processing text: {str(e)}")
|
88 |
-
embeddings.append(np.zeros(model.config.hidden_size))
|
89 |
|
90 |
-
return
|
91 |
|
92 |
-
def classify_emotion(text,
|
93 |
-
"""Classify emotion for
|
94 |
try:
|
95 |
-
|
96 |
-
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
101 |
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
return final_emotion
|
105 |
-
|
|
|
106 |
|
107 |
except Exception as e:
|
108 |
st.warning(f"Error in emotion classification: {str(e)}")
|
109 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
def format_topics(topic_model, topic_counts):
|
112 |
"""Format topics for display."""
|
@@ -116,7 +133,7 @@ def format_topics(topic_model, topic_counts):
|
|
116 |
topic_label = "Miscellaneous"
|
117 |
else:
|
118 |
words = topic_model.get_topic(topic_num)
|
119 |
-
topic_label = " | ".join([word for word, _ in words[:
|
120 |
|
121 |
formatted_topics.append({
|
122 |
'topic': topic_label,
|
@@ -126,6 +143,13 @@ def format_topics(topic_model, topic_counts):
|
|
126 |
|
127 |
def format_emotions(emotion_counts):
|
128 |
"""Format emotions for display."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
formatted_emotions = []
|
130 |
for label, count in emotion_counts:
|
131 |
emotion = EMOTION_LABELS.get(label, label)
|
@@ -139,11 +163,13 @@ def process_and_summarize(df, top_n=50):
|
|
139 |
"""Process the data and generate summaries."""
|
140 |
summaries = []
|
141 |
|
142 |
-
# Initialize BERTopic
|
143 |
topic_model = BERTopic(
|
144 |
-
language="
|
145 |
calculate_probabilities=True,
|
146 |
-
min_topic_size=
|
|
|
|
|
147 |
verbose=True
|
148 |
)
|
149 |
|
@@ -153,20 +179,24 @@ def process_and_summarize(df, top_n=50):
|
|
153 |
progress_bar = st.progress(0, text=progress_text)
|
154 |
|
155 |
texts = group['poem'].dropna().tolist()
|
156 |
-
batch_size = 10
|
157 |
all_emotions = []
|
158 |
|
159 |
-
# Generate embeddings
|
160 |
-
embeddings =
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
# Process emotions
|
164 |
-
for i in
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
progress_bar.progress(0.66, text="Classifying emotions...")
|
170 |
|
171 |
try:
|
172 |
# Fit topic model
|
@@ -183,12 +213,21 @@ def process_and_summarize(df, top_n=50):
|
|
183 |
'top_emotions': top_emotions
|
184 |
})
|
185 |
progress_bar.progress(1.0, text="Processing complete!")
|
|
|
186 |
except Exception as e:
|
187 |
st.warning(f"Could not generate topics for {country}: {str(e)}")
|
188 |
continue
|
189 |
|
190 |
return summaries, topic_model
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
# Main app interface
|
193 |
st.title("📚 Arabic Poem Analysis")
|
194 |
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
|
@@ -251,7 +290,7 @@ if uploaded_file is not None:
|
|
251 |
topic_name = "Miscellaneous"
|
252 |
else:
|
253 |
words = topic_model.get_topic(row['Topic'])
|
254 |
-
topic_name = " | ".join([word for word, _ in words[:
|
255 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
256 |
|
257 |
except Exception as e:
|
@@ -265,5 +304,4 @@ else:
|
|
265 |
'country': ['Egypt', 'Saudi Arabia'],
|
266 |
'poem': ['قصيدة مصرية', 'قصيدة سعودية']
|
267 |
})
|
268 |
-
st.dataframe(example_df)
|
269 |
-
|
|
|
17 |
@st.cache_resource
|
18 |
def load_models():
|
19 |
"""Load and cache the models to prevent reloading"""
|
20 |
+
# Use CAMeL-Lab's tokenizer for consistency with the emotion model
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
22 |
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
|
23 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
24 |
+
emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
25 |
+
emotion_classifier = pipeline(
|
26 |
+
"sentiment-analysis",
|
27 |
+
model=emotion_model,
|
28 |
+
tokenizer=emotion_tokenizer,
|
29 |
+
return_all_scores=True
|
30 |
+
)
|
31 |
+
return tokenizer, bert_model, emotion_classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
def split_text(text, max_length=512):
|
34 |
+
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
35 |
+
words = text.split()
|
36 |
chunks = []
|
37 |
+
current_chunk = []
|
38 |
+
current_length = 0
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
for word in words:
|
41 |
+
word_length = len(word.split())
|
42 |
+
if current_length + word_length > max_length:
|
43 |
+
if current_chunk: # Only append if there are words in the current chunk
|
44 |
+
chunks.append(' '.join(current_chunk))
|
45 |
+
current_chunk = [word]
|
46 |
+
current_length = word_length
|
47 |
+
else:
|
48 |
+
current_chunk.append(word)
|
49 |
+
current_length += word_length
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
if current_chunk: # Append the last chunk if it exists
|
52 |
+
chunks.append(' '.join(current_chunk))
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
return chunks
|
55 |
|
56 |
+
def classify_emotion(text, classifier):
|
57 |
+
"""Classify emotion for complete text."""
|
58 |
try:
|
59 |
+
# Split text into manageable chunks
|
60 |
+
chunks = split_text(text)
|
61 |
|
62 |
+
all_scores = []
|
63 |
+
for chunk in chunks:
|
64 |
+
result = classifier(chunk)
|
65 |
+
scores = result[0] # Get scores for all labels
|
66 |
+
all_scores.append(scores)
|
67 |
|
68 |
+
# Average scores across all chunks
|
69 |
+
if all_scores:
|
70 |
+
# Create a dictionary to store summed scores for each label
|
71 |
+
label_scores = {}
|
72 |
+
count = len(all_scores)
|
73 |
+
|
74 |
+
# Sum up scores for each label
|
75 |
+
for scores in all_scores:
|
76 |
+
for score in scores:
|
77 |
+
label = score['label']
|
78 |
+
if label not in label_scores:
|
79 |
+
label_scores[label] = 0
|
80 |
+
label_scores[label] += score['score']
|
81 |
+
|
82 |
+
# Calculate averages
|
83 |
+
avg_scores = {label: score/count for label, score in label_scores.items()}
|
84 |
+
|
85 |
+
# Get the label with highest average score
|
86 |
+
final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
|
87 |
return final_emotion
|
88 |
+
|
89 |
+
return "LABEL_2" # Default to neutral if no valid results
|
90 |
|
91 |
except Exception as e:
|
92 |
st.warning(f"Error in emotion classification: {str(e)}")
|
93 |
+
return "LABEL_2" # Default to neutral
|
94 |
+
|
95 |
+
def get_embedding_for_text(text, tokenizer, model):
|
96 |
+
"""Get embedding for complete text."""
|
97 |
+
chunks = split_text(text)
|
98 |
+
chunk_embeddings = []
|
99 |
+
|
100 |
+
for chunk in chunks:
|
101 |
+
try:
|
102 |
+
inputs = tokenizer(
|
103 |
+
chunk,
|
104 |
+
return_tensors="pt",
|
105 |
+
padding=True,
|
106 |
+
truncation=True,
|
107 |
+
max_length=512
|
108 |
+
)
|
109 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
110 |
+
|
111 |
+
with torch.no_grad():
|
112 |
+
outputs = model(**inputs)
|
113 |
+
|
114 |
+
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
115 |
+
chunk_embeddings.append(embedding[0])
|
116 |
+
except Exception as e:
|
117 |
+
st.warning(f"Error processing chunk: {str(e)}")
|
118 |
+
continue
|
119 |
+
|
120 |
+
if chunk_embeddings:
|
121 |
+
# Use weighted average based on chunk length
|
122 |
+
weights = np.array([len(chunk.split()) for chunk in chunks])
|
123 |
+
weights = weights / weights.sum()
|
124 |
+
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
125 |
+
return weighted_embedding
|
126 |
+
return np.zeros(model.config.hidden_size)
|
127 |
|
128 |
def format_topics(topic_model, topic_counts):
|
129 |
"""Format topics for display."""
|
|
|
133 |
topic_label = "Miscellaneous"
|
134 |
else:
|
135 |
words = topic_model.get_topic(topic_num)
|
136 |
+
topic_label = " | ".join([word for word, _ in words[:5]]) # Show top 5 words per topic
|
137 |
|
138 |
formatted_topics.append({
|
139 |
'topic': topic_label,
|
|
|
143 |
|
144 |
def format_emotions(emotion_counts):
|
145 |
"""Format emotions for display."""
|
146 |
+
# Define emotion labels mapping
|
147 |
+
EMOTION_LABELS = {
|
148 |
+
'LABEL_0': 'Negative',
|
149 |
+
'LABEL_1': 'Positive',
|
150 |
+
'LABEL_2': 'Neutral'
|
151 |
+
}
|
152 |
+
|
153 |
formatted_emotions = []
|
154 |
for label, count in emotion_counts:
|
155 |
emotion = EMOTION_LABELS.get(label, label)
|
|
|
163 |
"""Process the data and generate summaries."""
|
164 |
summaries = []
|
165 |
|
166 |
+
# Initialize BERTopic with Arabic-specific settings
|
167 |
topic_model = BERTopic(
|
168 |
+
language="multilingual",
|
169 |
calculate_probabilities=True,
|
170 |
+
min_topic_size=2, # Allow smaller topic groups
|
171 |
+
n_gram_range=(1, 3), # Include up to trigrams
|
172 |
+
top_n_words=15, # Show more words per topic
|
173 |
verbose=True
|
174 |
)
|
175 |
|
|
|
179 |
progress_bar = st.progress(0, text=progress_text)
|
180 |
|
181 |
texts = group['poem'].dropna().tolist()
|
|
|
182 |
all_emotions = []
|
183 |
|
184 |
+
# Generate embeddings with progress tracking
|
185 |
+
embeddings = []
|
186 |
+
for i, text in enumerate(texts):
|
187 |
+
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
|
188 |
+
embeddings.append(embedding)
|
189 |
+
progress = (i + 1) / len(texts) * 0.4
|
190 |
+
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
|
191 |
+
|
192 |
+
embeddings = np.array(embeddings)
|
193 |
|
194 |
+
# Process emotions with progress tracking
|
195 |
+
for i, text in enumerate(texts):
|
196 |
+
emotion = classify_emotion(text, emotion_classifier)
|
197 |
+
all_emotions.append(emotion)
|
198 |
+
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
|
199 |
+
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
|
|
200 |
|
201 |
try:
|
202 |
# Fit topic model
|
|
|
213 |
'top_emotions': top_emotions
|
214 |
})
|
215 |
progress_bar.progress(1.0, text="Processing complete!")
|
216 |
+
|
217 |
except Exception as e:
|
218 |
st.warning(f"Could not generate topics for {country}: {str(e)}")
|
219 |
continue
|
220 |
|
221 |
return summaries, topic_model
|
222 |
|
223 |
+
# Load models
|
224 |
+
try:
|
225 |
+
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
226 |
+
st.success("Models loaded successfully!")
|
227 |
+
except Exception as e:
|
228 |
+
st.error(f"Error loading models: {str(e)}")
|
229 |
+
st.stop()
|
230 |
+
|
231 |
# Main app interface
|
232 |
st.title("📚 Arabic Poem Analysis")
|
233 |
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
|
|
|
290 |
topic_name = "Miscellaneous"
|
291 |
else:
|
292 |
words = topic_model.get_topic(row['Topic'])
|
293 |
+
topic_name = " | ".join([word for word, _ in words[:5]])
|
294 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
295 |
|
296 |
except Exception as e:
|
|
|
304 |
'country': ['Egypt', 'Saudi Arabia'],
|
305 |
'poem': ['قصيدة مصرية', 'قصيدة سعودية']
|
306 |
})
|
307 |
+
st.dataframe(example_df)
|
|