kambris commited on
Commit
2b884bf
ยท
verified ยท
1 Parent(s): 7204906

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +273 -562
app.py CHANGED
@@ -1,607 +1,318 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
4
- from sklearn.feature_extraction.text import CountVectorizer
5
- from bertopic import BERTopic
6
- import torch
7
  import numpy as np
8
- from collections import Counter
9
- import os
10
- from wordcloud import WordCloud
 
11
  import matplotlib.pyplot as plt
12
- import pkg_resources
13
- import folium
14
- import country_converter as coco
15
- import time
16
- import gc
17
-
18
- def clear_memory():
19
- if torch.cuda.is_available():
20
- torch.cuda.empty_cache()
21
- gc.collect()
22
-
23
- current_dir = os.path.dirname(os.path.abspath(__file__))
24
- font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
25
-
26
- ARABIC_STOP_WORDS = {
27
- 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู„ูŠ', 'ุนู†', 'ู…ุน', 'ุฎู„ุงู„', 'ุญุชูŠ', 'ุญุชู‰', 'ุฅุฐุง',
28
-
29
- 'ุซู…', 'ุฃูˆ', 'ูˆ', 'ู„', 'ุจ', 'ูƒ', 'ู„ู„', 'ุงู„', 'ู‡ุฐุง',
30
- 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ', 'ู‡ุคู„ุงุก', 'ู‡ู…', 'ู‡ู†', 'ู‡ูˆ', 'ู‡ูŠ','ู‡ู†ุง', 'ู†ุญู†',
31
- 'ุงู†ุช', 'ุงู†ุชู…', 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ุงูŠ', 'ูƒู„',
32
- 'ุจุนุถ', 'ุบูŠุฑ', 'ุญูˆู„', 'ุนู†ุฏ', 'ู‚ุฏ', 'ู„ู‚ุฏ', 'ู„ู…', 'ู„ู†', 'ู„ูˆ',
33
- 'ู…ุง', 'ู…ุงุฐุง', 'ู…ุชู‰', 'ูƒูŠู', 'ุงูŠู†', 'ู„ู…ุงุฐุง', 'ุงู„ุฐูŠ', 'ุงู„ุชูŠ',
34
- 'ุงู„ุฐูŠู†', 'ุงู„ู„ุงุชูŠ', 'ุงู„ู„ูˆุงุชูŠ', 'ุงู„ุงู†', 'ุจูŠู†', 'ููˆู‚', 'ุชุญุช',
35
- 'ุงู…ุงู…', 'ุฎู„ู', 'ุญูŠู†', 'ู‚ุจู„', 'ุจุนุฏ', 'ุฃู†', 'ู„ู‡', 'ูƒู…ุง', 'ู„ู‡ุง',
36
- 'ู…ู†ุฐ', 'ู†ูุณ', 'ุญูŠุซ', 'ู‡ู†ุงูƒ', 'ุฌุฏุง', 'ุฐุงุช', 'ุถู…ู†', 'ุงู†ู‡', 'ู„ุฏู‰',
37
- 'ุนู„ูŠู‡', 'ู…ุซู„', 'ุฃู…ุง', 'ู„ุฏูŠ', 'ููŠู‡', 'ูƒู„ู…', 'ู„ูƒู†', 'ุงูŠุถุง', 'ู„ุงุฒู…',
38
- 'ูŠุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุถุฏ', 'ูŠุง', 'ู„ุง', 'ุงู…ุง',
39
- 'ุจู‡ุง', 'ุงู†', 'ุจู‡', 'ุงู„ูŠ', 'ู„ู…ุง', 'ุงู†ุง', 'ุงู„ูŠูƒ', 'ู„ูŠ', 'ู„ูƒ','ุงุฐุง','ุจู„ุง','ุงูˆ','ู„ุฏูŠูƒ','ู„ุฏูŠู‡','ุงู†ูŠ','ูƒู†ุช','ู„ูŠุณ','ุงูŠู‡ุง', 'ู‚ู„ุช',
40
 
41
- 'ูˆุซู…', 'ูˆุฃูˆ', 'ูˆู„', 'ูˆุจ', 'ูˆูƒ', 'ูˆู„ู„', 'ูˆุงู„',
42
- 'ูˆู‡ุฐุง', 'ูˆู‡ุฐู‡', 'ูˆุฐู„ูƒ', 'ูˆุชู„ูƒ', 'ูˆู‡ุคู„ุงุก', 'ูˆู‡ู…', 'ูˆู‡ู†', 'ูˆู‡ูˆ', 'ูˆู‡ูŠ', 'ูˆู†ุญู†',
43
- 'ูˆุงู†ุช', 'ูˆุงู†ุชู…', 'ูˆูƒุงู†', 'ูˆูƒุงู†ุช', 'ูˆูŠูƒูˆู†', 'ูˆุชูƒูˆู†', 'ูˆุงูŠ', 'ูˆูƒู„',
44
- 'ูˆุจุนุถ', 'ูˆุบูŠุฑ', 'ูˆุญูˆู„', 'ูˆุนู†ุฏ', 'ูˆู‚ุฏ', 'ูˆู„ู‚ุฏ', 'ูˆู„ู…', 'ูˆู„ู†', 'ูˆู„ูˆ',
45
- 'ูˆู…ุง', 'ูˆู…ุงุฐุง', 'ูˆู…ุชู‰', 'ูˆูƒูŠู', 'ูˆุงูŠู†', 'ูˆู„ู…ุงุฐุง', 'ูˆุงู„ุฐูŠ', 'ูˆุงู„ุชูŠ',
46
- 'ูˆุงู„ุฐูŠู†', 'ูˆุงู„ู„ุงุชูŠ', 'ูˆุงู„ู„ูˆุงุชูŠ', 'ูˆุงู„ุงู†', 'ูˆุจูŠู†', 'ูˆููˆู‚','ูˆู‡ู†ุง', 'ูˆุชุญุช',
47
- 'ูˆุงู…ุงู…', 'ูˆุฎู„ู', 'ูˆุญูŠู†', 'ูˆู‚ุจู„', 'ูˆุจุนุฏ', 'ูˆุฃู†', 'ูˆู„ู‡', 'ูˆูƒู…ุง', 'ูˆู„ู‡ุง',
48
- 'ูˆู…ู†ุฐ', 'ูˆู†ูุณ', 'ูˆุญูŠุซ', 'ูˆู‡ู†ุงูƒ', 'ูˆุฌุฏุง', 'ูˆุฐุงุช', 'ูˆุถู…ู†', 'ูˆุงู†ู‡', 'ูˆู„ุฏู‰',
49
- 'ูˆุนู„ูŠู‡', 'ูˆู…ุซู„', 'ูˆุฃู…ุง', 'ูˆููŠู‡', 'ูˆูƒู„ู…', 'ูˆู„ูƒู†', 'ูˆุงูŠุถุง', 'ูˆู„ุงุฒู…',
50
- 'ูˆูŠุฌุจ', 'ูˆุตุงุฑ', 'ูˆุตุงุฑุช', 'ูˆุถุฏ', 'ูˆูŠุง', 'ูˆู„ุง', 'ูˆุงู…ุง',
51
- 'ูˆุจู‡ุง', 'ูˆุงู†', 'ูˆุจู‡', 'ูˆุงู„ูŠ', 'ูˆู„ู…ุง', 'ูˆุงู†ุง', 'ูˆุงู„ูŠูƒ', 'ูˆู„ูŠ', 'ูˆู„ูƒ', 'ูˆู‚ู„ุช',
52
-
53
- 'ูˆููŠ', 'ูˆู…ู†', 'ูˆุนู„ู‰', 'ูˆุนู„ูŠ', 'ูˆุนู†', 'ูˆู…ุน', 'ูˆุญุชู‰', 'ูˆุฅุฐุง',
54
- 'ูˆู‡ุฐุง', 'ูˆู‡ุฐู‡', 'ูˆุฐู„ูƒ', 'ูˆุชู„ูƒ', 'ูˆู‡ูˆ', 'ูˆู‡ูŠ', 'ูˆู†ุญู†',
55
- 'ูˆูƒุงู†', 'ูˆูƒุงู†ุช', 'ูˆูƒู„', 'ูˆุจุนุถ', 'ูˆุญูˆู„', 'ูˆุนู†ุฏ', 'ูˆู‚ุฏ',
56
- 'ูˆู„ู‚ุฏ', 'ูˆู„ู…', 'ูˆู„ู†', 'ูˆู…ุง', 'ูˆูƒูŠู', 'ูˆุงูŠู†', 'ูˆุงู„ุฐูŠ',
57
- 'ูˆุจูŠู†', 'ูˆู‚ุจู„', 'ูˆุจุนุฏ', 'ูˆู„ู‡', 'ูˆู„ู‡ุง', 'ูˆู‡ู†ุงูƒ', 'ูˆุงู†ู‡',
58
- 'ู…ู†ู‡','ุงู„ุง','ููŠู‡ุง','ูู„ุง','ูˆูƒู…','ูŠูƒู†','ุนู„ูŠูƒ','ู…ู†ู‡ุง','ูู…ุง','ู„ู‡ู…','ูŠูƒู†','ูˆุงู†ูŠ','ู‡ู„','ูู‡ู„','ุจูŠ','ู†ุญูˆ','ูƒูŠ','ุณูˆู','ูƒู†ุง','ู„ู†ุง','ู…ุนุง','ูƒู„ู…ุง','ูˆุฅุฐุง','ู…ู†ู‡','ุนู†ู‡','ุฅุฐ','ูƒู…','ุจู„','ููŠู‡ุง','ู‡ูƒุฐุง','ู„ู‡ู…','ูˆู„ุฏู‰', 'ูˆุนู„ูŠู‡', 'ูˆู…ุซู„',
59
 
60
- 'ูˆุงุญุฏ', 'ุงุซู†ุงู†', 'ุซู„ุงุซุฉ', 'ุฃุฑุจุนุฉ', 'ุฎู…ุณุฉ', 'ุณุชุฉ', 'ุณุจุนุฉ',
61
- 'ุซู…ุงู†ูŠุฉ', 'ุชุณุนุฉ', 'ุนุดุฑุฉ',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- 'ุงู„ุฃูˆู„', 'ุงู„ุซุงู†ูŠ', 'ุงู„ุซุงู„ุซ', 'ุงู„ุฑุงุจุน', 'ุงู„ุฎุงู…ุณ', 'ุงู„ุณุงุฏุณ',
64
- 'ุงู„ุณุงุจุน', 'ุงู„ุซุงู…ู†', 'ุงู„ุชุงุณุน', 'ุงู„ุนุงุดุฑ'
 
 
 
 
65
  }
66
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
 
 
68
 
 
 
 
 
 
 
 
 
69
 
70
- COUNTRY_MAPPING = {
71
- 'ู…ุตุฑ': 'Egypt',
72
- 'ุงู„ุณุนูˆุฏูŠุฉ': 'Saudi Arabia',
73
- 'ุงู„ุฅู…ุงุฑุงุช': 'UAE',
74
- 'ุงู„ูƒูˆูŠุช': 'Kuwait',
75
- 'ุงู„ุนุฑุงู‚': 'Iraq',
76
- 'ุณูˆุฑูŠุง': 'Syria',
77
- 'ู„ุจู†ุงู†': 'Lebanon',
78
- 'ุงู„ุฃุฑุฏู†': 'Jordan',
79
- 'ูู„ุณุทูŠู†': 'Palestine',
80
- 'ุงู„ูŠู…ู†': 'Yemen',
81
- 'ุนู…ุงู†': 'Oman',
82
- 'ู‚ุทุฑ': 'Qatar',
83
- 'ุงู„ุจุญุฑูŠู†': 'Bahrain',
84
- 'ุงู„ุณูˆุฏุงู†': 'Sudan',
85
- 'ู„ูŠุจูŠุง': 'Libya',
86
- 'ุชูˆู†ุณ': 'Tunisia',
87
- 'ุงู„ุฌุฒุงุฆุฑ': 'Algeria',
88
- 'ุงู„ู…ุบุฑุจ': 'Morocco',
89
- 'ู…ูˆุฑูŠุชุงู†ูŠุง': 'Mauritania'
90
- }
91
 
92
- st.set_page_config(
93
- page_title="Contemporary Arabic Poetry Analysis",
94
- page_icon="๐Ÿ“š",
95
- layout="wide"
96
- )
97
 
98
- @st.cache_resource
99
- def load_models():
100
- """Load and cache the models to prevent reloading"""
101
- tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
102
- bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
103
- emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
104
- emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
105
- emotion_classifier = pipeline(
106
- "sentiment-analysis",
107
- model=emotion_model,
108
- tokenizer=emotion_tokenizer,
109
- return_all_scores=True
110
- )
111
- return tokenizer, bert_model, emotion_classifier
112
 
113
- def split_text(text, max_length=512):
114
- """Split text into chunks of maximum token length while preserving word boundaries."""
115
- words = text.split()
116
- chunks = []
117
- current_chunk = []
118
- current_length = 0
119
-
120
- for word in words:
121
- word_length = len(word.split())
122
- if current_length + word_length > max_length:
123
- if current_chunk:
124
- chunks.append(' '.join(current_chunk))
125
- current_chunk = [word]
126
- current_length = word_length
127
- else:
128
- current_chunk.append(word)
129
- current_length += word_length
130
-
131
- if current_chunk:
132
- chunks.append(' '.join(current_chunk))
133
-
134
- return chunks
135
-
136
- def get_country_coordinates():
137
- """Returns dictionary of Arab country coordinates"""
138
- return {
139
- 'Egypt': [26.8206, 30.8025],
140
- 'Saudi Arabia': [23.8859, 45.0792],
141
- 'UAE': [23.4241, 53.8478],
142
- 'Kuwait': [29.3117, 47.4818],
143
- 'Iraq': [33.2232, 43.6793],
144
- 'Syria': [34.8021, 38.9968],
145
- 'Lebanon': [33.8547, 35.8623],
146
- 'Jordan': [30.5852, 36.2384],
147
- 'Palestine': [31.9522, 35.2332],
148
- 'Yemen': [15.5527, 48.5164],
149
- 'Oman': [21.4735, 55.9754],
150
- 'Qatar': [25.3548, 51.1839],
151
- 'Bahrain': [26.0667, 50.5577],
152
- 'Sudan': [12.8628, 30.2176],
153
- 'Libya': [26.3351, 17.2283],
154
- 'Tunisia': [33.8869, 9.5375],
155
- 'Algeria': [28.0339, 1.6596],
156
- 'Morocco': [31.7917, -7.0926],
157
- 'Mauritania': [21.0079, -10.9408]
158
- }
159
- def create_topic_map(summaries):
160
- # Debug print to check incoming data
161
- print("DEBUG - First summary emotions:", summaries[0]['top_emotions'])
162
-
163
- coordinates = get_country_coordinates()
164
- m = folium.Map(location=[27.0, 42.0], zoom_start=5)
165
-
166
- sentiment_colors = {
167
- 'LABEL_1': 'green', # Positive
168
- 'LABEL_0': 'red', # Negative
169
- 'LABEL_2': 'blue' # Neutral
170
- }
171
-
172
- for summary in summaries:
173
- country_en = COUNTRY_MAPPING.get(summary['country'])
174
- if country_en and country_en in coordinates:
175
- REVERSE_EMOTION_LABELS = {
176
- 'positive': 'LABEL_1',
177
- 'negative': 'LABEL_0',
178
- 'neutral': 'LABEL_2'
179
- }
180
-
181
- dominant_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "neutral"
182
- dominant_label = REVERSE_EMOTION_LABELS.get(dominant_emotion, 'LABEL_2')
183
- circle_color = sentiment_colors.get(dominant_label, 'gray')
184
 
185
- # Debug print
186
- print(f"DEBUG - Country: {country_en}, Emotion: {dominant_emotion}, Label: {dominant_label}, Color: {circle_color}")
187
 
188
- popup_content = f"""
189
- <b>{country_en}</b><br>
190
- <b>Sentiment Distribution:</b><br>
191
- {'<br>'.join(f"โ€ข {e['emotion']}: {e['count']}" for e in summary['top_emotions'][:3])}<br>
192
- <b>Top Topic:</b><br>
193
- {summary['top_topics'][0]['topic'] if summary['top_topics'] else 'No topics'}<br>
194
- Total Poems: {summary['total_poems']}
195
- """
196
 
197
- folium.CircleMarker(
198
- location=coordinates[country_en],
199
- radius=10,
200
- popup=folium.Popup(popup_content, max_width=300),
201
- color=circle_color,
202
- fill=True
203
- ).add_to(m)
204
-
205
- legend_html = """
206
- <div style="position: fixed; bottom: 50px; left: 50px; z-index: 1000; background-color: white; padding: 10px; border: 2px solid grey; border-radius: 5px">
207
- <p><b>Sentiment:</b></p>
208
- <p><span style="color: green;">โ—</span> Positive</p>
209
- <p><span style="color: red;">โ—</span> Negative</p>
210
- <p><span style="color: blue;">โ—</span> Neutral</p>
211
- </div>
212
- """
213
- m.get_root().html.add_child(folium.Element(legend_html))
214
-
215
- return m
216
-
217
 
218
- def create_arabic_wordcloud(text, title):
219
- wordcloud = WordCloud(
220
- width=1200,
221
- height=600,
222
- background_color='white',
223
- font_path=font_path,
224
- max_words=200,
225
- stopwords=ARABIC_STOP_WORDS
226
- ).generate(text)
227
-
228
- fig, ax = plt.subplots(figsize=(15, 8))
229
- ax.imshow(wordcloud, interpolation='bilinear')
230
- ax.axis('off')
231
- ax.set_title(title, fontsize=16, pad=20)
232
- return fig
233
 
234
- def clean_arabic_text(text):
235
- """Clean Arabic text by removing stop words and normalizing."""
236
- words = text.split()
237
- cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
238
- return ' '.join(cleaned_words)
239
 
240
- def classify_emotion(text, classifier):
241
- """Classify emotion for complete text with proper token handling."""
242
- try:
243
- words = text.split()
244
- chunks = []
245
- current_chunk = []
246
- current_length = 0
247
 
248
- for word in words:
249
- word_tokens = len(classifier.tokenizer.encode(word))
250
- if current_length + word_tokens > 512:
251
- if current_chunk:
252
- chunks.append(' '.join(current_chunk))
253
- current_chunk = [word]
254
- current_length = word_tokens
255
- else:
256
- current_chunk.append(word)
257
- current_length += word_tokens
258
 
259
- if current_chunk:
260
- chunks.append(' '.join(current_chunk))
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- if not chunks:
263
- chunks = [text]
 
264
 
265
- all_scores = []
266
- for chunk in chunks:
267
- try:
268
- inputs = classifier.tokenizer(
269
- chunk,
270
- truncation=True,
271
- max_length=512,
272
- return_tensors="pt"
273
- )
274
- result = classifier(chunk, truncation=True, max_length=512)
275
- scores = result[0]
276
- all_scores.append(scores)
277
- except Exception as chunk_error:
278
- st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
279
- continue
 
 
 
 
 
 
 
280
 
281
- if all_scores:
282
- label_scores = {}
283
- count = len(all_scores)
284
-
285
- for scores in all_scores:
286
- for score in scores:
287
- label = score['label']
288
- if label not in label_scores:
289
- label_scores[label] = 0
290
- label_scores[label] += score['score']
291
-
292
- avg_scores = {label: score/count for label, score in label_scores.items()}
293
- final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
294
- return final_emotion
295
 
296
- return "LABEL_2"
 
 
 
297
 
298
- except Exception as e:
299
- st.warning(f"Error in emotion classification: {str(e)}")
300
- return "LABEL_2"
301
-
302
- def get_embedding_for_text(text, tokenizer, model):
303
- """Get embedding for complete text."""
304
- chunks = split_text(text)
305
- chunk_embeddings = []
306
-
307
- for chunk in chunks:
308
- try:
309
- inputs = tokenizer(
310
- chunk,
311
- return_tensors="pt",
312
- padding=True,
313
- truncation=True,
314
- max_length=512
315
- )
316
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
317
-
318
- with torch.no_grad():
319
- outputs = model(**inputs)
320
-
321
- embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
322
- chunk_embeddings.append(embedding[0])
323
- except Exception as e:
324
- st.warning(f"Error processing chunk: {str(e)}")
325
- continue
326
-
327
- if chunk_embeddings:
328
- weights = np.array([len(chunk.split()) for chunk in chunks])
329
- weights = weights / weights.sum()
330
- weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
331
- return weighted_embedding
332
- return np.zeros(model.config.hidden_size)
333
-
334
- def format_topics(topic_model, topic_counts):
335
- """Format topics for display."""
336
- formatted_topics = []
337
- for topic_num, count in topic_counts:
338
- if topic_num == -1:
339
- topic_label = "Miscellaneous"
340
- else:
341
- words = topic_model.get_topic(topic_num)
342
- topic_label = " | ".join([word for word, _ in words[:5]])
343
 
344
- formatted_topics.append({
345
- 'topic': topic_label,
346
- 'count': count
347
- })
348
- return formatted_topics
349
 
350
- def format_emotions(emotion_counts):
351
- """Format emotions for display."""
352
- EMOTION_LABELS = {
353
- 'LABEL_0': 'Negative',
354
- 'LABEL_1': 'Positive',
355
- 'LABEL_2': 'Neutral'
356
- }
357
-
358
- formatted_emotions = []
359
- for label, count in emotion_counts:
360
- emotion = EMOTION_LABELS.get(label, label)
361
- formatted_emotions.append({
362
- 'emotion': emotion,
363
- 'count': count
364
- })
365
- return formatted_emotions
366
 
367
- def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
368
- """Process the data and generate summaries with flexible topic configuration."""
369
- summaries = []
370
 
371
- topic_model_params = {
372
- "language": "arabic",
373
- "calculate_probabilities": True,
374
- "min_topic_size": 3,
375
- "n_gram_range": (1, 1),
376
- "top_n_words": 15,
377
- "verbose": True,
378
- }
379
- st.write(f"Total documents: {len(df)}")
380
- st.write(f"Topic strategy: {topic_strategy}")
381
- st.write(f"Min topic size: {min_topic_size}")
382
 
383
- if topic_strategy == "Manual":
384
- topic_model_params["nr_topics"] = n_topics
385
- else:
386
- topic_model_params["nr_topics"] = "auto"
387
-
388
- topic_model = BERTopic(
389
- embedding_model=bert_model,
390
- **topic_model_params)
391
-
392
- vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
393
- min_df=1,
394
- max_df=1.0)
395
- topic_model.vectorizer_model = vectorizer
396
-
397
- for country, group in df.groupby('country'):
398
- progress_text = f"Processing poems for {country}..."
399
- progress_bar = st.progress(0, text=progress_text)
400
 
401
- texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
402
- all_emotions = []
403
-
404
- embeddings = []
405
-
406
- clear_memory()
407
-
 
408
 
409
- for i, text in enumerate(texts):
410
- try:
411
- embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
412
- if embedding is not None and not np.isnan(embedding).any():
413
- embeddings.append(embedding)
414
- else:
415
- st.warning(f"Invalid embedding generated for text {i+1} in {country}")
416
- continue
417
- except Exception as e:
418
- st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
419
- continue
420
- if i % 10 == 0:
421
- clear_memory()
422
 
423
- progress = (i + 1) / len(texts) * 0.4
424
- progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
425
-
426
- if len(embeddings) != len(texts):
427
- texts = texts[:len(embeddings)]
428
- embeddings = np.array(embeddings)
429
-
430
- clear_memory()
431
-
432
- for i, text in enumerate(texts):
433
- emotion = classify_emotion(text, emotion_classifier)
434
- all_emotions.append(emotion)
435
- if i % 10 == 0:
436
- clear_memory()
437
- progress = 0.4 + ((i + 1) / len(texts) * 0.3)
438
- progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
439
-
440
- try:
441
 
442
- if len(texts) < min_topic_size:
443
- st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
444
- continue
445
-
 
 
 
 
446
 
447
- topics, probs = topic_model.fit_transform(texts, embeddings)
 
 
 
 
 
 
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
- topic_counts = Counter(topics)
 
 
 
 
451
 
452
- top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
453
- top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
 
 
 
 
 
 
454
 
455
- summaries.append({
456
- 'country': country,
457
- 'total_poems': len(texts),
458
- 'top_topics': top_topics,
459
- 'top_emotions': top_emotions
460
- })
461
- progress_bar.progress(1.0, text="Processing complete!")
 
 
 
462
 
463
- except Exception as e:
464
- st.warning(f"Could not generate topics for {country}: {str(e)}")
465
- continue
466
-
467
- return summaries, topic_model
468
-
469
- try:
470
- bert_tokenizer, bert_model, emotion_classifier = load_models()
471
- st.success("Models loaded successfully!")
472
- except Exception as e:
473
- st.error(f"Error loading models: {str(e)}")
474
- st.stop()
475
-
476
- # Main app interface
477
- st.title("๐Ÿ“š Contemporary Arabic Poetry Analysis")
478
- st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
479
-
480
- uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
481
-
482
- if uploaded_file is not None:
483
- try:
484
- if uploaded_file.name.endswith('.csv'):
485
- df = pd.read_csv(uploaded_file)
486
- else:
487
- df = pd.read_excel(uploaded_file)
488
-
489
- required_columns = ['country', 'poem']
490
- if not all(col in df.columns for col in required_columns):
491
- st.error("File must contain 'country' and 'poem' columns.")
492
- st.stop()
493
-
494
- df['country'] = df['country'].str.strip()
495
- df = df.dropna(subset=['country', 'poem'])
496
- sampled_df = df.groupby('country').apply(lambda x: x.head(20)).reset_index(drop=True)
497
-
498
- st.subheader("Topic Modeling Settings")
499
- col1, col2 = st.columns(2)
500
-
501
- with col1:
502
- topic_strategy = st.radio(
503
- "Topic Number Strategy",
504
- ["Auto", "Manual"],
505
- help="Choose whether to let the model determine the optimal number of topics or set it manually"
506
- )
507
 
508
- if topic_strategy == "Manual":
509
- n_documents = len(df)
510
- max_topics = 500
511
- min_topics = 5
512
- default_topics = 20
513
-
514
- n_topics = st.slider(
515
- "Number of Topics",
516
- min_value=min_topics,
517
- max_value=max_topics,
518
- value=default_topics,
519
- help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
520
- )
521
-
522
- st.info(f"""
523
- ๐Ÿ’ก For your dataset of {n_documents:,} documents:
524
- - Available topic range: {min_topics}-{max_topics}
525
- - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
526
- """)
527
 
528
- with col2:
529
- top_n = st.number_input(
530
- "Number of top topics/emotions to display:",
531
- min_value=1,
532
- max_value=100,
533
- value=10
534
- )
535
-
536
- if st.button("Process Data"):
537
-
538
- with st.spinner("Processing your data..."):
539
-
540
-
541
- summaries, topic_model = process_and_summarize(
542
- sampled_df,
543
- bert_tokenizer,
544
- bert_model,
545
- emotion_classifier,
546
- top_n=top_n,
547
- topic_strategy=topic_strategy,
548
- n_topics=n_topics if topic_strategy == "Manual" else None,
549
- min_topic_size=3
550
- )
551
-
552
- if summaries:
553
- st.success("Analysis complete!")
554
-
555
- tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"])
556
-
557
- with tab1:
558
- for summary in summaries:
559
- with st.expander(f"๐Ÿ“ {summary['country']} ({summary['total_poems']} poems)"):
560
- col1, col2 = st.columns(2)
561
-
562
- with col1:
563
- st.subheader("Top Topics")
564
- for topic in summary['top_topics']:
565
- st.write(f"โ€ข {topic['topic']}: {topic['count']} poems")
566
-
567
- with col2:
568
- st.subheader("Emotions")
569
- for emotion in summary['top_emotions']:
570
- st.write(f"โ€ข {emotion['emotion']}: {emotion['count']} poems")
571
-
572
- st.subheader("Word Cloud Visualization")
573
- country_poems = df[df['country'] == summary['country']]['poem']
574
- combined_text = ' '.join(country_poems)
575
- wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
576
- st.pyplot(wordcloud_fig)
577
-
578
- with tab2:
579
- st.subheader("Global Topic Distribution")
580
- topic_info = topic_model.get_topic_info()
581
- for _, row in topic_info.iterrows():
582
- if row['Topic'] == -1:
583
- topic_name = "Miscellaneous"
584
- else:
585
- words = topic_model.get_topic(row['Topic'])
586
- topic_name = " | ".join([word for word, _ in words[:5]])
587
- st.write(f"โ€ข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
588
-
589
- with tab3:
590
- st.subheader("Topic and Sentiment Distribution Map")
591
- topic_map = create_topic_map(summaries)
592
- st.components.v1.html(topic_map._repr_html_(), height=600)
593
-
594
- except Exception as e:
595
- st.error(f"Error processing file: {str(e)}")
596
-
597
- else:
598
- st.info("๐Ÿ‘† Upload a file to get started!")
599
-
600
- st.write("### Expected File Format:")
601
- example_df = pd.DataFrame({
602
- 'country': ['Egypt', 'Palestine'],
603
- 'poem': ['ู‚ุตูŠุฏุฉ ู…ุตุฑูŠุฉ', 'ู‚ุตูŠุฏุฉ ูู„ุณุทูŠู†ูŠุฉ']
604
- })
605
- st.dataframe(example_df)
606
-
607
-
 
1
  import streamlit as st
2
  import pandas as pd
 
 
 
 
3
  import numpy as np
4
+ import torch
5
+ import networkx as nx
6
+ import plotly.express as px
7
+ import plotly.graph_objs as go
8
  import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from scipy.signal import savgol_filter
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from wordcloud import WordCloud
14
+ import spacy
15
+
16
+ # Advanced NLP Libraries
17
+ from transformers import (
18
+ AutoTokenizer,
19
+ AutoModelForSequenceClassification,
20
+ pipeline,
21
+ AutoModelForTokenClassification
22
+ )
23
+ import nltk
24
+ from nltk.corpus import stopwords
25
+ from nltk.tokenize import word_tokenize
26
+ from textstat import flesch_reading_ease, flesch_kincaid_grade
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Download necessary NLTK resources
29
+ nltk.download('punkt', quiet=True)
30
+ nltk.download('stopwords', quiet=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Load spaCy model (requires separate installation)
33
+ try:
34
+ nlp = spacy.load('en_core_web_lg')
35
+ except:
36
+ st.error("Please install spaCy and en_core_web_lg model: \n"
37
+ "pip install spacy\n"
38
+ "python -m spacy download en_core_web_lg")
39
+
40
+ # Constants and Configurations
41
+ MORAL_FOUNDATIONS = {
42
+ 'care': 'Care/Harm',
43
+ 'fairness': 'Fairness/Cheating',
44
+ 'loyalty': 'Loyalty/Betrayal',
45
+ 'authority': 'Authority/Subversion',
46
+ 'sanctity': 'Sanctity/Degradation'
47
+ }
48
 
49
+ RHETORICAL_DEVICES = {
50
+ 'analogy': ['like', 'as', 'similar to'],
51
+ 'repetition': ['repetitive', 'recurring'],
52
+ 'metaphor': ['as if', 'like', 'represents'],
53
+ 'hyperbole': ['always', 'never', 'absolute'],
54
+ 'rhetorical_question': ['?']
55
  }
56
 
57
+ class SpeechAnalyzer:
58
+ def __init__(self):
59
+ # Load models
60
+ self.moralbert_tokenizer = AutoTokenizer.from_pretrained("minyoungchang/moralbert")
61
+ self.moralbert_model = AutoModelForSequenceClassification.from_pretrained("minyoungchang/moralbert")
62
+ self.sentiment_pipeline = pipeline("sentiment-analysis")
63
+
64
+ # Named Entity Recognition
65
+ self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
66
+ self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
67
+ self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
68
 
69
+ def split_text(self, text, max_length=512, overlap=50):
70
+ """Split long text into overlapping segments"""
71
+ words = text.split()
72
+ segments = []
73
+ current_segment = []
74
+ current_length = 0
75
 
76
+ for word in words:
77
+ if current_length + len(word.split()) > max_length:
78
+ segments.append(' '.join(current_segment))
79
+ current_segment = current_segment[-overlap:] + [word]
80
+ current_length = len(' '.join(current_segment).split())
81
+ else:
82
+ current_segment.append(word)
83
+ current_length = len(' '.join(current_segment).split())
84
 
85
+ if current_segment:
86
+ segments.append(' '.join(current_segment))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ return segments
 
 
 
 
89
 
90
+ def analyze_moral_foundations(self, text):
91
+ """Analyze moral foundations in text"""
92
+ segments = self.split_text(text)
93
+
94
+ foundation_scores = {
95
+ 'care': [], 'fairness': [], 'loyalty': [],
96
+ 'authority': [], 'sanctity': []
97
+ }
 
 
 
 
 
 
98
 
99
+ for segment in segments:
100
+ inputs = self.moralbert_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ with torch.no_grad():
103
+ outputs = self.moralbert_model(**inputs)
104
 
105
+ probabilities = torch.softmax(outputs.logits, dim=1)
 
 
 
 
 
 
 
106
 
107
+ for foundation in foundation_scores.keys():
108
+ foundation_scores[foundation].append(probabilities[0][1].item())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ aggregated_scores = {
111
+ foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
112
+ }
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ return aggregated_scores
 
 
 
 
115
 
116
+ def analyze_emotional_trajectory(self, text, window_size=5):
117
+ """Perform emotional trajectory analysis"""
118
+ segments = self.split_text(text, max_length=256)
 
 
 
 
119
 
120
+ sentiment_scores = []
121
+ for segment in segments:
122
+ result = self.sentiment_pipeline(segment)[0]
123
+ score = 1 if result['label'] == 'POSITIVE' else -1
124
+ sentiment_scores.append(score)
 
 
 
 
 
125
 
126
+ smoothed_scores = (savgol_filter(sentiment_scores, window_length=window_size, polyorder=2)
127
+ if len(sentiment_scores) > window_size else sentiment_scores)
128
+
129
+ return smoothed_scores
130
+
131
+ def detect_named_entities(self, text):
132
+ """Detect named entities in the text"""
133
+ entities = self.ner_pipeline(text)
134
+ return entities
135
+
136
+ def extract_key_phrases(self, text, top_n=10):
137
+ """Extract key phrases using TF-IDF"""
138
+ vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
139
+ tfidf_matrix = vectorizer.fit_transform([text])
140
+ feature_names = vectorizer.get_feature_names_out()
141
 
142
+ # Get top phrases by TF-IDF score
143
+ sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
144
+ top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
145
 
146
+ return top_phrases
147
+
148
+ def calculate_readability(self, text):
149
+ """Calculate readability metrics"""
150
+ return {
151
+ 'Flesch Reading Ease': flesch_reading_ease(text),
152
+ 'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text)
153
+ }
154
+
155
+ def detect_rhetorical_devices(self, text):
156
+ """Detect rhetorical devices"""
157
+ devices_found = {}
158
+ for device, markers in RHETORICAL_DEVICES.items():
159
+ count = sum(text.lower().count(marker) for marker in markers)
160
+ if count > 0:
161
+ devices_found[device] = count
162
+ return devices_found
163
+
164
+ def create_semantic_network(self, text, top_n=20):
165
+ """Create semantic network graph"""
166
+ # Use spaCy for advanced parsing
167
+ doc = nlp(text)
168
 
169
+ # Create graph
170
+ G = nx.Graph()
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ # Extract top nouns and their relationships
173
+ nouns = [token.text for token in doc if token.pos_ == 'NOUN']
174
+ noun_freq = nltk.FreqDist(nouns)
175
+ top_nouns = [noun for noun, _ in noun_freq.most_common(top_n)]
176
 
177
+ # Add nodes and edges
178
+ for noun in top_nouns:
179
+ G.add_node(noun)
180
+
181
+ # Connect related nouns
182
+ for i in range(len(top_nouns)):
183
+ for j in range(i+1, len(top_nouns)):
184
+ if top_nouns[i] in text and top_nouns[j] in text:
185
+ G.add_edge(top_nouns[i], top_nouns[j])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ return G
 
 
 
 
188
 
189
+ def main():
190
+ st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="๐Ÿ—ฃ๏ธ", layout="wide")
191
+ st.title("๐Ÿ—ฃ๏ธ Advanced Political Speech Analysis Toolkit")
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ # Initialize analyzer
194
+ analyzer = SpeechAnalyzer()
 
195
 
196
+ # File upload
197
+ uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
 
 
 
 
 
 
 
 
 
198
 
199
+ if uploaded_file is not None:
200
+ # Read file (similar to previous implementation)
201
+ if uploaded_file.name.endswith('.txt'):
202
+ text = uploaded_file.getvalue().decode('utf-8')
203
+ elif uploaded_file.name.endswith('.docx'):
204
+ import docx
205
+ doc = docx.Document(uploaded_file)
206
+ text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
207
+ elif uploaded_file.name.endswith('.pdf'):
208
+ import PyPDF2
209
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
210
+ text = ' '.join([page.extract_text() for page in pdf_reader.pages])
 
 
 
 
 
211
 
212
+ # Create tabs for different analyses
213
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
214
+ "Moral Foundations",
215
+ "Emotional Analysis",
216
+ "Linguistic Insights",
217
+ "Semantic Network",
218
+ "Advanced NLP"
219
+ ])
220
 
221
+ with tab1:
222
+ st.subheader("Moral Foundations Analysis")
223
+ moral_scores = analyzer.analyze_moral_foundations(text)
 
 
 
 
 
 
 
 
 
 
224
 
225
+ # Plotly bar chart
226
+ moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
227
+ moral_df.index.name = 'Moral Foundation'
228
+ moral_df = moral_df.reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ fig = px.bar(
231
+ moral_df,
232
+ x='Moral Foundation',
233
+ y='Score',
234
+ title='Moral Foundations Breakdown',
235
+ color='Moral Foundation'
236
+ )
237
+ st.plotly_chart(fig)
238
 
239
+ # Detailed insights
240
+ for foundation, score in moral_scores.items():
241
+ st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
242
+
243
+ with tab2:
244
+ st.subheader("Emotional Trajectory")
245
+ emotional_trajectory = analyzer.analyze_emotional_trajectory(text)
246
 
247
+ # Plotly line chart
248
+ trajectory_fig = go.Figure(data=go.Scatter(
249
+ y=emotional_trajectory,
250
+ mode='lines+markers',
251
+ name='Emotional Intensity'
252
+ ))
253
+ trajectory_fig.update_layout(
254
+ title='Speech Emotional Trajectory',
255
+ xaxis_title='Speech Segments',
256
+ yaxis_title='Emotional Intensity'
257
+ )
258
+ st.plotly_chart(trajectory_fig)
259
+
260
+ with tab3:
261
+ st.subheader("Linguistic Complexity")
262
+ readability = analyzer.calculate_readability(text)
263
 
264
+ col1, col2 = st.columns(2)
265
+ with col1:
266
+ st.metric("Flesch Reading Ease", f"{readability['Flesch Reading Ease']:.2f}")
267
+ with col2:
268
+ st.metric("Flesch-Kincaid Grade Level", f"{readability['Flesch-Kincaid Grade Level']:.2f}")
269
 
270
+ # Key Phrases
271
+ st.subheader("Key Phrases")
272
+ key_phrases = analyzer.extract_key_phrases(text)
273
+ st.write(", ".join(key_phrases))
274
+
275
+ with tab4:
276
+ st.subheader("Semantic Network")
277
+ semantic_graph = analyzer.create_semantic_network(text)
278
 
279
+ # Convert NetworkX graph to Plotly
280
+ edge_x = []
281
+ edge_y = []
282
+ for edge in semantic_graph.edges():
283
+ x0, y0 = semantic_graph.nodes[edge[0]].get('pos', (0,0))
284
+ x1, y1 = semantic_graph.nodes[edge[1]].get('pos', (0,0))
285
+ edge_x.append(x0)
286
+ edge_x.append(x1)
287
+ edge_y.append(y0)
288
+ edge_y.append(y1)
289
 
290
+ # Plotly network visualization
291
+ network_fig = go.Figure()
292
+ network_fig.add_trace(go.Scatter(
293
+ x=edge_x,
294
+ y=edge_y,
295
+ mode='lines',
296
+ line=dict(width=0.5, color='#888'),
297
+ hoverinfo='none'
298
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
+ st.plotly_chart(network_fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
+ with tab5:
303
+ st.subheader("Advanced NLP Analysis")
304
+
305
+ # Named Entities
306
+ st.write("### Named Entities")
307
+ named_entities = analyzer.detect_named_entities(text)
308
+ entities_df = pd.DataFrame(named_entities)
309
+ st.dataframe(entities_df)
310
+
311
+ # Rhetorical Devices
312
+ st.write("### Rhetorical Devices")
313
+ rhetorical_devices = analyzer.detect_rhetorical_devices(text)
314
+ for device, count in rhetorical_devices.items():
315
+ st.write(f"**{device.capitalize()}**: {count} instances")
316
+
317
+ if __name__ == "__main__":
318
+ main()