SoLProject / app.py
kambris's picture
Update app.py
7204906 verified
import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import torch
import numpy as np
from collections import Counter
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pkg_resources
import folium
import country_converter as coco
import time
import gc
def clear_memory():
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
current_dir = os.path.dirname(os.path.abspath(__file__))
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
ARABIC_STOP_WORDS = {
'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู„ูŠ', 'ุนู†', 'ู…ุน', 'ุฎู„ุงู„', 'ุญุชูŠ', 'ุญุชู‰', 'ุฅุฐุง',
'ุซู…', 'ุฃูˆ', 'ูˆ', 'ู„', 'ุจ', 'ูƒ', 'ู„ู„', 'ุงู„', 'ู‡ุฐุง',
'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ', 'ู‡ุคู„ุงุก', 'ู‡ู…', 'ู‡ู†', 'ู‡ูˆ', 'ู‡ูŠ','ู‡ู†ุง', 'ู†ุญู†',
'ุงู†ุช', 'ุงู†ุชู…', 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ุงูŠ', 'ูƒู„',
'ุจุนุถ', 'ุบูŠุฑ', 'ุญูˆู„', 'ุนู†ุฏ', 'ู‚ุฏ', 'ู„ู‚ุฏ', 'ู„ู…', 'ู„ู†', 'ู„ูˆ',
'ู…ุง', 'ู…ุงุฐุง', 'ู…ุชู‰', 'ูƒูŠู', 'ุงูŠู†', 'ู„ู…ุงุฐุง', 'ุงู„ุฐูŠ', 'ุงู„ุชูŠ',
'ุงู„ุฐูŠู†', 'ุงู„ู„ุงุชูŠ', 'ุงู„ู„ูˆุงุชูŠ', 'ุงู„ุงู†', 'ุจูŠู†', 'ููˆู‚', 'ุชุญุช',
'ุงู…ุงู…', 'ุฎู„ู', 'ุญูŠู†', 'ู‚ุจู„', 'ุจุนุฏ', 'ุฃู†', 'ู„ู‡', 'ูƒู…ุง', 'ู„ู‡ุง',
'ู…ู†ุฐ', 'ู†ูุณ', 'ุญูŠุซ', 'ู‡ู†ุงูƒ', 'ุฌุฏุง', 'ุฐุงุช', 'ุถู…ู†', 'ุงู†ู‡', 'ู„ุฏู‰',
'ุนู„ูŠู‡', 'ู…ุซู„', 'ุฃู…ุง', 'ู„ุฏูŠ', 'ููŠู‡', 'ูƒู„ู…', 'ู„ูƒู†', 'ุงูŠุถุง', 'ู„ุงุฒู…',
'ูŠุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุถุฏ', 'ูŠุง', 'ู„ุง', 'ุงู…ุง',
'ุจู‡ุง', 'ุงู†', 'ุจู‡', 'ุงู„ูŠ', 'ู„ู…ุง', 'ุงู†ุง', 'ุงู„ูŠูƒ', 'ู„ูŠ', 'ู„ูƒ','ุงุฐุง','ุจู„ุง','ุงูˆ','ู„ุฏูŠูƒ','ู„ุฏูŠู‡','ุงู†ูŠ','ูƒู†ุช','ู„ูŠุณ','ุงูŠู‡ุง', 'ู‚ู„ุช',
'ูˆุซู…', 'ูˆุฃูˆ', 'ูˆู„', 'ูˆุจ', 'ูˆูƒ', 'ูˆู„ู„', 'ูˆุงู„',
'ูˆู‡ุฐุง', 'ูˆู‡ุฐู‡', 'ูˆุฐู„ูƒ', 'ูˆุชู„ูƒ', 'ูˆู‡ุคู„ุงุก', 'ูˆู‡ู…', 'ูˆู‡ู†', 'ูˆู‡ูˆ', 'ูˆู‡ูŠ', 'ูˆู†ุญู†',
'ูˆุงู†ุช', 'ูˆุงู†ุชู…', 'ูˆูƒุงู†', 'ูˆูƒุงู†ุช', 'ูˆูŠูƒูˆู†', 'ูˆุชูƒูˆู†', 'ูˆุงูŠ', 'ูˆูƒู„',
'ูˆุจุนุถ', 'ูˆุบูŠุฑ', 'ูˆุญูˆู„', 'ูˆุนู†ุฏ', 'ูˆู‚ุฏ', 'ูˆู„ู‚ุฏ', 'ูˆู„ู…', 'ูˆู„ู†', 'ูˆู„ูˆ',
'ูˆู…ุง', 'ูˆู…ุงุฐุง', 'ูˆู…ุชู‰', 'ูˆูƒูŠู', 'ูˆุงูŠู†', 'ูˆู„ู…ุงุฐุง', 'ูˆุงู„ุฐูŠ', 'ูˆุงู„ุชูŠ',
'ูˆุงู„ุฐูŠู†', 'ูˆุงู„ู„ุงุชูŠ', 'ูˆุงู„ู„ูˆุงุชูŠ', 'ูˆุงู„ุงู†', 'ูˆุจูŠู†', 'ูˆููˆู‚','ูˆู‡ู†ุง', 'ูˆุชุญุช',
'ูˆุงู…ุงู…', 'ูˆุฎู„ู', 'ูˆุญูŠู†', 'ูˆู‚ุจู„', 'ูˆุจุนุฏ', 'ูˆุฃู†', 'ูˆู„ู‡', 'ูˆูƒู…ุง', 'ูˆู„ู‡ุง',
'ูˆู…ู†ุฐ', 'ูˆู†ูุณ', 'ูˆุญูŠุซ', 'ูˆู‡ู†ุงูƒ', 'ูˆุฌุฏุง', 'ูˆุฐุงุช', 'ูˆุถู…ู†', 'ูˆุงู†ู‡', 'ูˆู„ุฏู‰',
'ูˆุนู„ูŠู‡', 'ูˆู…ุซู„', 'ูˆุฃู…ุง', 'ูˆููŠู‡', 'ูˆูƒู„ู…', 'ูˆู„ูƒู†', 'ูˆุงูŠุถุง', 'ูˆู„ุงุฒู…',
'ูˆูŠุฌุจ', 'ูˆุตุงุฑ', 'ูˆุตุงุฑุช', 'ูˆุถุฏ', 'ูˆูŠุง', 'ูˆู„ุง', 'ูˆุงู…ุง',
'ูˆุจู‡ุง', 'ูˆุงู†', 'ูˆุจู‡', 'ูˆุงู„ูŠ', 'ูˆู„ู…ุง', 'ูˆุงู†ุง', 'ูˆุงู„ูŠูƒ', 'ูˆู„ูŠ', 'ูˆู„ูƒ', 'ูˆู‚ู„ุช',
'ูˆููŠ', 'ูˆู…ู†', 'ูˆุนู„ู‰', 'ูˆุนู„ูŠ', 'ูˆุนู†', 'ูˆู…ุน', 'ูˆุญุชู‰', 'ูˆุฅุฐุง',
'ูˆู‡ุฐุง', 'ูˆู‡ุฐู‡', 'ูˆุฐู„ูƒ', 'ูˆุชู„ูƒ', 'ูˆู‡ูˆ', 'ูˆู‡ูŠ', 'ูˆู†ุญู†',
'ูˆูƒุงู†', 'ูˆูƒุงู†ุช', 'ูˆูƒู„', 'ูˆุจุนุถ', 'ูˆุญูˆู„', 'ูˆุนู†ุฏ', 'ูˆู‚ุฏ',
'ูˆู„ู‚ุฏ', 'ูˆู„ู…', 'ูˆู„ู†', 'ูˆู…ุง', 'ูˆูƒูŠู', 'ูˆุงูŠู†', 'ูˆุงู„ุฐูŠ',
'ูˆุจูŠู†', 'ูˆู‚ุจู„', 'ูˆุจุนุฏ', 'ูˆู„ู‡', 'ูˆู„ู‡ุง', 'ูˆู‡ู†ุงูƒ', 'ูˆุงู†ู‡',
'ู…ู†ู‡','ุงู„ุง','ููŠู‡ุง','ูู„ุง','ูˆูƒู…','ูŠูƒู†','ุนู„ูŠูƒ','ู…ู†ู‡ุง','ูู…ุง','ู„ู‡ู…','ูŠูƒู†','ูˆุงู†ูŠ','ู‡ู„','ูู‡ู„','ุจูŠ','ู†ุญูˆ','ูƒูŠ','ุณูˆู','ูƒู†ุง','ู„ู†ุง','ู…ุนุง','ูƒู„ู…ุง','ูˆุฅุฐุง','ู…ู†ู‡','ุนู†ู‡','ุฅุฐ','ูƒู…','ุจู„','ููŠู‡ุง','ู‡ูƒุฐุง','ู„ู‡ู…','ูˆู„ุฏู‰', 'ูˆุนู„ูŠู‡', 'ูˆู…ุซู„',
'ูˆุงุญุฏ', 'ุงุซู†ุงู†', 'ุซู„ุงุซุฉ', 'ุฃุฑุจุนุฉ', 'ุฎู…ุณุฉ', 'ุณุชุฉ', 'ุณุจุนุฉ',
'ุซู…ุงู†ูŠุฉ', 'ุชุณุนุฉ', 'ุนุดุฑุฉ',
'ุงู„ุฃูˆู„', 'ุงู„ุซุงู†ูŠ', 'ุงู„ุซุงู„ุซ', 'ุงู„ุฑุงุจุน', 'ุงู„ุฎุงู…ุณ', 'ุงู„ุณุงุฏุณ',
'ุงู„ุณุงุจุน', 'ุงู„ุซุงู…ู†', 'ุงู„ุชุงุณุน', 'ุงู„ุนุงุดุฑ'
}
COUNTRY_MAPPING = {
'ู…ุตุฑ': 'Egypt',
'ุงู„ุณุนูˆุฏูŠุฉ': 'Saudi Arabia',
'ุงู„ุฅู…ุงุฑุงุช': 'UAE',
'ุงู„ูƒูˆูŠุช': 'Kuwait',
'ุงู„ุนุฑุงู‚': 'Iraq',
'ุณูˆุฑูŠุง': 'Syria',
'ู„ุจู†ุงู†': 'Lebanon',
'ุงู„ุฃุฑุฏู†': 'Jordan',
'ูู„ุณุทูŠู†': 'Palestine',
'ุงู„ูŠู…ู†': 'Yemen',
'ุนู…ุงู†': 'Oman',
'ู‚ุทุฑ': 'Qatar',
'ุงู„ุจุญุฑูŠู†': 'Bahrain',
'ุงู„ุณูˆุฏุงู†': 'Sudan',
'ู„ูŠุจูŠุง': 'Libya',
'ุชูˆู†ุณ': 'Tunisia',
'ุงู„ุฌุฒุงุฆุฑ': 'Algeria',
'ุงู„ู…ุบุฑุจ': 'Morocco',
'ู…ูˆุฑูŠุชุงู†ูŠุง': 'Mauritania'
}
st.set_page_config(
page_title="Contemporary Arabic Poetry Analysis",
page_icon="๐Ÿ“š",
layout="wide"
)
@st.cache_resource
def load_models():
"""Load and cache the models to prevent reloading"""
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
emotion_classifier = pipeline(
"sentiment-analysis",
model=emotion_model,
tokenizer=emotion_tokenizer,
return_all_scores=True
)
return tokenizer, bert_model, emotion_classifier
def split_text(text, max_length=512):
"""Split text into chunks of maximum token length while preserving word boundaries."""
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_length = len(word.split())
if current_length + word_length > max_length:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = word_length
else:
current_chunk.append(word)
current_length += word_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def get_country_coordinates():
"""Returns dictionary of Arab country coordinates"""
return {
'Egypt': [26.8206, 30.8025],
'Saudi Arabia': [23.8859, 45.0792],
'UAE': [23.4241, 53.8478],
'Kuwait': [29.3117, 47.4818],
'Iraq': [33.2232, 43.6793],
'Syria': [34.8021, 38.9968],
'Lebanon': [33.8547, 35.8623],
'Jordan': [30.5852, 36.2384],
'Palestine': [31.9522, 35.2332],
'Yemen': [15.5527, 48.5164],
'Oman': [21.4735, 55.9754],
'Qatar': [25.3548, 51.1839],
'Bahrain': [26.0667, 50.5577],
'Sudan': [12.8628, 30.2176],
'Libya': [26.3351, 17.2283],
'Tunisia': [33.8869, 9.5375],
'Algeria': [28.0339, 1.6596],
'Morocco': [31.7917, -7.0926],
'Mauritania': [21.0079, -10.9408]
}
def create_topic_map(summaries):
# Debug print to check incoming data
print("DEBUG - First summary emotions:", summaries[0]['top_emotions'])
coordinates = get_country_coordinates()
m = folium.Map(location=[27.0, 42.0], zoom_start=5)
sentiment_colors = {
'LABEL_1': 'green', # Positive
'LABEL_0': 'red', # Negative
'LABEL_2': 'blue' # Neutral
}
for summary in summaries:
country_en = COUNTRY_MAPPING.get(summary['country'])
if country_en and country_en in coordinates:
REVERSE_EMOTION_LABELS = {
'positive': 'LABEL_1',
'negative': 'LABEL_0',
'neutral': 'LABEL_2'
}
dominant_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "neutral"
dominant_label = REVERSE_EMOTION_LABELS.get(dominant_emotion, 'LABEL_2')
circle_color = sentiment_colors.get(dominant_label, 'gray')
# Debug print
print(f"DEBUG - Country: {country_en}, Emotion: {dominant_emotion}, Label: {dominant_label}, Color: {circle_color}")
popup_content = f"""
<b>{country_en}</b><br>
<b>Sentiment Distribution:</b><br>
{'<br>'.join(f"โ€ข {e['emotion']}: {e['count']}" for e in summary['top_emotions'][:3])}<br>
<b>Top Topic:</b><br>
{summary['top_topics'][0]['topic'] if summary['top_topics'] else 'No topics'}<br>
Total Poems: {summary['total_poems']}
"""
folium.CircleMarker(
location=coordinates[country_en],
radius=10,
popup=folium.Popup(popup_content, max_width=300),
color=circle_color,
fill=True
).add_to(m)
legend_html = """
<div style="position: fixed; bottom: 50px; left: 50px; z-index: 1000; background-color: white; padding: 10px; border: 2px solid grey; border-radius: 5px">
<p><b>Sentiment:</b></p>
<p><span style="color: green;">โ—</span> Positive</p>
<p><span style="color: red;">โ—</span> Negative</p>
<p><span style="color: blue;">โ—</span> Neutral</p>
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))
return m
def create_arabic_wordcloud(text, title):
wordcloud = WordCloud(
width=1200,
height=600,
background_color='white',
font_path=font_path,
max_words=200,
stopwords=ARABIC_STOP_WORDS
).generate(text)
fig, ax = plt.subplots(figsize=(15, 8))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
ax.set_title(title, fontsize=16, pad=20)
return fig
def clean_arabic_text(text):
"""Clean Arabic text by removing stop words and normalizing."""
words = text.split()
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
return ' '.join(cleaned_words)
def classify_emotion(text, classifier):
"""Classify emotion for complete text with proper token handling."""
try:
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_tokens = len(classifier.tokenizer.encode(word))
if current_length + word_tokens > 512:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = word_tokens
else:
current_chunk.append(word)
current_length += word_tokens
if current_chunk:
chunks.append(' '.join(current_chunk))
if not chunks:
chunks = [text]
all_scores = []
for chunk in chunks:
try:
inputs = classifier.tokenizer(
chunk,
truncation=True,
max_length=512,
return_tensors="pt"
)
result = classifier(chunk, truncation=True, max_length=512)
scores = result[0]
all_scores.append(scores)
except Exception as chunk_error:
st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
continue
if all_scores:
label_scores = {}
count = len(all_scores)
for scores in all_scores:
for score in scores:
label = score['label']
if label not in label_scores:
label_scores[label] = 0
label_scores[label] += score['score']
avg_scores = {label: score/count for label, score in label_scores.items()}
final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
return final_emotion
return "LABEL_2"
except Exception as e:
st.warning(f"Error in emotion classification: {str(e)}")
return "LABEL_2"
def get_embedding_for_text(text, tokenizer, model):
"""Get embedding for complete text."""
chunks = split_text(text)
chunk_embeddings = []
for chunk in chunks:
try:
inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
chunk_embeddings.append(embedding[0])
except Exception as e:
st.warning(f"Error processing chunk: {str(e)}")
continue
if chunk_embeddings:
weights = np.array([len(chunk.split()) for chunk in chunks])
weights = weights / weights.sum()
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
return weighted_embedding
return np.zeros(model.config.hidden_size)
def format_topics(topic_model, topic_counts):
"""Format topics for display."""
formatted_topics = []
for topic_num, count in topic_counts:
if topic_num == -1:
topic_label = "Miscellaneous"
else:
words = topic_model.get_topic(topic_num)
topic_label = " | ".join([word for word, _ in words[:5]])
formatted_topics.append({
'topic': topic_label,
'count': count
})
return formatted_topics
def format_emotions(emotion_counts):
"""Format emotions for display."""
EMOTION_LABELS = {
'LABEL_0': 'Negative',
'LABEL_1': 'Positive',
'LABEL_2': 'Neutral'
}
formatted_emotions = []
for label, count in emotion_counts:
emotion = EMOTION_LABELS.get(label, label)
formatted_emotions.append({
'emotion': emotion,
'count': count
})
return formatted_emotions
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
"""Process the data and generate summaries with flexible topic configuration."""
summaries = []
topic_model_params = {
"language": "arabic",
"calculate_probabilities": True,
"min_topic_size": 3,
"n_gram_range": (1, 1),
"top_n_words": 15,
"verbose": True,
}
st.write(f"Total documents: {len(df)}")
st.write(f"Topic strategy: {topic_strategy}")
st.write(f"Min topic size: {min_topic_size}")
if topic_strategy == "Manual":
topic_model_params["nr_topics"] = n_topics
else:
topic_model_params["nr_topics"] = "auto"
topic_model = BERTopic(
embedding_model=bert_model,
**topic_model_params)
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
min_df=1,
max_df=1.0)
topic_model.vectorizer_model = vectorizer
for country, group in df.groupby('country'):
progress_text = f"Processing poems for {country}..."
progress_bar = st.progress(0, text=progress_text)
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
all_emotions = []
embeddings = []
clear_memory()
for i, text in enumerate(texts):
try:
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
if embedding is not None and not np.isnan(embedding).any():
embeddings.append(embedding)
else:
st.warning(f"Invalid embedding generated for text {i+1} in {country}")
continue
except Exception as e:
st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
continue
if i % 10 == 0:
clear_memory()
progress = (i + 1) / len(texts) * 0.4
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
if len(embeddings) != len(texts):
texts = texts[:len(embeddings)]
embeddings = np.array(embeddings)
clear_memory()
for i, text in enumerate(texts):
emotion = classify_emotion(text, emotion_classifier)
all_emotions.append(emotion)
if i % 10 == 0:
clear_memory()
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
try:
if len(texts) < min_topic_size:
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
continue
topics, probs = topic_model.fit_transform(texts, embeddings)
topic_counts = Counter(topics)
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
summaries.append({
'country': country,
'total_poems': len(texts),
'top_topics': top_topics,
'top_emotions': top_emotions
})
progress_bar.progress(1.0, text="Processing complete!")
except Exception as e:
st.warning(f"Could not generate topics for {country}: {str(e)}")
continue
return summaries, topic_model
try:
bert_tokenizer, bert_model, emotion_classifier = load_models()
st.success("Models loaded successfully!")
except Exception as e:
st.error(f"Error loading models: {str(e)}")
st.stop()
# Main app interface
st.title("๐Ÿ“š Contemporary Arabic Poetry Analysis")
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
if uploaded_file is not None:
try:
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
required_columns = ['country', 'poem']
if not all(col in df.columns for col in required_columns):
st.error("File must contain 'country' and 'poem' columns.")
st.stop()
df['country'] = df['country'].str.strip()
df = df.dropna(subset=['country', 'poem'])
sampled_df = df.groupby('country').apply(lambda x: x.head(20)).reset_index(drop=True)
st.subheader("Topic Modeling Settings")
col1, col2 = st.columns(2)
with col1:
topic_strategy = st.radio(
"Topic Number Strategy",
["Auto", "Manual"],
help="Choose whether to let the model determine the optimal number of topics or set it manually"
)
if topic_strategy == "Manual":
n_documents = len(df)
max_topics = 500
min_topics = 5
default_topics = 20
n_topics = st.slider(
"Number of Topics",
min_value=min_topics,
max_value=max_topics,
value=default_topics,
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
)
st.info(f"""
๐Ÿ’ก For your dataset of {n_documents:,} documents:
- Available topic range: {min_topics}-{max_topics}
- Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
""")
with col2:
top_n = st.number_input(
"Number of top topics/emotions to display:",
min_value=1,
max_value=100,
value=10
)
if st.button("Process Data"):
with st.spinner("Processing your data..."):
summaries, topic_model = process_and_summarize(
sampled_df,
bert_tokenizer,
bert_model,
emotion_classifier,
top_n=top_n,
topic_strategy=topic_strategy,
n_topics=n_topics if topic_strategy == "Manual" else None,
min_topic_size=3
)
if summaries:
st.success("Analysis complete!")
tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"])
with tab1:
for summary in summaries:
with st.expander(f"๐Ÿ“ {summary['country']} ({summary['total_poems']} poems)"):
col1, col2 = st.columns(2)
with col1:
st.subheader("Top Topics")
for topic in summary['top_topics']:
st.write(f"โ€ข {topic['topic']}: {topic['count']} poems")
with col2:
st.subheader("Emotions")
for emotion in summary['top_emotions']:
st.write(f"โ€ข {emotion['emotion']}: {emotion['count']} poems")
st.subheader("Word Cloud Visualization")
country_poems = df[df['country'] == summary['country']]['poem']
combined_text = ' '.join(country_poems)
wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
st.pyplot(wordcloud_fig)
with tab2:
st.subheader("Global Topic Distribution")
topic_info = topic_model.get_topic_info()
for _, row in topic_info.iterrows():
if row['Topic'] == -1:
topic_name = "Miscellaneous"
else:
words = topic_model.get_topic(row['Topic'])
topic_name = " | ".join([word for word, _ in words[:5]])
st.write(f"โ€ข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
with tab3:
st.subheader("Topic and Sentiment Distribution Map")
topic_map = create_topic_map(summaries)
st.components.v1.html(topic_map._repr_html_(), height=600)
except Exception as e:
st.error(f"Error processing file: {str(e)}")
else:
st.info("๐Ÿ‘† Upload a file to get started!")
st.write("### Expected File Format:")
example_df = pd.DataFrame({
'country': ['Egypt', 'Palestine'],
'poem': ['ู‚ุตูŠุฏุฉ ู…ุตุฑูŠุฉ', 'ู‚ุตูŠุฏุฉ ูู„ุณุทูŠู†ูŠุฉ']
})
st.dataframe(example_df)