Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline | |
from sklearn.feature_extraction.text import CountVectorizer | |
from bertopic import BERTopic | |
import torch | |
import numpy as np | |
from collections import Counter | |
import os | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
import pkg_resources | |
import folium | |
import country_converter as coco | |
import time | |
import gc | |
def clear_memory(): | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf") | |
ARABIC_STOP_WORDS = { | |
'ูู', 'ู ู', 'ุฅูู', 'ุนูู', 'ุนูู', 'ุนู', 'ู ุน', 'ุฎูุงู', 'ุญุชู', 'ุญุชู', 'ุฅุฐุง', | |
'ุซู ', 'ุฃู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', | |
'ูุฐู', 'ุฐูู', 'ุชูู', 'ูุคูุงุก', 'ูู ', 'ูู', 'ูู', 'ูู','ููุง', 'ูุญู', | |
'ุงูุช', 'ุงูุชู ', 'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', | |
'ุจุนุถ', 'ุบูุฑ', 'ุญูู', 'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู ', 'ูู', 'ูู', | |
'ู ุง', 'ู ุงุฐุง', 'ู ุชู', 'ููู', 'ุงูู', 'ูู ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', | |
'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู', 'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', | |
'ุงู ุงู ', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ', 'ุฃู', 'ูู', 'ูู ุง', 'ููุง', | |
'ู ูุฐ', 'ููุณ', 'ุญูุซ', 'ููุงู', 'ุฌุฏุง', 'ุฐุงุช', 'ุถู ู', 'ุงูู', 'ูุฏู', | |
'ุนููู', 'ู ุซู', 'ุฃู ุง', 'ูุฏู', 'ููู', 'ููู ', 'ููู', 'ุงูุถุง', 'ูุงุฒู ', | |
'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุถุฏ', 'ูุง', 'ูุง', 'ุงู ุง', | |
'ุจูุง', 'ุงู', 'ุจู', 'ุงูู', 'ูู ุง', 'ุงูุง', 'ุงููู', 'ูู', 'ูู','ุงุฐุง','ุจูุง','ุงู','ูุฏูู','ูุฏูู','ุงูู','ููุช','ููุณ','ุงููุง', 'ููุช', | |
'ูุซู ', 'ูุฃู', 'ูู', 'ูุจ', 'ูู', 'ููู', 'ูุงู', | |
'ููุฐุง', 'ููุฐู', 'ูุฐูู', 'ูุชูู', 'ููุคูุงุก', 'ููู ', 'ููู', 'ููู', 'ููู', 'ููุญู', | |
'ูุงูุช', 'ูุงูุชู ', 'ููุงู', 'ููุงูุช', 'ููููู', 'ูุชููู', 'ูุงู', 'ููู', | |
'ูุจุนุถ', 'ูุบูุฑ', 'ูุญูู', 'ูุนูุฏ', 'ููุฏ', 'ูููุฏ', 'ููู ', 'ููู', 'ููู', | |
'ูู ุง', 'ูู ุงุฐุง', 'ูู ุชู', 'ูููู', 'ูุงูู', 'ููู ุงุฐุง', 'ูุงูุฐู', 'ูุงูุชู', | |
'ูุงูุฐูู', 'ูุงููุงุชู', 'ูุงูููุงุชู', 'ูุงูุงู', 'ูุจูู', 'ูููู','ูููุง', 'ูุชุญุช', | |
'ูุงู ุงู ', 'ูุฎูู', 'ูุญูู', 'ููุจู', 'ูุจุนุฏ', 'ูุฃู', 'ููู', 'ููู ุง', 'ูููุง', | |
'ูู ูุฐ', 'ูููุณ', 'ูุญูุซ', 'ูููุงู', 'ูุฌุฏุง', 'ูุฐุงุช', 'ูุถู ู', 'ูุงูู', 'ููุฏู', | |
'ูุนููู', 'ูู ุซู', 'ูุฃู ุง', 'ูููู', 'ูููู ', 'ูููู', 'ูุงูุถุง', 'ููุงุฒู ', | |
'ููุฌุจ', 'ูุตุงุฑ', 'ูุตุงุฑุช', 'ูุถุฏ', 'ููุง', 'ููุง', 'ูุงู ุง', | |
'ูุจูุง', 'ูุงู', 'ูุจู', 'ูุงูู', 'ููู ุง', 'ูุงูุง', 'ูุงููู', 'ููู', 'ููู', 'ูููุช', | |
'ููู', 'ูู ู', 'ูุนูู', 'ูุนูู', 'ูุนู', 'ูู ุน', 'ูุญุชู', 'ูุฅุฐุง', | |
'ููุฐุง', 'ููุฐู', 'ูุฐูู', 'ูุชูู', 'ููู', 'ููู', 'ููุญู', | |
'ููุงู', 'ููุงูุช', 'ููู', 'ูุจุนุถ', 'ูุญูู', 'ูุนูุฏ', 'ููุฏ', | |
'ูููุฏ', 'ููู ', 'ููู', 'ูู ุง', 'ูููู', 'ูุงูู', 'ูุงูุฐู', | |
'ูุจูู', 'ููุจู', 'ูุจุนุฏ', 'ููู', 'ูููุง', 'ูููุงู', 'ูุงูู', | |
'ู ูู','ุงูุง','ูููุง','ููุง','ููู ','ููู','ุนููู','ู ููุง','ูู ุง','ููู ','ููู','ูุงูู','ูู','ููู','ุจู','ูุญู','ูู','ุณูู','ููุง','ููุง','ู ุนุง','ููู ุง','ูุฅุฐุง','ู ูู','ุนูู','ุฅุฐ','ูู ','ุจู','ูููุง','ููุฐุง','ููู ','ููุฏู', 'ูุนููู', 'ูู ุซู', | |
'ูุงุญุฏ', 'ุงุซูุงู', 'ุซูุงุซุฉ', 'ุฃุฑุจุนุฉ', 'ุฎู ุณุฉ', 'ุณุชุฉ', 'ุณุจุนุฉ', | |
'ุซู ุงููุฉ', 'ุชุณุนุฉ', 'ุนุดุฑุฉ', | |
'ุงูุฃูู', 'ุงูุซุงูู', 'ุงูุซุงูุซ', 'ุงูุฑุงุจุน', 'ุงูุฎุงู ุณ', 'ุงูุณุงุฏุณ', | |
'ุงูุณุงุจุน', 'ุงูุซุงู ู', 'ุงูุชุงุณุน', 'ุงูุนุงุดุฑ' | |
} | |
COUNTRY_MAPPING = { | |
'ู ุตุฑ': 'Egypt', | |
'ุงูุณุนูุฏูุฉ': 'Saudi Arabia', | |
'ุงูุฅู ุงุฑุงุช': 'UAE', | |
'ุงููููุช': 'Kuwait', | |
'ุงูุนุฑุงู': 'Iraq', | |
'ุณูุฑูุง': 'Syria', | |
'ูุจูุงู': 'Lebanon', | |
'ุงูุฃุฑุฏู': 'Jordan', | |
'ููุณุทูู': 'Palestine', | |
'ุงููู ู': 'Yemen', | |
'ุนู ุงู': 'Oman', | |
'ูุทุฑ': 'Qatar', | |
'ุงูุจุญุฑูู': 'Bahrain', | |
'ุงูุณูุฏุงู': 'Sudan', | |
'ููุจูุง': 'Libya', | |
'ุชููุณ': 'Tunisia', | |
'ุงูุฌุฒุงุฆุฑ': 'Algeria', | |
'ุงูู ุบุฑุจ': 'Morocco', | |
'ู ูุฑูุชุงููุง': 'Mauritania' | |
} | |
st.set_page_config( | |
page_title="Contemporary Arabic Poetry Analysis", | |
page_icon="๐", | |
layout="wide" | |
) | |
def load_models(): | |
"""Load and cache the models to prevent reloading""" | |
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2") | |
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
emotion_classifier = pipeline( | |
"sentiment-analysis", | |
model=emotion_model, | |
tokenizer=emotion_tokenizer, | |
return_all_scores=True | |
) | |
return tokenizer, bert_model, emotion_classifier | |
def split_text(text, max_length=512): | |
"""Split text into chunks of maximum token length while preserving word boundaries.""" | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for word in words: | |
word_length = len(word.split()) | |
if current_length + word_length > max_length: | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [word] | |
current_length = word_length | |
else: | |
current_chunk.append(word) | |
current_length += word_length | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |
def get_country_coordinates(): | |
"""Returns dictionary of Arab country coordinates""" | |
return { | |
'Egypt': [26.8206, 30.8025], | |
'Saudi Arabia': [23.8859, 45.0792], | |
'UAE': [23.4241, 53.8478], | |
'Kuwait': [29.3117, 47.4818], | |
'Iraq': [33.2232, 43.6793], | |
'Syria': [34.8021, 38.9968], | |
'Lebanon': [33.8547, 35.8623], | |
'Jordan': [30.5852, 36.2384], | |
'Palestine': [31.9522, 35.2332], | |
'Yemen': [15.5527, 48.5164], | |
'Oman': [21.4735, 55.9754], | |
'Qatar': [25.3548, 51.1839], | |
'Bahrain': [26.0667, 50.5577], | |
'Sudan': [12.8628, 30.2176], | |
'Libya': [26.3351, 17.2283], | |
'Tunisia': [33.8869, 9.5375], | |
'Algeria': [28.0339, 1.6596], | |
'Morocco': [31.7917, -7.0926], | |
'Mauritania': [21.0079, -10.9408] | |
} | |
def create_topic_map(summaries): | |
# Debug print to check incoming data | |
print("DEBUG - First summary emotions:", summaries[0]['top_emotions']) | |
coordinates = get_country_coordinates() | |
m = folium.Map(location=[27.0, 42.0], zoom_start=5) | |
sentiment_colors = { | |
'LABEL_1': 'green', # Positive | |
'LABEL_0': 'red', # Negative | |
'LABEL_2': 'blue' # Neutral | |
} | |
for summary in summaries: | |
country_en = COUNTRY_MAPPING.get(summary['country']) | |
if country_en and country_en in coordinates: | |
REVERSE_EMOTION_LABELS = { | |
'positive': 'LABEL_1', | |
'negative': 'LABEL_0', | |
'neutral': 'LABEL_2' | |
} | |
dominant_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "neutral" | |
dominant_label = REVERSE_EMOTION_LABELS.get(dominant_emotion, 'LABEL_2') | |
circle_color = sentiment_colors.get(dominant_label, 'gray') | |
# Debug print | |
print(f"DEBUG - Country: {country_en}, Emotion: {dominant_emotion}, Label: {dominant_label}, Color: {circle_color}") | |
popup_content = f""" | |
<b>{country_en}</b><br> | |
<b>Sentiment Distribution:</b><br> | |
{'<br>'.join(f"โข {e['emotion']}: {e['count']}" for e in summary['top_emotions'][:3])}<br> | |
<b>Top Topic:</b><br> | |
{summary['top_topics'][0]['topic'] if summary['top_topics'] else 'No topics'}<br> | |
Total Poems: {summary['total_poems']} | |
""" | |
folium.CircleMarker( | |
location=coordinates[country_en], | |
radius=10, | |
popup=folium.Popup(popup_content, max_width=300), | |
color=circle_color, | |
fill=True | |
).add_to(m) | |
legend_html = """ | |
<div style="position: fixed; bottom: 50px; left: 50px; z-index: 1000; background-color: white; padding: 10px; border: 2px solid grey; border-radius: 5px"> | |
<p><b>Sentiment:</b></p> | |
<p><span style="color: green;">โ</span> Positive</p> | |
<p><span style="color: red;">โ</span> Negative</p> | |
<p><span style="color: blue;">โ</span> Neutral</p> | |
</div> | |
""" | |
m.get_root().html.add_child(folium.Element(legend_html)) | |
return m | |
def create_arabic_wordcloud(text, title): | |
wordcloud = WordCloud( | |
width=1200, | |
height=600, | |
background_color='white', | |
font_path=font_path, | |
max_words=200, | |
stopwords=ARABIC_STOP_WORDS | |
).generate(text) | |
fig, ax = plt.subplots(figsize=(15, 8)) | |
ax.imshow(wordcloud, interpolation='bilinear') | |
ax.axis('off') | |
ax.set_title(title, fontsize=16, pad=20) | |
return fig | |
def clean_arabic_text(text): | |
"""Clean Arabic text by removing stop words and normalizing.""" | |
words = text.split() | |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1] | |
return ' '.join(cleaned_words) | |
def classify_emotion(text, classifier): | |
"""Classify emotion for complete text with proper token handling.""" | |
try: | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for word in words: | |
word_tokens = len(classifier.tokenizer.encode(word)) | |
if current_length + word_tokens > 512: | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [word] | |
current_length = word_tokens | |
else: | |
current_chunk.append(word) | |
current_length += word_tokens | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
if not chunks: | |
chunks = [text] | |
all_scores = [] | |
for chunk in chunks: | |
try: | |
inputs = classifier.tokenizer( | |
chunk, | |
truncation=True, | |
max_length=512, | |
return_tensors="pt" | |
) | |
result = classifier(chunk, truncation=True, max_length=512) | |
scores = result[0] | |
all_scores.append(scores) | |
except Exception as chunk_error: | |
st.warning(f"Skipping chunk due to error: {str(chunk_error)}") | |
continue | |
if all_scores: | |
label_scores = {} | |
count = len(all_scores) | |
for scores in all_scores: | |
for score in scores: | |
label = score['label'] | |
if label not in label_scores: | |
label_scores[label] = 0 | |
label_scores[label] += score['score'] | |
avg_scores = {label: score/count for label, score in label_scores.items()} | |
final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0] | |
return final_emotion | |
return "LABEL_2" | |
except Exception as e: | |
st.warning(f"Error in emotion classification: {str(e)}") | |
return "LABEL_2" | |
def get_embedding_for_text(text, tokenizer, model): | |
"""Get embedding for complete text.""" | |
chunks = split_text(text) | |
chunk_embeddings = [] | |
for chunk in chunks: | |
try: | |
inputs = tokenizer( | |
chunk, | |
return_tensors="pt", | |
padding=True, | |
truncation=True, | |
max_length=512 | |
) | |
inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() | |
chunk_embeddings.append(embedding[0]) | |
except Exception as e: | |
st.warning(f"Error processing chunk: {str(e)}") | |
continue | |
if chunk_embeddings: | |
weights = np.array([len(chunk.split()) for chunk in chunks]) | |
weights = weights / weights.sum() | |
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights) | |
return weighted_embedding | |
return np.zeros(model.config.hidden_size) | |
def format_topics(topic_model, topic_counts): | |
"""Format topics for display.""" | |
formatted_topics = [] | |
for topic_num, count in topic_counts: | |
if topic_num == -1: | |
topic_label = "Miscellaneous" | |
else: | |
words = topic_model.get_topic(topic_num) | |
topic_label = " | ".join([word for word, _ in words[:5]]) | |
formatted_topics.append({ | |
'topic': topic_label, | |
'count': count | |
}) | |
return formatted_topics | |
def format_emotions(emotion_counts): | |
"""Format emotions for display.""" | |
EMOTION_LABELS = { | |
'LABEL_0': 'Negative', | |
'LABEL_1': 'Positive', | |
'LABEL_2': 'Neutral' | |
} | |
formatted_emotions = [] | |
for label, count in emotion_counts: | |
emotion = EMOTION_LABELS.get(label, label) | |
formatted_emotions.append({ | |
'emotion': emotion, | |
'count': count | |
}) | |
return formatted_emotions | |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3): | |
"""Process the data and generate summaries with flexible topic configuration.""" | |
summaries = [] | |
topic_model_params = { | |
"language": "arabic", | |
"calculate_probabilities": True, | |
"min_topic_size": 3, | |
"n_gram_range": (1, 1), | |
"top_n_words": 15, | |
"verbose": True, | |
} | |
st.write(f"Total documents: {len(df)}") | |
st.write(f"Topic strategy: {topic_strategy}") | |
st.write(f"Min topic size: {min_topic_size}") | |
if topic_strategy == "Manual": | |
topic_model_params["nr_topics"] = n_topics | |
else: | |
topic_model_params["nr_topics"] = "auto" | |
topic_model = BERTopic( | |
embedding_model=bert_model, | |
**topic_model_params) | |
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS), | |
min_df=1, | |
max_df=1.0) | |
topic_model.vectorizer_model = vectorizer | |
for country, group in df.groupby('country'): | |
progress_text = f"Processing poems for {country}..." | |
progress_bar = st.progress(0, text=progress_text) | |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()] | |
all_emotions = [] | |
embeddings = [] | |
clear_memory() | |
for i, text in enumerate(texts): | |
try: | |
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model) | |
if embedding is not None and not np.isnan(embedding).any(): | |
embeddings.append(embedding) | |
else: | |
st.warning(f"Invalid embedding generated for text {i+1} in {country}") | |
continue | |
except Exception as e: | |
st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}") | |
continue | |
if i % 10 == 0: | |
clear_memory() | |
progress = (i + 1) / len(texts) * 0.4 | |
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...") | |
if len(embeddings) != len(texts): | |
texts = texts[:len(embeddings)] | |
embeddings = np.array(embeddings) | |
clear_memory() | |
for i, text in enumerate(texts): | |
emotion = classify_emotion(text, emotion_classifier) | |
all_emotions.append(emotion) | |
if i % 10 == 0: | |
clear_memory() | |
progress = 0.4 + ((i + 1) / len(texts) * 0.3) | |
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...") | |
try: | |
if len(texts) < min_topic_size: | |
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)") | |
continue | |
topics, probs = topic_model.fit_transform(texts, embeddings) | |
topic_counts = Counter(topics) | |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n)) | |
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n)) | |
summaries.append({ | |
'country': country, | |
'total_poems': len(texts), | |
'top_topics': top_topics, | |
'top_emotions': top_emotions | |
}) | |
progress_bar.progress(1.0, text="Processing complete!") | |
except Exception as e: | |
st.warning(f"Could not generate topics for {country}: {str(e)}") | |
continue | |
return summaries, topic_model | |
try: | |
bert_tokenizer, bert_model, emotion_classifier = load_models() | |
st.success("Models loaded successfully!") | |
except Exception as e: | |
st.error(f"Error loading models: {str(e)}") | |
st.stop() | |
# Main app interface | |
st.title("๐ Contemporary Arabic Poetry Analysis") | |
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") | |
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) | |
if uploaded_file is not None: | |
try: | |
if uploaded_file.name.endswith('.csv'): | |
df = pd.read_csv(uploaded_file) | |
else: | |
df = pd.read_excel(uploaded_file) | |
required_columns = ['country', 'poem'] | |
if not all(col in df.columns for col in required_columns): | |
st.error("File must contain 'country' and 'poem' columns.") | |
st.stop() | |
df['country'] = df['country'].str.strip() | |
df = df.dropna(subset=['country', 'poem']) | |
sampled_df = df.groupby('country').apply(lambda x: x.head(20)).reset_index(drop=True) | |
st.subheader("Topic Modeling Settings") | |
col1, col2 = st.columns(2) | |
with col1: | |
topic_strategy = st.radio( | |
"Topic Number Strategy", | |
["Auto", "Manual"], | |
help="Choose whether to let the model determine the optimal number of topics or set it manually" | |
) | |
if topic_strategy == "Manual": | |
n_documents = len(df) | |
max_topics = 500 | |
min_topics = 5 | |
default_topics = 20 | |
n_topics = st.slider( | |
"Number of Topics", | |
min_value=min_topics, | |
max_value=max_topics, | |
value=default_topics, | |
help=f"Select the desired number of topics (max {max_topics} based on dataset size)" | |
) | |
st.info(f""" | |
๐ก For your dataset of {n_documents:,} documents: | |
- Available topic range: {min_topics}-{max_topics} | |
- Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence | |
""") | |
with col2: | |
top_n = st.number_input( | |
"Number of top topics/emotions to display:", | |
min_value=1, | |
max_value=100, | |
value=10 | |
) | |
if st.button("Process Data"): | |
with st.spinner("Processing your data..."): | |
summaries, topic_model = process_and_summarize( | |
sampled_df, | |
bert_tokenizer, | |
bert_model, | |
emotion_classifier, | |
top_n=top_n, | |
topic_strategy=topic_strategy, | |
n_topics=n_topics if topic_strategy == "Manual" else None, | |
min_topic_size=3 | |
) | |
if summaries: | |
st.success("Analysis complete!") | |
tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"]) | |
with tab1: | |
for summary in summaries: | |
with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Top Topics") | |
for topic in summary['top_topics']: | |
st.write(f"โข {topic['topic']}: {topic['count']} poems") | |
with col2: | |
st.subheader("Emotions") | |
for emotion in summary['top_emotions']: | |
st.write(f"โข {emotion['emotion']}: {emotion['count']} poems") | |
st.subheader("Word Cloud Visualization") | |
country_poems = df[df['country'] == summary['country']]['poem'] | |
combined_text = ' '.join(country_poems) | |
wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems") | |
st.pyplot(wordcloud_fig) | |
with tab2: | |
st.subheader("Global Topic Distribution") | |
topic_info = topic_model.get_topic_info() | |
for _, row in topic_info.iterrows(): | |
if row['Topic'] == -1: | |
topic_name = "Miscellaneous" | |
else: | |
words = topic_model.get_topic(row['Topic']) | |
topic_name = " | ".join([word for word, _ in words[:5]]) | |
st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)") | |
with tab3: | |
st.subheader("Topic and Sentiment Distribution Map") | |
topic_map = create_topic_map(summaries) | |
st.components.v1.html(topic_map._repr_html_(), height=600) | |
except Exception as e: | |
st.error(f"Error processing file: {str(e)}") | |
else: | |
st.info("๐ Upload a file to get started!") | |
st.write("### Expected File Format:") | |
example_df = pd.DataFrame({ | |
'country': ['Egypt', 'Palestine'], | |
'poem': ['ูุตูุฏุฉ ู ุตุฑูุฉ', 'ูุตูุฏุฉ ููุณุทูููุฉ'] | |
}) | |
st.dataframe(example_df) | |