Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline | |
from sklearn.feature_extraction.text import CountVectorizer | |
from bertopic import BERTopic | |
import torch | |
import numpy as np | |
from collections import Counter | |
import os | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
import pkg_resources | |
import folium | |
from folium.plugins import HeatMap | |
import country_converter as coco | |
from streamlit_folium import folium_static | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf") | |
ARABIC_STOP_WORDS = { | |
'ูู', 'ู ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู ', | |
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู', | |
'ุชูู', 'ูุคูุงุก', 'ูู ', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู ', | |
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู', | |
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู ', 'ูู', 'ูู', 'ู ุง', 'ู ุงุฐุง', 'ู ุชู', 'ููู', | |
'ุงูู', 'ูู ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู', | |
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู ุงู ', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ', | |
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู ', 'ูู', 'ูู', 'ู ู', 'ูู', 'ูู', 'ููุฉ', | |
'ูู ุง', 'ููุง', 'ู ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู ', 'ุญูุซ', 'ููุงู', | |
'ุฌุฏุง', 'ุฐุงุช', 'ุถู ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู ุซู', 'ููู', 'ุนูุฏ', | |
'ุฃู ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู', | |
'ููู', 'ุชูู', 'ููู ', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู ู', 'ููุฐุง', | |
'ุงูู', 'ุถู ู', 'ุงููุง', 'ุฌู ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง', | |
'ูุงุฒู ', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ' | |
} | |
st.set_page_config( | |
page_title="Arabic Poem Analysis", | |
page_icon="๐", | |
layout="wide" | |
) | |
def load_models(): | |
"""Load and cache the models""" | |
# + Added use_fast=True for faster tokenization | |
tokenizer = AutoTokenizer.from_pretrained( | |
"CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment", | |
use_fast=True | |
) | |
# + Added torchscript and low_cpu_mem_usage | |
bert_model = AutoModel.from_pretrained( | |
"aubmindlab/bert-base-arabertv2", | |
torchscript=True, | |
low_cpu_mem_usage=True | |
) | |
# + Added optimizations for emotion model | |
emotion_model = AutoModelForSequenceClassification.from_pretrained( | |
"CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment", | |
torchscript=True, | |
low_cpu_mem_usage=True | |
) | |
# ~ Changed pipeline configuration to use batching | |
emotion_classifier = pipeline( | |
"sentiment-analysis", | |
model=emotion_model, | |
tokenizer=tokenizer, | |
batch_size=32, | |
device=-1 # + Added to force CPU usage | |
) | |
return tokenizer, bert_model, emotion_classifier | |
# + Added new batch processing function | |
def process_texts_in_batches(texts, batch_size=32): | |
"""Process texts in batches for better CPU utilization""" | |
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)] | |
results = [] | |
for batch in batches: | |
batch_results = emotion_classifier(batch, truncation=True, max_length=512) | |
results.extend(batch_results) | |
return results | |
# + Added caching decorator for embeddings | |
def get_cached_embeddings(text, tokenizer, model): | |
"""Cache embeddings to avoid recomputation""" | |
return get_embedding_for_text(text, tokenizer, model) | |
def create_theme_map(summaries, topic_model): | |
"""Create an interactive map showing theme distributions across countries""" | |
try: | |
# Create a base map centered on the Arab world | |
m = folium.Map(location=[25, 45], zoom_start=4) | |
# Convert country names to coordinates | |
cc = coco.CountryConverter() | |
for summary in summaries: | |
try: | |
# Get country coordinates | |
country_iso = cc.convert(names=[summary['country']], to='ISO2') | |
country_data = cc.convert(names=[summary['country']], to='name_short') | |
# Create popup content with theme information | |
popup_content = f""" | |
<h4>{summary['country']}</h4> | |
<b>Top Themes:</b><br> | |
{'<br>'.join([f"โข {topic['topic']}: {topic['count']}" | |
for topic in summary['top_topics'][:5]])} | |
""" | |
# Add marker for each country | |
folium.CircleMarker( | |
location=[cc.convert(country_iso, to='latitude')[0], | |
cc.convert(country_iso, to='longitude')[0]], | |
radius=20, | |
popup=folium.Popup(popup_content, max_width=300), | |
color='red', | |
fill=True, | |
fill_opacity=0.7 | |
).add_to(m) | |
except Exception as e: | |
st.warning(f"Could not process {summary['country']}: {str(e)}") | |
continue | |
return m | |
except Exception as e: | |
st.error(f"Error creating map: {str(e)}") | |
return None | |
def split_text(text, max_length=512): | |
"""Split text into chunks of maximum token length while preserving word boundaries.""" | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for word in words: | |
word_length = len(word.split()) | |
if current_length + word_length > max_length: | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [word] | |
current_length = word_length | |
else: | |
current_chunk.append(word) | |
current_length += word_length | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |
def create_arabic_wordcloud(text, title): | |
wordcloud = WordCloud( | |
width=1200, | |
height=600, | |
background_color='white', | |
font_path=font_path, | |
max_words=200, | |
stopwords=ARABIC_STOP_WORDS | |
).generate(text) | |
fig, ax = plt.subplots(figsize=(15, 8)) | |
ax.imshow(wordcloud, interpolation='bilinear') | |
ax.axis('off') | |
ax.set_title(title, fontsize=16, pad=20) | |
return fig | |
def clean_arabic_text(text): | |
"""Clean Arabic text by removing stop words and normalizing.""" | |
words = text.split() | |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1] | |
return ' '.join(cleaned_words) | |
def classify_emotion(text, classifier): | |
"""Classify emotion for complete text with proper token handling.""" | |
try: | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for word in words: | |
word_tokens = len(classifier.tokenizer.encode(word)) | |
if current_length + word_tokens > 512: | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [word] | |
current_length = word_tokens | |
else: | |
current_chunk.append(word) | |
current_length += word_tokens | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
if not chunks: | |
chunks = [text] | |
all_scores = [] | |
for chunk in chunks: | |
try: | |
inputs = classifier.tokenizer( | |
chunk, | |
truncation=True, | |
max_length=512, | |
return_tensors="pt" | |
) | |
result = classifier(chunk, truncation=True, max_length=512) | |
scores = result[0] | |
all_scores.append(scores) | |
except Exception as chunk_error: | |
st.warning(f"Skipping chunk due to error: {str(chunk_error)}") | |
continue | |
if all_scores: | |
label_scores = {} | |
count = len(all_scores) | |
for scores in all_scores: | |
for score in scores: | |
label = score['label'] | |
if label not in label_scores: | |
label_scores[label] = 0 | |
label_scores[label] += score['score'] | |
avg_scores = {label: score/count for label, score in label_scores.items()} | |
final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0] | |
return final_emotion | |
return "LABEL_2" | |
except Exception as e: | |
st.warning(f"Error in emotion classification: {str(e)}") | |
return "LABEL_2" | |
def get_embedding_for_text(text, tokenizer, model): | |
"""Get embedding for complete text.""" | |
chunks = split_text(text) | |
chunk_embeddings = [] | |
for chunk in chunks: | |
try: | |
inputs = tokenizer( | |
chunk, | |
return_tensors="pt", | |
padding=True, | |
truncation=True, | |
max_length=512 | |
) | |
inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() | |
chunk_embeddings.append(embedding[0]) | |
except Exception as e: | |
st.warning(f"Error processing chunk: {str(e)}") | |
continue | |
if chunk_embeddings: | |
weights = np.array([len(chunk.split()) for chunk in chunks]) | |
weights = weights / weights.sum() | |
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights) | |
return weighted_embedding | |
return np.zeros(model.config.hidden_size) | |
def format_topics(topic_model, topic_counts): | |
"""Format topics for display.""" | |
formatted_topics = [] | |
for topic_num, count in topic_counts: | |
if topic_num == -1: | |
topic_label = "Miscellaneous" | |
else: | |
words = topic_model.get_topic(topic_num) | |
topic_label = " | ".join([word for word, _ in words[:5]]) | |
formatted_topics.append({ | |
'topic': topic_label, | |
'count': count | |
}) | |
return formatted_topics | |
def format_emotions(emotion_counts): | |
"""Format emotions for display.""" | |
EMOTION_LABELS = { | |
'LABEL_0': 'Negative', | |
'LABEL_1': 'Positive', | |
'LABEL_2': 'Neutral' | |
} | |
formatted_emotions = [] | |
for label, count in emotion_counts: | |
emotion = EMOTION_LABELS.get(label, label) | |
formatted_emotions.append({ | |
'emotion': emotion, | |
'count': count | |
}) | |
return formatted_emotions | |
def get_optimized_topic_model(bert_model): | |
"""Configure BERTopic for better CPU performance""" | |
return BERTopic( | |
embedding_model=bert_model, | |
language="arabic", | |
calculate_probabilities=False, | |
verbose=False, | |
n_gram_range=(1, 1), | |
min_topic_size=5, | |
nr_topics="auto", | |
low_memory=True | |
) | |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3): | |
"""Process the data and generate summaries with flexible topic configuration.""" | |
summaries = [] | |
topic_model = get_optimized_topic_model(bert_model) | |
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS), | |
min_df=1, | |
max_df=1.0) | |
topic_model.vectorizer_model = vectorizer | |
for country, group in df.groupby('country'): | |
progress_text = f"Processing poems for {country}..." | |
progress_bar = st.progress(0, text=progress_text) | |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()] | |
all_emotions = [] | |
embeddings = [] | |
for i, text in enumerate(texts): | |
try: | |
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model) | |
if embedding is not None and not np.isnan(embedding).any(): | |
embeddings.append(embedding) | |
else: | |
st.warning(f"Invalid embedding generated for text {i+1} in {country}") | |
continue | |
except Exception as e: | |
st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}") | |
continue | |
progress = (i + 1) / len(texts) * 0.4 | |
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...") | |
if len(embeddings) != len(texts): | |
texts = texts[:len(embeddings)] | |
embeddings = np.array(embeddings) | |
for i, text in enumerate(texts): | |
emotion = classify_emotion(text, emotion_classifier) | |
all_emotions.append(emotion) | |
progress = 0.4 + ((i + 1) / len(texts) * 0.3) | |
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...") | |
try: | |
if len(texts) < min_topic_size: | |
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)") | |
continue | |
topics, probs = topic_model.fit_transform(texts, embeddings) | |
topic_counts = Counter(topics) | |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n)) | |
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n)) | |
summaries.append({ | |
'country': country, | |
'total_poems': len(texts), | |
'top_topics': top_topics, | |
'top_emotions': top_emotions | |
}) | |
progress_bar.progress(1.0, text="Processing complete!") | |
except Exception as e: | |
st.warning(f"Could not generate topics for {country}: {str(e)}") | |
continue | |
return summaries, topic_model | |
try: | |
bert_tokenizer, bert_model, emotion_classifier = load_models() | |
st.success("Models loaded successfully!") | |
except Exception as e: | |
st.error(f"Error loading models: {str(e)}") | |
st.stop() | |
# Main app interface | |
st.title("๐ Arabic Poem Analysis") | |
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") | |
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) | |
if uploaded_file is not None: | |
try: | |
if uploaded_file.name.endswith('.csv'): | |
df = pd.read_csv(uploaded_file) | |
else: | |
df = pd.read_excel(uploaded_file) | |
required_columns = ['country', 'poem'] | |
if not all(col in df.columns for col in required_columns): | |
st.error("File must contain 'country' and 'poem' columns.") | |
st.stop() | |
df['country'] = df['country'].str.strip() | |
df = df.dropna(subset=['country', 'poem']) | |
st.subheader("Topic Modeling Settings") | |
col1, col2 = st.columns(2) | |
with col1: | |
topic_strategy = st.radio( | |
"Topic Number Strategy", | |
["Auto", "Manual"], | |
help="Choose whether to let the model determine the optimal number of topics or set it manually" | |
) | |
if topic_strategy == "Manual": | |
n_documents = len(df) | |
max_topics = 500 | |
min_topics = 5 | |
default_topics = 20 | |
n_topics = st.slider( | |
"Number of Topics", | |
min_value=min_topics, | |
max_value=max_topics, | |
value=default_topics, | |
help=f"Select the desired number of topics (max {max_topics} based on dataset size)" | |
) | |
st.info(f""" | |
๐ก For your dataset of {n_documents:,} documents: | |
- Available topic range: {min_topics}-{max_topics} | |
- Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence | |
""") | |
with col2: | |
top_n = st.number_input( | |
"Number of top topics/emotions to display:", | |
min_value=1, | |
max_value=100, | |
value=10 | |
) | |
if st.button("Process Data"): | |
with st.spinner("Processing your data..."): | |
summaries, topic_model = process_and_summarize( | |
df, | |
bert_tokenizer, | |
bert_model, | |
emotion_classifier, | |
top_n=top_n, | |
topic_strategy=topic_strategy, | |
n_topics=n_topics if topic_strategy == "Manual" else None, | |
min_topic_size=3 | |
) | |
if summaries: | |
st.success("Analysis complete!") | |
tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"]) | |
with tab1: | |
for summary in summaries: | |
with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Top Topics") | |
for topic in summary['top_topics']: | |
st.write(f"โข {topic['topic']}: {topic['count']} poems") | |
with col2: | |
st.subheader("Emotions") | |
for emotion in summary['top_emotions']: | |
st.write(f"โข {emotion['emotion']}: {emotion['count']} poems") | |
st.subheader("Word Cloud Visualization") | |
country_poems = df[df['country'] == summary['country']]['poem'] | |
combined_text = ' '.join(country_poems) | |
wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems") | |
st.pyplot(wordcloud_fig) | |
with tab2: | |
st.subheader("Global Topic Distribution") | |
topic_info = topic_model.get_topic_info() | |
for _, row in topic_info.iterrows(): | |
if row['Topic'] == -1: | |
topic_name = "Miscellaneous" | |
else: | |
words = topic_model.get_topic(row['Topic']) | |
topic_name = " | ".join([word for word, _ in words[:5]]) | |
st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)") | |
with tab3: | |
st.subheader("Thematic Distribution Map") | |
theme_map = create_theme_map(summaries, topic_model) | |
folium_static(theme_map) | |
except Exception as e: | |
st.error(f"Error processing file: {str(e)}") | |
else: | |
st.info("๐ Upload a file to get started!") | |
st.write("### Expected File Format:") | |
example_df = pd.DataFrame({ | |
'country': ['Egypt', 'Palestine'], | |
'poem': ['ูุตูุฏุฉ ู ุตุฑูุฉ', 'ูุตูุฏุฉ ููุณุทูููุฉ'] | |
}) | |
st.dataframe(example_df) | |