SoLProject / app.py
kambris's picture
Update app.py
1ba39e4 verified
raw
history blame
20.3 kB
import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import torch
import numpy as np
from collections import Counter
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pkg_resources
import folium
from folium.plugins import HeatMap
import country_converter as coco
from streamlit_folium import folium_static
current_dir = os.path.dirname(os.path.abspath(__file__))
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
ARABIC_STOP_WORDS = {
'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู†', 'ู…ุน', 'ุฎู„ุงู„', 'ุญุชู‰', 'ุฅุฐุง', 'ุซู…',
'ุฃูˆ', 'ูˆ', 'ู', 'ู„', 'ุจ', 'ูƒ', 'ู„ู„', 'ุงู„', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ',
'ุชู„ูƒ', 'ู‡ุคู„ุงุก', 'ู‡ู…', 'ู‡ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู†ุญู†', 'ุงู†ุช', 'ุงู†ุชู…',
'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ุงูŠ', 'ูƒู„', 'ุจุนุถ', 'ุบูŠุฑ', 'ุญูˆู„',
'ุนู†ุฏ', 'ู‚ุฏ', 'ู„ู‚ุฏ', 'ู„ู…', 'ู„ู†', 'ู„ูˆ', 'ู…ุง', 'ู…ุงุฐุง', 'ู…ุชู‰', 'ูƒูŠู',
'ุงูŠู†', 'ู„ู…ุงุฐุง', 'ุงู„ุฐูŠ', 'ุงู„ุชูŠ', 'ุงู„ุฐูŠู†', 'ุงู„ู„ุงุชูŠ', 'ุงู„ู„ูˆุงุชูŠ',
'ุงู„ุงู†', 'ุจูŠู†', 'ููˆู‚', 'ุชุญุช', 'ุงู…ุงู…', 'ุฎู„ู', 'ุญูŠู†', 'ู‚ุจู„', 'ุจุนุฏ',
'ูˆ', 'ุฃู†', 'ููŠ', 'ูƒู„', 'ู„ู…', 'ู„ู†', 'ู„ู‡', 'ู…ู†', 'ู‡ูˆ', 'ู‡ูŠ', 'ู‚ูˆุฉ',
'ูƒู…ุง', 'ู„ู‡ุง', 'ู…ู†ุฐ', 'ูˆู‚ุฏ', 'ูˆู„ุง', 'ู†ูุณ', 'ูˆู„ู…', 'ุญูŠุซ', 'ู‡ู†ุงูƒ',
'ุฌุฏุง', 'ุฐุงุช', 'ุถู…ู†', 'ุงู†ู‡', 'ู„ุฏู‰', 'ุนู„ูŠู‡', 'ู…ุซู„', 'ูˆู„ู‡', 'ุนู†ุฏ',
'ุฃู…ุง', 'ู‡ุฐู‡', 'ูˆุฃู†', 'ูˆูƒู„', 'ูˆู‚ุงู„', 'ู„ุฏูŠ', 'ูˆูƒุงู†', 'ููŠู‡', 'ูˆู‡ูŠ',
'ูˆู‡ูˆ', 'ุชู„ูƒ', 'ูƒู„ู…', 'ู„ูƒู†', 'ูˆููŠ', 'ูˆู‚ู', 'ูˆู„ู‚ุฏ', 'ูˆู…ู†', 'ูˆู‡ุฐุง',
'ุงูˆู„', 'ุถู…ู†', 'ุงู†ู‡ุง', 'ุฌู…ูŠุน', 'ุงู„ุฐูŠ', 'ู‚ุจู„', 'ุจุนุฏ', 'ุญูˆู„', 'ุงูŠุถุง',
'ู„ุงุฒู…', 'ุญุงุฌุฉ', 'ุนู„ูŠ', 'ูŠุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
}
st.set_page_config(
page_title="Arabic Poem Analysis",
page_icon="๐Ÿ“š",
layout="wide"
)
@st.cache_resource
def load_models():
"""Load and cache the models"""
# + Added use_fast=True for faster tokenization
tokenizer = AutoTokenizer.from_pretrained(
"CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
use_fast=True
)
# + Added torchscript and low_cpu_mem_usage
bert_model = AutoModel.from_pretrained(
"aubmindlab/bert-base-arabertv2",
torchscript=True,
low_cpu_mem_usage=True
)
# + Added optimizations for emotion model
emotion_model = AutoModelForSequenceClassification.from_pretrained(
"CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
torchscript=True,
low_cpu_mem_usage=True
)
# ~ Changed pipeline configuration to use batching
emotion_classifier = pipeline(
"sentiment-analysis",
model=emotion_model,
tokenizer=tokenizer,
batch_size=32,
device=-1 # + Added to force CPU usage
)
return tokenizer, bert_model, emotion_classifier
# + Added new batch processing function
def process_texts_in_batches(texts, batch_size=32):
"""Process texts in batches for better CPU utilization"""
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
results = []
for batch in batches:
batch_results = emotion_classifier(batch, truncation=True, max_length=512)
results.extend(batch_results)
return results
# + Added caching decorator for embeddings
@st.cache_data
def get_cached_embeddings(text, tokenizer, model):
"""Cache embeddings to avoid recomputation"""
return get_embedding_for_text(text, tokenizer, model)
def create_theme_map(summaries, topic_model):
"""Create an interactive map showing theme distributions across countries"""
try:
# Create a base map centered on the Arab world
m = folium.Map(location=[25, 45], zoom_start=4)
# Convert country names to coordinates
cc = coco.CountryConverter()
for summary in summaries:
try:
# Get country coordinates
country_iso = cc.convert(names=[summary['country']], to='ISO2')
country_data = cc.convert(names=[summary['country']], to='name_short')
# Create popup content with theme information
popup_content = f"""
<h4>{summary['country']}</h4>
<b>Top Themes:</b><br>
{'<br>'.join([f"โ€ข {topic['topic']}: {topic['count']}"
for topic in summary['top_topics'][:5]])}
"""
# Add marker for each country
folium.CircleMarker(
location=[cc.convert(country_iso, to='latitude')[0],
cc.convert(country_iso, to='longitude')[0]],
radius=20,
popup=folium.Popup(popup_content, max_width=300),
color='red',
fill=True,
fill_opacity=0.7
).add_to(m)
except Exception as e:
st.warning(f"Could not process {summary['country']}: {str(e)}")
continue
return m
except Exception as e:
st.error(f"Error creating map: {str(e)}")
return None
def split_text(text, max_length=512):
"""Split text into chunks of maximum token length while preserving word boundaries."""
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_length = len(word.split())
if current_length + word_length > max_length:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = word_length
else:
current_chunk.append(word)
current_length += word_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def create_arabic_wordcloud(text, title):
wordcloud = WordCloud(
width=1200,
height=600,
background_color='white',
font_path=font_path,
max_words=200,
stopwords=ARABIC_STOP_WORDS
).generate(text)
fig, ax = plt.subplots(figsize=(15, 8))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
ax.set_title(title, fontsize=16, pad=20)
return fig
def clean_arabic_text(text):
"""Clean Arabic text by removing stop words and normalizing."""
words = text.split()
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
return ' '.join(cleaned_words)
def classify_emotion(text, classifier):
"""Classify emotion for complete text with proper token handling."""
try:
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_tokens = len(classifier.tokenizer.encode(word))
if current_length + word_tokens > 512:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = word_tokens
else:
current_chunk.append(word)
current_length += word_tokens
if current_chunk:
chunks.append(' '.join(current_chunk))
if not chunks:
chunks = [text]
all_scores = []
for chunk in chunks:
try:
inputs = classifier.tokenizer(
chunk,
truncation=True,
max_length=512,
return_tensors="pt"
)
result = classifier(chunk, truncation=True, max_length=512)
scores = result[0]
all_scores.append(scores)
except Exception as chunk_error:
st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
continue
if all_scores:
label_scores = {}
count = len(all_scores)
for scores in all_scores:
for score in scores:
label = score['label']
if label not in label_scores:
label_scores[label] = 0
label_scores[label] += score['score']
avg_scores = {label: score/count for label, score in label_scores.items()}
final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
return final_emotion
return "LABEL_2"
except Exception as e:
st.warning(f"Error in emotion classification: {str(e)}")
return "LABEL_2"
def get_embedding_for_text(text, tokenizer, model):
"""Get embedding for complete text."""
chunks = split_text(text)
chunk_embeddings = []
for chunk in chunks:
try:
inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
chunk_embeddings.append(embedding[0])
except Exception as e:
st.warning(f"Error processing chunk: {str(e)}")
continue
if chunk_embeddings:
weights = np.array([len(chunk.split()) for chunk in chunks])
weights = weights / weights.sum()
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
return weighted_embedding
return np.zeros(model.config.hidden_size)
def format_topics(topic_model, topic_counts):
"""Format topics for display."""
formatted_topics = []
for topic_num, count in topic_counts:
if topic_num == -1:
topic_label = "Miscellaneous"
else:
words = topic_model.get_topic(topic_num)
topic_label = " | ".join([word for word, _ in words[:5]])
formatted_topics.append({
'topic': topic_label,
'count': count
})
return formatted_topics
def format_emotions(emotion_counts):
"""Format emotions for display."""
EMOTION_LABELS = {
'LABEL_0': 'Negative',
'LABEL_1': 'Positive',
'LABEL_2': 'Neutral'
}
formatted_emotions = []
for label, count in emotion_counts:
emotion = EMOTION_LABELS.get(label, label)
formatted_emotions.append({
'emotion': emotion,
'count': count
})
return formatted_emotions
def get_optimized_topic_model(bert_model):
"""Configure BERTopic for better CPU performance"""
return BERTopic(
embedding_model=bert_model,
language="arabic",
calculate_probabilities=False,
verbose=False,
n_gram_range=(1, 1),
min_topic_size=5,
nr_topics="auto",
low_memory=True
)
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
"""Process the data and generate summaries with flexible topic configuration."""
summaries = []
topic_model = get_optimized_topic_model(bert_model)
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
min_df=1,
max_df=1.0)
topic_model.vectorizer_model = vectorizer
for country, group in df.groupby('country'):
progress_text = f"Processing poems for {country}..."
progress_bar = st.progress(0, text=progress_text)
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
all_emotions = []
embeddings = []
for i, text in enumerate(texts):
try:
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
if embedding is not None and not np.isnan(embedding).any():
embeddings.append(embedding)
else:
st.warning(f"Invalid embedding generated for text {i+1} in {country}")
continue
except Exception as e:
st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
continue
progress = (i + 1) / len(texts) * 0.4
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
if len(embeddings) != len(texts):
texts = texts[:len(embeddings)]
embeddings = np.array(embeddings)
for i, text in enumerate(texts):
emotion = classify_emotion(text, emotion_classifier)
all_emotions.append(emotion)
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
try:
if len(texts) < min_topic_size:
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
continue
topics, probs = topic_model.fit_transform(texts, embeddings)
topic_counts = Counter(topics)
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
summaries.append({
'country': country,
'total_poems': len(texts),
'top_topics': top_topics,
'top_emotions': top_emotions
})
progress_bar.progress(1.0, text="Processing complete!")
except Exception as e:
st.warning(f"Could not generate topics for {country}: {str(e)}")
continue
return summaries, topic_model
try:
bert_tokenizer, bert_model, emotion_classifier = load_models()
st.success("Models loaded successfully!")
except Exception as e:
st.error(f"Error loading models: {str(e)}")
st.stop()
# Main app interface
st.title("๐Ÿ“š Arabic Poem Analysis")
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
if uploaded_file is not None:
try:
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
required_columns = ['country', 'poem']
if not all(col in df.columns for col in required_columns):
st.error("File must contain 'country' and 'poem' columns.")
st.stop()
df['country'] = df['country'].str.strip()
df = df.dropna(subset=['country', 'poem'])
st.subheader("Topic Modeling Settings")
col1, col2 = st.columns(2)
with col1:
topic_strategy = st.radio(
"Topic Number Strategy",
["Auto", "Manual"],
help="Choose whether to let the model determine the optimal number of topics or set it manually"
)
if topic_strategy == "Manual":
n_documents = len(df)
max_topics = 500
min_topics = 5
default_topics = 20
n_topics = st.slider(
"Number of Topics",
min_value=min_topics,
max_value=max_topics,
value=default_topics,
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
)
st.info(f"""
๐Ÿ’ก For your dataset of {n_documents:,} documents:
- Available topic range: {min_topics}-{max_topics}
- Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
""")
with col2:
top_n = st.number_input(
"Number of top topics/emotions to display:",
min_value=1,
max_value=100,
value=10
)
if st.button("Process Data"):
with st.spinner("Processing your data..."):
summaries, topic_model = process_and_summarize(
df,
bert_tokenizer,
bert_model,
emotion_classifier,
top_n=top_n,
topic_strategy=topic_strategy,
n_topics=n_topics if topic_strategy == "Manual" else None,
min_topic_size=3
)
if summaries:
st.success("Analysis complete!")
tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"])
with tab1:
for summary in summaries:
with st.expander(f"๐Ÿ“ {summary['country']} ({summary['total_poems']} poems)"):
col1, col2 = st.columns(2)
with col1:
st.subheader("Top Topics")
for topic in summary['top_topics']:
st.write(f"โ€ข {topic['topic']}: {topic['count']} poems")
with col2:
st.subheader("Emotions")
for emotion in summary['top_emotions']:
st.write(f"โ€ข {emotion['emotion']}: {emotion['count']} poems")
st.subheader("Word Cloud Visualization")
country_poems = df[df['country'] == summary['country']]['poem']
combined_text = ' '.join(country_poems)
wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
st.pyplot(wordcloud_fig)
with tab2:
st.subheader("Global Topic Distribution")
topic_info = topic_model.get_topic_info()
for _, row in topic_info.iterrows():
if row['Topic'] == -1:
topic_name = "Miscellaneous"
else:
words = topic_model.get_topic(row['Topic'])
topic_name = " | ".join([word for word, _ in words[:5]])
st.write(f"โ€ข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
with tab3:
st.subheader("Thematic Distribution Map")
theme_map = create_theme_map(summaries, topic_model)
folium_static(theme_map)
except Exception as e:
st.error(f"Error processing file: {str(e)}")
else:
st.info("๐Ÿ‘† Upload a file to get started!")
st.write("### Expected File Format:")
example_df = pd.DataFrame({
'country': ['Egypt', 'Palestine'],
'poem': ['ู‚ุตูŠุฏุฉ ู…ุตุฑูŠุฉ', 'ู‚ุตูŠุฏุฉ ูู„ุณุทูŠู†ูŠุฉ']
})
st.dataframe(example_df)