|
import streamlit as st |
|
import pandas as pd |
|
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from bertopic import BERTopic |
|
import torch |
|
import numpy as np |
|
from collections import Counter |
|
import os |
|
|
|
ARABIC_STOP_WORDS = { |
|
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู
', |
|
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู', |
|
'ุชูู', 'ูุคูุงุก', 'ูู
', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู
', |
|
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู', |
|
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู
', 'ูู', 'ูู', 'ู
ุง', 'ู
ุงุฐุง', 'ู
ุชู', 'ููู', |
|
'ุงูู', 'ูู
ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู', |
|
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู
ุงู
', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ', |
|
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู
', 'ูู', 'ูู', 'ู
ู', 'ูู', 'ูู', 'ููุฉ', |
|
'ูู
ุง', 'ููุง', 'ู
ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู
', 'ุญูุซ', 'ููุงู', |
|
'ุฌุฏุง', 'ุฐุงุช', 'ุถู
ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู
ุซู', 'ููู', 'ุนูุฏ', |
|
'ุฃู
ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู', |
|
'ููู', 'ุชูู', 'ููู
', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู
ู', 'ููุฐุง', |
|
'ุงูู', 'ุถู
ู', 'ุงููุง', 'ุฌู
ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง', |
|
'ูุงุฒู
', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ' |
|
} |
|
|
|
st.set_page_config( |
|
page_title="Arabic Poem Analysis", |
|
page_icon="๐", |
|
layout="wide" |
|
) |
|
|
|
@st.cache_resource |
|
def load_models(): |
|
"""Load and cache the models to prevent reloading""" |
|
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") |
|
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2") |
|
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") |
|
emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") |
|
emotion_classifier = pipeline( |
|
"sentiment-analysis", |
|
model=emotion_model, |
|
tokenizer=emotion_tokenizer, |
|
return_all_scores=True |
|
) |
|
return tokenizer, bert_model, emotion_classifier |
|
|
|
def split_text(text, max_length=512): |
|
"""Split text into chunks of maximum token length while preserving word boundaries.""" |
|
words = text.split() |
|
chunks = [] |
|
current_chunk = [] |
|
current_length = 0 |
|
|
|
for word in words: |
|
word_length = len(word.split()) |
|
if current_length + word_length > max_length: |
|
if current_chunk: |
|
chunks.append(' '.join(current_chunk)) |
|
current_chunk = [word] |
|
current_length = word_length |
|
else: |
|
current_chunk.append(word) |
|
current_length += word_length |
|
|
|
if current_chunk: |
|
chunks.append(' '.join(current_chunk)) |
|
|
|
return chunks |
|
|
|
def clean_arabic_text(text): |
|
"""Clean Arabic text by removing stop words and normalizing.""" |
|
words = text.split() |
|
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1] |
|
return ' '.join(cleaned_words) |
|
|
|
def classify_emotion(text, classifier): |
|
"""Classify emotion for complete text with proper token handling.""" |
|
try: |
|
words = text.split() |
|
chunks = [] |
|
current_chunk = [] |
|
current_length = 0 |
|
|
|
for word in words: |
|
word_tokens = len(classifier.tokenizer.encode(word)) |
|
if current_length + word_tokens > 512: |
|
if current_chunk: |
|
chunks.append(' '.join(current_chunk)) |
|
current_chunk = [word] |
|
current_length = word_tokens |
|
else: |
|
current_chunk.append(word) |
|
current_length += word_tokens |
|
|
|
if current_chunk: |
|
chunks.append(' '.join(current_chunk)) |
|
|
|
if not chunks: |
|
chunks = [text] |
|
|
|
all_scores = [] |
|
for chunk in chunks: |
|
try: |
|
inputs = classifier.tokenizer( |
|
chunk, |
|
truncation=True, |
|
max_length=512, |
|
return_tensors="pt" |
|
) |
|
result = classifier(chunk, truncation=True, max_length=512) |
|
scores = result[0] |
|
all_scores.append(scores) |
|
except Exception as chunk_error: |
|
st.warning(f"Skipping chunk due to error: {str(chunk_error)}") |
|
continue |
|
|
|
if all_scores: |
|
label_scores = {} |
|
count = len(all_scores) |
|
|
|
for scores in all_scores: |
|
for score in scores: |
|
label = score['label'] |
|
if label not in label_scores: |
|
label_scores[label] = 0 |
|
label_scores[label] += score['score'] |
|
|
|
avg_scores = {label: score/count for label, score in label_scores.items()} |
|
final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0] |
|
return final_emotion |
|
|
|
return "LABEL_2" |
|
|
|
except Exception as e: |
|
st.warning(f"Error in emotion classification: {str(e)}") |
|
return "LABEL_2" |
|
|
|
def get_embedding_for_text(text, tokenizer, model): |
|
"""Get embedding for complete text.""" |
|
chunks = split_text(text) |
|
chunk_embeddings = [] |
|
|
|
for chunk in chunks: |
|
try: |
|
inputs = tokenizer( |
|
chunk, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=512 |
|
) |
|
inputs = {k: v.to(model.device) for k, v in inputs.items()} |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() |
|
chunk_embeddings.append(embedding[0]) |
|
except Exception as e: |
|
st.warning(f"Error processing chunk: {str(e)}") |
|
continue |
|
|
|
if chunk_embeddings: |
|
weights = np.array([len(chunk.split()) for chunk in chunks]) |
|
weights = weights / weights.sum() |
|
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights) |
|
return weighted_embedding |
|
return np.zeros(model.config.hidden_size) |
|
|
|
def format_topics(topic_model, topic_counts): |
|
"""Format topics for display.""" |
|
formatted_topics = [] |
|
for topic_num, count in topic_counts: |
|
if topic_num == -1: |
|
topic_label = "Miscellaneous" |
|
else: |
|
words = topic_model.get_topic(topic_num) |
|
topic_label = " | ".join([word for word, _ in words[:5]]) |
|
|
|
formatted_topics.append({ |
|
'topic': topic_label, |
|
'count': count |
|
}) |
|
return formatted_topics |
|
|
|
def format_emotions(emotion_counts): |
|
"""Format emotions for display.""" |
|
EMOTION_LABELS = { |
|
'LABEL_0': 'Negative', |
|
'LABEL_1': 'Positive', |
|
'LABEL_2': 'Neutral' |
|
} |
|
|
|
formatted_emotions = [] |
|
for label, count in emotion_counts: |
|
emotion = EMOTION_LABELS.get(label, label) |
|
formatted_emotions.append({ |
|
'emotion': emotion, |
|
'count': count |
|
}) |
|
return formatted_emotions |
|
|
|
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=30): |
|
"""Process the data and generate summaries with flexible topic configuration.""" |
|
summaries = [] |
|
|
|
topic_model_params = { |
|
"language": "multilingual", |
|
"calculate_probabilities": True, |
|
"min_topic_size": min_topic_size, |
|
"n_gram_range": (1, 3), |
|
"top_n_words": 15, |
|
"verbose": True |
|
} |
|
|
|
if topic_strategy == "Manual" and n_topics is not None: |
|
topic_model_params["nr_topics"] = n_topics |
|
else: |
|
topic_model_params["nr_topics"] = "auto" |
|
|
|
topic_model = BERTopic(**topic_model_params) |
|
|
|
|
|
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS)) |
|
topic_model.vectorizer_model = vectorizer |
|
|
|
for country, group in df.groupby('country'): |
|
progress_text = f"Processing poems for {country}..." |
|
progress_bar = st.progress(0, text=progress_text) |
|
|
|
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()] |
|
all_emotions = [] |
|
|
|
embeddings = [] |
|
for i, text in enumerate(texts): |
|
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model) |
|
embeddings.append(embedding) |
|
progress = (i + 1) / len(texts) * 0.4 |
|
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...") |
|
|
|
embeddings = np.array(embeddings) |
|
|
|
for i, text in enumerate(texts): |
|
emotion = classify_emotion(text, emotion_classifier) |
|
all_emotions.append(emotion) |
|
progress = 0.4 + ((i + 1) / len(texts) * 0.3) |
|
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...") |
|
|
|
try: |
|
topics, probs = topic_model.fit_transform(texts, embeddings) |
|
|
|
topic_counts = Counter(topics) |
|
if -1 in topic_counts: |
|
del topic_counts[-1] |
|
|
|
top_topics = format_topics(topic_model, topic_counts.most_common(top_n)) |
|
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n)) |
|
|
|
summaries.append({ |
|
'country': country, |
|
'total_poems': len(texts), |
|
'top_topics': top_topics, |
|
'top_emotions': top_emotions |
|
}) |
|
progress_bar.progress(1.0, text="Processing complete!") |
|
|
|
except Exception as e: |
|
st.warning(f"Could not generate topics for {country}: {str(e)}") |
|
continue |
|
|
|
return summaries, topic_model |
|
|
|
|
|
try: |
|
bert_tokenizer, bert_model, emotion_classifier = load_models() |
|
st.success("Models loaded successfully!") |
|
except Exception as e: |
|
st.error(f"Error loading models: {str(e)}") |
|
st.stop() |
|
|
|
|
|
st.title("๐ Arabic Poem Analysis") |
|
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) |
|
|
|
if uploaded_file is not None: |
|
try: |
|
|
|
if uploaded_file.name.endswith('.csv'): |
|
df = pd.read_csv(uploaded_file) |
|
else: |
|
df = pd.read_excel(uploaded_file) |
|
|
|
|
|
required_columns = ['country', 'poem'] |
|
if not all(col in df.columns for col in required_columns): |
|
st.error("File must contain 'country' and 'poem' columns.") |
|
st.stop() |
|
|
|
|
|
df['country'] = df['country'].str.strip() |
|
df = df.dropna(subset=['country', 'poem']) |
|
|
|
|
|
st.subheader("Topic Modeling Settings") |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
topic_strategy = st.radio( |
|
"Topic Number Strategy", |
|
["Auto", "Manual"], |
|
help="Choose whether to let the model determine the optimal number of topics or set it manually" |
|
) |
|
|
|
if topic_strategy == "Manual": |
|
n_documents = len(df) |
|
max_topics = min(500, n_documents // 50) |
|
min_topics = 5 |
|
recommended_topics = max_topics // 5 |
|
default_topics = min(recommended_topics, 50) |
|
|
|
n_topics = st.slider( |
|
"Number of Topics", |
|
min_value=min_topics, |
|
max_value=max_topics, |
|
value=default_topics, |
|
help=f"Select the desired number of topics (max {max_topics} based on dataset size)" |
|
) |
|
|
|
st.info(f""" |
|
๐ก For your dataset of {n_documents:,} documents: |
|
- Available topic range: {min_topics}-{max_topics} |
|
- Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence |
|
""") |
|
|
|
with col2: |
|
top_n = st.number_input( |
|
"Number of top topics/emotions to display:", |
|
min_value=1, |
|
max_value=100, |
|
value=10 |
|
) |
|
|
|
min_topic_size = st.slider( |
|
"Minimum Topic Size", |
|
min_value=10, |
|
max_value=100, |
|
value=30, |
|
help="Minimum number of documents required to form a topic" |
|
) |
|
|
|
if st.button("Process Data"): |
|
with st.spinner("Processing your data..."): |
|
summaries, topic_model = process_and_summarize( |
|
df, |
|
bert_tokenizer, |
|
bert_model, |
|
emotion_classifier, |
|
top_n=top_n, |
|
topic_strategy=topic_strategy, |
|
n_topics=n_topics if topic_strategy == "Manual" else None, |
|
min_topic_size=min_topic_size |
|
) |
|
|
|
if summaries: |
|
st.success("Analysis complete!") |
|
|
|
|
|
tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"]) |
|
|
|
with tab1: |
|
for summary in summaries: |
|
with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"): |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("Top Topics") |
|
for topic in summary['top_topics']: |
|
st.write(f"โข {topic['topic']}: {topic['count']} poems") |
|
|
|
with col2: |
|
st.subheader("Emotions") |
|
for emotion in summary['top_emotions']: |
|
st.write(f"โข {emotion['emotion']}: {emotion['count']} poems") |
|
|
|
with tab2: |
|
st.subheader("Global Topic Distribution") |
|
topic_info = topic_model.get_topic_info() |
|
for _, row in topic_info.iterrows(): |
|
if row['Topic'] == -1: |
|
topic_name = "Miscellaneous" |
|
else: |
|
words = topic_model.get_topic(row['Topic']) |
|
topic_name = " | ".join([word for word, _ in words[:5]]) |
|
st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)") |
|
|
|
except Exception as e: |
|
st.error(f"Error processing file: {str(e)}") |
|
|
|
else: |
|
st.info("๐ Upload a file to get started!") |
|
|
|
|
|
st.write("### Expected File Format:") |
|
example_df = pd.DataFrame({ |
|
'country': ['Egypt', 'Palestine'], |
|
'poem': ['ูุตูุฏุฉ ู
ุตุฑูุฉ', 'ูุตูุฏุฉ ููุณุทูููุฉ'] |
|
}) |
|
st.dataframe(example_df) |
|
|
|
|