File size: 4,639 Bytes
4b4bf72
 
3f0f6de
4b4bf72
b3d1640
3f0f6de
4b4bf72
3f0f6de
 
 
4b4bf72
3f0f6de
 
 
b3d1640
3f0f6de
b3d1640
3f0f6de
 
631c46c
3f0f6de
 
 
 
 
631c46c
3f0f6de
 
 
 
 
 
 
 
631c46c
3f0f6de
5fce9bd
b3d1640
3f0f6de
 
5fce9bd
 
 
 
 
 
 
3f0f6de
5fce9bd
 
7684baa
5fce9bd
 
 
3f0f6de
 
 
 
7684baa
 
3f0f6de
 
4b4bf72
3f0f6de
 
 
 
 
 
 
 
 
7684baa
3f0f6de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fce9bd
3f0f6de
 
5fce9bd
4b4bf72
 
5fce9bd
3f0f6de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fce9bd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
from bertopic import BERTopic
import torch
from collections import Counter

# Load AraBERT tokenizer and model for embeddings
bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")

# Load AraBERT model for emotion classification
emotion_model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2")
emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)

# Function to generate embeddings using AraBERT
def generate_embeddings(texts):
    # Tokenize the list of texts using the tokenizer
    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=False, max_length=512)

    # Split large sequences into chunks of size 512
    chunked_inputs = []
    for input_ids in inputs['input_ids']:
        chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
        chunked_inputs.extend(chunks)

    # Process each chunk and get embeddings
    embeddings = []
    for chunk in chunked_inputs:
        input_tensor = torch.tensor(chunk).unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            outputs = bert_model(input_tensor)
        chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        embeddings.append(chunk_embedding)

    # Return the embeddings averaged across chunks
    return embeddings

# Function to process the uploaded file and summarize by country
def process_and_summarize(uploaded_file, top_n=50):
    # Determine the file type
    if uploaded_file.name.endswith(".csv"):
        df = pd.read_csv(uploaded_file)
    elif uploaded_file.name.endswith(".xlsx"):
        df = pd.read_excel(uploaded_file)
    else:
        st.error("Unsupported file format.")
        return None, None

    # Validate required columns
    required_columns = ['country', 'poem']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        st.error(f"Missing columns: {', '.join(missing_columns)}")
        return None, None

    # Parse and preprocess the file
    df['country'] = df['country'].str.strip()
    df = df.dropna(subset=['country', 'poem'])

    # Group by country
    summaries = []
    topic_model = BERTopic()
    for country, group in df.groupby('country'):
        st.info(f"Processing poems for {country}...")

        # Combine all poems for the country
        texts = group['poem'].dropna().tolist()

        # Classify emotions
        st.info(f"Classifying emotions for {country}...")
        emotions = [emotion_classifier(text)[0]['label'] for text in texts]

        # Generate embeddings and fit topic model
        st.info(f"Generating embeddings and topics for {country}...")
        embeddings = generate_embeddings(texts)
        topics, _ = topic_model.fit_transform(embeddings)

        # Aggregate topics and emotions
        top_topics = Counter(topics).most_common(top_n)
        top_emotions = Counter(emotions).most_common(top_n)

        summaries.append({
            'country': country,
            'total_poems': len(texts),
            'top_topics': top_topics,
            'top_emotions': top_emotions
        })

    return summaries, topic_model

# Streamlit App Interface
st.title("Arabic Poem Topic Modeling & Emotion Classification")
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")

uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])

if uploaded_file is not None:
    try:
        top_n = st.number_input("Select the number of top topics/emotions to display:", min_value=1, max_value=100, value=50)

        summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n)
        if summaries is not None:
            st.success("Data successfully processed!")

            # Display summary for each country
            for summary in summaries:
                st.write(f"### {summary['country']}")
                st.write(f"Total Poems: {summary['total_poems']}")
                st.write(f"Top {top_n} Topics:")
                st.write(summary['top_topics'])
                st.write(f"Top {top_n} Emotions:")
                st.write(summary['top_emotions'])

            # Display overall topics
            st.write("### Global Topic Information:")
            st.write(topic_model.get_topic_info())
    except Exception as e:
        st.error(f"Error: {e}")