Spaces:

kambris
/

SoLProject

Runtime error

File size: 4,649 Bytes

4b4bf72
 
b3d1640
4b4bf72
b3d1640
4b4bf72
b3d1640
 
 
4b4bf72
b3d1640
 
 
 
 
 
 
 
ebe462c
b3d1640
ebe462c
b3d1640
 
 
4b4bf72
 
 
 
 
 
 
 
ebe462c
 
 
 
 
 
 
 
4b4bf72
 
 
b3d1640
 
4b4bf72
b3d1640
4b4bf72
 
ebe462c
4b4bf72
 
 
ebe462c
 
4b4bf72
 
 
 
ebe462c
4b4bf72
 
ebe462c

import streamlit as st
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
from bertopic import BERTopic
import torch

# Initialize ARAT5 model and tokenizer for topic modeling
tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")

# Emotion classification pipeline for Arabic (use an Arabic emotion classification model)
emotion_classifier = pipeline("text-classification", model="d0r13n/ara-bert-base-arabic-emotion")

# Function to get embeddings from ARAT5 for topic modeling
def generate_embeddings(texts):
    # Tokenize the Arabic text for ARAT5
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        # Use ARAT5 to generate embeddings
        outputs = model.encoder(input_ids=inputs['input_ids'])
    # Extract the embeddings (mean of hidden states for simplicity)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to process the CSV file and return emotion and topic model
def process_file(uploaded_file):
    # Load CSV
    df = pd.read_csv(uploaded_file)
    
    # Display basic info about the CSV
    st.write("CSV Loaded Successfully!")
    st.write(f"Data Preview: {df.head()}")
    
    # Ensure 'date' column is in datetime format and extract the year
    df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Replace 'date' with your actual column name
    df['year'] = df['date'].dt.year
    
    # Modify this to use the 'poem' column that contains the Arabic poems
    texts = df['poem'].dropna().tolist()  # Replace 'poem' with your actual column name
    
    # Emotion Classification: Classify emotions for each poem (Arabic)
    emotions = [emotion_classifier(text)[0]['label'] for text in texts]
    df['emotion'] = emotions
    
    # Topic Modeling using ARAT5 embeddings
    embeddings = generate_embeddings(texts)
    topic_model = BERTopic()
    topics, _ = topic_model.fit_transform(embeddings)
    df['topic'] = topics
    
    # Return the processed dataframe
    return df

# Streamlit App
st.title("Arabic Poem Topic Modeling & Emotion Classification with ARAT5")
st.write("Upload a CSV file to perform topic modeling and emotion classification on Arabic poems.")

# File upload widget
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])

# If file is uploaded, process and display results
if uploaded_file is not None:
    result_df = process_file(uploaded_file)

    # Show date selection widgets
    st.write("### Filter by Date Range")
    start_date = st.date_input("Start Date", value=pd.to_datetime(result_df['date'].min()))
    end_date = st.date_input("End Date", value=pd.to_datetime(result_df['date'].max()))
    
    # Filter data based on selected date range
    filtered_df = result_df[(result_df['date'] >= start_date) & (result_df['date'] <= end_date)]
    
    # Display filtered data
    st.write(f"Filtered Data (Poems from {start_date} to {end_date}):")
    st.write(filtered_df[['poet_name', 'era', 'poem', 'emotion', 'topic', 'date']])

    # Create buttons to show different summaries
    summary_type = st.radio("Select Summary Type:", 
                            ("Emotion and Topic Summary by Date Range", 
                             "Global Emotion and Topic Summary"))

    # Display the selected summary
    if summary_type == "Emotion and Topic Summary by Date Range":
        st.write("Emotion and Topic Summary for Selected Date Range:")
        
        # Emotion Distribution in Date Range
        emotion_counts = filtered_df['emotion'].value_counts()
        st.write("Emotion Counts in Date Range:")
        st.write(emotion_counts)
        
        # Topic Distribution in Date Range
        topic_counts = filtered_df['topic'].value_counts()
        st.write("Topic Counts in Date Range:")
        st.write(topic_counts)
        
        # Visualize emotion distribution over the selected range (optional)
        st.bar_chart(emotion_counts, use_container_width=True)
        
        # Visualize topic distribution over the selected range (optional)
        st.bar_chart(topic_counts, use_container_width=True)

    elif summary_type == "Global Emotion and Topic Summary":
        st.write("Global Emotion and Topic Summary (All Poems):")
        global_emotion_count = result_df['emotion'].value_counts().to_dict()
        global_topic_count = result_df['topic'].value_counts().to_dict()
        
        st.write(f"Emotion Distribution: {global_emotion_count}")
        st.write(f"Topic Distribution: {global_topic_count}")