File size: 3,083 Bytes
4b4bf72
 
631c46c
4b4bf72
b3d1640
631c46c
4b4bf72
631c46c
 
 
4b4bf72
c671da9
631c46c
b3d1640
631c46c
b3d1640
631c46c
 
 
a6cdac2
 
631c46c
 
 
 
 
 
 
 
c671da9
631c46c
c671da9
631c46c
 
 
 
 
 
 
 
5fce9bd
b3d1640
631c46c
 
5fce9bd
 
 
 
 
 
 
631c46c
5fce9bd
 
7684baa
5fce9bd
 
 
631c46c
 
 
7684baa
631c46c
 
7684baa
631c46c
 
 
 
4b4bf72
631c46c
 
 
 
7684baa
631c46c
5fce9bd
 
4b4bf72
 
5fce9bd
631c46c
 
 
 
5fce9bd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
from bertopic import BERTopic
import torch
import numpy as np

# Initialize ARAT5 model and tokenizer for topic modeling
tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")

# Initialize AraBERT model and tokenizer for feature extraction
bert_tokenizer = pipeline("feature-extraction", model="aubmindlab/bert-base-arabertv2")

# Function to get embeddings from ARAT5 for topic modeling
def generate_embeddings(texts):
    embeddings = []
    
    for text in texts:
        # Tokenize the text with truncation set to False
        tokens = bert_tokenizer.tokenizer.encode(text, truncation=False)  # Do not truncate here

        # Split the tokens into chunks of size 512 (maximum length)
        chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]

        poem_embeddings = []

        for chunk in chunked_texts:
            # Process each chunk and get embeddings
            inputs = bert_tokenizer.tokenizer(chunk, return_tensors="pt", padding=True, truncation=False, max_length=512)
            with torch.no_grad():
                outputs = bert_tokenizer.model(**inputs)
            chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()

            poem_embeddings.append(chunk_embedding)

        # Average the embeddings of all chunks (optional, can also concatenate them)
        final_embedding = np.mean(np.array(poem_embeddings), axis=0)
        embeddings.append(final_embedding)

    return embeddings

# Function to process the CSV or Excel file
def process_file(uploaded_file):
    # Determine the file type
    if uploaded_file.name.endswith(".csv"):
        df = pd.read_csv(uploaded_file)
    elif uploaded_file.name.endswith(".xlsx"):
        df = pd.read_excel(uploaded_file)
    else:
        st.error("Unsupported file format.")
        return None

    # Validate required columns
    required_columns = ['country', 'poem']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        st.error(f"Missing columns: {', '.join(missing_columns)}")
        return None
    
    # Process the file
    df = df.dropna(subset=['country', 'poem'])
    
    texts = df['poem'].dropna().tolist()

    # Generate embeddings for all poems
    embeddings = generate_embeddings(texts)
    
    # Perform topic modeling with BERTopic
    topic_model = BERTopic()
    topics, _ = topic_model.fit_transform(embeddings)
    df['topic'] = topics
    
    return df

# Streamlit App
st.title("Arabic Poem Topic Modeling & Emotion Classification")
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])

if uploaded_file is not None:
    try:
        result_df = process_file(uploaded_file)
        if result_df is not None:
            st.write("Data successfully processed!")
            st.write(result_df.head())
    except Exception as e:
        st.error(f"Error: {e}")