Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,50 +1,43 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
from transformers import
|
4 |
from bertopic import BERTopic
|
5 |
import torch
|
6 |
-
|
7 |
|
8 |
-
#
|
9 |
-
|
10 |
-
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
# Function to
|
17 |
def generate_embeddings(texts):
|
18 |
-
|
19 |
-
|
20 |
-
for text in texts:
|
21 |
-
# Tokenize the text with truncation set to False
|
22 |
-
# We are using the BertTokenizer directly without using the pipeline
|
23 |
-
tokens = bert_tokenizer.encode(text, add_special_tokens=True, truncation=False, padding=False)
|
24 |
-
|
25 |
-
# Split the tokens into chunks of size 512 (maximum length)
|
26 |
-
chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
|
27 |
-
|
28 |
-
poem_embeddings = []
|
29 |
-
|
30 |
-
for chunk in chunked_texts:
|
31 |
-
# Convert the chunk to a tensor and prepare the input for BERT model
|
32 |
-
inputs = torch.tensor(chunk).unsqueeze(0) # Adding batch dimension
|
33 |
-
with torch.no_grad():
|
34 |
-
outputs = bert_model(inputs)
|
35 |
-
# Get the embeddings from the last hidden state (mean of all token embeddings)
|
36 |
-
chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
|
37 |
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
43 |
|
|
|
44 |
return embeddings
|
45 |
|
46 |
-
# Function to process the
|
47 |
-
def
|
48 |
# Determine the file type
|
49 |
if uploaded_file.name.endswith(".csv"):
|
50 |
df = pd.read_csv(uploaded_file)
|
@@ -52,39 +45,75 @@ def process_file(uploaded_file):
|
|
52 |
df = pd.read_excel(uploaded_file)
|
53 |
else:
|
54 |
st.error("Unsupported file format.")
|
55 |
-
return None
|
56 |
|
57 |
# Validate required columns
|
58 |
required_columns = ['country', 'poem']
|
59 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
60 |
if missing_columns:
|
61 |
st.error(f"Missing columns: {', '.join(missing_columns)}")
|
62 |
-
return None
|
63 |
-
|
64 |
-
#
|
|
|
65 |
df = df.dropna(subset=['country', 'poem'])
|
66 |
-
|
67 |
-
texts = df['poem'].dropna().tolist()
|
68 |
|
69 |
-
#
|
70 |
-
|
71 |
-
|
72 |
-
# Perform topic modeling with BERTopic
|
73 |
topic_model = BERTopic()
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
st.title("Arabic Poem Topic Modeling & Emotion Classification")
|
|
|
|
|
81 |
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
|
82 |
|
83 |
if uploaded_file is not None:
|
84 |
try:
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
except Exception as e:
|
90 |
st.error(f"Error: {e}")
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
|
4 |
from bertopic import BERTopic
|
5 |
import torch
|
6 |
+
from collections import Counter
|
7 |
|
8 |
+
# Load AraBERT tokenizer and model for embeddings
|
9 |
+
bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
|
10 |
+
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
|
11 |
|
12 |
+
# Load AraBERT model for emotion classification
|
13 |
+
emotion_model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2")
|
14 |
+
emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
|
15 |
|
16 |
+
# Function to generate embeddings using AraBERT
|
17 |
def generate_embeddings(texts):
|
18 |
+
# Tokenize the list of texts using the tokenizer
|
19 |
+
inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=False, max_length=512)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
# Split large sequences into chunks of size 512
|
22 |
+
chunked_inputs = []
|
23 |
+
for input_ids in inputs['input_ids']:
|
24 |
+
chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
|
25 |
+
chunked_inputs.extend(chunks)
|
26 |
|
27 |
+
# Process each chunk and get embeddings
|
28 |
+
embeddings = []
|
29 |
+
for chunk in chunked_inputs:
|
30 |
+
input_tensor = torch.tensor(chunk).unsqueeze(0) # Add batch dimension
|
31 |
+
with torch.no_grad():
|
32 |
+
outputs = bert_model(input_tensor)
|
33 |
+
chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
|
34 |
+
embeddings.append(chunk_embedding)
|
35 |
|
36 |
+
# Return the embeddings averaged across chunks
|
37 |
return embeddings
|
38 |
|
39 |
+
# Function to process the uploaded file and summarize by country
|
40 |
+
def process_and_summarize(uploaded_file, top_n=50):
|
41 |
# Determine the file type
|
42 |
if uploaded_file.name.endswith(".csv"):
|
43 |
df = pd.read_csv(uploaded_file)
|
|
|
45 |
df = pd.read_excel(uploaded_file)
|
46 |
else:
|
47 |
st.error("Unsupported file format.")
|
48 |
+
return None, None
|
49 |
|
50 |
# Validate required columns
|
51 |
required_columns = ['country', 'poem']
|
52 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
53 |
if missing_columns:
|
54 |
st.error(f"Missing columns: {', '.join(missing_columns)}")
|
55 |
+
return None, None
|
56 |
+
|
57 |
+
# Parse and preprocess the file
|
58 |
+
df['country'] = df['country'].str.strip()
|
59 |
df = df.dropna(subset=['country', 'poem'])
|
|
|
|
|
60 |
|
61 |
+
# Group by country
|
62 |
+
summaries = []
|
|
|
|
|
63 |
topic_model = BERTopic()
|
64 |
+
for country, group in df.groupby('country'):
|
65 |
+
st.info(f"Processing poems for {country}...")
|
66 |
+
|
67 |
+
# Combine all poems for the country
|
68 |
+
texts = group['poem'].dropna().tolist()
|
69 |
+
|
70 |
+
# Classify emotions
|
71 |
+
st.info(f"Classifying emotions for {country}...")
|
72 |
+
emotions = [emotion_classifier(text)[0]['label'] for text in texts]
|
73 |
|
74 |
+
# Generate embeddings and fit topic model
|
75 |
+
st.info(f"Generating embeddings and topics for {country}...")
|
76 |
+
embeddings = generate_embeddings(texts)
|
77 |
+
topics, _ = topic_model.fit_transform(embeddings)
|
78 |
+
|
79 |
+
# Aggregate topics and emotions
|
80 |
+
top_topics = Counter(topics).most_common(top_n)
|
81 |
+
top_emotions = Counter(emotions).most_common(top_n)
|
82 |
+
|
83 |
+
summaries.append({
|
84 |
+
'country': country,
|
85 |
+
'total_poems': len(texts),
|
86 |
+
'top_topics': top_topics,
|
87 |
+
'top_emotions': top_emotions
|
88 |
+
})
|
89 |
+
|
90 |
+
return summaries, topic_model
|
91 |
+
|
92 |
+
# Streamlit App Interface
|
93 |
st.title("Arabic Poem Topic Modeling & Emotion Classification")
|
94 |
+
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
|
95 |
+
|
96 |
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
|
97 |
|
98 |
if uploaded_file is not None:
|
99 |
try:
|
100 |
+
top_n = st.number_input("Select the number of top topics/emotions to display:", min_value=1, max_value=100, value=50)
|
101 |
+
|
102 |
+
summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n)
|
103 |
+
if summaries is not None:
|
104 |
+
st.success("Data successfully processed!")
|
105 |
+
|
106 |
+
# Display summary for each country
|
107 |
+
for summary in summaries:
|
108 |
+
st.write(f"### {summary['country']}")
|
109 |
+
st.write(f"Total Poems: {summary['total_poems']}")
|
110 |
+
st.write(f"Top {top_n} Topics:")
|
111 |
+
st.write(summary['top_topics'])
|
112 |
+
st.write(f"Top {top_n} Emotions:")
|
113 |
+
st.write(summary['top_emotions'])
|
114 |
+
|
115 |
+
# Display overall topics
|
116 |
+
st.write("### Global Topic Information:")
|
117 |
+
st.write(topic_model.get_topic_info())
|
118 |
except Exception as e:
|
119 |
st.error(f"Error: {e}")
|