Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

SoLProject / app.py

kambris

Update app.py

ebe462c verified 7 months ago

raw

history blame

4.65 kB

	import streamlit as st
	import pandas as pd
	from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
	from bertopic import BERTopic
	import torch

	# Initialize ARAT5 model and tokenizer for topic modeling
	tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
	model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")

	# Emotion classification pipeline for Arabic (use an Arabic emotion classification model)
	emotion_classifier = pipeline("text-classification", model="d0r13n/ara-bert-base-arabic-emotion")

	# Function to get embeddings from ARAT5 for topic modeling
	def generate_embeddings(texts):
	# Tokenize the Arabic text for ARAT5
	inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
	with torch.no_grad():
	# Use ARAT5 to generate embeddings
	outputs = model.encoder(input_ids=inputs['input_ids'])
	# Extract the embeddings (mean of hidden states for simplicity)
	return outputs.last_hidden_state.mean(dim=1).numpy()

	# Function to process the CSV file and return emotion and topic model
	def process_file(uploaded_file):
	# Load CSV
	df = pd.read_csv(uploaded_file)

	# Display basic info about the CSV
	st.write("CSV Loaded Successfully!")
	st.write(f"Data Preview: {df.head()}")

	# Ensure 'date' column is in datetime format and extract the year
	df['date'] = pd.to_datetime(df['date'], errors='coerce') # Replace 'date' with your actual column name
	df['year'] = df['date'].dt.year

	# Modify this to use the 'poem' column that contains the Arabic poems
	texts = df['poem'].dropna().tolist() # Replace 'poem' with your actual column name

	# Emotion Classification: Classify emotions for each poem (Arabic)
	emotions = [emotion_classifier(text)[0]['label'] for text in texts]
	df['emotion'] = emotions

	# Topic Modeling using ARAT5 embeddings
	embeddings = generate_embeddings(texts)
	topic_model = BERTopic()
	topics, _ = topic_model.fit_transform(embeddings)
	df['topic'] = topics

	# Return the processed dataframe
	return df

	# Streamlit App
	st.title("Arabic Poem Topic Modeling & Emotion Classification with ARAT5")
	st.write("Upload a CSV file to perform topic modeling and emotion classification on Arabic poems.")

	# File upload widget
	uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])

	# If file is uploaded, process and display results
	if uploaded_file is not None:
	result_df = process_file(uploaded_file)

	# Show date selection widgets
	st.write("### Filter by Date Range")
	start_date = st.date_input("Start Date", value=pd.to_datetime(result_df['date'].min()))
	end_date = st.date_input("End Date", value=pd.to_datetime(result_df['date'].max()))

	# Filter data based on selected date range
	filtered_df = result_df[(result_df['date'] >= start_date) & (result_df['date'] <= end_date)]

	# Display filtered data
	st.write(f"Filtered Data (Poems from {start_date} to {end_date}):")
	st.write(filtered_df[['poet_name', 'era', 'poem', 'emotion', 'topic', 'date']])

	# Create buttons to show different summaries
	summary_type = st.radio("Select Summary Type:",
	("Emotion and Topic Summary by Date Range",
	"Global Emotion and Topic Summary"))

	# Display the selected summary
	if summary_type == "Emotion and Topic Summary by Date Range":
	st.write("Emotion and Topic Summary for Selected Date Range:")

	# Emotion Distribution in Date Range
	emotion_counts = filtered_df['emotion'].value_counts()
	st.write("Emotion Counts in Date Range:")
	st.write(emotion_counts)

	# Topic Distribution in Date Range
	topic_counts = filtered_df['topic'].value_counts()
	st.write("Topic Counts in Date Range:")
	st.write(topic_counts)

	# Visualize emotion distribution over the selected range (optional)
	st.bar_chart(emotion_counts, use_container_width=True)

	# Visualize topic distribution over the selected range (optional)
	st.bar_chart(topic_counts, use_container_width=True)

	elif summary_type == "Global Emotion and Topic Summary":
	st.write("Global Emotion and Topic Summary (All Poems):")
	global_emotion_count = result_df['emotion'].value_counts().to_dict()
	global_topic_count = result_df['topic'].value_counts().to_dict()

	st.write(f"Emotion Distribution: {global_emotion_count}")
	st.write(f"Topic Distribution: {global_topic_count}")