Spaces:

reysarms
/

song_genre_classification

Running

App Files Files Community

song_genre_classification / app.py

reysarms

revised errors

ab0c289 4 days ago

raw

history blame contribute delete

4.82 kB

	import streamlit as st
	import pandas as pd
	import joblib
	import seaborn as sns
	import matplotlib.pyplot as plt
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from wordcloud import WordCloud
	from imblearn.over_sampling import SMOTE
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import SGDClassifier
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import accuracy_score, classification_report
	import os

	# Download NLTK data
	nltk.download('stopwords')
	nltk.download('wordnet')

	st.title("🎤 Lyric Artist Classifier")
	st.write("Ever wondered who might have written a set of lyrics? This app predicts the artist based on lyrical patterns!")

	# Load datasets
	@st.cache_data
	def load_data():
	train_df = pd.read_csv("training_songs.csv")
	test_df = pd.read_csv("test_songs.csv")
	return train_df, test_df

	train_df, test_df = load_data()

	if 'Lyrics' not in train_df.columns or 'Artist' not in train_df.columns:
	st.error("Dataset must contain 'Lyrics' and 'Artist' columns.")
	st.stop()

	# Text preprocessing
	def preprocess_text(text):
	text = text.lower()
	text = re.sub(r'[^a-z\s]', '', text)
	lemmatizer = WordNetLemmatizer()
	stop_words = set(stopwords.words('english'))
	words = text.split()
	words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
	return ' '.join(words)

	train_df['Lyrics'] = train_df['Lyrics'].apply(preprocess_text)
	test_df['Lyrics'] = test_df['Lyrics'].apply(preprocess_text)

	# Train model
	@st.cache_resource
	def train_model():
	vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,3))
	X_train = vectorizer.fit_transform(train_df['Lyrics'])
	y_train = train_df['Artist']
	smote = SMOTE()
	X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
	model = SGDClassifier(loss='log_loss', max_iter=1000)
	param_grid = {'alpha': [0.0001, 0.001, 0.01]}
	grid_search = GridSearchCV(model, param_grid, cv=3)
	grid_search.fit(X_train_resampled, y_train_resampled)
	best_model = grid_search.best_estimator_
	joblib.dump((vectorizer, best_model), "model.pkl")
	return vectorizer, best_model

	if os.path.exists("model.pkl"):
	vectorizer, model = joblib.load("model.pkl")
	else:
	vectorizer, model = train_model()

	X_test = vectorizer.transform(test_df['Lyrics'])
	y_test = test_df['Artist']
	y_pred = model.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)

	# Tabs
	tabs = st.tabs(["Home", "Prediction", "Dataset", "Visualizations", "Model Performance"])

	with tabs[0]:
	st.header("Welcome to Lyric Artist Classifier!")
	st.write("This AI-powered app predicts the artist of a song based on its lyrics.")
	st.write("The model has been trained on a dataset of various artists and uses text analysis techniques to make predictions.")
	st.subheader("Model Performance")
	st.write(f"Current Model Accuracy: {accuracy:.2f}")
	st.write("While the model performs well, predictions might be less accurate for artists with fewer songs in the dataset.")

	with tabs[1]:
	st.header("Predict the Artist!")
	lyrics_input = st.text_area("Enter Lyrics:", height=200)
	def predict_artist(lyrics):
	X_input = vectorizer.transform([preprocess_text(lyrics)])
	predictions = model.predict_proba(X_input)[0]
	top_artists = sorted(zip(model.classes_, predictions), key=lambda x: x[1], reverse=True)[:3]
	return top_artists
	if st.button("Predict Artist"):
	if lyrics_input.strip():
	top_artists = predict_artist(lyrics_input)
	st.success("Top Predictions:")
	for artist, prob in top_artists:
	st.write(f"{artist}: {prob:.2f}")
	else:
	st.warning("Please enter some lyrics!")

	with tabs[2]:
	st.header("Sample Training Data")
	st.dataframe(train_df[['Artist', 'Song', 'Lyrics']], height=400)

	with tabs[3]:
	st.header("Visualizations")
	st.subheader("Artist Distribution")
	fig, ax = plt.subplots(figsize=(8, 6))
	top_artists = train_df['Artist'].value_counts().nlargest(20)
	sns.barplot(x=top_artists.values, y=top_artists.index, palette='coolwarm', ax=ax)
	ax.set_xlabel("Number of Songs")
	ax.set_ylabel("Artist")
	st.pyplot(fig)
	st.subheader("Word Cloud")
	wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['Lyrics']))
	fig, ax = plt.subplots()
	ax.imshow(wordcloud, interpolation='bilinear')
	ax.axis("off")
	st.pyplot(fig)

	with tabs[4]:
	st.header("Model Performance")
	st.subheader("Classification Report")
	st.dataframe(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T, height=400)