|
import streamlit as st |
|
import pandas as pd |
|
import joblib |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import re |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from wordcloud import WordCloud |
|
from imblearn.over_sampling import SMOTE |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.linear_model import SGDClassifier |
|
from sklearn.model_selection import GridSearchCV |
|
from sklearn.metrics import accuracy_score, classification_report |
|
import os |
|
|
|
|
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
|
|
st.title("🎤 Lyric Artist Classifier") |
|
st.write("Ever wondered who might have written a set of lyrics? This app predicts the artist based on lyrical patterns!") |
|
|
|
|
|
@st.cache_data |
|
def load_data(): |
|
train_df = pd.read_csv("training_songs.csv") |
|
test_df = pd.read_csv("test_songs.csv") |
|
return train_df, test_df |
|
|
|
train_df, test_df = load_data() |
|
|
|
if 'Lyrics' not in train_df.columns or 'Artist' not in train_df.columns: |
|
st.error("Dataset must contain 'Lyrics' and 'Artist' columns.") |
|
st.stop() |
|
|
|
|
|
def preprocess_text(text): |
|
text = text.lower() |
|
text = re.sub(r'[^a-z\s]', '', text) |
|
lemmatizer = WordNetLemmatizer() |
|
stop_words = set(stopwords.words('english')) |
|
words = text.split() |
|
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] |
|
return ' '.join(words) |
|
|
|
train_df['Lyrics'] = train_df['Lyrics'].apply(preprocess_text) |
|
test_df['Lyrics'] = test_df['Lyrics'].apply(preprocess_text) |
|
|
|
|
|
@st.cache_resource |
|
def train_model(): |
|
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,3)) |
|
X_train = vectorizer.fit_transform(train_df['Lyrics']) |
|
y_train = train_df['Artist'] |
|
smote = SMOTE() |
|
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) |
|
model = SGDClassifier(loss='log_loss', max_iter=1000) |
|
param_grid = {'alpha': [0.0001, 0.001, 0.01]} |
|
grid_search = GridSearchCV(model, param_grid, cv=3) |
|
grid_search.fit(X_train_resampled, y_train_resampled) |
|
best_model = grid_search.best_estimator_ |
|
joblib.dump((vectorizer, best_model), "model.pkl") |
|
return vectorizer, best_model |
|
|
|
if os.path.exists("model.pkl"): |
|
vectorizer, model = joblib.load("model.pkl") |
|
else: |
|
vectorizer, model = train_model() |
|
|
|
X_test = vectorizer.transform(test_df['Lyrics']) |
|
y_test = test_df['Artist'] |
|
y_pred = model.predict(X_test) |
|
accuracy = accuracy_score(y_test, y_pred) |
|
|
|
|
|
tabs = st.tabs(["Home", "Prediction", "Dataset", "Visualizations", "Model Performance"]) |
|
|
|
with tabs[0]: |
|
st.header("Welcome to Lyric Artist Classifier!") |
|
st.write("This AI-powered app predicts the artist of a song based on its lyrics.") |
|
st.write("The model has been trained on a dataset of various artists and uses text analysis techniques to make predictions.") |
|
st.subheader("Model Performance") |
|
st.write(f"Current Model Accuracy: **{accuracy:.2f}**") |
|
st.write("While the model performs well, predictions might be less accurate for artists with fewer songs in the dataset.") |
|
|
|
with tabs[1]: |
|
st.header("Predict the Artist!") |
|
lyrics_input = st.text_area("Enter Lyrics:", height=200) |
|
def predict_artist(lyrics): |
|
X_input = vectorizer.transform([preprocess_text(lyrics)]) |
|
predictions = model.predict_proba(X_input)[0] |
|
top_artists = sorted(zip(model.classes_, predictions), key=lambda x: x[1], reverse=True)[:3] |
|
return top_artists |
|
if st.button("Predict Artist"): |
|
if lyrics_input.strip(): |
|
top_artists = predict_artist(lyrics_input) |
|
st.success("Top Predictions:") |
|
for artist, prob in top_artists: |
|
st.write(f"{artist}: {prob:.2f}") |
|
else: |
|
st.warning("Please enter some lyrics!") |
|
|
|
with tabs[2]: |
|
st.header("Sample Training Data") |
|
st.dataframe(train_df[['Artist', 'Song', 'Lyrics']], height=400) |
|
|
|
with tabs[3]: |
|
st.header("Visualizations") |
|
st.subheader("Artist Distribution") |
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
top_artists = train_df['Artist'].value_counts().nlargest(20) |
|
sns.barplot(x=top_artists.values, y=top_artists.index, palette='coolwarm', ax=ax) |
|
ax.set_xlabel("Number of Songs") |
|
ax.set_ylabel("Artist") |
|
st.pyplot(fig) |
|
st.subheader("Word Cloud") |
|
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['Lyrics'])) |
|
fig, ax = plt.subplots() |
|
ax.imshow(wordcloud, interpolation='bilinear') |
|
ax.axis("off") |
|
st.pyplot(fig) |
|
|
|
with tabs[4]: |
|
st.header("Model Performance") |
|
st.subheader("Classification Report") |
|
st.dataframe(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T, height=400) |
|
|