Spaces:

reysarms
/

song_genre_classification

Sleeping

App Files Files Community

reysarms commited on 16 days ago

Commit

ab0c289

1 Parent(s): d300c28

revised errors

Browse files

Files changed (5) hide show

app.py +128 -0
model.pkl +3 -0
requirements.txt +6 -1
test_songs.csv +0 -0
training_songs.csv +0 -0

app.py CHANGED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+import pandas as pd
+import joblib
+import seaborn as sns
+import matplotlib.pyplot as plt
+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from wordcloud import WordCloud
+from imblearn.over_sampling import SMOTE
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import SGDClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, classification_report
+import os
+# Download NLTK data
+nltk.download('stopwords')
+nltk.download('wordnet')
+st.title("🎤 Lyric Artist Classifier")
+st.write("Ever wondered who might have written a set of lyrics? This app predicts the artist based on lyrical patterns!")
+# Load datasets
+@st.cache_data
+def load_data():
+    train_df = pd.read_csv("training_songs.csv")
+    test_df = pd.read_csv("test_songs.csv")
+    return train_df, test_df
+train_df, test_df = load_data()
+if 'Lyrics' not in train_df.columns or 'Artist' not in train_df.columns:
+    st.error("Dataset must contain 'Lyrics' and 'Artist' columns.")
+    st.stop()
+# Text preprocessing
+def preprocess_text(text):
+    text = text.lower()
+    text = re.sub(r'[^a-z\s]', '', text)
+    lemmatizer = WordNetLemmatizer()
+    stop_words = set(stopwords.words('english'))
+    words = text.split()
+    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
+    return ' '.join(words)
+train_df['Lyrics'] = train_df['Lyrics'].apply(preprocess_text)
+test_df['Lyrics'] = test_df['Lyrics'].apply(preprocess_text)
+# Train model
+@st.cache_resource
+def train_model():
+    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,3))
+    X_train = vectorizer.fit_transform(train_df['Lyrics'])
+    y_train = train_df['Artist']
+    smote = SMOTE()
+    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
+    model = SGDClassifier(loss='log_loss', max_iter=1000)
+    param_grid = {'alpha': [0.0001, 0.001, 0.01]}
+    grid_search = GridSearchCV(model, param_grid, cv=3)
+    grid_search.fit(X_train_resampled, y_train_resampled)
+    best_model = grid_search.best_estimator_
+    joblib.dump((vectorizer, best_model), "model.pkl")
+    return vectorizer, best_model
+if os.path.exists("model.pkl"):
+    vectorizer, model = joblib.load("model.pkl")
+else:
+    vectorizer, model = train_model()
+X_test = vectorizer.transform(test_df['Lyrics'])
+y_test = test_df['Artist']
+y_pred = model.predict(X_test)
+accuracy = accuracy_score(y_test, y_pred)
+# Tabs
+tabs = st.tabs(["Home", "Prediction", "Dataset", "Visualizations", "Model Performance"])
+with tabs[0]:
+    st.header("Welcome to Lyric Artist Classifier!")
+    st.write("This AI-powered app predicts the artist of a song based on its lyrics.")
+    st.write("The model has been trained on a dataset of various artists and uses text analysis techniques to make predictions.")
+    st.subheader("Model Performance")
+    st.write(f"Current Model Accuracy: **{accuracy:.2f}**")
+    st.write("While the model performs well, predictions might be less accurate for artists with fewer songs in the dataset.")
+with tabs[1]:
+    st.header("Predict the Artist!")
+    lyrics_input = st.text_area("Enter Lyrics:", height=200)
+    def predict_artist(lyrics):
+        X_input = vectorizer.transform([preprocess_text(lyrics)])
+        predictions = model.predict_proba(X_input)[0]
+        top_artists = sorted(zip(model.classes_, predictions), key=lambda x: x[1], reverse=True)[:3]
+        return top_artists
+    if st.button("Predict Artist"):
+        if lyrics_input.strip():
+            top_artists = predict_artist(lyrics_input)
+            st.success("Top Predictions:")
+            for artist, prob in top_artists:
+                st.write(f"{artist}: {prob:.2f}")
+        else:
+            st.warning("Please enter some lyrics!")
+with tabs[2]:
+    st.header("Sample Training Data")
+    st.dataframe(train_df[['Artist', 'Song', 'Lyrics']], height=400)
+with tabs[3]:
+    st.header("Visualizations")
+    st.subheader("Artist Distribution")
+    fig, ax = plt.subplots(figsize=(8, 6))
+    top_artists = train_df['Artist'].value_counts().nlargest(20)
+    sns.barplot(x=top_artists.values, y=top_artists.index, palette='coolwarm', ax=ax)
+    ax.set_xlabel("Number of Songs")
+    ax.set_ylabel("Artist")
+    st.pyplot(fig)
+    st.subheader("Word Cloud")
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['Lyrics']))
+    fig, ax = plt.subplots()
+    ax.imshow(wordcloud, interpolation='bilinear')
+    ax.axis("off")
+    st.pyplot(fig)
+with tabs[4]:
+    st.header("Model Performance")
+    st.subheader("Classification Report")
+    st.dataframe(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T, height=400)

model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32367d3342cdceceba5448bf54a74ae3141471ab8cec8a513c792eaf703d2dbc
+size 5583786

requirements.txt CHANGED Viewed

@@ -1,4 +1,9 @@
 streamlit
 pandas
-scikit-learn
 joblib

 streamlit
 pandas
 joblib
+seaborn
+matplotlib
+wordcloud
+scikit-learn
+imblearn
+nltk

test_songs.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

training_songs.csv ADDED Viewed

The diff for this file is too large to render. See raw diff