import streamlit as st import pandas as pd import joblib import seaborn as sns import matplotlib.pyplot as plt import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from wordcloud import WordCloud from imblearn.over_sampling import SMOTE from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score, classification_report import os # Download NLTK data nltk.download('stopwords') nltk.download('wordnet') st.title("🎤 Lyric Artist Classifier") st.write("Ever wondered who might have written a set of lyrics? This app predicts the artist based on lyrical patterns!") # Load datasets @st.cache_data def load_data(): train_df = pd.read_csv("training_songs.csv") test_df = pd.read_csv("test_songs.csv") return train_df, test_df train_df, test_df = load_data() if 'Lyrics' not in train_df.columns or 'Artist' not in train_df.columns: st.error("Dataset must contain 'Lyrics' and 'Artist' columns.") st.stop() # Text preprocessing def preprocess_text(text): text = text.lower() text = re.sub(r'[^a-z\s]', '', text) lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) words = text.split() words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] return ' '.join(words) train_df['Lyrics'] = train_df['Lyrics'].apply(preprocess_text) test_df['Lyrics'] = test_df['Lyrics'].apply(preprocess_text) # Train model @st.cache_resource def train_model(): vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,3)) X_train = vectorizer.fit_transform(train_df['Lyrics']) y_train = train_df['Artist'] smote = SMOTE() X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) model = SGDClassifier(loss='log_loss', max_iter=1000) param_grid = {'alpha': [0.0001, 0.001, 0.01]} grid_search = GridSearchCV(model, param_grid, cv=3) grid_search.fit(X_train_resampled, y_train_resampled) best_model = grid_search.best_estimator_ joblib.dump((vectorizer, best_model), "model.pkl") return vectorizer, best_model if os.path.exists("model.pkl"): vectorizer, model = joblib.load("model.pkl") else: vectorizer, model = train_model() X_test = vectorizer.transform(test_df['Lyrics']) y_test = test_df['Artist'] y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) # Tabs tabs = st.tabs(["Home", "Prediction", "Dataset", "Visualizations", "Model Performance"]) with tabs[0]: st.header("Welcome to Lyric Artist Classifier!") st.write("This AI-powered app predicts the artist of a song based on its lyrics.") st.write("The model has been trained on a dataset of various artists and uses text analysis techniques to make predictions.") st.subheader("Model Performance") st.write(f"Current Model Accuracy: **{accuracy:.2f}**") st.write("While the model performs well, predictions might be less accurate for artists with fewer songs in the dataset.") with tabs[1]: st.header("Predict the Artist!") lyrics_input = st.text_area("Enter Lyrics:", height=200) def predict_artist(lyrics): X_input = vectorizer.transform([preprocess_text(lyrics)]) predictions = model.predict_proba(X_input)[0] top_artists = sorted(zip(model.classes_, predictions), key=lambda x: x[1], reverse=True)[:3] return top_artists if st.button("Predict Artist"): if lyrics_input.strip(): top_artists = predict_artist(lyrics_input) st.success("Top Predictions:") for artist, prob in top_artists: st.write(f"{artist}: {prob:.2f}") else: st.warning("Please enter some lyrics!") with tabs[2]: st.header("Sample Training Data") st.dataframe(train_df[['Artist', 'Song', 'Lyrics']], height=400) with tabs[3]: st.header("Visualizations") st.subheader("Artist Distribution") fig, ax = plt.subplots(figsize=(8, 6)) top_artists = train_df['Artist'].value_counts().nlargest(20) sns.barplot(x=top_artists.values, y=top_artists.index, palette='coolwarm', ax=ax) ax.set_xlabel("Number of Songs") ax.set_ylabel("Artist") st.pyplot(fig) st.subheader("Word Cloud") wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['Lyrics'])) fig, ax = plt.subplots() ax.imshow(wordcloud, interpolation='bilinear') ax.axis("off") st.pyplot(fig) with tabs[4]: st.header("Model Performance") st.subheader("Classification Report") st.dataframe(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T, height=400)