reysarms commited on
Commit
ab0c289
·
1 Parent(s): d300c28

revised errors

Browse files
Files changed (5) hide show
  1. app.py +128 -0
  2. model.pkl +3 -0
  3. requirements.txt +6 -1
  4. test_songs.csv +0 -0
  5. training_songs.csv +0 -0
app.py CHANGED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import joblib
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ import re
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ from nltk.stem import WordNetLemmatizer
10
+ from wordcloud import WordCloud
11
+ from imblearn.over_sampling import SMOTE
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.linear_model import SGDClassifier
14
+ from sklearn.model_selection import GridSearchCV
15
+ from sklearn.metrics import accuracy_score, classification_report
16
+ import os
17
+
18
+ # Download NLTK data
19
+ nltk.download('stopwords')
20
+ nltk.download('wordnet')
21
+
22
+ st.title("🎤 Lyric Artist Classifier")
23
+ st.write("Ever wondered who might have written a set of lyrics? This app predicts the artist based on lyrical patterns!")
24
+
25
+ # Load datasets
26
+ @st.cache_data
27
+ def load_data():
28
+ train_df = pd.read_csv("training_songs.csv")
29
+ test_df = pd.read_csv("test_songs.csv")
30
+ return train_df, test_df
31
+
32
+ train_df, test_df = load_data()
33
+
34
+ if 'Lyrics' not in train_df.columns or 'Artist' not in train_df.columns:
35
+ st.error("Dataset must contain 'Lyrics' and 'Artist' columns.")
36
+ st.stop()
37
+
38
+ # Text preprocessing
39
+ def preprocess_text(text):
40
+ text = text.lower()
41
+ text = re.sub(r'[^a-z\s]', '', text)
42
+ lemmatizer = WordNetLemmatizer()
43
+ stop_words = set(stopwords.words('english'))
44
+ words = text.split()
45
+ words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
46
+ return ' '.join(words)
47
+
48
+ train_df['Lyrics'] = train_df['Lyrics'].apply(preprocess_text)
49
+ test_df['Lyrics'] = test_df['Lyrics'].apply(preprocess_text)
50
+
51
+ # Train model
52
+ @st.cache_resource
53
+ def train_model():
54
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,3))
55
+ X_train = vectorizer.fit_transform(train_df['Lyrics'])
56
+ y_train = train_df['Artist']
57
+ smote = SMOTE()
58
+ X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
59
+ model = SGDClassifier(loss='log_loss', max_iter=1000)
60
+ param_grid = {'alpha': [0.0001, 0.001, 0.01]}
61
+ grid_search = GridSearchCV(model, param_grid, cv=3)
62
+ grid_search.fit(X_train_resampled, y_train_resampled)
63
+ best_model = grid_search.best_estimator_
64
+ joblib.dump((vectorizer, best_model), "model.pkl")
65
+ return vectorizer, best_model
66
+
67
+ if os.path.exists("model.pkl"):
68
+ vectorizer, model = joblib.load("model.pkl")
69
+ else:
70
+ vectorizer, model = train_model()
71
+
72
+ X_test = vectorizer.transform(test_df['Lyrics'])
73
+ y_test = test_df['Artist']
74
+ y_pred = model.predict(X_test)
75
+ accuracy = accuracy_score(y_test, y_pred)
76
+
77
+ # Tabs
78
+ tabs = st.tabs(["Home", "Prediction", "Dataset", "Visualizations", "Model Performance"])
79
+
80
+ with tabs[0]:
81
+ st.header("Welcome to Lyric Artist Classifier!")
82
+ st.write("This AI-powered app predicts the artist of a song based on its lyrics.")
83
+ st.write("The model has been trained on a dataset of various artists and uses text analysis techniques to make predictions.")
84
+ st.subheader("Model Performance")
85
+ st.write(f"Current Model Accuracy: **{accuracy:.2f}**")
86
+ st.write("While the model performs well, predictions might be less accurate for artists with fewer songs in the dataset.")
87
+
88
+ with tabs[1]:
89
+ st.header("Predict the Artist!")
90
+ lyrics_input = st.text_area("Enter Lyrics:", height=200)
91
+ def predict_artist(lyrics):
92
+ X_input = vectorizer.transform([preprocess_text(lyrics)])
93
+ predictions = model.predict_proba(X_input)[0]
94
+ top_artists = sorted(zip(model.classes_, predictions), key=lambda x: x[1], reverse=True)[:3]
95
+ return top_artists
96
+ if st.button("Predict Artist"):
97
+ if lyrics_input.strip():
98
+ top_artists = predict_artist(lyrics_input)
99
+ st.success("Top Predictions:")
100
+ for artist, prob in top_artists:
101
+ st.write(f"{artist}: {prob:.2f}")
102
+ else:
103
+ st.warning("Please enter some lyrics!")
104
+
105
+ with tabs[2]:
106
+ st.header("Sample Training Data")
107
+ st.dataframe(train_df[['Artist', 'Song', 'Lyrics']], height=400)
108
+
109
+ with tabs[3]:
110
+ st.header("Visualizations")
111
+ st.subheader("Artist Distribution")
112
+ fig, ax = plt.subplots(figsize=(8, 6))
113
+ top_artists = train_df['Artist'].value_counts().nlargest(20)
114
+ sns.barplot(x=top_artists.values, y=top_artists.index, palette='coolwarm', ax=ax)
115
+ ax.set_xlabel("Number of Songs")
116
+ ax.set_ylabel("Artist")
117
+ st.pyplot(fig)
118
+ st.subheader("Word Cloud")
119
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['Lyrics']))
120
+ fig, ax = plt.subplots()
121
+ ax.imshow(wordcloud, interpolation='bilinear')
122
+ ax.axis("off")
123
+ st.pyplot(fig)
124
+
125
+ with tabs[4]:
126
+ st.header("Model Performance")
127
+ st.subheader("Classification Report")
128
+ st.dataframe(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T, height=400)
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32367d3342cdceceba5448bf54a74ae3141471ab8cec8a513c792eaf703d2dbc
3
+ size 5583786
requirements.txt CHANGED
@@ -1,4 +1,9 @@
1
  streamlit
2
  pandas
3
- scikit-learn
4
  joblib
 
 
 
 
 
 
 
1
  streamlit
2
  pandas
 
3
  joblib
4
+ seaborn
5
+ matplotlib
6
+ wordcloud
7
+ scikit-learn
8
+ imblearn
9
+ nltk
test_songs.csv ADDED
The diff for this file is too large to render. See raw diff
 
training_songs.csv ADDED
The diff for this file is too large to render. See raw diff