# **Music recommender**

# **Load Data**

In [None]:
import pandas as pd
from google.colab import files

# Upload the file
uploaded = files.upload()

# Assuming the file is named "music_data.csv"
data_path = "music_data.csv"

# Load the data
df = pd.read_csv(data_path)
df.dropna(inplace=True)

# Display the first few rows of the dataset
print(df.head())


Saving music_data.csv to music_data.csv
                                          title  \
0  100 Club 1996 ''We Love You Beatles'' - Live   
1                             Yo Quiero Contigo   
4                                       Emerald   
6                                         Karma   
7                                   Money Blues   

                           release          artist_name   duration  \
0     Sex Pistols - The Interviews          Sex Pistols   88.73751   
1  Sentenciados - Platinum Edition  Baby Rasta & Gringo  167.36608   
4                          Emerald              Bedrock  501.86404   
6         The Diary Of Alicia Keys          Alicia Keys  255.99955   
7                        Slidetime        Joanna Connor  243.66975   

   artist_familiarity  artist_hotttnesss  year  listeners  playcount  \
0            0.731184           0.549204     0        172        210   
1            0.610186           0.355320     0       9753      16911   
4            0.6

In [None]:
df.head()

Unnamed: 0,title,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,listeners,playcount,tags
0,100 Club 1996 ''We Love You Beatles'' - Live,Sex Pistols - The Interviews,Sex Pistols,88.73751,0.731184,0.549204,0,172,210,"The Beatles, title is a full sentence"
1,Yo Quiero Contigo,Sentenciados - Platinum Edition,Baby Rasta & Gringo,167.36608,0.610186,0.35532,0,9753,16911,"Reggaeton, alexis y fido, Eliana, mis videos, ..."
4,Emerald,Emerald,Bedrock,501.86404,0.654039,0.390625,2004,973,2247,dance
6,Karma,The Diary Of Alicia Keys,Alicia Keys,255.99955,0.933916,0.778674,2003,250304,1028356,"rnb, soul, Alicia Keys, female vocalists, Karma"
7,Money Blues,Slidetime,Joanna Connor,243.66975,0.479218,0.332857,0,429,1008,"guitar girl, blues"


In [None]:
# Display basic information about the dataset
print(df.info())

# Display summary statistics for numerical columns
print(df.describe())

# Display unique values for categorical columns
print("Unique values in 'title':", df['title'].nunique())
print("Unique values in 'artist_name':", df['artist_name'].nunique())
print("Unique values in 'tags':", df['tags'].nunique())

<class 'pandas.core.frame.DataFrame'>
Index: 5063 entries, 0 to 9530
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               5063 non-null   object 
 1   release             5063 non-null   object 
 2   artist_name         5063 non-null   object 
 3   duration            5063 non-null   float64
 4   artist_familiarity  5063 non-null   float64
 5   artist_hotttnesss   5063 non-null   float64
 6   year                5063 non-null   int64  
 7   listeners           5063 non-null   int64  
 8   playcount           5063 non-null   int64  
 9   tags                5063 non-null   object 
dtypes: float64(3), int64(3), object(4)
memory usage: 435.1+ KB
None
          duration  artist_familiarity  artist_hotttnesss         year  \
count  5063.000000         5063.000000        5063.000000  5063.000000   
mean    243.156073            0.626861           0.439664  1392.483705   
std     107.732894 

# **Preprocessing**

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import joblib
import re

# Function to clean tags and artist names
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra white spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Clean 'tags' and 'artist_name' columns
df['tags'] = df['tags'].apply(clean_text)
df['artist_name'] = df['artist_name'].apply(clean_text)

def label_encode_data(df):
    df = df.copy(deep=True)
    label_encoders = {}
    unknown_label = 'unknown'  # Define an unknown label

    for column in ['tags', 'title', 'artist_name']:
        le = LabelEncoder()
        unique_categories = df[column].unique().tolist()
        unique_categories.append(unknown_label)
        le.fit(unique_categories)
        df[column] = le.transform(df[column].astype(str))
        label_encoders[column] = le

    return df, label_encoders

# Normalize numerical features
scaler = MinMaxScaler()
df[['listeners', 'playcount']] = scaler.fit_transform(df[['listeners', 'playcount']])

# Label encode categorical features
df_scaled, label_encoders = label_encode_data(df)

# Save the encoders and scaler
joblib.dump(label_encoders, "/content/new_label_encoders.joblib")
joblib.dump(scaler, "/content/new_scaler.joblib")

print("Label encoders and scaler saved successfully.")


Label encoders and scaler saved successfully.


In [None]:
from sklearn.model_selection import train_test_split

# Split data into features and target
X = df_scaled[['tags', 'artist_name']]
y = df_scaled['title']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")

# Number of unique titles
num_unique_titles = len(label_encoders['title'].classes_)

# Check for out-of-bounds indices in y_train and y_test
print("Maximum value in y_train:", y_train.max())
print("Maximum value in y_test:", y_test.max())
print("Number of unique titles:", num_unique_titles)

# If any out-of-bounds values are found, print them
out_of_bounds_train = y_train[y_train >= num_unique_titles]
out_of_bounds_test = y_test[y_test >= num_unique_titles]

if not out_of_bounds_train.empty:
    print("Out-of-bounds values in y_train:", out_of_bounds_train)
if not out_of_bounds_test.empty:
    print("Out-of-bounds values in y_test:", out_of_bounds_test)

# Fix out-of-bounds values by setting them to a valid index
y_train = y_train.clip(upper=num_unique_titles - 1)
y_test = y_test.clip(upper=num_unique_titles - 1)

# Print the maximum values after clipping
print("Maximum value in y_train after clipping:", y_train.max())
print("Maximum value in y_test after clipping:", y_test.max())


Data split into training and testing sets.
Maximum value in y_train: 4854
Maximum value in y_test: 4850
Number of unique titles: 4855
Maximum value in y_train after clipping: 4854
Maximum value in y_test after clipping: 4850


# **Training**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np

# Define the neural network model with Dropout and Batch Normalization
class ImprovedSongRecommender(nn.Module):
    def __init__(self, input_size, num_titles):
        super(ImprovedSongRecommender, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.output = nn.Linear(128, num_titles)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.output(x)
        return x

# Adjusting input size for the model
input_size = X_train.shape[1]  # Number of features in the input
num_unique_titles = len(label_encoders['title'].classes_)  # Number of unique titles including 'unknown'

# Initialize the model with the correct input size and output size
model = ImprovedSongRecommender(input_size, num_unique_titles)

# Initialize the optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

# Use a learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Early stopping parameters
patience = 3
min_delta = 0.01
best_val_loss = np.inf
patience_counter = 0

# Function to train the model
def train_model(model, X_train, y_train, X_test, y_test):
    global best_val_loss, patience_counter
    train_loader = DataLoader(list(zip(X_train.values.astype(float), y_train)), batch_size=10, shuffle=True)
    test_loader = DataLoader(list(zip(X_test.values.astype(float), y_test)), batch_size=10, shuffle=False)

    model.train()
    for epoch in range(20):  # Increase the number of epochs
        train_loss = 0
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features.float())
            loss = criterion(outputs, labels.long())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Step the scheduler
        scheduler.step()

        # Validation phase
        model.eval()
        validation_loss = 0
        with torch.no_grad():
            for features, labels in test_loader:
                outputs = model(features.float())
                loss = criterion(outputs, labels.long())
                validation_loss += loss.item()

        avg_val_loss = validation_loss / len(test_loader)
        print(f'Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}, Validation Loss: {avg_val_loss}')

        # Early stopping
        if avg_val_loss < best_val_loss - min_delta:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

# Train the model
train_model(model, X_train, y_train, X_test, y_test)

# Save the trained model
model_path = '/content/improved_model.pth'
torch.save(model.state_dict(), model_path)

print("Improved model trained and saved successfully.")


Epoch 1, Training Loss: 8.921830113728841, Validation Loss: 8.836441385979747
Epoch 2, Training Loss: 8.331391870239635, Validation Loss: 9.148561271966672
Epoch 3, Training Loss: 7.494005516429007, Validation Loss: 10.484928570541681
Epoch 4, Training Loss: 6.704833826606657, Validation Loss: 11.745069999320835
Early stopping triggered
Improved model trained and saved successfully.


# **Testing**

In [None]:
import torch
from joblib import load

# Define the same neural network model
class ImprovedSongRecommender(nn.Module):
    def __init__(self, input_size, num_titles):
        super(ImprovedSongRecommender, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.output = nn.Linear(128, num_titles)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.output(x)
        return x

# Load the trained model
model_path = '/content/improved_model.pth'
num_unique_titles = 4855  # Update this to match your dataset

model = ImprovedSongRecommender(input_size=2, num_titles=num_unique_titles)  # Adjust input size accordingly
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Load the label encoders and scaler
label_encoders_path = '/content/new_label_encoders.joblib'
scaler_path = '/content/new_scaler.joblib'

label_encoders = load(label_encoders_path)
scaler = load(scaler_path)

# Create a mapping from encoded indices to actual song titles
index_to_song_title = {index: title for index, title in enumerate(label_encoders['title'].classes_)}

def encode_input(tags, artist_name):
    tags = tags.strip().replace('\n', '')
    artist_name = artist_name.strip().replace('\n', '')

    try:
        encoded_tags = label_encoders['tags'].transform([tags])[0]
    except ValueError:
        encoded_tags = label_encoders['tags'].transform(['unknown'])[0]

    try:
        encoded_artist = label_encoders['artist_name'].transform([artist_name])[0]
    except ValueError:
        encoded_artist = label_encoders['artist_name'].transform(['unknown'])[0]

    return [encoded_tags, encoded_artist]

def recommend_songs(tags, artist_name):
    encoded_input = encode_input(tags, artist_name)
    input_tensor = torch.tensor([encoded_input]).float()

    with torch.no_grad():
        output = model(input_tensor)

    recommendations_indices = torch.topk(output, 5).indices.squeeze().tolist()
    recommendations = [index_to_song_title.get(idx, "Unknown song") for idx in recommendations_indices]

    return recommendations

# Test the recommendation function
tags = "rock"
artist_name = "The Beatles"

recommendations = recommend_songs(tags, artist_name)
print("Recommendations:", recommendations)


Recommendations: ['Betrayal Is A Symptom', 'The Earth Will Shake', 'Saturday', 'Firehouse Rock', 'Breathe Easy']


In [None]:
import torch
from joblib import load

# Define the same neural network model
class ImprovedSongRecommender(nn.Module):
    def __init__(self, input_size, num_titles):
        super(ImprovedSongRecommender, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.output = nn.Linear(128, num_titles)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.output(x)
        return x

# Load the trained model
model_path = '/content/improved_model.pth'
num_unique_titles = 4855  # Update this to match your dataset

model = ImprovedSongRecommender(input_size=2, num_titles=num_unique_titles)  # Adjust input size accordingly
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Load the label encoders and scaler
label_encoders_path = '/content/new_label_encoders.joblib'
scaler_path = '/content/new_scaler.joblib'

label_encoders = load(label_encoders_path)
scaler = load(scaler_path)

# Create a mapping from encoded indices to actual song titles
index_to_song_title = {index: title for index, title in enumerate(label_encoders['title'].classes_)}

def encode_input(tags, artist_name):
    tags = tags.strip().replace('\n', '')
    artist_name = artist_name.strip().replace('\n', '')

    try:
        encoded_tags = label_encoders['tags'].transform([tags])[0]
    except ValueError:
        encoded_tags = label_encoders['tags'].transform(['unknown'])[0]

    try:
        encoded_artist = label_encoders['artist_name'].transform([artist_name])[0]
    except ValueError:
        encoded_artist = label_encoders['artist_name'].transform(['unknown'])[0]

    return [encoded_tags, encoded_artist]

def recommend_songs(tags, artist_name):
    encoded_input = encode_input(tags, artist_name)
    input_tensor = torch.tensor([encoded_input]).float()

    with torch.no_grad():
        output = model(input_tensor)

    recommendations_indices = torch.topk(output, 5).indices.squeeze().tolist()
    recommendations = [index_to_song_title.get(idx, "Unknown song") for idx in recommendations_indices]

    return recommendations

# Test the recommendation function with new inputs
tags = "pop"
artist_name = "Adele"

recommendations = recommend_songs(tags, artist_name)
print("Recommendations:", recommendations)

# Test with another set of inputs
tags = "jazz"
artist_name = "Miles Davis"

recommendations = recommend_songs(tags, artist_name)
print("Recommendations:", recommendations)


Recommendations: ['Betrayal Is A Symptom', 'Carnival (from "Black Orpheus")', 'Saturday', 'The Earth Will Shake', 'Start!']
Recommendations: ['Old Friends', 'Betrayal Is A Symptom', 'Between Love & Hate', 'Carnival (from "Black Orpheus")', 'Satin Doll']
