Bryan-Az
linked mlops from git dir & added evaluation notebook
9dd7d9c
!pip install python-dotenv
# imports
import pandas as pd
import h5py
import os
from sqlalchemy import create_engine
import requests
import time
from dotenv import load_dotenv
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/CMPE-258: Team Neurobytes/Neurobytes/db/data/music_data.csv')
df.dropna(inplace=True)
import pandas as pd
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import torch.optim as optim
# Encode categorical data
label_encoders = {}
unknown_label = 'unknown' # Define an unknown label
for column in ['artist_name', 'tags', 'title']:
le = LabelEncoder()
# Get unique categories plus an 'unknown' category
unique_categories = df[column].unique().tolist()
# Add 'unknown' to the list of categories
unique_categories.append(unknown_label)
# Fit the LabelEncoder to these categories
le.fit(unique_categories)
df[column] = le.transform(df[column].astype(str))
# Store the encoder
label_encoders[column] = le
# Normalize numerical features
scaler = MinMaxScaler()
df[['duration', 'listeners', 'playcount']] = scaler.fit_transform(
df[['duration', 'listeners', 'playcount']])
# Split data into features and target
X = df[['artist_name', 'tags', 'duration', 'listeners', 'playcount']]
y = df['title']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
class SongRecommender(nn.Module):
def __init__(self):
super(SongRecommender, self).__init__()
self.fc1 = nn.Linear(5, 128) # Adjust input features if needed
self.fc2 = nn.Linear(128, 256)
self.fc3 = nn.Linear(256, 128)
# Output size = number of unique titles including 'unknown'
# Add 1 for the 'unknown' label
self.output = nn.Linear(128, len(y.unique()) + 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.output(x)
return x
model = SongRecommender()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
def train_model(model, X_train, y_train, X_test, y_test):
train_loader = DataLoader(
list(zip(X_train.values.astype(float), y_train)), batch_size=50, shuffle=True)
test_loader = DataLoader(
list(zip(X_test.values.astype(float), y_test)), batch_size=50, shuffle=False)
model.train()
for epoch in range(50): # Number of epochs
train_loss = 0
for features, labels in train_loader:
optimizer.zero_grad()
outputs = model(torch.tensor(features).float())
# Ensure labels are long type
loss = criterion(outputs, torch.tensor(labels).long())
loss.backward()
optimizer.step()
train_loss += loss.item()
# Validation phase
model.eval()
validation_loss = 0
for features, labels in test_loader:
outputs = model(torch.tensor(features).float())
loss = criterion(outputs, torch.tensor(labels).long())
validation_loss += loss.item()
print(f'Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}')
train_model(model, X_train, y_train, X_test, y_test)
# save the model
torch.save(model.state_dict(), 'model.pth')
# load the model
model = SongRecommender()
def recommend_songs(model, input_features):
model.eval()
print(input_features)
with torch.no_grad():
try:
artist_index = label_encoders['artist_name'].transform(
[input_features['artist_name']])
except ValueError:
artist_index = label_encoders['artist_name'].transform(['unknown'])
try:
tags_index = label_encoders['tags'].transform(
[input_features['tags']])
except ValueError:
tags_index = label_encoders['tags'].transform(['unknown'])
# Create a DataFrame with feature names
scaled_features = pd.DataFrame(
[[input_features['duration'], input_features['listeners'],
input_features['playcount']]],
columns=['duration', 'listeners', 'playcount']
)
scaled_features = scaler.transform(scaled_features)[0]
features = torch.tensor(
[artist_index[0], tags_index[0], *scaled_features]).float().unsqueeze(0)
predictions = model(features)
top_5_values, top_5_indices = predictions.topk(5)
recommended_song_ids = top_5_indices.squeeze().tolist()
return label_encoders['title'].inverse_transform(recommended_song_ids)
import requests
def fetch_song_data(api_key, artist_name, track_name):
url = "http://ws.audioscrobbler.com/2.0/"
params = {
'method': 'track.getInfo',
'api_key': api_key,
'artist': artist_name,
'track': track_name,
'format': 'json'
}
response = requests.get(url, params=params)
print(response.content)
return response.json() if response.status_code == 200 else {}
def parse_song_data(song_data):
if song_data and 'track' in song_data:
track = song_data['track']
return {
'artist_name': track['artist']['name'],
'tags': ', '.join([tag['name'] for tag in track.get('toptags', {}).get('tag', [])]),
'duration': float(track.get('duration', 0)),
'listeners': int(track.get('listeners', 0)),
'playcount': int(track.get('playcount', 0)),
'album': track.get('album', {}).get('title', 'Unknown')
}
return {}
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.getenv('LASTFM_API_KEY')
artist_name = 'Lagy Gaga'
track_name = 'Poker Face'
# Fetch and parse song data
song_data = fetch_song_data(api_key, artist_name, track_name)
parsed_data = parse_song_data(song_data)
print(song_data)
# if the song is not found, or the tags column is empty, print a message
if not parsed_data or not parsed_data['tags']:
print("Song not found or tags not available.")
else:
recommend_songs(model, parsed_data)