{"cells":[{"cell_type":"markdown","metadata":{"id":"5ByvVHnFr-s1"},"source":["Get million song subset data song list\n","Get metadata and join the data\n","\n","use artist similarity and artists to train the model on similarity\n","\n","use last.fm to get additional data on each song to augment this"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1048,"status":"ok","timestamp":1715387243880,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"dO4mavAdsELi","outputId":"d0064229-e1a9-4875-e8f1-ee2b19e36855"},"outputs":[{"ename":"ModuleNotFoundError","evalue":"No module named 'google.colab'","output_type":"error","traceback":["\u001b[1;31m---------------------------------------------------------------------------\u001b[0m","\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgoogle\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcolab\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m drive\n\u001b[0;32m 2\u001b[0m drive\u001b[38;5;241m.\u001b[39mmount(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/content/drive\u001b[39m\u001b[38;5;124m'\u001b[39m)\n","\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'google.colab'"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7109,"status":"ok","timestamp":1715387250988,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"RktUo1FTsTm4","outputId":"ef21ad0f-4c34-4693-f5de-e6f5617465d7"},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: python-dotenv in /usr/local/lib/python3.10/dist-packages (1.0.1)\n"]}],"source":["!pip install python-dotenv"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":1591,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"AkHtP67Sr-s2"},"outputs":[],"source":["# imports\n","import pandas as pd\n","import h5py\n","import os\n","from sqlalchemy import create_engine\n","import requests\n","import time\n","from dotenv import load_dotenv"]},{"cell_type":"code","execution_count":4,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"vIJPMBNFr-s3"},"outputs":[],"source":["pd.set_option('display.max_rows', 100)"]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":35},"executionInfo":{"elapsed":2,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"kShsPDUoW0Tm","outputId":"5c3cb25e-e38f-4d9d-9102-1e18fd73618a"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'/content'"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["os.getcwd()"]},{"cell_type":"code","execution_count":6,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1715387252577,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"kmdF3bjKW1kz"},"outputs":[],"source":["os.chdir('/content/drive/MyDrive/CMPE-258: Team Neurobytes/Neurobytes/mlops/notebooks')"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":318,"status":"ok","timestamp":1715387252893,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"JKeiMiYMWwpM","outputId":"1eee2de4-7144-45fb-9269-a7eed567bc2d"},"outputs":[{"name":"stdout","output_type":"stream","text":["label_encoder.joblib model_training.ipynb scaler.joblib\t tracks_eda.ipynb\n","model.pth\t README.md\t\t test_spotify_api.ipynb users_eda.ipynb\n"]}],"source":["! ls"]},{"cell_type":"markdown","metadata":{"id":"sk1jv62kr-s3"},"source":["# Loading Data"]},{"cell_type":"markdown","metadata":{"id":"rTCKoervr-s3"},"source":["## Loading million song subset data"]},{"cell_type":"code","execution_count":9,"metadata":{"executionInfo":{"elapsed":162,"status":"ok","timestamp":1715364589548,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"zdtzGTb4r-s3"},"outputs":[],"source":["# load the data (only loading song_id, metadata contains the rest)\n","def read_song_features(file_path):\n"," with h5py.File(file_path, 'r') as f:\n"," song_id = f['metadata']['songs']['song_id'][0].decode('utf-8')\n"," return {'song_id': song_id}\n","\n","\n","# process all files in a directory into a df\n","def process_all_files_to_dataframe(root_dir):\n"," data = []\n"," print(f\"Checking directory: {root_dir}\")\n","\n"," for subdir, dirs, files in os.walk(root_dir):\n"," print(f\"Currently scanning {subdir} with {len(files)} files\")\n"," for file in files:\n"," if file.endswith('.h5'):\n"," file_path = os.path.join(subdir, file)\n"," print(f\"Processing file: {file_path}\")\n"," song_data = read_song_features(file_path)\n"," data.append(song_data)\n","\n"," if not data:\n"," print(\"No data to process.\")\n","\n"," df = pd.DataFrame(data)\n"," return df"]},{"cell_type":"code","execution_count":11,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":163,"status":"ok","timestamp":1715364643278,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"4WPOjZXOr-s3","outputId":"8ebedd7f-7741-4d5f-d46b-8f8399ff1c54"},"outputs":[{"name":"stdout","output_type":"stream","text":["Checking directory: ../../../data/\n","No data to process.\n"]}],"source":["root_dir = 'data/MillionSongSubset'\n","df = process_all_files_to_dataframe(root_dir)"]},{"cell_type":"markdown","metadata":{"id":"cM5Cf9MEr-s3"},"source":["### Loading million song subset metadata from sqlite db"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"JBos93r4r-s3"},"outputs":[],"source":["# load metadata from sqlite\n","def load_data_from_sqlite(db_path, table_name):\n"," engine = create_engine(f'sqlite:///{db_path}')\n"," query = f\"SELECT * FROM {table_name}\"\n"," df = pd.read_sql_query(query, engine)\n"," return df\n","\n","# load metadata and merge with song data\n","db_path3 = 'data/MillionSongSubsetMetadata/track_metadata.db'\n","df3 = load_data_from_sqlite(db_path3, 'songs')\n","df = df.merge(df3, on='song_id', how='left')\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"B2zIZ4T6r-s4"},"outputs":[],"source":["columns_to_drop = ['track_id', 'artist_id', 'song_id', 'artist_mbid', 'track_7digitalid', 'shs_perf', 'shs_work']\n","\n","for column in columns_to_drop:\n"," if column in df.columns:\n"," df.drop(columns=[column], inplace=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"NWP5gZpzr-s4"},"outputs":[],"source":["df.columns"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"TcgnhEffr-s4"},"outputs":[],"source":["df.head()"]},{"cell_type":"markdown","metadata":{"id":"9XEtFgNrr-s4"},"source":["## Loading last.fm data"]},{"cell_type":"code","execution_count":8,"metadata":{"executionInfo":{"elapsed":214,"status":"ok","timestamp":1715387284033,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"ce8MTL8dr-s4"},"outputs":[],"source":["def fetch_data(api_key, method, params):\n"," base_url = \"http://ws.audioscrobbler.com/2.0/\"\n"," params['api_key'] = api_key\n"," params['method'] = method\n"," params['format'] = 'json'\n"," response = requests.get(base_url, params=params)\n"," return response.json()\n","\n","\n","def get_artist_info(api_key, artist_name):\n"," params = {'artist': artist_name}\n"," return fetch_data(api_key, 'artist.getInfo', params)\n","\n","\n","def get_track_info(api_key, artist_name, track_name):\n"," params = {'artist': artist_name, 'track': track_name}\n"," return fetch_data(api_key, 'track.getInfo', params)\n","\n","\n","def batch_fetch_data(api_key, items, fetch_function, sleep_time=1):\n"," results = []\n"," for item in items:\n"," result = fetch_function(api_key, *item)\n"," results.append(result)\n"," # time.sleep(sleep_time)\n"," return results"]},{"cell_type":"code","execution_count":9,"metadata":{"executionInfo":{"elapsed":1,"status":"ok","timestamp":1715387284241,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"4NqoJJOYr-s4"},"outputs":[],"source":["# load LASTFM_API_KEY from .env\n","import requests\n","load_dotenv()\n","api_key = os.getenv('LASTFM_API_KEY')\n","\n","\n","def fetch_lastfm_data(api_key, artist_name, track_name):\n"," base_url = \"http://ws.audioscrobbler.com/2.0/\"\n"," params = {\n"," 'method': 'track.getInfo',\n"," 'api_key': api_key,\n"," 'artist': artist_name,\n"," 'track': track_name,\n"," 'format': 'json'\n"," }\n"," response = requests.get(base_url, params=params)\n"," if response.status_code == 200 and response.text.strip():\n"," return response.json()\n"," else:\n"," return None\n","\n","\n","def parse_lastfm_data(data):\n"," if data and 'track' in data:\n"," track = data['track']\n"," return {\n"," 'listeners': track.get('listeners', '0'),\n"," 'playcount': track.get('playcount', '0'),\n"," 'tags': ', '.join(tag['name'] for tag in track.get('toptags', {}).get('tag', [])),\n"," }\n"," return None"]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":216},"executionInfo":{"elapsed":357,"status":"error","timestamp":1715387298868,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"prQYTEpGr-s4","outputId":"7f04e536-cd64-41d8-861f-1ef035e99cab"},"outputs":[{"ename":"NameError","evalue":"name 'df' is not defined","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mload_dotenv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mapi_key\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetenv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'LASTFM_API_KEY'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0msubset_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mtracks_skipped\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mNameError\u001b[0m: name 'df' is not defined"]}],"source":["from tqdm import tqdm\n","tqdm.pandas()\n","\n","load_dotenv()\n","api_key = os.getenv('LASTFM_API_KEY')\n","subset_df = df.head(1000)\n","\n","tracks_skipped = 0\n","\n","\n","def fetch_and_parse(row):\n"," global tracks_skipped\n"," data = fetch_lastfm_data(api_key, row['artist_name'], row['title'])\n"," if data is None:\n"," tracks_skipped += 1\n"," return None\n"," parsed_data = parse_lastfm_data(data)\n"," if parsed_data is None:\n"," tracks_skipped += 1\n"," return parsed_data\n","\n","\n","# Use progress_apply instead of apply\n","subset_df['lastfm_data'] = subset_df.progress_apply(fetch_and_parse, axis=1)\n","\n","# Remove rows where lastfm_data is None\n","subset_df = subset_df[subset_df['lastfm_data'].notna()]\n","\n","subset_df.reset_index(drop=True, inplace=True)\n","track_details_df = pd.json_normalize(subset_df['lastfm_data'])\n","mixed = pd.concat(\n"," [subset_df.drop(columns=['lastfm_data']), track_details_df], axis=1)\n","\n","print(f\"Tracks skipped: {tracks_skipped}\")\n","\n","mixed.to_csv('data/music_data_small.csv', index=False)"]},{"cell_type":"markdown","metadata":{"id":"0wxV_-P6r-s5"},"source":["## Data processing"]},{"cell_type":"code","execution_count":191,"metadata":{"executionInfo":{"elapsed":191,"status":"ok","timestamp":1715390440593,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"vMUDiJbjr-s5"},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"..\\..\\db\\data\\music_data.csv\")\n","df.dropna(inplace=True)"]},{"cell_type":"code","execution_count":192,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":258},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1715390440851,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"gHQ3NQr7rBN4","outputId":"6897914c-b362-4ea0-94e0-a5030f95fe88"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
titlereleaseartist_namedurationartist_familiarityartist_hotttnesssyearlistenersplaycounttags
0100 Club 1996 ''We Love You Beatles'' - LiveSex Pistols - The InterviewsSex Pistols88.737510.7311840.5492040172210The Beatles, title is a full sentence
1Yo Quiero ContigoSentenciados - Platinum EditionBaby Rasta & Gringo167.366080.6101860.3553200975316911Reggaeton, alexis y fido, Eliana, mis videos, ...
4EmeraldEmeraldBedrock501.864040.6540390.39062520049732247dance
6KarmaThe Diary Of Alicia KeysAlicia Keys255.999550.9339160.77867420032503041028356rnb, soul, Alicia Keys, female vocalists, Karma
7Money BluesSlidetimeJoanna Connor243.669750.4792180.33285704291008guitar girl, blues
\n","
"],"text/plain":[" title \\\n","0 100 Club 1996 ''We Love You Beatles'' - Live \n","1 Yo Quiero Contigo \n","4 Emerald \n","6 Karma \n","7 Money Blues \n","\n"," release artist_name duration \\\n","0 Sex Pistols - The Interviews Sex Pistols 88.73751 \n","1 Sentenciados - Platinum Edition Baby Rasta & Gringo 167.36608 \n","4 Emerald Bedrock 501.86404 \n","6 The Diary Of Alicia Keys Alicia Keys 255.99955 \n","7 Slidetime Joanna Connor 243.66975 \n","\n"," artist_familiarity artist_hotttnesss year listeners playcount \\\n","0 0.731184 0.549204 0 172 210 \n","1 0.610186 0.355320 0 9753 16911 \n","4 0.654039 0.390625 2004 973 2247 \n","6 0.933916 0.778674 2003 250304 1028356 \n","7 0.479218 0.332857 0 429 1008 \n","\n"," tags \n","0 The Beatles, title is a full sentence \n","1 Reggaeton, alexis y fido, Eliana, mis videos, ... \n","4 dance \n","6 rnb, soul, Alicia Keys, female vocalists, Karma \n","7 guitar girl, blues "]},"execution_count":192,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":193,"metadata":{"executionInfo":{"elapsed":142,"status":"ok","timestamp":1715390441226,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"ltW1gvc7r-s5"},"outputs":[],"source":["import pandas as pd\n","import torch\n","from torch.utils.data import DataLoader\n","import torch.nn as nn\n","import torch.nn.functional as F\n","from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n","from sklearn.model_selection import train_test_split\n","import torch.optim as optim\n","\n","def label_encode_data(df):\n"," df = df.copy(deep=True)\n"," # Encode categorical data\n"," label_encoders = {}\n"," unknown_label = 'unknown' # Define an unknown label\n","\n"," for column in ['tags', 'title']:\n"," le = LabelEncoder()\n","\n"," # Get unique categories plus an 'unknown' category\n"," unique_categories = df[column].unique().tolist()\n"," # Add 'unknown' to the list of categories\n"," unique_categories.append(unknown_label)\n","\n"," # Fit the LabelEncoder to these categories\n"," le.fit(unique_categories)\n"," df[column] = le.transform(df[column].astype(str))\n","\n"," # Store the encoder\n"," label_encoders[column] = le\n","\n"," return df, label_encoders\n","\n","\n","# Normalize numerical features\n","scaler = MinMaxScaler()\n","df[['listeners', 'playcount']] = scaler.fit_transform(\n"," df[['listeners', 'playcount']])\n","\n","# Label encode categorical features\n","df_scaled, label_encoder_training = label_encode_data(df)\n","\n","# Split data into features and target\n","X = df_scaled[['tags']]\n","y = df_scaled['title']\n","\n","# Split the dataset into training and testing sets\n","X_train, X_test, y_train, y_test = train_test_split(\n"," X, y, test_size=0.2, random_state=42)"]},{"cell_type":"code","execution_count":194,"metadata":{"executionInfo":{"elapsed":166,"status":"ok","timestamp":1715390465207,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"G3RFJN6Ur-s5"},"outputs":[],"source":["class SongRecommender(nn.Module):\n"," def __init__(self):\n"," super(SongRecommender, self).__init__()\n"," self.fc1 = nn.Linear(1, 128) # Adjust input features if needed\n"," self.fc2 = nn.Linear(128, 256)\n"," self.fc3 = nn.Linear(256, 128)\n"," # Output size = number of unique titles including 'unknown'\n"," # Add 1 for the 'unknown' label\n"," self.output = nn.Linear(128, len(y.unique()) + 1)\n","\n"," def forward(self, x):\n"," x = F.relu(self.fc1(x))\n"," x = F.relu(self.fc2(x))\n"," x = F.relu(self.fc3(x))\n"," x = self.output(x)\n"," return x\n","\n","\n","model = SongRecommender()\n","optimizer = optim.Adam(model.parameters(), lr=0.001)\n","criterion = nn.CrossEntropyLoss()"]},{"cell_type":"code","execution_count":195,"metadata":{"executionInfo":{"elapsed":160,"status":"ok","timestamp":1715390466326,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"-HenNJLnr-s5"},"outputs":[],"source":["def train_model(model, X_train, y_train, X_test, y_test):\n"," train_loader = DataLoader(\n"," list(zip(X_train.values.astype(float), y_train)), batch_size=10, shuffle=True)\n"," test_loader = DataLoader(\n"," list(zip(X_test.values.astype(float), y_test)), batch_size=10, shuffle=False)\n","\n"," model.train()\n"," for epoch in range(10): # Number of epochs\n"," train_loss = 0\n"," for features, labels in train_loader:\n"," optimizer.zero_grad()\n"," outputs = model(torch.tensor(features).float())\n"," # Ensure labels are long type\n"," loss = criterion(outputs, torch.tensor(labels).long())\n"," loss.backward()\n"," optimizer.step()\n"," train_loss += loss.item()\n","\n"," # Validation phase\n"," model.eval()\n"," validation_loss = 0\n"," for features, labels in test_loader:\n"," outputs = model(torch.tensor(features).float())\n"," loss = criterion(outputs, torch.tensor(labels).long())\n"," validation_loss += loss.item()\n","\n"," print(f'Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}')"]},{"cell_type":"code","execution_count":196,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":138831,"status":"ok","timestamp":1715390606602,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"gNpxg0ANr-s5","outputId":"c7e9ce0c-3653-4e9a-b3ee-2b9d88da4364"},"outputs":[{"name":"stderr","output_type":"stream","text":["C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:12: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," outputs = model(torch.tensor(features).float())\n","C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:14: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," loss = criterion(outputs, torch.tensor(labels).long())\n","C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," outputs = model(torch.tensor(features).float())\n","C:\\Users\\Nickk\\AppData\\Local\\Temp\\ipykernel_13264\\1321601871.py:24: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"," loss = criterion(outputs, torch.tensor(labels).long())\n"]},{"name":"stdout","output_type":"stream","text":["Epoch 1, Training Loss: 14.161600421387472, Validation Loss: 8.646175272324506\n","Epoch 2, Training Loss: 8.468926938374837, Validation Loss: 8.906991397633272\n","Epoch 3, Training Loss: 8.42033219749545, Validation Loss: 9.14518429251278\n","Epoch 4, Training Loss: 8.428513119544512, Validation Loss: 9.366180943507775\n","Epoch 5, Training Loss: 8.350075204872791, Validation Loss: 9.573424189698462\n","Epoch 6, Training Loss: 8.334989405267033, Validation Loss: 9.770331466899199\n","Epoch 7, Training Loss: 8.404972361340935, Validation Loss: 9.958629150016636\n","Epoch 8, Training Loss: 8.490517691624017, Validation Loss: 10.354363404068293\n","Epoch 9, Training Loss: 8.405202573611412, Validation Loss: 10.315738350737329\n","Epoch 10, Training Loss: 8.300552919175889, Validation Loss: 10.487916422825233\n"]}],"source":["train_model(model, X_train, y_train, X_test, y_test)"]},{"cell_type":"code","execution_count":197,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SongRecommender(\n"," (fc1): Linear(in_features=1, out_features=128, bias=True)\n"," (fc2): Linear(in_features=128, out_features=256, bias=True)\n"," (fc3): Linear(in_features=256, out_features=128, bias=True)\n"," (output): Linear(in_features=128, out_features=4855, bias=True)\n",")\n"]}],"source":["print(model)"]},{"cell_type":"code","execution_count":198,"metadata":{"executionInfo":{"elapsed":138,"status":"ok","timestamp":1715390703802,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"1i30qNdCr-s5"},"outputs":[],"source":["# save the model\n","torch.save(model.state_dict(), './model.pth')"]},{"cell_type":"code","execution_count":199,"metadata":{"executionInfo":{"elapsed":1,"status":"ok","timestamp":1715390703994,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"iCAMAEj5r-s5"},"outputs":[],"source":["# load the model\n","model = SongRecommender()"]},{"cell_type":"code","execution_count":200,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":292,"status":"ok","timestamp":1715390704465,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"jjkMlHqDHS-Z","outputId":"469d9395-1aa0-4695-98e0-ee867cd31e6b"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
artist_nametitletagslistenersplaycount
0Sex Pistols100 Club 1996 ''We Love You Beatles'' - LiveThe Beatles, title is a full sentence0.0000700.000009
1Baby Rasta & GringoYo Quiero ContigoReggaeton, alexis y fido, Eliana, mis videos, ...0.0039780.000729
4BedrockEmeralddance0.0003970.000097
6Alicia KeysKarmarnb, soul, Alicia Keys, female vocalists, Karma0.1021030.044359
7Joanna ConnorMoney Bluesguitar girl, blues0.0001750.000043
\n","
"],"text/plain":[" artist_name title \\\n","0 Sex Pistols 100 Club 1996 ''We Love You Beatles'' - Live \n","1 Baby Rasta & Gringo Yo Quiero Contigo \n","4 Bedrock Emerald \n","6 Alicia Keys Karma \n","7 Joanna Connor Money Blues \n","\n"," tags listeners playcount \n","0 The Beatles, title is a full sentence 0.000070 0.000009 \n","1 Reggaeton, alexis y fido, Eliana, mis videos, ... 0.003978 0.000729 \n","4 dance 0.000397 0.000097 \n","6 rnb, soul, Alicia Keys, female vocalists, Karma 0.102103 0.044359 \n","7 guitar girl, blues 0.000175 0.000043 "]},"execution_count":200,"metadata":{},"output_type":"execute_result"}],"source":["df.loc[:, ['artist_name', 'title', 'tags', 'listeners', 'playcount']].head()"]},{"cell_type":"code","execution_count":201,"metadata":{"executionInfo":{"elapsed":186,"status":"ok","timestamp":1715390829249,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"-W3SwScgr-s5"},"outputs":[],"source":["def label_encode_data(df):\n"," df = df.copy(deep=True)\n"," # Encode categorical data\n"," label_encoders = {}\n"," unknown_label = 'unknown' # Define an unknown label\n","\n"," for column in ['tags']:\n"," le = LabelEncoder()\n","\n"," # Get unique categories plus an 'unknown' category\n"," unique_categories = df[column].unique().tolist()\n"," # Add 'unknown' to the list of categories\n"," unique_categories.append(unknown_label)\n","\n"," # Fit the LabelEncoder to these categories\n"," le.fit(unique_categories)\n"," df[column] = le.transform(df[column].astype(str))\n","\n"," # Store the encoder\n"," label_encoders[column] = le\n","\n"," return df, label_encoders\n","\n","\n","def recommend_songs(model, user_data, full_data=df, train_encoder=label_encoder_training):\n"," model.eval()\n"," full_data = full_data.copy(deep=True)\n"," with torch.no_grad():\n","\n"," # Create a DataFrame with feature names\n"," text_features = user_data.loc[:, ['tags']]\n","\n"," # encoding using concatenated full dataset and evaluation set for inference\n"," df = full_data.loc[:, ['tags']]\n"," text_features_full = df.loc[:, ['tags']]\n","\n"," all_labels = pd.concat([text_features, text_features_full], axis=0)\n"," all_labels.reset_index(drop=True, inplace=True)\n","\n"," # Get the encoder based on all categorical features\n"," _, label_encoders = label_encode_data(all_labels)\n","\n"," # encode the user data\n"," label_encoded_data = text_features.copy(deep=True)\n"," for column in ['tags']:\n"," label_encoded_data[column] = label_encoders[column].transform(\n"," label_encoded_data[column].astype(str))\n","\n"," # converting label_encoded_data into a torch tensor as float dtype\n"," all_features = torch.tensor(\n"," label_encoded_data.to_numpy()).float().unsqueeze(0)\n","\n"," # Make predictions\n"," predictions = model(all_features)\n","\n"," predictions = predictions[0, :5, :] # selecting top 5\n"," for row in predictions:\n"," top_5_values, top_5_indices = row.topk(5)\n"," recommended_song_ids = top_5_indices.squeeze().tolist()\n","\n"," try:\n"," recommended_titles = label_encoders['title'].inverse_transform(\n"," recommended_song_ids)\n"," recommended_tags = label_encoders['tags'].inverse_transform(\n"," recommended_song_ids)\n"," except:\n"," recommended_titles = train_encoder['title'].inverse_transform(\n"," recommended_song_ids)[:5]\n"," recommended_tags = train_encoder['tags'].inverse_transform(recommended_song_ids)[\n"," :5]\n","\n"," return list(zip(recommended_titles, recommended_tags))"]},{"cell_type":"code","execution_count":202,"metadata":{},"outputs":[],"source":["user_preferences = pd.read_csv(\"..\\\\..\\\\db\\\\data\\\\user_preferences.csv\")\n","user_preferences.drop('level_0', axis=1, inplace=True)"]},{"cell_type":"code","execution_count":203,"metadata":{"executionInfo":{"elapsed":180,"status":"ok","timestamp":1715388203675,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"o89SnI9-r-s5"},"outputs":[],"source":["import requests\n","\n","\n","def fetch_song_data(api_key, artist_name, track_name):\n"," url = \"http://ws.audioscrobbler.com/2.0/\"\n"," params = {\n"," 'method': 'track.getInfo',\n"," 'api_key': api_key,\n"," 'artist': artist_name,\n"," 'track': track_name,\n"," 'format': 'json'\n"," }\n"," response = requests.get(url, params=params)\n"," print(response.content)\n"," return response.json() if response.status_code == 200 else {}\n","\n","\n","def parse_song_data(song_data):\n"," if song_data and 'track' in song_data:\n"," track = song_data['track']\n"," return {\n"," 'artist_name': track['artist']['name'],\n"," 'tags': ', '.join([tag['name'] for tag in track.get('toptags', {}).get('tag', [])]),\n"," 'duration': float(track.get('duration', 0)),\n"," 'listeners': int(track.get('listeners', 0)),\n"," 'playcount': int(track.get('playcount', 0)),\n"," 'album': track.get('album', {}).get('title', 'Unknown')\n"," }\n"," return {}"]},{"cell_type":"markdown","metadata":{"id":"xm89R7m8Xh-G"},"source":["# Importing the User Data and Making Recommendations\n","Let's make recommendations using the sample user's preferences."]},{"cell_type":"code","execution_count":204,"metadata":{"executionInfo":{"elapsed":171,"status":"ok","timestamp":1715388938035,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"ufzxTjO3YeXT"},"outputs":[],"source":["import numpy as np"]},{"cell_type":"code","execution_count":205,"metadata":{"executionInfo":{"elapsed":379,"status":"ok","timestamp":1715388938616,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"y3hXSZHnXhA7"},"outputs":[],"source":["user_preferences = pd.read_csv(\"..\\\\..\\\\db\\\\data\\\\user_preferences.csv\")\n","user_preferences.drop('level_0', axis=1, inplace=True)"]},{"cell_type":"code","execution_count":206,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":293},"executionInfo":{"elapsed":232,"status":"ok","timestamp":1715388938847,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"OU7aafogGu2t","outputId":"27703100-6719-44ca-8429-cec20e133bbe"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
songIDartistsonglinktextuserIDlistenersplaycounttags
019632TotoYou Are The Flower/t/toto/you+are+the+flower_20139737.htmlYou never lose a minute, if in it there is lov...02530787344AOR, rock, soft rock, 70s, pop rock
119632TotoYou Are The Flower/t/toto/you+are+the+flower_20139737.htmlYou never lose a minute, if in it there is lov...02530787344AOR, rock, soft rock, 70s, pop rock
225284Billie HolidayI Only Have Eyes For You/b/billie+holiday/i+only+have+eyes+for+you_200...My love must be a kind of blind love, \\r\\nI c...060356178625jazz, female vocal, vocal jazz, blues, female ...
343594Michael BoltonOnly A Woman Like You/m/michael+bolton/only+a+woman+like+you_101792...It's beautiful, your honesty \\r\\nYou cry when...0459513266Ballad, romantic, soul, pop, cool
450200Rascal FlattsThe Day Before You/r/rascal+flatts/the+day+before+you_10238985.htmlI had all but given up \\r\\nOn finding the one...02207786012country, rock, contemporary country, seen live...
\n","
"],"text/plain":[" songID artist song \\\n","0 19632 Toto You Are The Flower \n","1 19632 Toto You Are The Flower \n","2 25284 Billie Holiday I Only Have Eyes For You \n","3 43594 Michael Bolton Only A Woman Like You \n","4 50200 Rascal Flatts The Day Before You \n","\n"," link \\\n","0 /t/toto/you+are+the+flower_20139737.html \n","1 /t/toto/you+are+the+flower_20139737.html \n","2 /b/billie+holiday/i+only+have+eyes+for+you_200... \n","3 /m/michael+bolton/only+a+woman+like+you_101792... \n","4 /r/rascal+flatts/the+day+before+you_10238985.html \n","\n"," text userID listeners \\\n","0 You never lose a minute, if in it there is lov... 0 25307 \n","1 You never lose a minute, if in it there is lov... 0 25307 \n","2 My love must be a kind of blind love, \\r\\nI c... 0 60356 \n","3 It's beautiful, your honesty \\r\\nYou cry when... 0 4595 \n","4 I had all but given up \\r\\nOn finding the one... 0 22077 \n","\n"," playcount tags \n","0 87344 AOR, rock, soft rock, 70s, pop rock \n","1 87344 AOR, rock, soft rock, 70s, pop rock \n","2 178625 jazz, female vocal, vocal jazz, blues, female ... \n","3 13266 Ballad, romantic, soul, pop, cool \n","4 86012 country, rock, contemporary country, seen live... "]},"execution_count":206,"metadata":{},"output_type":"execute_result"}],"source":["user_preferences.head()"]},{"cell_type":"code","execution_count":216,"metadata":{"executionInfo":{"elapsed":166,"status":"ok","timestamp":1715388941345,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"jVeX1VX9YXj9"},"outputs":[],"source":["sample_user = user_preferences.where(user_preferences['userID'] == np.random.randint(*(0, 9))).dropna()"]},{"cell_type":"markdown","metadata":{"id":"Yu9C90x4Y9lY"},"source":["Hopefully, the neural network makes recommendations of artists that fall into the top 5 for the user."]},{"cell_type":"code","execution_count":217,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":148,"status":"ok","timestamp":1715391053989,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"wnANm0R3YrVa","outputId":"9f890890-055b-41b9-c630-8986442dcaf9"},"outputs":[{"data":{"text/plain":["tags\n","romantic, Love, pop, easy listening, michael bolton 5.0\n","pop 4.0\n","loneliness after dusk, Madonna, demo, never let you go, rebel heart 3.0\n","alternative rock, pop, alternative, pop rock, OneRepublic 3.0\n","pop, boybands, dance, backstreet boys, love at first listen 3.0\n","dtype: float64"]},"execution_count":217,"metadata":{},"output_type":"execute_result"}],"source":["top_5 = sample_user.groupby('tags').count().mean(axis=1).sort_values(ascending=False)[:5]\n","top_5"]},{"cell_type":"code","execution_count":218,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":397,"status":"ok","timestamp":1715390877851,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"m9k_0jVCr-s5","outputId":"9610f359-503f-4b06-d36a-f07d4a79a687"},"outputs":[{"name":"stdout","output_type":"stream","text":["#### RECOMMENDATIONS ###\n"]},{"data":{"text/plain":["[('Blueberry Hill', 'Hip-Hop, hip hop, rap, underground hip-hop, political'),\n"," ('Prognosis', 'mpb, pop, 80s, latin, California'),\n"," ('Money Blues', 'hip hop, rap, Hip-Hop, LL Cool J, Timbaland'),\n"," ('Facedown', 'blues, Old Blues, guitar, slide guitar, gospel'),\n"," ('CB4', 'Kanye West, rnb, 00s, janet jackson, pop')]"]},"execution_count":218,"metadata":{},"output_type":"execute_result"}],"source":["print(\"#### RECOMMENDATIONS ###\")\n","song_recs = recommend_songs(model, sample_user, df) # requires giving main song df for finding embeddings\n","song_recs"]},{"cell_type":"code","execution_count":210,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":276},"executionInfo":{"elapsed":219,"status":"ok","timestamp":1715390907498,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"Klx_gv2v4i6x","outputId":"c2b4dd3e-a48e-411b-9a2c-7963ef922075"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
titlereleaseartist_namedurationartist_familiarityartist_hotttnesssyearlistenersplaycounttags
\n","
"],"text/plain":["Empty DataFrame\n","Columns: [title, release, artist_name, duration, artist_familiarity, artist_hotttnesss, year, listeners, playcount, tags]\n","Index: []"]},"execution_count":210,"metadata":{},"output_type":"execute_result"}],"source":["# finding the song artist in the main dataset\n","df.loc[df['title'].isin(song_recs)]"]},{"cell_type":"code","execution_count":211,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"elapsed":428,"status":"ok","timestamp":1715391084128,"user":{"displayName":"Bryan Alexis Ambriz","userId":"16154433038435291108"},"user_tz":420},"id":"AmOx_KyQRALU","outputId":"2010ab1c-ea64-4a00-8dc6-186d584d4868"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
songIDartistsonglinktextuserIDlistenersplaycounttags
\n","
"],"text/plain":["Empty DataFrame\n","Columns: [songID, artist, song, link, text, userID, listeners, playcount, tags]\n","Index: []"]},"execution_count":211,"metadata":{},"output_type":"execute_result"}],"source":["# lets see how it compares to sample user\n","sample_user.where(sample_user['artist'].isin(top_5.index)).dropna()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"rqkW9oKPRe05"},"outputs":[],"source":[]}],"metadata":{"colab":{"collapsed_sections":["rTCKoervr-s3","9XEtFgNrr-s4"],"provenance":[]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}