Spaces:
Runtime error
Runtime error
File size: 2,884 Bytes
bfaf419 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# -*- coding: utf-8 -*-
"""preprocessed_data
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ssqIgQzcaBw12nyIoBSmaoWGZoaBPPHR
"""
import pandas as pd
from google.colab import files
files.upload()
!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d saurabhshahane/spotgen-music-dataset
!pip install patool
import patoolib
patoolib.extract_archive('spotgen-music-dataset.zip')
data_dir = "SpotGenTrack/Data Sources/"
albums_data = pd.read_csv(data_dir + "spotify_albums.csv")
artists_data = pd.read_csv(data_dir + "spotify_artists.csv")
tracks_data = pd.read_csv(data_dir + "spotify_tracks.csv")
display(albums_data.head())
albums_data.columns
display(artists_data.head())
artists_data.columns
display(tracks_data.head())
tracks_data.columns
## join artist genre information and album release date with track dataset
# drop irrelevant columns
# get only tracks after 1990
def join_genre_and_date(artist_df, album_df, track_df):
album = album_df.rename(columns={'id':"album_id"}).set_index('album_id')
artist = artist_df.rename(columns={'id':"artists_id",'name':"artists_name"}).set_index('artists_id')
track = track_df.set_index('album_id').join(album['release_date'], on='album_id' )
track.artists_id = track.artists_id.apply(lambda x: x[2:-2])
track = track.set_index('artists_id').join(artist[['artists_name','genres']], on='artists_id' )
track.reset_index(drop=False, inplace=True)
track['release_year'] = pd.to_datetime(track.release_date).dt.year
track.drop(columns = ['Unnamed: 0','country','track_name_prev','track_number','type'], inplace = True)
return track[track.release_year >= 1990]
def get_filtered_track_df(df, genres_to_include):
df['genres'] = df.genres.apply(lambda x: [i[1:-1] for i in str(x)[1:-1].split(", ")])
df_exploded = df.explode("genres")[df.explode("genres")["genres"].isin(genres_to_include)]
df_exploded.loc[df_exploded["genres"]=="korean pop", "genres"] = "k-pop"
df_exploded_indices = list(df_exploded.index.unique())
df = df[df.index.isin(df_exploded_indices)]
df = df.reset_index(drop=True)
return df
track_with_year_and_genre = join_genre_and_date(artists_data, albums_data, tracks_data)
genres_to_include = genres = ['dance pop', 'electronic', 'electropop', 'hip hop', 'jazz', 'k-pop', 'latin', 'pop', 'pop rap', 'r&b', 'rock']
filtered_track_df = get_filtered_track_df(track_with_year_and_genre, genres_to_include)
filtered_track_df["uri"] = filtered_track_df["uri"].str.replace("spotify:track:", "")
filtered_track_df = filtered_track_df.drop(columns=['analysis_url', 'available_markets'])
display(filtered_track_df.head())
filtered_track_df.columns
filtered_track_df.to_csv("filtered_track_df.csv", index=False)
tracks_data.describe() |