# -*- coding: utf-8 -*- """preprocessed_data Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1ssqIgQzcaBw12nyIoBSmaoWGZoaBPPHR """ import pandas as pd from google.colab import files files.upload() !mkdir ~/.kaggle/ !cp kaggle.json ~/.kaggle/ !chmod 600 ~/.kaggle/kaggle.json !kaggle datasets download -d saurabhshahane/spotgen-music-dataset !pip install patool import patoolib patoolib.extract_archive('spotgen-music-dataset.zip') data_dir = "SpotGenTrack/Data Sources/" albums_data = pd.read_csv(data_dir + "spotify_albums.csv") artists_data = pd.read_csv(data_dir + "spotify_artists.csv") tracks_data = pd.read_csv(data_dir + "spotify_tracks.csv") display(albums_data.head()) albums_data.columns display(artists_data.head()) artists_data.columns display(tracks_data.head()) tracks_data.columns ## join artist genre information and album release date with track dataset # drop irrelevant columns # get only tracks after 1990 def join_genre_and_date(artist_df, album_df, track_df): album = album_df.rename(columns={'id':"album_id"}).set_index('album_id') artist = artist_df.rename(columns={'id':"artists_id",'name':"artists_name"}).set_index('artists_id') track = track_df.set_index('album_id').join(album['release_date'], on='album_id' ) track.artists_id = track.artists_id.apply(lambda x: x[2:-2]) track = track.set_index('artists_id').join(artist[['artists_name','genres']], on='artists_id' ) track.reset_index(drop=False, inplace=True) track['release_year'] = pd.to_datetime(track.release_date).dt.year track.drop(columns = ['Unnamed: 0','country','track_name_prev','track_number','type'], inplace = True) return track[track.release_year >= 1990] def get_filtered_track_df(df, genres_to_include): df['genres'] = df.genres.apply(lambda x: [i[1:-1] for i in str(x)[1:-1].split(", ")]) df_exploded = df.explode("genres")[df.explode("genres")["genres"].isin(genres_to_include)] df_exploded.loc[df_exploded["genres"]=="korean pop", "genres"] = "k-pop" df_exploded_indices = list(df_exploded.index.unique()) df = df[df.index.isin(df_exploded_indices)] df = df.reset_index(drop=True) return df track_with_year_and_genre = join_genre_and_date(artists_data, albums_data, tracks_data) genres_to_include = genres = ['dance pop', 'electronic', 'electropop', 'hip hop', 'jazz', 'k-pop', 'latin', 'pop', 'pop rap', 'r&b', 'rock'] filtered_track_df = get_filtered_track_df(track_with_year_and_genre, genres_to_include) filtered_track_df["uri"] = filtered_track_df["uri"].str.replace("spotify:track:", "") filtered_track_df = filtered_track_df.drop(columns=['analysis_url', 'available_markets']) display(filtered_track_df.head()) filtered_track_df.columns filtered_track_df.to_csv("filtered_track_df.csv", index=False) tracks_data.describe()