File size: 2,884 Bytes
bfaf419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
"""preprocessed_data

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ssqIgQzcaBw12nyIoBSmaoWGZoaBPPHR
"""

import pandas as pd

from google.colab import files
files.upload()

!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d saurabhshahane/spotgen-music-dataset

!pip install patool

import patoolib

patoolib.extract_archive('spotgen-music-dataset.zip')

data_dir = "SpotGenTrack/Data Sources/"
albums_data = pd.read_csv(data_dir + "spotify_albums.csv")
artists_data = pd.read_csv(data_dir + "spotify_artists.csv")
tracks_data = pd.read_csv(data_dir + "spotify_tracks.csv")

display(albums_data.head())
albums_data.columns

display(artists_data.head())
artists_data.columns

display(tracks_data.head())
tracks_data.columns

## join artist genre information and album release date with track dataset
# drop irrelevant columns
# get only tracks after 1990
def join_genre_and_date(artist_df, album_df, track_df):
    album = album_df.rename(columns={'id':"album_id"}).set_index('album_id')
    artist = artist_df.rename(columns={'id':"artists_id",'name':"artists_name"}).set_index('artists_id')
    track = track_df.set_index('album_id').join(album['release_date'], on='album_id' )
    track.artists_id = track.artists_id.apply(lambda x: x[2:-2])
    track = track.set_index('artists_id').join(artist[['artists_name','genres']], on='artists_id' )
    track.reset_index(drop=False, inplace=True)
    track['release_year'] = pd.to_datetime(track.release_date).dt.year
    track.drop(columns = ['Unnamed: 0','country','track_name_prev','track_number','type'], inplace = True)
    
    return track[track.release_year >= 1990]

def get_filtered_track_df(df, genres_to_include):
    df['genres'] = df.genres.apply(lambda x: [i[1:-1] for i in str(x)[1:-1].split(", ")])
    df_exploded = df.explode("genres")[df.explode("genres")["genres"].isin(genres_to_include)]
    df_exploded.loc[df_exploded["genres"]=="korean pop", "genres"] = "k-pop"
    df_exploded_indices = list(df_exploded.index.unique())
    df = df[df.index.isin(df_exploded_indices)]
    df = df.reset_index(drop=True)
    return df

track_with_year_and_genre = join_genre_and_date(artists_data, albums_data, tracks_data)
genres_to_include = genres = ['dance pop', 'electronic', 'electropop', 'hip hop', 'jazz', 'k-pop', 'latin', 'pop', 'pop rap', 'r&b', 'rock']
filtered_track_df = get_filtered_track_df(track_with_year_and_genre, genres_to_include)

filtered_track_df["uri"] = filtered_track_df["uri"].str.replace("spotify:track:", "")
filtered_track_df = filtered_track_df.drop(columns=['analysis_url', 'available_markets'])

display(filtered_track_df.head())
filtered_track_df.columns

filtered_track_df.to_csv("filtered_track_df.csv", index=False)

tracks_data.describe()