nanhsin commited on
Commit
81291e5
·
verified ·
1 Parent(s): c89a297

Upload data_fetch.py

Browse files
Files changed (1) hide show
  1. data_fetch.py +305 -0
data_fetch.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from datetime import datetime, timedelta
3
+ import json
4
+ import re
5
+ import requests
6
+ import nltk
7
+ from nltk.corpus import cmudict
8
+ # nltk.download('punkt')
9
+ # nltk.download('cmudict')
10
+ from langdetect import detect
11
+ import os
12
+ from dotenv import load_dotenv
13
+
14
+
15
+ def init():
16
+ '''Initialize the environment.'''
17
+ load_dotenv()
18
+
19
+ global spotify_cid
20
+ global spotify_secret
21
+ global genius_token
22
+ global headers
23
+
24
+ spotify_cid = os.getenv("SPOTIFY_CID")
25
+ spotify_secret = os.getenv("SPOTIFY_SECRET")
26
+ genius_token = os.getenv("GENIUS_TOKEN")
27
+ headers = {"Authorization": "Bearer " + genius_token}
28
+
29
+
30
+ # Cache
31
+
32
+ CACHE_FILENAME = "cache.json"
33
+
34
+ def openCache():
35
+ '''Check if cache file exists, if so load it, if not create new cache'''
36
+ try:
37
+ cache_file = open(CACHE_FILENAME, "r")
38
+ cache_contents = cache_file.read()
39
+ cache_dict = json.loads(cache_contents)
40
+ cache_file.close()
41
+ except:
42
+ cache_dict = {}
43
+ return cache_dict
44
+
45
+ def saveCache(cache_dict):
46
+ '''Save cache file'''
47
+ cache_file = open(CACHE_FILENAME, "w")
48
+ contents_to_write = json.dumps(cache_dict)
49
+ cache_file.write(contents_to_write)
50
+ cache_file.close()
51
+
52
+
53
+ # Billboard scraper
54
+
55
+ def scrapeBillboard(date):
56
+ '''
57
+ Scrape the Billboard Hot 100 chart for a given date.
58
+
59
+ Parameters:
60
+ date (datetime.date): The date of the chart.
61
+
62
+ Returns:
63
+ list: A list of tuples containing the title and artist of each song.
64
+ '''
65
+ url = "https://www.billboard.com/charts/hot-100/" + str(date) + "/"
66
+ html = requests.get(url)
67
+ soup = BeautifulSoup(html.content, "html.parser")
68
+
69
+ ul = soup.findAll("ul", class_="o-chart-results-list-row")
70
+
71
+ billboard = []
72
+ for i in ul:
73
+ title = i.find("h3").text.strip()
74
+ artist = i.find("span", class_="a-font-primary-s").text.strip()
75
+ billboard.append((title, artist))
76
+
77
+ return billboard
78
+
79
+
80
+ # Spotify API
81
+
82
+ def getSpotifyToken():
83
+ '''Get the Spotify access token.'''
84
+ response = requests.post(
85
+ "https://accounts.spotify.com/api/token",
86
+ data={
87
+ "grant_type": "client_credentials",
88
+ "client_id": spotify_cid,
89
+ "client_secret": spotify_secret,
90
+ }).json()
91
+
92
+ return response["access_token"]
93
+
94
+ def getSpotifyID(token, title, artist):
95
+ '''Get the Spotify ID of a song.'''
96
+ headers = {"Authorization": "Bearer " + token}
97
+ url = f"https://api.spotify.com/v1/search?q={title}%20{artist}&type=track&market=US&limit=1"
98
+ response = requests.get(url, headers=headers).json()
99
+ return response["tracks"]["items"][0]["id"]
100
+
101
+ def getSpotifyFeatures(token, song_id):
102
+ '''Get the Spotify features of a song.'''
103
+ headers = {"Authorization": "Bearer " + token}
104
+ url = f"https://api.spotify.com/v1/audio-features/{song_id}"
105
+ response = requests.get(url, headers=headers)
106
+ return response.json()
107
+
108
+
109
+ # Genius API
110
+
111
+ def getGeniusURL(title, artist):
112
+ '''Get the Genius URL of a song.'''
113
+ url = "https://api.genius.com/search"
114
+ params = {"q": f"{title} {artist}"}
115
+ response = requests.get(url, params=params, headers=headers).json()
116
+ return response["response"]["hits"][0]["result"]["url"]
117
+
118
+ def getLyrics(url):
119
+ '''Get the lyrics of a song from its Genius URL.'''
120
+ html = requests.get(url)
121
+ soup = BeautifulSoup(html.content, "html.parser")
122
+ lyrics = soup.find("div", {"data-lyrics-container": "true"}).get_text(separator="\n")
123
+ return lyrics
124
+
125
+
126
+ # Readability metrics
127
+
128
+ def countSyllables(word):
129
+ '''Count the number of syllables in a word.'''
130
+ count = 0
131
+ vowels = 'aeiouy'
132
+ word = word.lower().strip(".:;?!")
133
+ if word[0] in vowels:
134
+ count +=1
135
+ for index in range(1, len(word)):
136
+ if word[index] in vowels and word[index-1] not in vowels:
137
+ count +=1
138
+ if word.endswith('e'):
139
+ count -= 1
140
+ if word.endswith('le'):
141
+ count += 1
142
+ if count == 0:
143
+ count +=1
144
+ return count
145
+
146
+ def getFRES(lyrics):
147
+ '''Calculate the Flesch reading-ease score (FRES) of a song.'''
148
+ # Remove [Verse], [Chorus], etc.
149
+ lyrics = re.sub(r"\[.*\]", "", lyrics)
150
+ sentence = lyrics.split("\n")
151
+ sentence = [i for i in sentence if i]
152
+ word = lyrics.split()
153
+ word_count = len(word)
154
+ sentence_count = len(sentence)
155
+ syllable_count = sum([countSyllables(token) for token in word])
156
+ return 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count))
157
+
158
+
159
+ def vocabComplex(lyrics):
160
+ '''Calculate the ratio of different unique word stems (types) to the total number of words (tokens).'''
161
+ tokens = nltk.word_tokenize(lyrics.lower())
162
+ return len(set(tokens)) / len(tokens)
163
+
164
+
165
+ def sentenceLength(lyrics):
166
+ '''Calculate the average number of words in a sentence.'''
167
+ sentences = nltk.sent_tokenize(lyrics)
168
+ total_words = sum(len(nltk.word_tokenize(sent)) for sent in sentences)
169
+ return total_words / len(sentences)
170
+
171
+
172
+ def avgSyllable(lyrics):
173
+ """Calculate the average number of syllables per word."""
174
+ d = cmudict.dict()
175
+ words = lyrics.split()
176
+ total_syllables = sum(len(d[word.lower()][0]) for word in words if word.lower() in d)
177
+ return total_syllables / len(words)
178
+
179
+
180
+ # Data consolidation
181
+
182
+
183
+ def addAllFeatures(dataset, billboard):
184
+ '''
185
+ Add new songs on the Billboard Hot 100 to the dataset with all features including lyrics.
186
+
187
+ Parameters:
188
+ dataset (dict): The dataset.
189
+ billboard (list): The list of songs on the Billboard Hot 100.
190
+
191
+ Returns:
192
+ dict: The dataset with lyrics.
193
+ '''
194
+ spotify_token = getSpotifyToken()
195
+
196
+ for title, artist in billboard:
197
+
198
+ # Skip if the song is already in the dataset
199
+ abbrev = title.replace("_", " ") + "_" + artist.replace("_", " ")
200
+ if abbrev in dataset["data"]:
201
+ continue
202
+
203
+ try:
204
+ # Get the Spotify features, Genius lyrics, and FRES
205
+ print("Running: ", abbrev)
206
+ spotify_id = getSpotifyID(spotify_token, title, artist)
207
+ features = getSpotifyFeatures(getSpotifyToken(), spotify_id)
208
+ genius_url = getGeniusURL(title, artist)
209
+ lyrics = getLyrics(genius_url)
210
+ features["fres"] = getFRES(lyrics)
211
+ features["vocabComplex"] = vocabComplex(lyrics)
212
+ features["sentenceLength"] = sentenceLength(lyrics)
213
+ features["avgSyllable"] = avgSyllable(lyrics)
214
+ features["lyrics"] = lyrics
215
+ features["title"] = title.replace("_", " ")
216
+ features["artist"] = artist.replace("_", " ")
217
+ features["lang"] = detect(lyrics)
218
+ except:
219
+ # Skip if the song is not found on Spotify or Genius
220
+ print("Not found: ", abbrev)
221
+ continue
222
+
223
+ # Add the song with features to the dataset
224
+ dataset["data"][abbrev] = features
225
+ return dataset
226
+
227
+
228
+ def updateCache():
229
+ '''Update the dataset with new songs on the Billboard Hot 100.'''
230
+
231
+ dataset = openCache()
232
+ # Billboard Hot 100 is updated every Saturday
233
+ today = datetime.today().date()
234
+ saturday = today + timedelta(days=5-today.weekday())
235
+
236
+ # If cache is empty, add all songs on the Billboard Hot 100 from the past year to the dataset
237
+ if dataset == {}:
238
+ dataset["updated_week"] = str(saturday)
239
+ dataset["data"] = {}
240
+ billboard = []
241
+
242
+ for i in range(52):
243
+ billboard.extend(scrapeBillboard(saturday))
244
+ saturday -= timedelta(days=7)
245
+
246
+ billboard = list(set(billboard))
247
+ dataset = addAllFeatures(dataset, billboard)
248
+ saveCache(dataset)
249
+
250
+ # If cache is not empty, check if the dataset is up to date
251
+ else:
252
+ # If not, updated new songs from the last updated week to the current week to the dataset
253
+ if dataset["updated_week"] != str(saturday):
254
+ last_updated = dataset["updated_week"]
255
+ dataset["updated_week"] = str(saturday)
256
+ billboard = []
257
+
258
+ while str(saturday) != last_updated:
259
+ billboard.extend(scrapeBillboard(saturday))
260
+ saturday -= timedelta(days=7)
261
+
262
+ billboard = list(set(billboard))
263
+ dataset = addAllFeatures(dataset, billboard)
264
+ saveCache(dataset)
265
+ else:
266
+ print("Dataset is up to date.")
267
+
268
+ print("Data retrieved: ", len(dataset["data"]))
269
+ print("Data sample: ", dataset["data"]["Houdini_Dua Lipa"])
270
+
271
+
272
+ def exportData():
273
+ '''Export the dataset to a JSON file.'''
274
+
275
+ dataset = openCache()
276
+ data = dataset["data"]
277
+
278
+ # Prepare data by selecting only specific attributes for each song
279
+ filtered_data = []
280
+ for song in data:
281
+ features = data[song]
282
+ filtered_data.append({
283
+ "id": features['id'],
284
+ "title": features['title'],
285
+ "artist": features['artist'],
286
+ "danceability": features['danceability'],
287
+ "valence": features['valence'],
288
+ "speechiness": features['speechiness'],
289
+ "fres": features['fres'],
290
+ "vocabComplex": features['vocabComplex'],
291
+ "sentenceLength": features['sentenceLength'],
292
+ "avgSyllable": features['avgSyllable'],
293
+ "lyrics": features['lyrics'],
294
+ "lang": features["lang"]
295
+ })
296
+
297
+ # Export the filtered data to a JSON file
298
+ with open("data.json", "w") as file:
299
+ json.dump(filtered_data, file, indent=4)
300
+
301
+
302
+ if __name__ == "__main__":
303
+ init()
304
+ updateCache()
305
+ exportData()