Upload data_fetch.py
Browse files- data_fetch.py +305 -0
data_fetch.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
from datetime import datetime, timedelta
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
import requests
|
6 |
+
import nltk
|
7 |
+
from nltk.corpus import cmudict
|
8 |
+
# nltk.download('punkt')
|
9 |
+
# nltk.download('cmudict')
|
10 |
+
from langdetect import detect
|
11 |
+
import os
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
|
15 |
+
def init():
|
16 |
+
'''Initialize the environment.'''
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
global spotify_cid
|
20 |
+
global spotify_secret
|
21 |
+
global genius_token
|
22 |
+
global headers
|
23 |
+
|
24 |
+
spotify_cid = os.getenv("SPOTIFY_CID")
|
25 |
+
spotify_secret = os.getenv("SPOTIFY_SECRET")
|
26 |
+
genius_token = os.getenv("GENIUS_TOKEN")
|
27 |
+
headers = {"Authorization": "Bearer " + genius_token}
|
28 |
+
|
29 |
+
|
30 |
+
# Cache
|
31 |
+
|
32 |
+
CACHE_FILENAME = "cache.json"
|
33 |
+
|
34 |
+
def openCache():
|
35 |
+
'''Check if cache file exists, if so load it, if not create new cache'''
|
36 |
+
try:
|
37 |
+
cache_file = open(CACHE_FILENAME, "r")
|
38 |
+
cache_contents = cache_file.read()
|
39 |
+
cache_dict = json.loads(cache_contents)
|
40 |
+
cache_file.close()
|
41 |
+
except:
|
42 |
+
cache_dict = {}
|
43 |
+
return cache_dict
|
44 |
+
|
45 |
+
def saveCache(cache_dict):
|
46 |
+
'''Save cache file'''
|
47 |
+
cache_file = open(CACHE_FILENAME, "w")
|
48 |
+
contents_to_write = json.dumps(cache_dict)
|
49 |
+
cache_file.write(contents_to_write)
|
50 |
+
cache_file.close()
|
51 |
+
|
52 |
+
|
53 |
+
# Billboard scraper
|
54 |
+
|
55 |
+
def scrapeBillboard(date):
|
56 |
+
'''
|
57 |
+
Scrape the Billboard Hot 100 chart for a given date.
|
58 |
+
|
59 |
+
Parameters:
|
60 |
+
date (datetime.date): The date of the chart.
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
list: A list of tuples containing the title and artist of each song.
|
64 |
+
'''
|
65 |
+
url = "https://www.billboard.com/charts/hot-100/" + str(date) + "/"
|
66 |
+
html = requests.get(url)
|
67 |
+
soup = BeautifulSoup(html.content, "html.parser")
|
68 |
+
|
69 |
+
ul = soup.findAll("ul", class_="o-chart-results-list-row")
|
70 |
+
|
71 |
+
billboard = []
|
72 |
+
for i in ul:
|
73 |
+
title = i.find("h3").text.strip()
|
74 |
+
artist = i.find("span", class_="a-font-primary-s").text.strip()
|
75 |
+
billboard.append((title, artist))
|
76 |
+
|
77 |
+
return billboard
|
78 |
+
|
79 |
+
|
80 |
+
# Spotify API
|
81 |
+
|
82 |
+
def getSpotifyToken():
|
83 |
+
'''Get the Spotify access token.'''
|
84 |
+
response = requests.post(
|
85 |
+
"https://accounts.spotify.com/api/token",
|
86 |
+
data={
|
87 |
+
"grant_type": "client_credentials",
|
88 |
+
"client_id": spotify_cid,
|
89 |
+
"client_secret": spotify_secret,
|
90 |
+
}).json()
|
91 |
+
|
92 |
+
return response["access_token"]
|
93 |
+
|
94 |
+
def getSpotifyID(token, title, artist):
|
95 |
+
'''Get the Spotify ID of a song.'''
|
96 |
+
headers = {"Authorization": "Bearer " + token}
|
97 |
+
url = f"https://api.spotify.com/v1/search?q={title}%20{artist}&type=track&market=US&limit=1"
|
98 |
+
response = requests.get(url, headers=headers).json()
|
99 |
+
return response["tracks"]["items"][0]["id"]
|
100 |
+
|
101 |
+
def getSpotifyFeatures(token, song_id):
|
102 |
+
'''Get the Spotify features of a song.'''
|
103 |
+
headers = {"Authorization": "Bearer " + token}
|
104 |
+
url = f"https://api.spotify.com/v1/audio-features/{song_id}"
|
105 |
+
response = requests.get(url, headers=headers)
|
106 |
+
return response.json()
|
107 |
+
|
108 |
+
|
109 |
+
# Genius API
|
110 |
+
|
111 |
+
def getGeniusURL(title, artist):
|
112 |
+
'''Get the Genius URL of a song.'''
|
113 |
+
url = "https://api.genius.com/search"
|
114 |
+
params = {"q": f"{title} {artist}"}
|
115 |
+
response = requests.get(url, params=params, headers=headers).json()
|
116 |
+
return response["response"]["hits"][0]["result"]["url"]
|
117 |
+
|
118 |
+
def getLyrics(url):
|
119 |
+
'''Get the lyrics of a song from its Genius URL.'''
|
120 |
+
html = requests.get(url)
|
121 |
+
soup = BeautifulSoup(html.content, "html.parser")
|
122 |
+
lyrics = soup.find("div", {"data-lyrics-container": "true"}).get_text(separator="\n")
|
123 |
+
return lyrics
|
124 |
+
|
125 |
+
|
126 |
+
# Readability metrics
|
127 |
+
|
128 |
+
def countSyllables(word):
|
129 |
+
'''Count the number of syllables in a word.'''
|
130 |
+
count = 0
|
131 |
+
vowels = 'aeiouy'
|
132 |
+
word = word.lower().strip(".:;?!")
|
133 |
+
if word[0] in vowels:
|
134 |
+
count +=1
|
135 |
+
for index in range(1, len(word)):
|
136 |
+
if word[index] in vowels and word[index-1] not in vowels:
|
137 |
+
count +=1
|
138 |
+
if word.endswith('e'):
|
139 |
+
count -= 1
|
140 |
+
if word.endswith('le'):
|
141 |
+
count += 1
|
142 |
+
if count == 0:
|
143 |
+
count +=1
|
144 |
+
return count
|
145 |
+
|
146 |
+
def getFRES(lyrics):
|
147 |
+
'''Calculate the Flesch reading-ease score (FRES) of a song.'''
|
148 |
+
# Remove [Verse], [Chorus], etc.
|
149 |
+
lyrics = re.sub(r"\[.*\]", "", lyrics)
|
150 |
+
sentence = lyrics.split("\n")
|
151 |
+
sentence = [i for i in sentence if i]
|
152 |
+
word = lyrics.split()
|
153 |
+
word_count = len(word)
|
154 |
+
sentence_count = len(sentence)
|
155 |
+
syllable_count = sum([countSyllables(token) for token in word])
|
156 |
+
return 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count))
|
157 |
+
|
158 |
+
|
159 |
+
def vocabComplex(lyrics):
|
160 |
+
'''Calculate the ratio of different unique word stems (types) to the total number of words (tokens).'''
|
161 |
+
tokens = nltk.word_tokenize(lyrics.lower())
|
162 |
+
return len(set(tokens)) / len(tokens)
|
163 |
+
|
164 |
+
|
165 |
+
def sentenceLength(lyrics):
|
166 |
+
'''Calculate the average number of words in a sentence.'''
|
167 |
+
sentences = nltk.sent_tokenize(lyrics)
|
168 |
+
total_words = sum(len(nltk.word_tokenize(sent)) for sent in sentences)
|
169 |
+
return total_words / len(sentences)
|
170 |
+
|
171 |
+
|
172 |
+
def avgSyllable(lyrics):
|
173 |
+
"""Calculate the average number of syllables per word."""
|
174 |
+
d = cmudict.dict()
|
175 |
+
words = lyrics.split()
|
176 |
+
total_syllables = sum(len(d[word.lower()][0]) for word in words if word.lower() in d)
|
177 |
+
return total_syllables / len(words)
|
178 |
+
|
179 |
+
|
180 |
+
# Data consolidation
|
181 |
+
|
182 |
+
|
183 |
+
def addAllFeatures(dataset, billboard):
|
184 |
+
'''
|
185 |
+
Add new songs on the Billboard Hot 100 to the dataset with all features including lyrics.
|
186 |
+
|
187 |
+
Parameters:
|
188 |
+
dataset (dict): The dataset.
|
189 |
+
billboard (list): The list of songs on the Billboard Hot 100.
|
190 |
+
|
191 |
+
Returns:
|
192 |
+
dict: The dataset with lyrics.
|
193 |
+
'''
|
194 |
+
spotify_token = getSpotifyToken()
|
195 |
+
|
196 |
+
for title, artist in billboard:
|
197 |
+
|
198 |
+
# Skip if the song is already in the dataset
|
199 |
+
abbrev = title.replace("_", " ") + "_" + artist.replace("_", " ")
|
200 |
+
if abbrev in dataset["data"]:
|
201 |
+
continue
|
202 |
+
|
203 |
+
try:
|
204 |
+
# Get the Spotify features, Genius lyrics, and FRES
|
205 |
+
print("Running: ", abbrev)
|
206 |
+
spotify_id = getSpotifyID(spotify_token, title, artist)
|
207 |
+
features = getSpotifyFeatures(getSpotifyToken(), spotify_id)
|
208 |
+
genius_url = getGeniusURL(title, artist)
|
209 |
+
lyrics = getLyrics(genius_url)
|
210 |
+
features["fres"] = getFRES(lyrics)
|
211 |
+
features["vocabComplex"] = vocabComplex(lyrics)
|
212 |
+
features["sentenceLength"] = sentenceLength(lyrics)
|
213 |
+
features["avgSyllable"] = avgSyllable(lyrics)
|
214 |
+
features["lyrics"] = lyrics
|
215 |
+
features["title"] = title.replace("_", " ")
|
216 |
+
features["artist"] = artist.replace("_", " ")
|
217 |
+
features["lang"] = detect(lyrics)
|
218 |
+
except:
|
219 |
+
# Skip if the song is not found on Spotify or Genius
|
220 |
+
print("Not found: ", abbrev)
|
221 |
+
continue
|
222 |
+
|
223 |
+
# Add the song with features to the dataset
|
224 |
+
dataset["data"][abbrev] = features
|
225 |
+
return dataset
|
226 |
+
|
227 |
+
|
228 |
+
def updateCache():
|
229 |
+
'''Update the dataset with new songs on the Billboard Hot 100.'''
|
230 |
+
|
231 |
+
dataset = openCache()
|
232 |
+
# Billboard Hot 100 is updated every Saturday
|
233 |
+
today = datetime.today().date()
|
234 |
+
saturday = today + timedelta(days=5-today.weekday())
|
235 |
+
|
236 |
+
# If cache is empty, add all songs on the Billboard Hot 100 from the past year to the dataset
|
237 |
+
if dataset == {}:
|
238 |
+
dataset["updated_week"] = str(saturday)
|
239 |
+
dataset["data"] = {}
|
240 |
+
billboard = []
|
241 |
+
|
242 |
+
for i in range(52):
|
243 |
+
billboard.extend(scrapeBillboard(saturday))
|
244 |
+
saturday -= timedelta(days=7)
|
245 |
+
|
246 |
+
billboard = list(set(billboard))
|
247 |
+
dataset = addAllFeatures(dataset, billboard)
|
248 |
+
saveCache(dataset)
|
249 |
+
|
250 |
+
# If cache is not empty, check if the dataset is up to date
|
251 |
+
else:
|
252 |
+
# If not, updated new songs from the last updated week to the current week to the dataset
|
253 |
+
if dataset["updated_week"] != str(saturday):
|
254 |
+
last_updated = dataset["updated_week"]
|
255 |
+
dataset["updated_week"] = str(saturday)
|
256 |
+
billboard = []
|
257 |
+
|
258 |
+
while str(saturday) != last_updated:
|
259 |
+
billboard.extend(scrapeBillboard(saturday))
|
260 |
+
saturday -= timedelta(days=7)
|
261 |
+
|
262 |
+
billboard = list(set(billboard))
|
263 |
+
dataset = addAllFeatures(dataset, billboard)
|
264 |
+
saveCache(dataset)
|
265 |
+
else:
|
266 |
+
print("Dataset is up to date.")
|
267 |
+
|
268 |
+
print("Data retrieved: ", len(dataset["data"]))
|
269 |
+
print("Data sample: ", dataset["data"]["Houdini_Dua Lipa"])
|
270 |
+
|
271 |
+
|
272 |
+
def exportData():
|
273 |
+
'''Export the dataset to a JSON file.'''
|
274 |
+
|
275 |
+
dataset = openCache()
|
276 |
+
data = dataset["data"]
|
277 |
+
|
278 |
+
# Prepare data by selecting only specific attributes for each song
|
279 |
+
filtered_data = []
|
280 |
+
for song in data:
|
281 |
+
features = data[song]
|
282 |
+
filtered_data.append({
|
283 |
+
"id": features['id'],
|
284 |
+
"title": features['title'],
|
285 |
+
"artist": features['artist'],
|
286 |
+
"danceability": features['danceability'],
|
287 |
+
"valence": features['valence'],
|
288 |
+
"speechiness": features['speechiness'],
|
289 |
+
"fres": features['fres'],
|
290 |
+
"vocabComplex": features['vocabComplex'],
|
291 |
+
"sentenceLength": features['sentenceLength'],
|
292 |
+
"avgSyllable": features['avgSyllable'],
|
293 |
+
"lyrics": features['lyrics'],
|
294 |
+
"lang": features["lang"]
|
295 |
+
})
|
296 |
+
|
297 |
+
# Export the filtered data to a JSON file
|
298 |
+
with open("data.json", "w") as file:
|
299 |
+
json.dump(filtered_data, file, indent=4)
|
300 |
+
|
301 |
+
|
302 |
+
if __name__ == "__main__":
|
303 |
+
init()
|
304 |
+
updateCache()
|
305 |
+
exportData()
|