|
const axios = require('axios'); |
|
const cheerio = require('cheerio'); |
|
const fs = require('fs'); |
|
const path = require('path'); |
|
|
|
class Artist { |
|
constructor(data) { |
|
this.name = ''; |
|
this.url = ''; |
|
this.biography = ''; |
|
this.discography = []; |
|
|
|
if (data) { |
|
for (const prop in data) { |
|
if (data.hasOwnProperty(prop)) { |
|
this[prop] = data[prop]; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
class Record { |
|
constructor(data) { |
|
this.artist = ''; |
|
this.title = ''; |
|
this.label = ''; |
|
this.url = ''; |
|
this.rating = 0; |
|
this.year = ''; |
|
this.genre = ''; |
|
this.text = ''; |
|
this.styles = []; |
|
this.tracks = []; |
|
if (data) { |
|
for (const prop in data) { |
|
if (data.hasOwnProperty(prop)) { |
|
this[prop] = data[prop]; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
class MusicScraper { |
|
constructor() { |
|
this.artistsToGet = 5; |
|
this.artistsFetched = []; |
|
this.relatedArtists = []; |
|
this.artists = []; |
|
|
|
|
|
this.timestampedDirectoryName = this.getTimestampedDirectoryName(); |
|
|
|
process.on('SIGINT', () => { |
|
|
|
console.log('Exiting the script...'); |
|
this.done(); |
|
process.exit(0); |
|
}); |
|
} |
|
|
|
getTimestampedDirectoryName() { |
|
const now = new Date(); |
|
const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); |
|
return `${timestamp}_data`; |
|
} |
|
|
|
async fetchData(url) { |
|
if (!url) { |
|
console.log(`Can't fetch URL since it's null`); |
|
return false; |
|
} |
|
try { |
|
const response = await axios.get(url); |
|
return response.data; |
|
} catch (error) { |
|
throw new Error(`Error fetching data: ${error.message}`); |
|
} |
|
} |
|
|
|
generateUrl(baseUrl, type) { |
|
return `${baseUrl}/${type}`; |
|
} |
|
|
|
cleanText(text) { |
|
|
|
var newText = text.replace(/\s+/g, ' ').trim(); |
|
newText = newText.replace(/[\\"]/g, ''); |
|
return newText; |
|
} |
|
|
|
async getRelatedArtists(Artist) { |
|
try { |
|
const html = await this.fetchData(this.generateUrl(Artist.url, 'related')); |
|
var relatedArtists = this.scrapeRelatedArtistData(html); |
|
return relatedArtists; |
|
} catch (error) { |
|
console.error(error); |
|
} |
|
} |
|
|
|
async getArtistBiography(Artist) { |
|
try { |
|
const html = await this.fetchData(this.generateUrl(Artist.url, 'biography')); |
|
const $ = cheerio.load(html); |
|
var biographyText = $('section.biography .text'); |
|
var cleanBiographyText = this.cleanText(biographyText.text()); |
|
return cleanBiographyText; |
|
} catch (error) { |
|
console.error(`Error scraping ${Artist.url}: ${error.message}`); |
|
return false; |
|
} |
|
} |
|
|
|
async getArtistDiscography(Artist) { |
|
try { |
|
const html = await this.fetchData(this.generateUrl(Artist.url, 'discography')); |
|
const $ = cheerio.load(html); |
|
var tableRows = $('.discography table tbody tr'); |
|
var records = []; |
|
tableRows.each(function () { |
|
var recordData = { |
|
year: $(this).find('.year').text().trim(), |
|
title: $(this).find('.title').text().trim(), |
|
label: $(this).find('.label').text().trim(), |
|
url: $(this).find('.title a').attr('href'), |
|
}; |
|
var record = new Record(recordData); |
|
records.push(record); |
|
}); |
|
return records; |
|
} catch (error) { |
|
console.error(`Error scraping ${Artist.url}: ${error.message}`); |
|
return false; |
|
} |
|
} |
|
|
|
async getSingleRecordData(record) { |
|
try { |
|
console.log(`Getting record ${record.title}`); |
|
const html = await this.fetchData(record.url); |
|
const $ = cheerio.load(html); |
|
var trackRows = $('.track-listing table tbody tr'); |
|
var recordData = { |
|
title: record.title, |
|
year: record.year, |
|
label: record.label, |
|
url: record.url, |
|
rating: this.cleanText($('.allmusic-rating').text()), |
|
|
|
|
|
|
|
genre: this.cleanText($('.basic-info .genre div a').text()), |
|
|
|
text: this.cleanText($('section.review .text').text()), |
|
tracks: [], |
|
|
|
}; |
|
trackRows.each(function () { |
|
recordData.tracks.push($(this).find('.title a').text().trim()); |
|
}); |
|
var fullRecord = new Record(recordData); |
|
return fullRecord; |
|
} catch (error) { |
|
console.error(`Error scraping ${Record.url}: ${error.message}`); |
|
return false; |
|
} |
|
} |
|
|
|
scrapeRelatedArtistData(html) { |
|
const $ = cheerio.load(html); |
|
const liElements = $('.related.similars ul li'); |
|
|
|
let relatedArtists = []; |
|
|
|
liElements.each((index, element) => { |
|
let artist = new Artist({ |
|
name: $(element).text().trim(), |
|
url: $(element).find('a').attr('href'), |
|
}); |
|
|
|
if (artist.name && artist.url) { |
|
relatedArtists.push(artist); |
|
} |
|
}); |
|
return relatedArtists; |
|
} |
|
|
|
async run() { |
|
console.log(' '); |
|
console.log(' '); |
|
console.log(' '); |
|
console.log(' '); |
|
console.log('------------------------'); |
|
var initialArtistData = { |
|
name: 'The Abyssinians', |
|
url: 'https://www.allmusic.com/artist/the-abyssinians-mn0000588943', |
|
}; |
|
var InitialArtist = new Artist(initialArtistData); |
|
var data = await this.getArtistData(InitialArtist); |
|
if (data) { |
|
this.artists.push(data); |
|
} |
|
|
|
var relatedArtists = await this.getRelatedArtists(InitialArtist); |
|
if (relatedArtists) { |
|
this.relatedArtists = this.relatedArtists.concat(relatedArtists); |
|
} |
|
|
|
|
|
|
|
for (let artist of this.relatedArtists) { |
|
var data = await this.getArtistData(artist); |
|
if (!data) { |
|
this.done(); |
|
} |
|
this.artists.push(data); |
|
} |
|
|
|
this.done(); |
|
} |
|
|
|
async getArtistData(Artist) { |
|
if (!Artist || !Artist.name || !Artist.url) { |
|
return false; |
|
} |
|
|
|
|
|
if (this.artistsFetched.includes(Artist.name)) { |
|
console.log(`Artist already fetched.`); |
|
return false; |
|
} |
|
|
|
if (this.artistsToGet < this.artistsFetched.length) { |
|
console.log(`Reached the limit of artists to fetch.`); |
|
this.done(); |
|
} |
|
|
|
this.artistsFetched.push(Artist.name); |
|
var biography = await this.getArtistBiography(Artist); |
|
var discography = await this.getArtistDiscography(Artist); |
|
if (biography) { |
|
Artist.biography = biography; |
|
} |
|
if (discography) { |
|
|
|
var records = []; |
|
for (let record of discography) { |
|
console.log(record); |
|
var fullRecordData = await this.getSingleRecordData(record); |
|
records.push(fullRecordData); |
|
} |
|
Artist.discography = records; |
|
} |
|
return Artist; |
|
} |
|
|
|
done() { |
|
console.log(this.artists); |
|
console.log(this.artists[0].discography); |
|
this.writeToDisk(); |
|
process.exit(0); |
|
} |
|
|
|
removeSpecialCharacters(string) { |
|
const noSpecialCharacters = string.replace(/[^a-zA-Z0-9– ]/g, ''); |
|
return noSpecialCharacters; |
|
} |
|
|
|
writeToDisk() { |
|
fs.mkdir(this.timestampedDirectoryName, (err) => { |
|
if (err) { |
|
console.error('Error creating folder:', err); |
|
} else { |
|
console.log('Folder created successfully'); |
|
} |
|
}); |
|
|
|
|
|
for (let artist of this.artists) { |
|
const jsonFileName = `${artist.name} biography.txt`; |
|
const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); |
|
const jsonString = JSON.stringify(artist, null, 2); |
|
|
|
var bioText = `${artist.name} biography\n`; |
|
bioText += artist.biography; |
|
fs.writeFileSync(jsonFilePath, bioText); |
|
console.log(`Data written to ${jsonFileName}`); |
|
|
|
for (let record of artist.discography) { |
|
var artistAndTitle = this.removeSpecialCharacters(artist.name + ' – ' + record.title); |
|
|
|
const jsonFileName = `${artistAndTitle} review.txt`; |
|
const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); |
|
const jsonString = JSON.stringify(artist, null, 2); |
|
|
|
var reviewText = `Review of ${record.title} by ${artist.name}\n`; |
|
reviewText += `Artist: ${artist.name}\n`; |
|
reviewText += `Album title: ${record.title}\n`; |
|
reviewText += `Release year: ${record.year}\n`; |
|
reviewText += `Label: ${record.label}\n`; |
|
reviewText += `Genre: ${record.genre}\n`; |
|
if (record.rating) { |
|
reviewText += `Rating: ${record.rating} out of 10\n`; |
|
} |
|
reviewText += `\n\Track listing:\n`; |
|
for (let track of record.tracks) { |
|
reviewText += `${track}`; |
|
reviewText += '\n'; |
|
} |
|
if (record.text) { |
|
reviewText += '\n'; |
|
reviewText += `Review: ${record.text}\n`; |
|
} |
|
fs.writeFileSync(jsonFilePath, reviewText); |
|
console.log(`Data written to ${jsonFileName}`); |
|
} |
|
} |
|
|
|
|
|
} |
|
} |
|
|
|
const scraper = new MusicScraper(); |
|
scraper.run(); |
|
|