|
const axios = require('axios'); |
|
const cheerio = require('cheerio'); |
|
const fs = require('fs'); |
|
const path = require('path'); |
|
|
|
class MusicScraper { |
|
constructor() { |
|
this.relatedFetched = 0; |
|
this.relatedNumberToGet = 50; |
|
this.urls = []; |
|
this.data = []; |
|
this.fetchedUrls = []; |
|
|
|
|
|
this.timestampedDirectoryName = this.getTimestampedDirectoryName(); |
|
|
|
process.on('SIGINT', () => { |
|
|
|
this.done(); |
|
console.log('Exiting the script...'); |
|
|
|
process.exit(0); |
|
}); |
|
} |
|
|
|
async fetchData(url) { |
|
console.log(`Fetching ${url}`); |
|
try { |
|
const response = await axios.get(url); |
|
return response.data; |
|
} catch (error) { |
|
throw new Error(`Error fetching data: ${error.message}`); |
|
} |
|
} |
|
|
|
scrapeRelatedData(html) { |
|
const $ = cheerio.load(html); |
|
const liElements = $('.related.similars ul li'); |
|
|
|
liElements.each((index, element) => { |
|
const link = $(element).find('a').attr('href'); |
|
const relatedLink = $(element).find('a').attr('href') + '/related'; |
|
const biographyLink = $(element).find('a').attr('href') + '/biography'; |
|
const discographyLink = $(element).find('a').attr('href') + '/discography'; |
|
const text = $(element).text().trim(); |
|
|
|
if (link && text) { |
|
this.data.push({ link, text, relatedLink, biographyLink, discographyLink, discographies: [] }); |
|
} |
|
}); |
|
} |
|
|
|
async scrapeUrls() { |
|
for (const url of this.urls) { |
|
try { |
|
this.relatedFetched++; |
|
if (this.relatedNumberToGet > this.relatedFetched) { |
|
const html = await this.fetchData(url); |
|
this.scrapeRelatedData(html); |
|
} |
|
} catch (error) { |
|
console.error(`Error scraping ${url}: ${error.message}`); |
|
} |
|
} |
|
} |
|
|
|
cleanText(text) { |
|
|
|
var newText = text.replace(/\s+/g, ' ').trim(); |
|
newText = newText.replace(/[\\"]/g, ''); |
|
return newText; |
|
} |
|
|
|
async scrapeBiographies() { |
|
|
|
let index = 0; |
|
for (const item of this.data) { |
|
console.log('Scraping biography for ' + item.text); |
|
try { |
|
const html = await this.fetchData(item.biographyLink); |
|
const $ = cheerio.load(html); |
|
var biographyText = $('section.biography .text'); |
|
|
|
this.data[index].biography = this.cleanText(biographyText.text()); |
|
|
|
|
|
index++; |
|
} catch (error) { |
|
console.error(`Error scraping ${item.biographyLink}: ${error.message}`); |
|
index++; |
|
} |
|
} |
|
} |
|
|
|
async scrapeDiscographies() { |
|
|
|
let index = 0; |
|
for (const item of this.data) { |
|
console.log('Scraping discographies for ' + item.text); |
|
try { |
|
const html = await this.fetchData(item.discographyLink); |
|
const $ = cheerio.load(html); |
|
var rows = $('.discography').find('tr'); |
|
console.log(rows.text()); |
|
if (rows) { |
|
rows.each(() => { |
|
var release = { |
|
|
|
year: $(this).find('.year') ? $(element).find('.year').text() : '', |
|
}; |
|
this.data[index].discographies.push(release); |
|
}); |
|
} |
|
index++; |
|
} catch (error) { |
|
console.error(`Error scraping ${item.discographyLink}: ${error.message}`); |
|
index++; |
|
} |
|
} |
|
} |
|
|
|
async run() { |
|
try { |
|
const initialUrl = 'https://www.allmusic.com/artist/johnny-osbourne-mn0000248916/related'; |
|
const initialHtml = await this.fetchData(initialUrl); |
|
this.scrapeRelatedData(initialHtml); |
|
|
|
|
|
this.urls = this.data.map((item) => item.relatedLink); |
|
|
|
await this.scrapeUrls(); |
|
await this.scrapeBiographies(); |
|
|
|
|
|
this.done(); |
|
} catch (error) { |
|
console.error(error); |
|
} |
|
} |
|
|
|
getTimestampedDirectoryName() { |
|
const now = new Date(); |
|
const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); |
|
return `${timestamp}_data`; |
|
} |
|
|
|
getTimestampedFileName() { |
|
const now = new Date(); |
|
const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); |
|
return `${timestamp}_data.txt`; |
|
} |
|
|
|
writeDataToDisk() { |
|
const jsonFileName = this.getTimestampedFileName(); |
|
const jsonFilePath = path.join(__dirname, jsonFileName); |
|
const jsonData = JSON.stringify(this.data, null, 2); |
|
|
|
fs.writeFileSync(jsonFilePath, jsonData); |
|
console.log(`Data written to ${jsonFileName}`); |
|
} |
|
|
|
writeBiographyDataToDisk() { |
|
fs.mkdir(this.timestampedDirectoryName, (err) => { |
|
if (err) { |
|
console.error('Error creating folder:', err); |
|
} else { |
|
console.log('Folder created successfully'); |
|
} |
|
}); |
|
|
|
const jsonFileName = item.text + '.txt'; |
|
const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); |
|
const jsonString = JSON.stringify(jsonObject, null, 2); |
|
fs.writeFileSync(jsonFilePath, jsonString); |
|
console.log(`Data written to ${jsonFileName}`); |
|
} |
|
|
|
done() { |
|
|
|
console.log(this.data); |
|
this.writeBiographyDataToDisk(); |
|
process.exit(); |
|
} |
|
} |
|
|
|
const scraper = new MusicScraper(); |
|
scraper.run(); |
|
|