nk-test / scrapeAllMusic.js
asylwan's picture
Upload folder using huggingface_hub
c51674d
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
class MusicScraper {
constructor() {
this.relatedFetched = 0;
this.relatedNumberToGet = 50;
this.urls = [];
this.data = [];
this.fetchedUrls = [];
/* Create a timestamped folder name */
this.timestampedDirectoryName = this.getTimestampedDirectoryName();
process.on('SIGINT', () => {
// Run your cleanup or other desired method before exiting
this.done();
console.log('Exiting the script...');
process.exit(0); // Exit gracefully
});
}
async fetchData(url) {
console.log(`Fetching ${url}`);
try {
const response = await axios.get(url);
return response.data;
} catch (error) {
throw new Error(`Error fetching data: ${error.message}`);
}
}
scrapeRelatedData(html) {
const $ = cheerio.load(html);
const liElements = $('.related.similars ul li');
liElements.each((index, element) => {
const link = $(element).find('a').attr('href');
const relatedLink = $(element).find('a').attr('href') + '/related';
const biographyLink = $(element).find('a').attr('href') + '/biography';
const discographyLink = $(element).find('a').attr('href') + '/discography';
const text = $(element).text().trim();
if (link && text) {
this.data.push({ link, text, relatedLink, biographyLink, discographyLink, discographies: [] });
}
});
}
async scrapeUrls() {
for (const url of this.urls) {
try {
this.relatedFetched++;
if (this.relatedNumberToGet > this.relatedFetched) {
const html = await this.fetchData(url);
this.scrapeRelatedData(html);
}
} catch (error) {
console.error(`Error scraping ${url}: ${error.message}`);
}
}
}
cleanText(text) {
// Replace consecutive spaces and newlines with a single space
var newText = text.replace(/\s+/g, ' ').trim();
newText = newText.replace(/[\\"]/g, '');
return newText;
}
async scrapeBiographies() {
/* Loop over all biography URLs, fetch the data, and add it to the object */
let index = 0;
for (const item of this.data) {
console.log('Scraping biography for ' + item.text);
try {
const html = await this.fetchData(item.biographyLink);
const $ = cheerio.load(html);
var biographyText = $('section.biography .text');
this.data[index].biography = this.cleanText(biographyText.text());
//console.log(biographyText.text());
index++;
} catch (error) {
console.error(`Error scraping ${item.biographyLink}: ${error.message}`);
index++;
}
}
}
async scrapeDiscographies() {
/* Loop over all discography URLs, fetch the data, and add it to the object */
let index = 0;
for (const item of this.data) {
console.log('Scraping discographies for ' + item.text);
try {
const html = await this.fetchData(item.discographyLink);
const $ = cheerio.load(html);
var rows = $('.discography').find('tr');
console.log(rows.text());
if (rows) {
rows.each(() => {
var release = {
//title: $(element).find('.title').data('sort-title'),
year: $(this).find('.year') ? $(element).find('.year').text() : '',
};
this.data[index].discographies.push(release);
});
}
index++;
} catch (error) {
console.error(`Error scraping ${item.discographyLink}: ${error.message}`);
index++;
}
}
}
async run() {
try {
const initialUrl = 'https://www.allmusic.com/artist/johnny-osbourne-mn0000248916/related';
const initialHtml = await this.fetchData(initialUrl);
this.scrapeRelatedData(initialHtml);
// Extract URLs from the scraped data
this.urls = this.data.map((item) => item.relatedLink);
await this.scrapeUrls();
await this.scrapeBiographies();
//await this.scrapeDiscographies();
this.done();
} catch (error) {
console.error(error);
}
}
getTimestampedDirectoryName() {
const now = new Date();
const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, '');
return `${timestamp}_data`;
}
getTimestampedFileName() {
const now = new Date();
const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, '');
return `${timestamp}_data.txt`;
}
writeDataToDisk() {
const jsonFileName = this.getTimestampedFileName();
const jsonFilePath = path.join(__dirname, jsonFileName);
const jsonData = JSON.stringify(this.data, null, 2); // Pretty-print JSON
fs.writeFileSync(jsonFilePath, jsonData);
console.log(`Data written to ${jsonFileName}`);
}
writeBiographyDataToDisk() {
fs.mkdir(this.timestampedDirectoryName, (err) => {
if (err) {
console.error('Error creating folder:', err);
} else {
console.log('Folder created successfully');
}
});
const jsonFileName = item.text + '.txt';
const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName);
const jsonString = JSON.stringify(jsonObject, null, 2);
fs.writeFileSync(jsonFilePath, jsonString);
console.log(`Data written to ${jsonFileName}`);
}
done() {
//this.writeDataToDisk();
console.log(this.data);
this.writeBiographyDataToDisk();
process.exit();
}
}
const scraper = new MusicScraper();
scraper.run();