helper / index.js
vyles's picture
Update index.js
daa7349 verified
import express from 'express';
import { chromium } from 'playwright';
import cors from 'cors';
import bodyParser from 'body-parser'
const app = express();
const PORT = process.env.PORT || 7860;
app.set('json spaces', 2)
app.use(bodyParser.urlencoded({ extended: true }))
app.use(bodyParser.json())
app.use(express.json({ limit: '500mb' }));
app.use(cors());
async function extractContentFromUrl(url, page) {
try {
page.setDefaultNavigationTimeout(30000);
page.setDefaultTimeout(30000);
await page.route('**/*', (route) => {
const resourceType = route.request().resourceType();
const url = route.request().url();
if (['font', 'media', 'websocket'].includes(resourceType)) {
route.abort();
}
else if (url.includes('google-analytics') ||
url.includes('doubleclick') ||
url.includes('facebook') ||
url.includes('twitter') ||
url.includes('analytics')) {
route.abort();
} else {
route.continue();
}
});
try {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000
});
} catch (navigationError) {
console.log(`Navigation error for ${url}, trying with load event`);
await page.goto(url, {
waitUntil: 'load',
timeout: 30000
});
}
try {
await page.waitForSelector('body', { state: 'visible', timeout: 10000 });
} catch (e) {
console.log('Body selector timeout, continuing anyway');
}
await page.waitForTimeout(3000);
const contentSelectors = ['article', 'main', '.content', '#content'];
for (const selector of contentSelectors) {
try {
await page.waitForSelector(selector, { timeout: 5000 });
break;
} catch (e) {
}
}
await page.evaluate(() => {
return new Promise((resolve) => {
let totalHeight = 0;
const distance = 100;
let scrollCount = 0;
const maxScrolls = 30;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
scrollCount++;
if(totalHeight >= scrollHeight || scrollCount >= maxScrolls){
clearInterval(timer);
window.scrollTo(0, 0);
resolve();
}
}, 100);
});
});
await page.waitForTimeout(1000);
const content = await page.evaluate(() => {
const cleanText = (text) => {
return text ? text.replace(/\s+/g, ' ').trim() : '';
};
const getTextContent = (element) => {
if (!element) return '';
const clone = element.cloneNode(true);
const scripts = clone.querySelectorAll('script, style, noscript, iframe');
scripts.forEach(el => el.remove());
return cleanText(clone.textContent || clone.innerText || '');
};
const title = document.title || '';
const metaDescription = document.querySelector('meta[name="description"]')?.content ||
document.querySelector('meta[property="og:description"]')?.content || '';
const h1Elements = Array.from(document.querySelectorAll('h1'))
.map(h1 => getTextContent(h1))
.filter(text => text.length > 0);
const h2Elements = Array.from(document.querySelectorAll('h2'))
.map(h2 => getTextContent(h2))
.filter(text => text.length > 0);
const paragraphs = Array.from(document.querySelectorAll('p'))
.map(p => getTextContent(p))
.filter(text => text.length > 20);
const contentSelectors = [
'main',
'article',
'[role="main"]',
'.content',
'#content',
'.post-content',
'.entry-content',
'.article-content',
'.page-content',
'.main-content',
'[itemprop="articleBody"]',
'.story-body',
'.article-body',
'.detail__body-text',
'.detail__body',
'.itp_bodycontent'
];
let mainContent = null;
for (const selector of contentSelectors) {
mainContent = document.querySelector(selector);
if (mainContent && getTextContent(mainContent).length > 100) break;
}
if (!mainContent) {
mainContent = document.body;
}
const mainText = getTextContent(mainContent);
let fallbackText = '';
if (!mainText || mainText.length < 100) {
const allDivs = Array.from(document.querySelectorAll('div'))
.map(div => getTextContent(div))
.filter(text => text.length > 200)
.sort((a, b) => b.length - a.length);
fallbackText = allDivs[0] || '';
}
const finalMainText = mainText || fallbackText;
const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
return {
title,
metaDescription,
headings: {
h1: h1Elements,
h2: h2Elements
},
paragraphs,
mainText: finalMainText,
wordCount,
hasContent: wordCount > 50
};
});
return {
url,
success: true,
content,
extractedAt: new Date().toISOString()
};
} catch (error) {
console.error(`Error extracting ${url}:`, error);
return {
url,
success: false,
error: error.message,
extractedAt: new Date().toISOString()
};
}
}
app.post('/extract-content', async (req, res) => {
const { urls } = req.body;
if (!urls || !Array.isArray(urls)) {
return res.status(400).json({
success: false,
message: 'Body harus berisi array urls'
});
}
if (urls.length === 0) {
return res.status(400).json({
success: false,
message: 'Array urls tidak boleh kosong'
});
}
if (urls.length > 10) {
return res.status(400).json({
success: false,
message: 'Maksimal 10 URLs per request'
});
}
const validUrls = [];
const invalidUrls = [];
urls.forEach(url => {
try {
new URL(url);
validUrls.push(url);
} catch (error) {
invalidUrls.push(url);
}
});
if (invalidUrls.length > 0) {
return res.status(400).json({
success: false,
message: 'Format URL tidak valid',
invalidUrls
});
}
let browser;
try {
browser = await chromium.launch({
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--disable-blink-features=AutomationControlled',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
],
executablePath: process.env.CHROME_BIN,
headless: true,
});
const results = [];
// Process URLs sequentially to avoid browser crashes
for (const url of validUrls) {
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/New_York'
});
const page = await context.newPage();
try {
const result = await extractContentFromUrl(url, page);
results.push(result);
} catch (error) {
console.error(`Error processing ${url}:`, error);
results.push({
url,
success: false,
error: error.message,
extractedAt: new Date().toISOString()
});
} finally {
await context.close();
}
}
const successCount = results.filter(r => r.success).length;
const failCount = results.filter(r => !r.success).length;
const emptyContentCount = results.filter(r => r.success && (!r.content.hasContent || r.content.wordCount < 50)).length;
res.json({
success: true,
message: `Berhasil memproses ${validUrls.length} URLs`,
statistics: {
total: validUrls.length,
success: successCount,
failed: failCount,
emptyContent: emptyContentCount
},
results
});
} catch (error) {
console.error('Error:', error);
res.status(500).json({
success: false,
message: 'Terjadi kesalahan saat memproses URLs',
error: error.message
});
} finally {
if (browser) {
await browser.close();
}
}
});
app.get('/health', (req, res) => {
res.json({
success: true,
message: 'Content Extractor API is running',
timestamp: new Date().toISOString()
});
});
app.get('/', (req, res) => {
res.json({
success: true,
message: 'Content Extractor API',
endpoints: {
'POST /extract-content': 'Extract content from URLs',
'GET /health': 'Health check',
'GET /': 'API information'
},
usage: {
method: 'POST',
endpoint: '/extract-content',
body: {
urls: ['https://example.com', 'https://another-site.com']
}
}
});
});
app.use((err, req, res, next) => {
console.error('Unhandled error:', err);
res.status(500).json({
success: false,
message: 'Internal server error',
error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong'
});
});
app.use((req, res) => {
res.status(404).json({
success: false,
message: 'Endpoint not found'
});
});
app.listen(PORT, () => {
console.log(`πŸš€ Content Extractor API running on port ${PORT}`);
console.log(`πŸ“– API Documentation: http://localhost:${PORT}`);
console.log(`πŸ₯ Health Check: http://localhost:${PORT}/health`);
});