|
import express from 'express'; |
|
import { chromium } from 'playwright'; |
|
import cors from 'cors'; |
|
import bodyParser from 'body-parser' |
|
|
|
const app = express(); |
|
const PORT = process.env.PORT || 7860; |
|
|
|
app.set('json spaces', 2) |
|
|
|
app.use(bodyParser.urlencoded({ extended: true })) |
|
app.use(bodyParser.json()) |
|
app.use(express.json({ limit: '500mb' })); |
|
app.use(cors()); |
|
|
|
async function extractContentFromUrl(url, page) { |
|
try { |
|
page.setDefaultNavigationTimeout(30000); |
|
page.setDefaultTimeout(30000); |
|
|
|
await page.route('**/*', (route) => { |
|
const resourceType = route.request().resourceType(); |
|
const url = route.request().url(); |
|
|
|
if (['font', 'media', 'websocket'].includes(resourceType)) { |
|
route.abort(); |
|
} |
|
else if (url.includes('google-analytics') || |
|
url.includes('doubleclick') || |
|
url.includes('facebook') || |
|
url.includes('twitter') || |
|
url.includes('analytics')) { |
|
route.abort(); |
|
} else { |
|
route.continue(); |
|
} |
|
}); |
|
|
|
try { |
|
await page.goto(url, { |
|
waitUntil: 'domcontentloaded', |
|
timeout: 30000 |
|
}); |
|
} catch (navigationError) { |
|
console.log(`Navigation error for ${url}, trying with load event`); |
|
await page.goto(url, { |
|
waitUntil: 'load', |
|
timeout: 30000 |
|
}); |
|
} |
|
|
|
try { |
|
await page.waitForSelector('body', { state: 'visible', timeout: 10000 }); |
|
} catch (e) { |
|
console.log('Body selector timeout, continuing anyway'); |
|
} |
|
|
|
await page.waitForTimeout(3000); |
|
|
|
const contentSelectors = ['article', 'main', '.content', '#content']; |
|
for (const selector of contentSelectors) { |
|
try { |
|
await page.waitForSelector(selector, { timeout: 5000 }); |
|
break; |
|
} catch (e) { |
|
} |
|
} |
|
|
|
await page.evaluate(() => { |
|
return new Promise((resolve) => { |
|
let totalHeight = 0; |
|
const distance = 100; |
|
let scrollCount = 0; |
|
const maxScrolls = 30; |
|
|
|
const timer = setInterval(() => { |
|
const scrollHeight = document.body.scrollHeight; |
|
window.scrollBy(0, distance); |
|
totalHeight += distance; |
|
scrollCount++; |
|
|
|
if(totalHeight >= scrollHeight || scrollCount >= maxScrolls){ |
|
clearInterval(timer); |
|
window.scrollTo(0, 0); |
|
resolve(); |
|
} |
|
}, 100); |
|
}); |
|
}); |
|
|
|
await page.waitForTimeout(1000); |
|
|
|
const content = await page.evaluate(() => { |
|
const cleanText = (text) => { |
|
return text ? text.replace(/\s+/g, ' ').trim() : ''; |
|
}; |
|
|
|
const getTextContent = (element) => { |
|
if (!element) return ''; |
|
|
|
const clone = element.cloneNode(true); |
|
|
|
const scripts = clone.querySelectorAll('script, style, noscript, iframe'); |
|
scripts.forEach(el => el.remove()); |
|
|
|
return cleanText(clone.textContent || clone.innerText || ''); |
|
}; |
|
|
|
const title = document.title || ''; |
|
const metaDescription = document.querySelector('meta[name="description"]')?.content || |
|
document.querySelector('meta[property="og:description"]')?.content || ''; |
|
|
|
const h1Elements = Array.from(document.querySelectorAll('h1')) |
|
.map(h1 => getTextContent(h1)) |
|
.filter(text => text.length > 0); |
|
|
|
const h2Elements = Array.from(document.querySelectorAll('h2')) |
|
.map(h2 => getTextContent(h2)) |
|
.filter(text => text.length > 0); |
|
|
|
const paragraphs = Array.from(document.querySelectorAll('p')) |
|
.map(p => getTextContent(p)) |
|
.filter(text => text.length > 20); |
|
|
|
const contentSelectors = [ |
|
'main', |
|
'article', |
|
'[role="main"]', |
|
'.content', |
|
'#content', |
|
'.post-content', |
|
'.entry-content', |
|
'.article-content', |
|
'.page-content', |
|
'.main-content', |
|
'[itemprop="articleBody"]', |
|
'.story-body', |
|
'.article-body', |
|
'.detail__body-text', |
|
'.detail__body', |
|
'.itp_bodycontent' |
|
]; |
|
|
|
let mainContent = null; |
|
for (const selector of contentSelectors) { |
|
mainContent = document.querySelector(selector); |
|
if (mainContent && getTextContent(mainContent).length > 100) break; |
|
} |
|
|
|
if (!mainContent) { |
|
mainContent = document.body; |
|
} |
|
|
|
const mainText = getTextContent(mainContent); |
|
|
|
let fallbackText = ''; |
|
if (!mainText || mainText.length < 100) { |
|
const allDivs = Array.from(document.querySelectorAll('div')) |
|
.map(div => getTextContent(div)) |
|
.filter(text => text.length > 200) |
|
.sort((a, b) => b.length - a.length); |
|
|
|
fallbackText = allDivs[0] || ''; |
|
} |
|
|
|
const finalMainText = mainText || fallbackText; |
|
|
|
const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || ''; |
|
const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length; |
|
|
|
return { |
|
title, |
|
metaDescription, |
|
headings: { |
|
h1: h1Elements, |
|
h2: h2Elements |
|
}, |
|
paragraphs, |
|
mainText: finalMainText, |
|
wordCount, |
|
hasContent: wordCount > 50 |
|
}; |
|
}); |
|
|
|
return { |
|
url, |
|
success: true, |
|
content, |
|
extractedAt: new Date().toISOString() |
|
}; |
|
} catch (error) { |
|
console.error(`Error extracting ${url}:`, error); |
|
return { |
|
url, |
|
success: false, |
|
error: error.message, |
|
extractedAt: new Date().toISOString() |
|
}; |
|
} |
|
} |
|
|
|
app.post('/extract-content', async (req, res) => { |
|
const { urls } = req.body; |
|
|
|
if (!urls || !Array.isArray(urls)) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Body harus berisi array urls' |
|
}); |
|
} |
|
|
|
if (urls.length === 0) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Array urls tidak boleh kosong' |
|
}); |
|
} |
|
|
|
if (urls.length > 10) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Maksimal 10 URLs per request' |
|
}); |
|
} |
|
|
|
const validUrls = []; |
|
const invalidUrls = []; |
|
|
|
urls.forEach(url => { |
|
try { |
|
new URL(url); |
|
validUrls.push(url); |
|
} catch (error) { |
|
invalidUrls.push(url); |
|
} |
|
}); |
|
|
|
if (invalidUrls.length > 0) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Format URL tidak valid', |
|
invalidUrls |
|
}); |
|
} |
|
|
|
let browser; |
|
|
|
try { |
|
browser = await chromium.launch({ |
|
args: [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-accelerated-2d-canvas', |
|
'--disable-gpu', |
|
'--disable-blink-features=AutomationControlled', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins,site-per-process' |
|
], |
|
executablePath: process.env.CHROME_BIN, |
|
headless: true, |
|
}); |
|
|
|
const results = []; |
|
|
|
|
|
for (const url of validUrls) { |
|
const context = await browser.newContext({ |
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
|
viewport: { width: 1920, height: 1080 }, |
|
locale: 'en-US', |
|
timezoneId: 'America/New_York' |
|
}); |
|
|
|
const page = await context.newPage(); |
|
|
|
try { |
|
const result = await extractContentFromUrl(url, page); |
|
results.push(result); |
|
} catch (error) { |
|
console.error(`Error processing ${url}:`, error); |
|
results.push({ |
|
url, |
|
success: false, |
|
error: error.message, |
|
extractedAt: new Date().toISOString() |
|
}); |
|
} finally { |
|
await context.close(); |
|
} |
|
} |
|
|
|
const successCount = results.filter(r => r.success).length; |
|
const failCount = results.filter(r => !r.success).length; |
|
const emptyContentCount = results.filter(r => r.success && (!r.content.hasContent || r.content.wordCount < 50)).length; |
|
|
|
res.json({ |
|
success: true, |
|
message: `Berhasil memproses ${validUrls.length} URLs`, |
|
statistics: { |
|
total: validUrls.length, |
|
success: successCount, |
|
failed: failCount, |
|
emptyContent: emptyContentCount |
|
}, |
|
results |
|
}); |
|
|
|
} catch (error) { |
|
console.error('Error:', error); |
|
res.status(500).json({ |
|
success: false, |
|
message: 'Terjadi kesalahan saat memproses URLs', |
|
error: error.message |
|
}); |
|
} finally { |
|
if (browser) { |
|
await browser.close(); |
|
} |
|
} |
|
}); |
|
|
|
app.get('/health', (req, res) => { |
|
res.json({ |
|
success: true, |
|
message: 'Content Extractor API is running', |
|
timestamp: new Date().toISOString() |
|
}); |
|
}); |
|
|
|
app.get('/', (req, res) => { |
|
res.json({ |
|
success: true, |
|
message: 'Content Extractor API', |
|
endpoints: { |
|
'POST /extract-content': 'Extract content from URLs', |
|
'GET /health': 'Health check', |
|
'GET /': 'API information' |
|
}, |
|
usage: { |
|
method: 'POST', |
|
endpoint: '/extract-content', |
|
body: { |
|
urls: ['https://example.com', 'https://another-site.com'] |
|
} |
|
} |
|
}); |
|
}); |
|
|
|
app.use((err, req, res, next) => { |
|
console.error('Unhandled error:', err); |
|
res.status(500).json({ |
|
success: false, |
|
message: 'Internal server error', |
|
error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong' |
|
}); |
|
}); |
|
|
|
app.use((req, res) => { |
|
res.status(404).json({ |
|
success: false, |
|
message: 'Endpoint not found' |
|
}); |
|
}); |
|
|
|
app.listen(PORT, () => { |
|
console.log(`π Content Extractor API running on port ${PORT}`); |
|
console.log(`π API Documentation: http://localhost:${PORT}`); |
|
console.log(`π₯ Health Check: http://localhost:${PORT}/health`); |
|
}); |