|
import express from 'express'; |
|
import { chromium } from 'playwright'; |
|
import cors from 'cors'; |
|
import bodyParser from 'body-parser' |
|
|
|
const app = express(); |
|
const PORT = process.env.PORT || 7860; |
|
|
|
app.set('json spaces', 2) |
|
|
|
app.use(bodyParser.urlencoded({ extended: true })) |
|
app.use(bodyParser.json()) |
|
app.use(express.json({ limit: '500mb' })); |
|
app.use(cors()); |
|
|
|
async function extractContentFromUrl(url, browser) { |
|
const context = await browser.newContext({ |
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
}); |
|
const page = await context.newPage(); |
|
try { |
|
await page.goto(url, { |
|
waitUntil: 'domcontentloaded', |
|
timeout: 60000 |
|
}); |
|
await page.waitForTimeout(10000); |
|
const content = await page.evaluate(() => { |
|
const cleanText = (text) => { |
|
return text ? text.replace(/\s+/g, ' ').trim() : ''; |
|
}; |
|
|
|
const title = document.title || ''; |
|
const metaDescription = document.querySelector('meta[name="description"]')?.content || ''; |
|
const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText)); |
|
const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText)); |
|
const paragraphs = Array.from(document.querySelectorAll('p')).map(p => cleanText(p.innerText)).filter(text => text.length > 20); |
|
const mainContent = document.querySelector('main, article, .content, #content, .post-content'); |
|
const mainText = mainContent ? cleanText(mainContent.innerText) : ''; |
|
|
|
const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({ |
|
text: cleanText(a.innerText), |
|
href: a.href |
|
})).filter(link => link.text && link.href); |
|
|
|
const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({ |
|
src: img.src, |
|
alt: img.alt || '', |
|
title: img.title || '' |
|
})); |
|
|
|
return { |
|
title, |
|
metaDescription, |
|
headings: { |
|
h1: h1Elements, |
|
h2: h2Elements |
|
}, |
|
paragraphs, |
|
mainText, |
|
links: links.slice(0, 20), |
|
images: images.slice(0, 10), |
|
wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length |
|
}; |
|
}); |
|
|
|
return { |
|
url, |
|
success: true, |
|
content, |
|
extractedAt: new Date().toISOString() |
|
}; |
|
} catch (error) { |
|
return { |
|
url, |
|
success: false, |
|
error: error.message, |
|
extractedAt: new Date().toISOString() |
|
}; |
|
} finally { |
|
await context.close(); |
|
} |
|
} |
|
|
|
app.post('/extract-content', async (req, res) => { |
|
const { urls } = req.body; |
|
|
|
if (!urls || !Array.isArray(urls)) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Body harus berisi array urls' |
|
}); |
|
} |
|
|
|
if (urls.length === 0) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Array urls tidak boleh kosong' |
|
}); |
|
} |
|
|
|
if (urls.length > 10) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Maksimal 10 URLs per request' |
|
}); |
|
} |
|
|
|
const validUrls = []; |
|
const invalidUrls = []; |
|
|
|
urls.forEach(url => { |
|
try { |
|
new URL(url); |
|
validUrls.push(url); |
|
} catch (error) { |
|
invalidUrls.push(url); |
|
} |
|
}); |
|
|
|
if (invalidUrls.length > 0) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Format URL tidak valid', |
|
invalidUrls |
|
}); |
|
} |
|
|
|
let browser; |
|
|
|
try { |
|
browser = await chromium.launch({ |
|
args: ['--incognito', '--single-process', '--no-sandbox', '--no-zygote', '--no-cache'], |
|
executablePath: process.env.CHROME_BIN, |
|
headless: true, |
|
}); |
|
|
|
console.log(`Memproses ${validUrls.length} URLs...`); |
|
|
|
const concurrencyLimit = 3; |
|
const results = []; |
|
|
|
for (let i = 0; i < validUrls.length; i += concurrencyLimit) { |
|
const batch = validUrls.slice(i, i + concurrencyLimit); |
|
const batchPromises = batch.map(url => extractContentFromUrl(url, browser)); |
|
const batchResults = await Promise.all(batchPromises); |
|
results.push(...batchResults); |
|
} |
|
|
|
const successCount = results.filter(r => r.success).length; |
|
const failCount = results.filter(r => !r.success).length; |
|
|
|
res.json({ |
|
success: true, |
|
message: `Berhasil memproses ${validUrls.length} URLs`, |
|
statistics: { |
|
total: validUrls.length, |
|
success: successCount, |
|
failed: failCount |
|
}, |
|
results |
|
}); |
|
|
|
} catch (error) { |
|
console.error('Error:', error); |
|
res.status(500).json({ |
|
success: false, |
|
message: 'Terjadi kesalahan saat memproses URLs', |
|
error: error.message |
|
}); |
|
} finally { |
|
if (browser) { |
|
await browser.close(); |
|
} |
|
} |
|
}); |
|
|
|
app.get('/health', (req, res) => { |
|
res.json({ |
|
success: true, |
|
message: 'Content Extractor API is running', |
|
timestamp: new Date().toISOString() |
|
}); |
|
}); |
|
|
|
app.get('/', (req, res) => { |
|
res.json({ |
|
success: true, |
|
message: 'Content Extractor API', |
|
endpoints: { |
|
'POST /extract-content': 'Extract content from URLs', |
|
'GET /health': 'Health check', |
|
'GET /': 'API information' |
|
}, |
|
usage: { |
|
method: 'POST', |
|
endpoint: '/extract-content', |
|
body: { |
|
urls: ['https://example.com', 'https://another-site.com'] |
|
} |
|
} |
|
}); |
|
}); |
|
|
|
app.use((err, req, res, next) => { |
|
console.error('Unhandled error:', err); |
|
res.status(500).json({ |
|
success: false, |
|
message: 'Internal server error', |
|
error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong' |
|
}); |
|
}); |
|
|
|
app.use((req, res) => { |
|
res.status(404).json({ |
|
success: false, |
|
message: 'Endpoint not found' |
|
}); |
|
}); |
|
|
|
app.listen(PORT, () => { |
|
console.log(`π Content Extractor API running on port ${PORT}`); |
|
console.log(`π API Documentation: http://localhost:${PORT}`); |
|
console.log(`π₯ Health Check: http://localhost:${PORT}/health`); |
|
}); |