import express from 'express'; import { chromium } from 'playwright'; import cors from 'cors'; import bodyParser from 'body-parser' const app = express(); const PORT = process.env.PORT || 7860; app.set('json spaces', 2) app.use(bodyParser.urlencoded({ extended: true })) app.use(bodyParser.json()) app.use(express.json({ limit: '500mb' })); app.use(cors()); async function extractContentFromUrl(url, browser) { const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }); const page = await context.newPage(); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 }); await page.waitForTimeout(15000); const content = await page.evaluate(() => { const cleanText = (text) => { return text ? text.replace(/\s+/g, ' ').trim() : ''; }; const title = document.title || ''; const metaDescription = document.querySelector('meta[name="description"]')?.content || ''; const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText)); const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText)); const paragraphs = Array.from(document.querySelectorAll('p')).map(p => cleanText(p.innerText)).filter(text => text.length > 20); // Filter paragraf yang terlalu pendek const mainContent = document.querySelector('main, article, .content, #content, .post-content'); const mainText = mainContent ? cleanText(mainContent.innerText) : ''; const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({ text: cleanText(a.innerText), href: a.href })).filter(link => link.text && link.href); const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({ src: img.src, alt: img.alt || '', title: img.title || '' })); return { title, metaDescription, headings: { h1: h1Elements, h2: h2Elements }, paragraphs, mainText, links: links.slice(0, 20), images: images.slice(0, 10), wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length }; }); return { url, success: true, content, extractedAt: new Date().toISOString() }; } catch (error) { return { url, success: false, error: error.message, extractedAt: new Date().toISOString() }; } finally { await context.close(); } } app.post('/extract-content', async (req, res) => { const { urls } = req.body; if (!urls || !Array.isArray(urls)) { return res.status(400).json({ success: false, message: 'Body harus berisi array urls' }); } if (urls.length === 0) { return res.status(400).json({ success: false, message: 'Array urls tidak boleh kosong' }); } if (urls.length > 10) { return res.status(400).json({ success: false, message: 'Maksimal 10 URLs per request' }); } const validUrls = []; const invalidUrls = []; urls.forEach(url => { try { new URL(url); validUrls.push(url); } catch (error) { invalidUrls.push(url); } }); if (invalidUrls.length > 0) { return res.status(400).json({ success: false, message: 'Format URL tidak valid', invalidUrls }); } let browser; try { browser = await chromium.launch({ args: ['--incognito', '--single-process', '--no-sandbox', '--no-zygote', '--no-cache'], executablePath: process.env.CHROME_BIN, headless: true, }); const concurrencyLimit = 3; const results = []; for (let i = 0; i < validUrls.length; i += concurrencyLimit) { const batch = validUrls.slice(i, i + concurrencyLimit); const batchPromises = batch.map(url => extractContentFromUrl(url, browser)); const batchResults = await Promise.all(batchPromises); results.push(...batchResults); } const successCount = results.filter(r => r.success).length; const failCount = results.filter(r => !r.success).length; res.json({ success: true, message: `Berhasil memproses ${validUrls.length} URLs`, statistics: { total: validUrls.length, success: successCount, failed: failCount }, results }); } catch (error) { console.error('Error:', error); res.status(500).json({ success: false, message: 'Terjadi kesalahan saat memproses URLs', error: error.message }); } finally { if (browser) { await browser.close(); } } }); app.get('/health', (req, res) => { res.json({ success: true, message: 'Content Extractor API is running', timestamp: new Date().toISOString() }); }); app.get('/', (req, res) => { res.json({ success: true, message: 'Content Extractor API', endpoints: { 'POST /extract-content': 'Extract content from URLs', 'GET /health': 'Health check', 'GET /': 'API information' }, usage: { method: 'POST', endpoint: '/extract-content', body: { urls: ['https://example.com', 'https://another-site.com'] } } }); }); app.use((err, req, res, next) => { console.error('Unhandled error:', err); res.status(500).json({ success: false, message: 'Internal server error', error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong' }); }); app.use((req, res) => { res.status(404).json({ success: false, message: 'Endpoint not found' }); }); app.listen(PORT, () => { console.log(`🚀 Content Extractor API running on port ${PORT}`); console.log(`📖 API Documentation: http://localhost:${PORT}`); console.log(`🏥 Health Check: http://localhost:${PORT}/health`); });