const express = require('express'); const { chromium } = require('playwright'); const app = express(); const PORT = process.env.PORT || 3000; // Middleware app.use(express.json({ limit: '10mb' })); app.use(express.urlencoded({ extended: true })); // CORS middleware (opsional) app.use((req, res, next) => { res.header('Access-Control-Allow-Origin', '*'); res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS'); res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept, Authorization'); if (req.method === 'OPTIONS') { res.sendStatus(200); } else { next(); } }); // Fungsi untuk mengekstrak konten dari URL async function extractContentFromUrl(url, timeout = 30000) { let browser; try { // Launch browser browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }); const page = await context.newPage(); // Set timeout page.setDefaultTimeout(timeout); // Navigate to URL await page.goto(url, { waitUntil: 'domcontentloaded', timeout: timeout }); // Wait for page to load completely await page.waitForLoadState('networkidle'); // Extract content const content = await page.evaluate(() => { // Remove script and style elements const scripts = document.querySelectorAll('script, style, noscript'); scripts.forEach(el => el.remove()); // Get title const title = document.title || ''; // Get meta description const metaDescription = document.querySelector('meta[name="description"]')?.content || ''; // Get all text content const textContent = document.body?.innerText || ''; // Get all links const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({ text: a.innerText.trim(), href: a.href })).filter(link => link.text && link.href); // Get all images const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({ alt: img.alt || '', src: img.src })); // Get headings const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6')).map(h => ({ level: h.tagName.toLowerCase(), text: h.innerText.trim() })).filter(h => h.text); return { title, metaDescription, textContent: textContent.substring(0, 10000), // Limit text content links: links.slice(0, 50), // Limit links images: images.slice(0, 20), // Limit images headings: headings.slice(0, 30), // Limit headings wordCount: textContent.split(/\s+/).length }; }); await browser.close(); return { url, success: true, data: content, extractedAt: new Date().toISOString() }; } catch (error) { if (browser) { await browser.close(); } return { url, success: false, error: error.message, extractedAt: new Date().toISOString() }; } } // Validasi URL function isValidUrl(string) { try { const url = new URL(string); return url.protocol === 'http:' || url.protocol === 'https:'; } catch { return false; } } // Route untuk ekstrak konten app.post('/extract-content', async (req, res) => { try { const { urls, timeout } = req.body; // Validasi input if (!urls || !Array.isArray(urls)) { return res.status(400).json({ success: false, message: 'URLs harus berupa array' }); } if (urls.length === 0) { return res.status(400).json({ success: false, message: 'Array URLs tidak boleh kosong' }); } if (urls.length > 10) { return res.status(400).json({ success: false, message: 'Maksimal 10 URL per request' }); } // Validasi setiap URL const invalidUrls = urls.filter(url => !isValidUrl(url)); if (invalidUrls.length > 0) { return res.status(400).json({ success: false, message: 'URL tidak valid ditemukan', invalidUrls }); } // Ekstrak konten dari setiap URL const results = []; const requestTimeout = timeout || 30000; for (const url of urls) { console.log(`Extracting content from: ${url}`); const result = await extractContentFromUrl(url, requestTimeout); results.push(result); } // Hitung statistik const successCount = results.filter(r => r.success).length; const failCount = results.filter(r => !r.success).length; res.json({ success: true, message: `Berhasil memproses ${results.length} URL`, statistics: { total: results.length, success: successCount, failed: failCount }, results }); } catch (error) { console.error('Error in /extract-content:', error); res.status(500).json({ success: false, message: 'Internal server error', error: error.message }); } }); // Route untuk health check app.get('/health', (req, res) => { res.json({ success: true, message: 'API is running', timestamp: new Date().toISOString() }); }); // Route untuk info API app.get('/', (req, res) => { res.json({ name: 'URL Content Extractor API', version: '1.0.0', description: 'API untuk mengekstrak konten dari URL menggunakan Playwright', endpoints: { 'POST /extract-content': { description: 'Ekstrak konten dari array URL', body: { urls: ['http://example.com', 'https://example2.com'], timeout: 30000 } }, 'GET /health': 'Health check endpoint', 'GET /': 'API information' } }); }); // Error handler app.use((err, req, res, next) => { console.error('Unhandled error:', err); res.status(500).json({ success: false, message: 'Internal server error' }); }); // 404 handler app.use('*', (req, res) => { res.status(404).json({ success: false, message: 'Endpoint tidak ditemukan' }); }); // Start server app.listen(PORT, () => { console.log(`🚀 Server berjalan di port ${PORT}`); console.log(`📝 API Documentation: http://localhost:${PORT}`); console.log(`❤️ Health Check: http://localhost:${PORT}/health`); });