import express from 'express'; import { chromium } from 'playwright'; import cors from 'cors'; const app = express(); const PORT = process.env.PORT || 3000; // Middleware app.use(express.json({ limit: '50mb' })); app.use(cors()); // Utility function untuk mengekstrak konten dari URL async function extractContentFromUrl(url, browser) { const page = await browser.newPage(); try { // Set user agent untuk menghindari blocking await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); // Navigate ke URL dengan timeout await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); // Wait untuk memastikan konten dimuat await page.waitForTimeout(2000); // Ekstrak berbagai informasi dari halaman const content = await page.evaluate(() => { // Helper function untuk membersihkan text const cleanText = (text) => { return text ? text.replace(/\s+/g, ' ').trim() : ''; }; // Ekstrak title const title = document.title || ''; // Ekstrak meta description const metaDescription = document.querySelector('meta[name="description"]')?.content || ''; // Ekstrak heading utama const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText)); const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText)); // Ekstrak semua paragraf const paragraphs = Array.from(document.querySelectorAll('p')) .map(p => cleanText(p.innerText)) .filter(text => text.length > 20); // Filter paragraf yang terlalu pendek // Ekstrak text dari main content area (jika ada) const mainContent = document.querySelector('main, article, .content, #content, .post-content'); const mainText = mainContent ? cleanText(mainContent.innerText) : ''; // Ekstrak semua link const links = Array.from(document.querySelectorAll('a[href]')) .map(a => ({ text: cleanText(a.innerText), href: a.href })) .filter(link => link.text && link.href); // Ekstrak images const images = Array.from(document.querySelectorAll('img[src]')) .map(img => ({ src: img.src, alt: img.alt || '', title: img.title || '' })); return { title, metaDescription, headings: { h1: h1Elements, h2: h2Elements }, paragraphs, mainText, links: links.slice(0, 20), // Batasi jumlah link images: images.slice(0, 10), // Batasi jumlah gambar wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length }; }); return { url, success: true, content, extractedAt: new Date().toISOString() }; } catch (error) { return { url, success: false, error: error.message, extractedAt: new Date().toISOString() }; } finally { await page.close(); } } // Endpoint untuk mengekstrak konten dari multiple URLs app.post('/extract-content', async (req, res) => { const { urls } = req.body; // Validasi input if (!urls || !Array.isArray(urls)) { return res.status(400).json({ success: false, message: 'Body harus berisi array urls' }); } if (urls.length === 0) { return res.status(400).json({ success: false, message: 'Array urls tidak boleh kosong' }); } if (urls.length > 10) { return res.status(400).json({ success: false, message: 'Maksimal 10 URLs per request' }); } // Validasi format URL const validUrls = []; const invalidUrls = []; urls.forEach(url => { try { new URL(url); validUrls.push(url); } catch (error) { invalidUrls.push(url); } }); if (invalidUrls.length > 0) { return res.status(400).json({ success: false, message: 'Format URL tidak valid', invalidUrls }); } let browser; try { // Launch browser browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); console.log(`Memproses ${validUrls.length} URLs...`); // Process URLs secara paralel dengan batasan concurrency const concurrencyLimit = 3; const results = []; for (let i = 0; i < validUrls.length; i += concurrencyLimit) { const batch = validUrls.slice(i, i + concurrencyLimit); const batchPromises = batch.map(url => extractContentFromUrl(url, browser)); const batchResults = await Promise.all(batchPromises); results.push(...batchResults); } // Statistik hasil const successCount = results.filter(r => r.success).length; const failCount = results.filter(r => !r.success).length; res.json({ success: true, message: `Berhasil memproses ${validUrls.length} URLs`, statistics: { total: validUrls.length, success: successCount, failed: failCount }, results }); } catch (error) { console.error('Error:', error); res.status(500).json({ success: false, message: 'Terjadi kesalahan saat memproses URLs', error: error.message }); } finally { if (browser) { await browser.close(); } } }); // Health check endpoint app.get('/health', (req, res) => { res.json({ success: true, message: 'Content Extractor API is running', timestamp: new Date().toISOString() }); }); // Root endpoint app.get('/', (req, res) => { res.json({ success: true, message: 'Content Extractor API', endpoints: { 'POST /extract-content': 'Extract content from URLs', 'GET /health': 'Health check', 'GET /': 'API information' }, usage: { method: 'POST', endpoint: '/extract-content', body: { urls: ['https://example.com', 'https://another-site.com'] } } }); }); // Error handling middleware app.use((err, req, res, next) => { console.error('Unhandled error:', err); res.status(500).json({ success: false, message: 'Internal server error', error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong' }); }); // 404 handler app.use('*', (req, res) => { res.status(404).json({ success: false, message: 'Endpoint not found' }); }); // Start server app.listen(PORT, () => { console.log(`🚀 Content Extractor API running on port ${PORT}`); console.log(`📖 API Documentation: http://localhost:${PORT}`); console.log(`🏥 Health Check: http://localhost:${PORT}/health`); });