import express from 'express'; import { chromium } from 'playwright'; import cors from 'cors'; import bodyParser from 'body-parser' const app = express(); const PORT = process.env.PORT || 7860; app.set('json spaces', 2) app.use(bodyParser.urlencoded({ extended: true })) app.use(bodyParser.json()) app.use(express.json({ limit: '500mb' })); app.use(cors()); async function extractContentFromUrl(url, browser) { const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', viewport: { width: 1920, height: 1080 }, locale: 'en-US', timezoneId: 'America/New_York' }); const page = await context.newPage(); try { // Intercept dan block resource yang tidak perlu await page.route('**/*', (route) => { const resourceType = route.request().resourceType(); if (['font', 'media'].includes(resourceType)) { route.abort(); } else { route.continue(); } }); await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 }); // Wait for content to be visible await page.waitForSelector('body', { state: 'visible', timeout: 30000 }); // Scroll untuk trigger lazy loading await page.evaluate(() => { return new Promise((resolve) => { let totalHeight = 0; const distance = 100; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if(totalHeight >= scrollHeight){ clearInterval(timer); window.scrollTo(0, 0); resolve(); } }, 100); }); }); // Wait tambahan setelah scroll await page.waitForTimeout(2000); const content = await page.evaluate(() => { const cleanText = (text) => { return text ? text.replace(/\s+/g, ' ').trim() : ''; }; // Helper function untuk mendapatkan text content yang lebih baik const getTextContent = (element) => { if (!element) return ''; // Clone element untuk manipulasi const clone = element.cloneNode(true); // Remove script dan style tags const scripts = clone.querySelectorAll('script, style, noscript'); scripts.forEach(el => el.remove()); // Get text content return cleanText(clone.textContent || clone.innerText || ''); }; const title = document.title || ''; const metaDescription = document.querySelector('meta[name="description"]')?.content || document.querySelector('meta[property="og:description"]')?.content || ''; // Improved heading extraction const h1Elements = Array.from(document.querySelectorAll('h1')) .map(h1 => getTextContent(h1)) .filter(text => text.length > 0); const h2Elements = Array.from(document.querySelectorAll('h2')) .map(h2 => getTextContent(h2)) .filter(text => text.length > 0); // Improved paragraph extraction const paragraphs = Array.from(document.querySelectorAll('p')) .map(p => getTextContent(p)) .filter(text => text.length > 20); // Try multiple selectors for main content const contentSelectors = [ 'main', 'article', '[role="main"]', '.content', '#content', '.post-content', '.entry-content', '.article-content', '.page-content', '.main-content', '[itemprop="articleBody"]', '.story-body', '.article-body' ]; let mainContent = null; for (const selector of contentSelectors) { mainContent = document.querySelector(selector); if (mainContent) break; } // Fallback: jika tidak ada main content, ambil dari body if (!mainContent) { mainContent = document.body; } const mainText = getTextContent(mainContent); // Jika mainText masih kosong, coba ambil semua text dari div yang panjang let fallbackText = ''; if (!mainText || mainText.length < 100) { const allDivs = Array.from(document.querySelectorAll('div')) .map(div => getTextContent(div)) .filter(text => text.length > 200) .sort((a, b) => b.length - a.length); fallbackText = allDivs[0] || ''; } const finalMainText = mainText || fallbackText; // Extract links dengan filter yang lebih baik const links = Array.from(document.querySelectorAll('a[href]')) .map(a => ({ text: getTextContent(a), href: a.href })) .filter(link => link.text && link.href && !link.href.startsWith('javascript:')) .slice(0, 20); // Extract images dengan filter yang lebih baik const images = Array.from(document.querySelectorAll('img[src]')) .filter(img => img.src && !img.src.includes('data:image')) .map(img => ({ src: img.src, alt: img.alt || '', title: img.title || '' })) .slice(0, 10); // Calculate word count const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || ''; const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length; return { title, metaDescription, headings: { h1: h1Elements, h2: h2Elements }, paragraphs, mainText: finalMainText, links, images, wordCount, hasContent: wordCount > 50 // Flag untuk mengecek apakah ada konten }; }); return { url, success: true, content, extractedAt: new Date().toISOString() }; } catch (error) { console.error(`Error extracting ${url}:`, error); return { url, success: false, error: error.message, extractedAt: new Date().toISOString() }; } finally { await context.close(); } } app.post('/extract-content', async (req, res) => { const { urls } = req.body; if (!urls || !Array.isArray(urls)) { return res.status(400).json({ success: false, message: 'Body harus berisi array urls' }); } if (urls.length === 0) { return res.status(400).json({ success: false, message: 'Array urls tidak boleh kosong' }); } if (urls.length > 10) { return res.status(400).json({ success: false, message: 'Maksimal 10 URLs per request' }); } const validUrls = []; const invalidUrls = []; urls.forEach(url => { try { new URL(url); validUrls.push(url); } catch (error) { invalidUrls.push(url); } }); if (invalidUrls.length > 0) { return res.status(400).json({ success: false, message: 'Format URL tidak valid', invalidUrls }); } let browser; try { browser = await chromium.launch({ args: [ '--incognito', '--single-process', '--no-sandbox', '--no-zygote', '--no-cache', '--disable-dev-shm-usage', '--disable-setuid-sandbox', '--disable-accelerated-2d-canvas', '--disable-gpu' ], executablePath: process.env.CHROME_BIN, headless: true, }); const concurrencyLimit = 3; const results = []; for (let i = 0; i < validUrls.length; i += concurrencyLimit) { const batch = validUrls.slice(i, i + concurrencyLimit); const batchPromises = batch.map(url => extractContentFromUrl(url, browser)); const batchResults = await Promise.all(batchPromises); results.push(...batchResults); } const successCount = results.filter(r => r.success).length; const failCount = results.filter(r => !r.success).length; const emptyContentCount = results.filter(r => r.success && (!r.content.hasContent || r.content.wordCount < 50)).length; res.json({ success: true, message: `Berhasil memproses ${validUrls.length} URLs`, statistics: { total: validUrls.length, success: successCount, failed: failCount, emptyContent: emptyContentCount }, results }); } catch (error) { console.error('Error:', error); res.status(500).json({ success: false, message: 'Terjadi kesalahan saat memproses URLs', error: error.message }); } finally { if (browser) { await browser.close(); } } }); app.get('/health', (req, res) => { res.json({ success: true, message: 'Content Extractor API is running', timestamp: new Date().toISOString() }); }); app.get('/', (req, res) => { res.json({ success: true, message: 'Content Extractor API', endpoints: { 'POST /extract-content': 'Extract content from URLs', 'GET /health': 'Health check', 'GET /': 'API information' }, usage: { method: 'POST', endpoint: '/extract-content', body: { urls: ['https://example.com', 'https://another-site.com'] } } }); }); app.use((err, req, res, next) => { console.error('Unhandled error:', err); res.status(500).json({ success: false, message: 'Internal server error', error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong' }); }); app.use((req, res) => { res.status(404).json({ success: false, message: 'Endpoint not found' }); }); app.listen(PORT, () => { console.log(`🚀 Content Extractor API running on port ${PORT}`); console.log(`📖 API Documentation: http://localhost:${PORT}`); console.log(`🏥 Health Check: http://localhost:${PORT}/health`); });