import express from 'express'; import { chromium } from 'playwright'; import cors from 'cors'; import bodyParser from 'body-parser' const app = express(); const PORT = process.env.PORT || 7860; app.set('json spaces', 2) app.use(bodyParser.urlencoded({ extended: true })) app.use(bodyParser.json()) app.use(express.json({ limit: '500mb' })); app.use(cors()); async function extractContentFromUrl(url, page) { try { page.setDefaultNavigationTimeout(30000); page.setDefaultTimeout(30000); await page.route('**/*', (route) => { const resourceType = route.request().resourceType(); const url = route.request().url(); if (['font', 'media', 'websocket'].includes(resourceType)) { route.abort(); } else if (url.includes('google-analytics') || url.includes('doubleclick') || url.includes('facebook') || url.includes('twitter') || url.includes('analytics')) { route.abort(); } else { route.continue(); } }); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); } catch (navigationError) { console.log(`Navigation error for ${url}, trying with load event`); await page.goto(url, { waitUntil: 'load', timeout: 30000 }); } try { await page.waitForSelector('body', { state: 'visible', timeout: 10000 }); } catch (e) { console.log('Body selector timeout, continuing anyway'); } await page.waitForTimeout(3000); const contentSelectors = ['article', 'main', '.content', '#content']; for (const selector of contentSelectors) { try { await page.waitForSelector(selector, { timeout: 5000 }); break; } catch (e) { } } await page.evaluate(() => { return new Promise((resolve) => { let totalHeight = 0; const distance = 100; let scrollCount = 0; const maxScrolls = 30; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; scrollCount++; if(totalHeight >= scrollHeight || scrollCount >= maxScrolls){ clearInterval(timer); window.scrollTo(0, 0); resolve(); } }, 100); }); }); await page.waitForTimeout(1000); const content = await page.evaluate(() => { const cleanText = (text) => { return text ? text.replace(/\s+/g, ' ').trim() : ''; }; const getTextContent = (element) => { if (!element) return ''; const clone = element.cloneNode(true); const scripts = clone.querySelectorAll('script, style, noscript, iframe'); scripts.forEach(el => el.remove()); return cleanText(clone.textContent || clone.innerText || ''); }; const title = document.title || ''; const metaDescription = document.querySelector('meta[name="description"]')?.content || document.querySelector('meta[property="og:description"]')?.content || ''; const h1Elements = Array.from(document.querySelectorAll('h1')) .map(h1 => getTextContent(h1)) .filter(text => text.length > 0); const h2Elements = Array.from(document.querySelectorAll('h2')) .map(h2 => getTextContent(h2)) .filter(text => text.length > 0); const paragraphs = Array.from(document.querySelectorAll('p')) .map(p => getTextContent(p)) .filter(text => text.length > 20); const contentSelectors = [ 'main', 'article', '[role="main"]', '.content', '#content', '.post-content', '.entry-content', '.article-content', '.page-content', '.main-content', '[itemprop="articleBody"]', '.story-body', '.article-body', '.detail__body-text', '.detail__body', '.itp_bodycontent' ]; let mainContent = null; for (const selector of contentSelectors) { mainContent = document.querySelector(selector); if (mainContent && getTextContent(mainContent).length > 100) break; } if (!mainContent) { mainContent = document.body; } const mainText = getTextContent(mainContent); let fallbackText = ''; if (!mainText || mainText.length < 100) { const allDivs = Array.from(document.querySelectorAll('div')) .map(div => getTextContent(div)) .filter(text => text.length > 200) .sort((a, b) => b.length - a.length); fallbackText = allDivs[0] || ''; } const finalMainText = mainText || fallbackText; const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || ''; const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length; return { title, metaDescription, headings: { h1: h1Elements, h2: h2Elements }, paragraphs, mainText: finalMainText, wordCount, hasContent: wordCount > 50 }; }); return { url, success: true, content, extractedAt: new Date().toISOString() }; } catch (error) { console.error(`Error extracting ${url}:`, error); return { url, success: false, error: error.message, extractedAt: new Date().toISOString() }; } } app.post('/extract-content', async (req, res) => { const { urls } = req.body; if (!urls || !Array.isArray(urls)) { return res.status(400).json({ success: false, message: 'Body harus berisi array urls' }); } if (urls.length === 0) { return res.status(400).json({ success: false, message: 'Array urls tidak boleh kosong' }); } if (urls.length > 10) { return res.status(400).json({ success: false, message: 'Maksimal 10 URLs per request' }); } const validUrls = []; const invalidUrls = []; urls.forEach(url => { try { new URL(url); validUrls.push(url); } catch (error) { invalidUrls.push(url); } }); if (invalidUrls.length > 0) { return res.status(400).json({ success: false, message: 'Format URL tidak valid', invalidUrls }); } let browser; try { browser = await chromium.launch({ args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--disable-blink-features=AutomationControlled', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process' ], executablePath: process.env.CHROME_BIN, headless: true, }); const results = []; // Process URLs sequentially to avoid browser crashes for (const url of validUrls) { const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', viewport: { width: 1920, height: 1080 }, locale: 'en-US', timezoneId: 'America/New_York' }); const page = await context.newPage(); try { const result = await extractContentFromUrl(url, page); results.push(result); } catch (error) { console.error(`Error processing ${url}:`, error); results.push({ url, success: false, error: error.message, extractedAt: new Date().toISOString() }); } finally { await context.close(); } } const successCount = results.filter(r => r.success).length; const failCount = results.filter(r => !r.success).length; const emptyContentCount = results.filter(r => r.success && (!r.content.hasContent || r.content.wordCount < 50)).length; res.json({ success: true, message: `Berhasil memproses ${validUrls.length} URLs`, statistics: { total: validUrls.length, success: successCount, failed: failCount, emptyContent: emptyContentCount }, results }); } catch (error) { console.error('Error:', error); res.status(500).json({ success: false, message: 'Terjadi kesalahan saat memproses URLs', error: error.message }); } finally { if (browser) { await browser.close(); } } }); app.get('/health', (req, res) => { res.json({ success: true, message: 'Content Extractor API is running', timestamp: new Date().toISOString() }); }); app.get('/', (req, res) => { res.json({ success: true, message: 'Content Extractor API', endpoints: { 'POST /extract-content': 'Extract content from URLs', 'GET /health': 'Health check', 'GET /': 'API information' }, usage: { method: 'POST', endpoint: '/extract-content', body: { urls: ['https://example.com', 'https://another-site.com'] } } }); }); app.use((err, req, res, next) => { console.error('Unhandled error:', err); res.status(500).json({ success: false, message: 'Internal server error', error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong' }); }); app.use((req, res) => { res.status(404).json({ success: false, message: 'Endpoint not found' }); }); app.listen(PORT, () => { console.log(`🚀 Content Extractor API running on port ${PORT}`); console.log(`📖 API Documentation: http://localhost:${PORT}`); console.log(`🏥 Health Check: http://localhost:${PORT}/health`); });