|
const express = require('express'); |
|
const { chromium } = require('playwright'); |
|
const app = express(); |
|
const PORT = process.env.PORT || 3000; |
|
|
|
|
|
app.use(express.json({ limit: '10mb' })); |
|
app.use(express.urlencoded({ extended: true })); |
|
|
|
|
|
app.use((req, res, next) => { |
|
res.header('Access-Control-Allow-Origin', '*'); |
|
res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS'); |
|
res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept, Authorization'); |
|
if (req.method === 'OPTIONS') { |
|
res.sendStatus(200); |
|
} else { |
|
next(); |
|
} |
|
}); |
|
|
|
|
|
async function extractContentFromUrl(url, timeout = 30000) { |
|
let browser; |
|
try { |
|
|
|
browser = await chromium.launch({ |
|
headless: true, |
|
args: ['--no-sandbox', '--disable-setuid-sandbox'] |
|
}); |
|
|
|
const context = await browser.newContext({ |
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
}); |
|
|
|
const page = await context.newPage(); |
|
|
|
|
|
page.setDefaultTimeout(timeout); |
|
|
|
|
|
await page.goto(url, { |
|
waitUntil: 'domcontentloaded', |
|
timeout: timeout |
|
}); |
|
|
|
|
|
await page.waitForLoadState('networkidle'); |
|
|
|
|
|
const content = await page.evaluate(() => { |
|
|
|
const scripts = document.querySelectorAll('script, style, noscript'); |
|
scripts.forEach(el => el.remove()); |
|
|
|
|
|
const title = document.title || ''; |
|
|
|
|
|
const metaDescription = document.querySelector('meta[name="description"]')?.content || ''; |
|
|
|
|
|
const textContent = document.body?.innerText || ''; |
|
|
|
|
|
const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({ |
|
text: a.innerText.trim(), |
|
href: a.href |
|
})).filter(link => link.text && link.href); |
|
|
|
|
|
const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({ |
|
alt: img.alt || '', |
|
src: img.src |
|
})); |
|
|
|
|
|
const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6')).map(h => ({ |
|
level: h.tagName.toLowerCase(), |
|
text: h.innerText.trim() |
|
})).filter(h => h.text); |
|
|
|
return { |
|
title, |
|
metaDescription, |
|
textContent: textContent.substring(0, 10000), |
|
links: links.slice(0, 50), |
|
images: images.slice(0, 20), |
|
headings: headings.slice(0, 30), |
|
wordCount: textContent.split(/\s+/).length |
|
}; |
|
}); |
|
|
|
await browser.close(); |
|
|
|
return { |
|
url, |
|
success: true, |
|
data: content, |
|
extractedAt: new Date().toISOString() |
|
}; |
|
|
|
} catch (error) { |
|
if (browser) { |
|
await browser.close(); |
|
} |
|
|
|
return { |
|
url, |
|
success: false, |
|
error: error.message, |
|
extractedAt: new Date().toISOString() |
|
}; |
|
} |
|
} |
|
|
|
|
|
function isValidUrl(string) { |
|
try { |
|
const url = new URL(string); |
|
return url.protocol === 'http:' || url.protocol === 'https:'; |
|
} catch { |
|
return false; |
|
} |
|
} |
|
|
|
|
|
app.post('/extract-content', async (req, res) => { |
|
try { |
|
const { urls, timeout } = req.body; |
|
|
|
|
|
if (!urls || !Array.isArray(urls)) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'URLs harus berupa array' |
|
}); |
|
} |
|
|
|
if (urls.length === 0) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Array URLs tidak boleh kosong' |
|
}); |
|
} |
|
|
|
if (urls.length > 10) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'Maksimal 10 URL per request' |
|
}); |
|
} |
|
|
|
|
|
const invalidUrls = urls.filter(url => !isValidUrl(url)); |
|
if (invalidUrls.length > 0) { |
|
return res.status(400).json({ |
|
success: false, |
|
message: 'URL tidak valid ditemukan', |
|
invalidUrls |
|
}); |
|
} |
|
|
|
|
|
const results = []; |
|
const requestTimeout = timeout || 30000; |
|
|
|
for (const url of urls) { |
|
console.log(`Extracting content from: ${url}`); |
|
const result = await extractContentFromUrl(url, requestTimeout); |
|
results.push(result); |
|
} |
|
|
|
|
|
const successCount = results.filter(r => r.success).length; |
|
const failCount = results.filter(r => !r.success).length; |
|
|
|
res.json({ |
|
success: true, |
|
message: `Berhasil memproses ${results.length} URL`, |
|
statistics: { |
|
total: results.length, |
|
success: successCount, |
|
failed: failCount |
|
}, |
|
results |
|
}); |
|
|
|
} catch (error) { |
|
console.error('Error in /extract-content:', error); |
|
res.status(500).json({ |
|
success: false, |
|
message: 'Internal server error', |
|
error: error.message |
|
}); |
|
} |
|
}); |
|
|
|
|
|
app.get('/health', (req, res) => { |
|
res.json({ |
|
success: true, |
|
message: 'API is running', |
|
timestamp: new Date().toISOString() |
|
}); |
|
}); |
|
|
|
|
|
app.get('/', (req, res) => { |
|
res.json({ |
|
name: 'URL Content Extractor API', |
|
version: '1.0.0', |
|
description: 'API untuk mengekstrak konten dari URL menggunakan Playwright', |
|
endpoints: { |
|
'POST /extract-content': { |
|
description: 'Ekstrak konten dari array URL', |
|
body: { |
|
urls: ['http://example.com', 'https://example2.com'], |
|
timeout: 30000 |
|
} |
|
}, |
|
'GET /health': 'Health check endpoint', |
|
'GET /': 'API information' |
|
} |
|
}); |
|
}); |
|
|
|
|
|
app.use((err, req, res, next) => { |
|
console.error('Unhandled error:', err); |
|
res.status(500).json({ |
|
success: false, |
|
message: 'Internal server error' |
|
}); |
|
}); |
|
|
|
|
|
app.use('*', (req, res) => { |
|
res.status(404).json({ |
|
success: false, |
|
message: 'Endpoint tidak ditemukan' |
|
}); |
|
}); |
|
|
|
|
|
app.listen(PORT, () => { |
|
console.log(`π Server berjalan di port ${PORT}`); |
|
console.log(`π API Documentation: http://localhost:${PORT}`); |
|
console.log(`β€οΈ Health Check: http://localhost:${PORT}/health`); |
|
}); |