helper / index.js
vyles's picture
Create index.js
af34501 verified
raw
history blame
6.61 kB
const express = require('express');
const { chromium } = require('playwright');
const app = express();
const PORT = process.env.PORT || 3000;
// Middleware
app.use(express.json({ limit: '10mb' }));
app.use(express.urlencoded({ extended: true }));
// CORS middleware (opsional)
app.use((req, res, next) => {
res.header('Access-Control-Allow-Origin', '*');
res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS');
res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept, Authorization');
if (req.method === 'OPTIONS') {
res.sendStatus(200);
} else {
next();
}
});
// Fungsi untuk mengekstrak konten dari URL
async function extractContentFromUrl(url, timeout = 30000) {
let browser;
try {
// Launch browser
browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
});
const page = await context.newPage();
// Set timeout
page.setDefaultTimeout(timeout);
// Navigate to URL
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: timeout
});
// Wait for page to load completely
await page.waitForLoadState('networkidle');
// Extract content
const content = await page.evaluate(() => {
// Remove script and style elements
const scripts = document.querySelectorAll('script, style, noscript');
scripts.forEach(el => el.remove());
// Get title
const title = document.title || '';
// Get meta description
const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
// Get all text content
const textContent = document.body?.innerText || '';
// Get all links
const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({
text: a.innerText.trim(),
href: a.href
})).filter(link => link.text && link.href);
// Get all images
const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({
alt: img.alt || '',
src: img.src
}));
// Get headings
const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6')).map(h => ({
level: h.tagName.toLowerCase(),
text: h.innerText.trim()
})).filter(h => h.text);
return {
title,
metaDescription,
textContent: textContent.substring(0, 10000), // Limit text content
links: links.slice(0, 50), // Limit links
images: images.slice(0, 20), // Limit images
headings: headings.slice(0, 30), // Limit headings
wordCount: textContent.split(/\s+/).length
};
});
await browser.close();
return {
url,
success: true,
data: content,
extractedAt: new Date().toISOString()
};
} catch (error) {
if (browser) {
await browser.close();
}
return {
url,
success: false,
error: error.message,
extractedAt: new Date().toISOString()
};
}
}
// Validasi URL
function isValidUrl(string) {
try {
const url = new URL(string);
return url.protocol === 'http:' || url.protocol === 'https:';
} catch {
return false;
}
}
// Route untuk ekstrak konten
app.post('/extract-content', async (req, res) => {
try {
const { urls, timeout } = req.body;
// Validasi input
if (!urls || !Array.isArray(urls)) {
return res.status(400).json({
success: false,
message: 'URLs harus berupa array'
});
}
if (urls.length === 0) {
return res.status(400).json({
success: false,
message: 'Array URLs tidak boleh kosong'
});
}
if (urls.length > 10) {
return res.status(400).json({
success: false,
message: 'Maksimal 10 URL per request'
});
}
// Validasi setiap URL
const invalidUrls = urls.filter(url => !isValidUrl(url));
if (invalidUrls.length > 0) {
return res.status(400).json({
success: false,
message: 'URL tidak valid ditemukan',
invalidUrls
});
}
// Ekstrak konten dari setiap URL
const results = [];
const requestTimeout = timeout || 30000;
for (const url of urls) {
console.log(`Extracting content from: ${url}`);
const result = await extractContentFromUrl(url, requestTimeout);
results.push(result);
}
// Hitung statistik
const successCount = results.filter(r => r.success).length;
const failCount = results.filter(r => !r.success).length;
res.json({
success: true,
message: `Berhasil memproses ${results.length} URL`,
statistics: {
total: results.length,
success: successCount,
failed: failCount
},
results
});
} catch (error) {
console.error('Error in /extract-content:', error);
res.status(500).json({
success: false,
message: 'Internal server error',
error: error.message
});
}
});
// Route untuk health check
app.get('/health', (req, res) => {
res.json({
success: true,
message: 'API is running',
timestamp: new Date().toISOString()
});
});
// Route untuk info API
app.get('/', (req, res) => {
res.json({
name: 'URL Content Extractor API',
version: '1.0.0',
description: 'API untuk mengekstrak konten dari URL menggunakan Playwright',
endpoints: {
'POST /extract-content': {
description: 'Ekstrak konten dari array URL',
body: {
urls: ['http://example.com', 'https://example2.com'],
timeout: 30000
}
},
'GET /health': 'Health check endpoint',
'GET /': 'API information'
}
});
});
// Error handler
app.use((err, req, res, next) => {
console.error('Unhandled error:', err);
res.status(500).json({
success: false,
message: 'Internal server error'
});
});
// 404 handler
app.use('*', (req, res) => {
res.status(404).json({
success: false,
message: 'Endpoint tidak ditemukan'
});
});
// Start server
app.listen(PORT, () => {
console.log(`πŸš€ Server berjalan di port ${PORT}`);
console.log(`πŸ“ API Documentation: http://localhost:${PORT}`);
console.log(`❀️ Health Check: http://localhost:${PORT}/health`);
});