helper / index.js
vyles's picture
Update index.js
979deb4 verified
raw
history blame
6.88 kB
import express from 'express';
import { chromium } from 'playwright';
import cors from 'cors';
const app = express();
const PORT = process.env.PORT || 3000;
// Middleware
app.use(express.json({ limit: '50mb' }));
app.use(cors());
// Utility function untuk mengekstrak konten dari URL
async function extractContentFromUrl(url, browser) {
const page = await browser.newPage();
try {
// Set user agent untuk menghindari blocking
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
// Navigate ke URL dengan timeout
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000
});
// Wait untuk memastikan konten dimuat
await page.waitForTimeout(2000);
// Ekstrak berbagai informasi dari halaman
const content = await page.evaluate(() => {
// Helper function untuk membersihkan text
const cleanText = (text) => {
return text ? text.replace(/\s+/g, ' ').trim() : '';
};
// Ekstrak title
const title = document.title || '';
// Ekstrak meta description
const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
// Ekstrak heading utama
const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText));
const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText));
// Ekstrak semua paragraf
const paragraphs = Array.from(document.querySelectorAll('p'))
.map(p => cleanText(p.innerText))
.filter(text => text.length > 20); // Filter paragraf yang terlalu pendek
// Ekstrak text dari main content area (jika ada)
const mainContent = document.querySelector('main, article, .content, #content, .post-content');
const mainText = mainContent ? cleanText(mainContent.innerText) : '';
// Ekstrak semua link
const links = Array.from(document.querySelectorAll('a[href]'))
.map(a => ({
text: cleanText(a.innerText),
href: a.href
}))
.filter(link => link.text && link.href);
// Ekstrak images
const images = Array.from(document.querySelectorAll('img[src]'))
.map(img => ({
src: img.src,
alt: img.alt || '',
title: img.title || ''
}));
return {
title,
metaDescription,
headings: {
h1: h1Elements,
h2: h2Elements
},
paragraphs,
mainText,
links: links.slice(0, 20), // Batasi jumlah link
images: images.slice(0, 10), // Batasi jumlah gambar
wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length
};
});
return {
url,
success: true,
content,
extractedAt: new Date().toISOString()
};
} catch (error) {
return {
url,
success: false,
error: error.message,
extractedAt: new Date().toISOString()
};
} finally {
await page.close();
}
}
// Endpoint untuk mengekstrak konten dari multiple URLs
app.post('/extract-content', async (req, res) => {
const { urls } = req.body;
// Validasi input
if (!urls || !Array.isArray(urls)) {
return res.status(400).json({
success: false,
message: 'Body harus berisi array urls'
});
}
if (urls.length === 0) {
return res.status(400).json({
success: false,
message: 'Array urls tidak boleh kosong'
});
}
if (urls.length > 10) {
return res.status(400).json({
success: false,
message: 'Maksimal 10 URLs per request'
});
}
// Validasi format URL
const validUrls = [];
const invalidUrls = [];
urls.forEach(url => {
try {
new URL(url);
validUrls.push(url);
} catch (error) {
invalidUrls.push(url);
}
});
if (invalidUrls.length > 0) {
return res.status(400).json({
success: false,
message: 'Format URL tidak valid',
invalidUrls
});
}
let browser;
try {
// Launch browser
browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
console.log(`Memproses ${validUrls.length} URLs...`);
// Process URLs secara paralel dengan batasan concurrency
const concurrencyLimit = 3;
const results = [];
for (let i = 0; i < validUrls.length; i += concurrencyLimit) {
const batch = validUrls.slice(i, i + concurrencyLimit);
const batchPromises = batch.map(url => extractContentFromUrl(url, browser));
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
}
// Statistik hasil
const successCount = results.filter(r => r.success).length;
const failCount = results.filter(r => !r.success).length;
res.json({
success: true,
message: `Berhasil memproses ${validUrls.length} URLs`,
statistics: {
total: validUrls.length,
success: successCount,
failed: failCount
},
results
});
} catch (error) {
console.error('Error:', error);
res.status(500).json({
success: false,
message: 'Terjadi kesalahan saat memproses URLs',
error: error.message
});
} finally {
if (browser) {
await browser.close();
}
}
});
// Health check endpoint
app.get('/health', (req, res) => {
res.json({
success: true,
message: 'Content Extractor API is running',
timestamp: new Date().toISOString()
});
});
// Root endpoint
app.get('/', (req, res) => {
res.json({
success: true,
message: 'Content Extractor API',
endpoints: {
'POST /extract-content': 'Extract content from URLs',
'GET /health': 'Health check',
'GET /': 'API information'
},
usage: {
method: 'POST',
endpoint: '/extract-content',
body: {
urls: ['https://example.com', 'https://another-site.com']
}
}
});
});
// Error handling middleware
app.use((err, req, res, next) => {
console.error('Unhandled error:', err);
res.status(500).json({
success: false,
message: 'Internal server error',
error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong'
});
});
// 404 handler - fix untuk path-to-regexp error
app.use((req, res) => {
res.status(404).json({
success: false,
message: 'Endpoint not found'
});
});
// Start server
app.listen(PORT, () => {
console.log(`πŸš€ Content Extractor API running on port ${PORT}`);
console.log(`πŸ“– API Documentation: http://localhost:${PORT}`);
console.log(`πŸ₯ Health Check: http://localhost:${PORT}/health`);
});
export default app;