helper / index.js
vyles's picture
Upload index.js
9db0314 verified
raw
history blame
6.88 kB
import express from 'express';
import { chromium } from 'playwright';
import cors from 'cors';
import bodyParser from 'body-parser'
const app = express();
const PORT = process.env.PORT || 7860;
app.set('json spaces', 2)
app.use(bodyParser.urlencoded({ extended: true }))
app.use(bodyParser.json())
app.use(express.json({ limit: '500mb' }));
app.use(cors());
async function extractContentFromUrl(url, browser) {
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
});
const page = await context.newPage();
try {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await page.waitForTimeout(15000);
const content = await page.evaluate(() => {
const cleanText = (text) => {
return text ? text.replace(/\s+/g, ' ').trim() : '';
};
const title = document.title || '';
const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText));
const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText));
const paragraphs = Array.from(document.querySelectorAll('p')).map(p => cleanText(p.innerText)).filter(text => text.length > 20); // Filter paragraf yang terlalu pendek
const mainContent = document.querySelector('main, article, .content, #content, .post-content');
const mainText = mainContent ? cleanText(mainContent.innerText) : '';
const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({
text: cleanText(a.innerText),
href: a.href
})).filter(link => link.text && link.href);
const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({
src: img.src,
alt: img.alt || '',
title: img.title || ''
}));
return {
title,
metaDescription,
headings: {
h1: h1Elements,
h2: h2Elements
},
paragraphs,
mainText,
links: links.slice(0, 20),
images: images.slice(0, 10),
wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length
};
});
return {
url,
success: true,
content,
extractedAt: new Date().toISOString()
};
} catch (error) {
return {
url,
success: false,
error: error.message,
extractedAt: new Date().toISOString()
};
} finally {
await context.close();
}
}
app.post('/extract-content', async (req, res) => {
const { urls } = req.body;
if (!urls || !Array.isArray(urls)) {
return res.status(400).json({
success: false,
message: 'Body harus berisi array urls'
});
}
if (urls.length === 0) {
return res.status(400).json({
success: false,
message: 'Array urls tidak boleh kosong'
});
}
if (urls.length > 10) {
return res.status(400).json({
success: false,
message: 'Maksimal 10 URLs per request'
});
}
const validUrls = [];
const invalidUrls = [];
urls.forEach(url => {
try {
new URL(url);
validUrls.push(url);
} catch (error) {
invalidUrls.push(url);
}
});
if (invalidUrls.length > 0) {
return res.status(400).json({
success: false,
message: 'Format URL tidak valid',
invalidUrls
});
}
let browser;
try {
browser = await chromium.launch({
args: ['--incognito', '--single-process', '--no-sandbox', '--no-zygote', '--no-cache'],
executablePath: process.env.CHROME_BIN,
headless: true,
});
const concurrencyLimit = 3;
const results = [];
for (let i = 0; i < validUrls.length; i += concurrencyLimit) {
const batch = validUrls.slice(i, i + concurrencyLimit);
const batchPromises = batch.map(url => extractContentFromUrl(url, browser));
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
}
const successCount = results.filter(r => r.success).length;
const failCount = results.filter(r => !r.success).length;
res.json({
success: true,
message: `Berhasil memproses ${validUrls.length} URLs`,
statistics: {
total: validUrls.length,
success: successCount,
failed: failCount
},
results
});
} catch (error) {
console.error('Error:', error);
res.status(500).json({
success: false,
message: 'Terjadi kesalahan saat memproses URLs',
error: error.message
});
} finally {
if (browser) {
await browser.close();
}
}
});
app.get('/health', (req, res) => {
res.json({
success: true,
message: 'Content Extractor API is running',
timestamp: new Date().toISOString()
});
});
app.get('/', (req, res) => {
res.json({
success: true,
message: 'Content Extractor API',
endpoints: {
'POST /extract-content': 'Extract content from URLs',
'GET /health': 'Health check',
'GET /': 'API information'
},
usage: {
method: 'POST',
endpoint: '/extract-content',
body: {
urls: ['https://example.com', 'https://another-site.com']
}
}
});
});
app.use((err, req, res, next) => {
console.error('Unhandled error:', err);
res.status(500).json({
success: false,
message: 'Internal server error',
error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong'
});
});
app.use((req, res) => {
res.status(404).json({
success: false,
message: 'Endpoint not found'
});
});
app.listen(PORT, () => {
console.log(`πŸš€ Content Extractor API running on port ${PORT}`);
console.log(`πŸ“– API Documentation: http://localhost:${PORT}`);
console.log(`πŸ₯ Health Check: http://localhost:${PORT}/health`);
});