Create index.js
Browse files
index.js
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const express = require('express');
|
2 |
+
const { chromium } = require('playwright');
|
3 |
+
const app = express();
|
4 |
+
const PORT = process.env.PORT || 3000;
|
5 |
+
|
6 |
+
// Middleware
|
7 |
+
app.use(express.json({ limit: '10mb' }));
|
8 |
+
app.use(express.urlencoded({ extended: true }));
|
9 |
+
|
10 |
+
// CORS middleware (opsional)
|
11 |
+
app.use((req, res, next) => {
|
12 |
+
res.header('Access-Control-Allow-Origin', '*');
|
13 |
+
res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS');
|
14 |
+
res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept, Authorization');
|
15 |
+
if (req.method === 'OPTIONS') {
|
16 |
+
res.sendStatus(200);
|
17 |
+
} else {
|
18 |
+
next();
|
19 |
+
}
|
20 |
+
});
|
21 |
+
|
22 |
+
// Fungsi untuk mengekstrak konten dari URL
|
23 |
+
async function extractContentFromUrl(url, timeout = 30000) {
|
24 |
+
let browser;
|
25 |
+
try {
|
26 |
+
// Launch browser
|
27 |
+
browser = await chromium.launch({
|
28 |
+
headless: true,
|
29 |
+
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
30 |
+
});
|
31 |
+
|
32 |
+
const context = await browser.newContext({
|
33 |
+
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
34 |
+
});
|
35 |
+
|
36 |
+
const page = await context.newPage();
|
37 |
+
|
38 |
+
// Set timeout
|
39 |
+
page.setDefaultTimeout(timeout);
|
40 |
+
|
41 |
+
// Navigate to URL
|
42 |
+
await page.goto(url, {
|
43 |
+
waitUntil: 'domcontentloaded',
|
44 |
+
timeout: timeout
|
45 |
+
});
|
46 |
+
|
47 |
+
// Wait for page to load completely
|
48 |
+
await page.waitForLoadState('networkidle');
|
49 |
+
|
50 |
+
// Extract content
|
51 |
+
const content = await page.evaluate(() => {
|
52 |
+
// Remove script and style elements
|
53 |
+
const scripts = document.querySelectorAll('script, style, noscript');
|
54 |
+
scripts.forEach(el => el.remove());
|
55 |
+
|
56 |
+
// Get title
|
57 |
+
const title = document.title || '';
|
58 |
+
|
59 |
+
// Get meta description
|
60 |
+
const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
|
61 |
+
|
62 |
+
// Get all text content
|
63 |
+
const textContent = document.body?.innerText || '';
|
64 |
+
|
65 |
+
// Get all links
|
66 |
+
const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({
|
67 |
+
text: a.innerText.trim(),
|
68 |
+
href: a.href
|
69 |
+
})).filter(link => link.text && link.href);
|
70 |
+
|
71 |
+
// Get all images
|
72 |
+
const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({
|
73 |
+
alt: img.alt || '',
|
74 |
+
src: img.src
|
75 |
+
}));
|
76 |
+
|
77 |
+
// Get headings
|
78 |
+
const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6')).map(h => ({
|
79 |
+
level: h.tagName.toLowerCase(),
|
80 |
+
text: h.innerText.trim()
|
81 |
+
})).filter(h => h.text);
|
82 |
+
|
83 |
+
return {
|
84 |
+
title,
|
85 |
+
metaDescription,
|
86 |
+
textContent: textContent.substring(0, 10000), // Limit text content
|
87 |
+
links: links.slice(0, 50), // Limit links
|
88 |
+
images: images.slice(0, 20), // Limit images
|
89 |
+
headings: headings.slice(0, 30), // Limit headings
|
90 |
+
wordCount: textContent.split(/\s+/).length
|
91 |
+
};
|
92 |
+
});
|
93 |
+
|
94 |
+
await browser.close();
|
95 |
+
|
96 |
+
return {
|
97 |
+
url,
|
98 |
+
success: true,
|
99 |
+
data: content,
|
100 |
+
extractedAt: new Date().toISOString()
|
101 |
+
};
|
102 |
+
|
103 |
+
} catch (error) {
|
104 |
+
if (browser) {
|
105 |
+
await browser.close();
|
106 |
+
}
|
107 |
+
|
108 |
+
return {
|
109 |
+
url,
|
110 |
+
success: false,
|
111 |
+
error: error.message,
|
112 |
+
extractedAt: new Date().toISOString()
|
113 |
+
};
|
114 |
+
}
|
115 |
+
}
|
116 |
+
|
117 |
+
// Validasi URL
|
118 |
+
function isValidUrl(string) {
|
119 |
+
try {
|
120 |
+
const url = new URL(string);
|
121 |
+
return url.protocol === 'http:' || url.protocol === 'https:';
|
122 |
+
} catch {
|
123 |
+
return false;
|
124 |
+
}
|
125 |
+
}
|
126 |
+
|
127 |
+
// Route untuk ekstrak konten
|
128 |
+
app.post('/extract-content', async (req, res) => {
|
129 |
+
try {
|
130 |
+
const { urls, timeout } = req.body;
|
131 |
+
|
132 |
+
// Validasi input
|
133 |
+
if (!urls || !Array.isArray(urls)) {
|
134 |
+
return res.status(400).json({
|
135 |
+
success: false,
|
136 |
+
message: 'URLs harus berupa array'
|
137 |
+
});
|
138 |
+
}
|
139 |
+
|
140 |
+
if (urls.length === 0) {
|
141 |
+
return res.status(400).json({
|
142 |
+
success: false,
|
143 |
+
message: 'Array URLs tidak boleh kosong'
|
144 |
+
});
|
145 |
+
}
|
146 |
+
|
147 |
+
if (urls.length > 10) {
|
148 |
+
return res.status(400).json({
|
149 |
+
success: false,
|
150 |
+
message: 'Maksimal 10 URL per request'
|
151 |
+
});
|
152 |
+
}
|
153 |
+
|
154 |
+
// Validasi setiap URL
|
155 |
+
const invalidUrls = urls.filter(url => !isValidUrl(url));
|
156 |
+
if (invalidUrls.length > 0) {
|
157 |
+
return res.status(400).json({
|
158 |
+
success: false,
|
159 |
+
message: 'URL tidak valid ditemukan',
|
160 |
+
invalidUrls
|
161 |
+
});
|
162 |
+
}
|
163 |
+
|
164 |
+
// Ekstrak konten dari setiap URL
|
165 |
+
const results = [];
|
166 |
+
const requestTimeout = timeout || 30000;
|
167 |
+
|
168 |
+
for (const url of urls) {
|
169 |
+
console.log(`Extracting content from: ${url}`);
|
170 |
+
const result = await extractContentFromUrl(url, requestTimeout);
|
171 |
+
results.push(result);
|
172 |
+
}
|
173 |
+
|
174 |
+
// Hitung statistik
|
175 |
+
const successCount = results.filter(r => r.success).length;
|
176 |
+
const failCount = results.filter(r => !r.success).length;
|
177 |
+
|
178 |
+
res.json({
|
179 |
+
success: true,
|
180 |
+
message: `Berhasil memproses ${results.length} URL`,
|
181 |
+
statistics: {
|
182 |
+
total: results.length,
|
183 |
+
success: successCount,
|
184 |
+
failed: failCount
|
185 |
+
},
|
186 |
+
results
|
187 |
+
});
|
188 |
+
|
189 |
+
} catch (error) {
|
190 |
+
console.error('Error in /extract-content:', error);
|
191 |
+
res.status(500).json({
|
192 |
+
success: false,
|
193 |
+
message: 'Internal server error',
|
194 |
+
error: error.message
|
195 |
+
});
|
196 |
+
}
|
197 |
+
});
|
198 |
+
|
199 |
+
// Route untuk health check
|
200 |
+
app.get('/health', (req, res) => {
|
201 |
+
res.json({
|
202 |
+
success: true,
|
203 |
+
message: 'API is running',
|
204 |
+
timestamp: new Date().toISOString()
|
205 |
+
});
|
206 |
+
});
|
207 |
+
|
208 |
+
// Route untuk info API
|
209 |
+
app.get('/', (req, res) => {
|
210 |
+
res.json({
|
211 |
+
name: 'URL Content Extractor API',
|
212 |
+
version: '1.0.0',
|
213 |
+
description: 'API untuk mengekstrak konten dari URL menggunakan Playwright',
|
214 |
+
endpoints: {
|
215 |
+
'POST /extract-content': {
|
216 |
+
description: 'Ekstrak konten dari array URL',
|
217 |
+
body: {
|
218 |
+
urls: ['http://example.com', 'https://example2.com'],
|
219 |
+
timeout: 30000
|
220 |
+
}
|
221 |
+
},
|
222 |
+
'GET /health': 'Health check endpoint',
|
223 |
+
'GET /': 'API information'
|
224 |
+
}
|
225 |
+
});
|
226 |
+
});
|
227 |
+
|
228 |
+
// Error handler
|
229 |
+
app.use((err, req, res, next) => {
|
230 |
+
console.error('Unhandled error:', err);
|
231 |
+
res.status(500).json({
|
232 |
+
success: false,
|
233 |
+
message: 'Internal server error'
|
234 |
+
});
|
235 |
+
});
|
236 |
+
|
237 |
+
// 404 handler
|
238 |
+
app.use('*', (req, res) => {
|
239 |
+
res.status(404).json({
|
240 |
+
success: false,
|
241 |
+
message: 'Endpoint tidak ditemukan'
|
242 |
+
});
|
243 |
+
});
|
244 |
+
|
245 |
+
// Start server
|
246 |
+
app.listen(PORT, () => {
|
247 |
+
console.log(`🚀 Server berjalan di port ${PORT}`);
|
248 |
+
console.log(`📝 API Documentation: http://localhost:${PORT}`);
|
249 |
+
console.log(`❤️ Health Check: http://localhost:${PORT}/health`);
|
250 |
+
});
|