vyles commited on
Commit
af34501
·
verified ·
1 Parent(s): 3bc292d

Create index.js

Browse files
Files changed (1) hide show
  1. index.js +250 -0
index.js ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const express = require('express');
2
+ const { chromium } = require('playwright');
3
+ const app = express();
4
+ const PORT = process.env.PORT || 3000;
5
+
6
+ // Middleware
7
+ app.use(express.json({ limit: '10mb' }));
8
+ app.use(express.urlencoded({ extended: true }));
9
+
10
+ // CORS middleware (opsional)
11
+ app.use((req, res, next) => {
12
+ res.header('Access-Control-Allow-Origin', '*');
13
+ res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS');
14
+ res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept, Authorization');
15
+ if (req.method === 'OPTIONS') {
16
+ res.sendStatus(200);
17
+ } else {
18
+ next();
19
+ }
20
+ });
21
+
22
+ // Fungsi untuk mengekstrak konten dari URL
23
+ async function extractContentFromUrl(url, timeout = 30000) {
24
+ let browser;
25
+ try {
26
+ // Launch browser
27
+ browser = await chromium.launch({
28
+ headless: true,
29
+ args: ['--no-sandbox', '--disable-setuid-sandbox']
30
+ });
31
+
32
+ const context = await browser.newContext({
33
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
34
+ });
35
+
36
+ const page = await context.newPage();
37
+
38
+ // Set timeout
39
+ page.setDefaultTimeout(timeout);
40
+
41
+ // Navigate to URL
42
+ await page.goto(url, {
43
+ waitUntil: 'domcontentloaded',
44
+ timeout: timeout
45
+ });
46
+
47
+ // Wait for page to load completely
48
+ await page.waitForLoadState('networkidle');
49
+
50
+ // Extract content
51
+ const content = await page.evaluate(() => {
52
+ // Remove script and style elements
53
+ const scripts = document.querySelectorAll('script, style, noscript');
54
+ scripts.forEach(el => el.remove());
55
+
56
+ // Get title
57
+ const title = document.title || '';
58
+
59
+ // Get meta description
60
+ const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
61
+
62
+ // Get all text content
63
+ const textContent = document.body?.innerText || '';
64
+
65
+ // Get all links
66
+ const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({
67
+ text: a.innerText.trim(),
68
+ href: a.href
69
+ })).filter(link => link.text && link.href);
70
+
71
+ // Get all images
72
+ const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({
73
+ alt: img.alt || '',
74
+ src: img.src
75
+ }));
76
+
77
+ // Get headings
78
+ const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6')).map(h => ({
79
+ level: h.tagName.toLowerCase(),
80
+ text: h.innerText.trim()
81
+ })).filter(h => h.text);
82
+
83
+ return {
84
+ title,
85
+ metaDescription,
86
+ textContent: textContent.substring(0, 10000), // Limit text content
87
+ links: links.slice(0, 50), // Limit links
88
+ images: images.slice(0, 20), // Limit images
89
+ headings: headings.slice(0, 30), // Limit headings
90
+ wordCount: textContent.split(/\s+/).length
91
+ };
92
+ });
93
+
94
+ await browser.close();
95
+
96
+ return {
97
+ url,
98
+ success: true,
99
+ data: content,
100
+ extractedAt: new Date().toISOString()
101
+ };
102
+
103
+ } catch (error) {
104
+ if (browser) {
105
+ await browser.close();
106
+ }
107
+
108
+ return {
109
+ url,
110
+ success: false,
111
+ error: error.message,
112
+ extractedAt: new Date().toISOString()
113
+ };
114
+ }
115
+ }
116
+
117
+ // Validasi URL
118
+ function isValidUrl(string) {
119
+ try {
120
+ const url = new URL(string);
121
+ return url.protocol === 'http:' || url.protocol === 'https:';
122
+ } catch {
123
+ return false;
124
+ }
125
+ }
126
+
127
+ // Route untuk ekstrak konten
128
+ app.post('/extract-content', async (req, res) => {
129
+ try {
130
+ const { urls, timeout } = req.body;
131
+
132
+ // Validasi input
133
+ if (!urls || !Array.isArray(urls)) {
134
+ return res.status(400).json({
135
+ success: false,
136
+ message: 'URLs harus berupa array'
137
+ });
138
+ }
139
+
140
+ if (urls.length === 0) {
141
+ return res.status(400).json({
142
+ success: false,
143
+ message: 'Array URLs tidak boleh kosong'
144
+ });
145
+ }
146
+
147
+ if (urls.length > 10) {
148
+ return res.status(400).json({
149
+ success: false,
150
+ message: 'Maksimal 10 URL per request'
151
+ });
152
+ }
153
+
154
+ // Validasi setiap URL
155
+ const invalidUrls = urls.filter(url => !isValidUrl(url));
156
+ if (invalidUrls.length > 0) {
157
+ return res.status(400).json({
158
+ success: false,
159
+ message: 'URL tidak valid ditemukan',
160
+ invalidUrls
161
+ });
162
+ }
163
+
164
+ // Ekstrak konten dari setiap URL
165
+ const results = [];
166
+ const requestTimeout = timeout || 30000;
167
+
168
+ for (const url of urls) {
169
+ console.log(`Extracting content from: ${url}`);
170
+ const result = await extractContentFromUrl(url, requestTimeout);
171
+ results.push(result);
172
+ }
173
+
174
+ // Hitung statistik
175
+ const successCount = results.filter(r => r.success).length;
176
+ const failCount = results.filter(r => !r.success).length;
177
+
178
+ res.json({
179
+ success: true,
180
+ message: `Berhasil memproses ${results.length} URL`,
181
+ statistics: {
182
+ total: results.length,
183
+ success: successCount,
184
+ failed: failCount
185
+ },
186
+ results
187
+ });
188
+
189
+ } catch (error) {
190
+ console.error('Error in /extract-content:', error);
191
+ res.status(500).json({
192
+ success: false,
193
+ message: 'Internal server error',
194
+ error: error.message
195
+ });
196
+ }
197
+ });
198
+
199
+ // Route untuk health check
200
+ app.get('/health', (req, res) => {
201
+ res.json({
202
+ success: true,
203
+ message: 'API is running',
204
+ timestamp: new Date().toISOString()
205
+ });
206
+ });
207
+
208
+ // Route untuk info API
209
+ app.get('/', (req, res) => {
210
+ res.json({
211
+ name: 'URL Content Extractor API',
212
+ version: '1.0.0',
213
+ description: 'API untuk mengekstrak konten dari URL menggunakan Playwright',
214
+ endpoints: {
215
+ 'POST /extract-content': {
216
+ description: 'Ekstrak konten dari array URL',
217
+ body: {
218
+ urls: ['http://example.com', 'https://example2.com'],
219
+ timeout: 30000
220
+ }
221
+ },
222
+ 'GET /health': 'Health check endpoint',
223
+ 'GET /': 'API information'
224
+ }
225
+ });
226
+ });
227
+
228
+ // Error handler
229
+ app.use((err, req, res, next) => {
230
+ console.error('Unhandled error:', err);
231
+ res.status(500).json({
232
+ success: false,
233
+ message: 'Internal server error'
234
+ });
235
+ });
236
+
237
+ // 404 handler
238
+ app.use('*', (req, res) => {
239
+ res.status(404).json({
240
+ success: false,
241
+ message: 'Endpoint tidak ditemukan'
242
+ });
243
+ });
244
+
245
+ // Start server
246
+ app.listen(PORT, () => {
247
+ console.log(`🚀 Server berjalan di port ${PORT}`);
248
+ console.log(`📝 API Documentation: http://localhost:${PORT}`);
249
+ console.log(`❤️ Health Check: http://localhost:${PORT}/health`);
250
+ });