vyles commited on
Commit
40d121e
Β·
verified Β·
1 Parent(s): c209578

Update index.js

Browse files
Files changed (1) hide show
  1. index.js +182 -229
index.js CHANGED
@@ -5,264 +5,217 @@ import cors from 'cors';
5
  const app = express();
6
  const PORT = process.env.PORT || 7860;
7
 
8
- // Middleware
9
- app.use(express.json({ limit: '50mb' }));
 
10
  app.use(cors());
11
 
12
- // Utility function untuk mengekstrak konten dari URL
13
  async function extractContentFromUrl(url, browser) {
14
- // MODIFIKASI: Buat konteks baru untuk setiap URL. Ini lebih aman dan memungkinkan isolasi.
15
- const context = await browser.newContext({
16
- userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
17
- });
18
-
19
- // MODIFIKASI: Buat halaman dari konteks yang sudah memiliki user agent.
20
- const page = await context.newPage();
21
-
22
- try {
23
- // Baris "await page.setUserAgent(...)" yang lama sudah dihapus.
24
-
25
- // Navigate ke URL dengan timeout
26
- await page.goto(url, {
27
- waitUntil: 'domcontentloaded',
28
- timeout: 30000
29
- });
30
-
31
- // Wait untuk memastikan konten dimuat
32
- await page.waitForTimeout(2000);
33
-
34
- // Ekstrak berbagai informasi dari halaman
35
- const content = await page.evaluate(() => {
36
- // Helper function untuk membersihkan text
37
- const cleanText = (text) => {
38
- return text ? text.replace(/\s+/g, ' ').trim() : '';
39
- };
40
-
41
- // Ekstrak title
42
- const title = document.title || '';
43
-
44
- // Ekstrak meta description
45
- const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
46
-
47
- // Ekstrak heading utama
48
- const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText));
49
- const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText));
50
-
51
- // Ekstrak semua paragraf
52
- const paragraphs = Array.from(document.querySelectorAll('p'))
53
- .map(p => cleanText(p.innerText))
54
- .filter(text => text.length > 20); // Filter paragraf yang terlalu pendek
55
-
56
- // Ekstrak text dari main content area (jika ada)
57
- const mainContent = document.querySelector('main, article, .content, #content, .post-content');
58
- const mainText = mainContent ? cleanText(mainContent.innerText) : '';
59
-
60
- // Ekstrak semua link
61
- const links = Array.from(document.querySelectorAll('a[href]'))
62
- .map(a => ({
63
- text: cleanText(a.innerText),
64
- href: a.href
65
- }))
66
- .filter(link => link.text && link.href);
67
-
68
- // Ekstrak images
69
- const images = Array.from(document.querySelectorAll('img[src]'))
70
- .map(img => ({
71
- src: img.src,
72
- alt: img.alt || '',
73
- title: img.title || ''
74
- }));
75
-
76
- return {
77
- title,
78
- metaDescription,
79
- headings: {
80
- h1: h1Elements,
81
- h2: h2Elements
82
- },
83
- paragraphs,
84
- mainText,
85
- links: links.slice(0, 20), // Batasi jumlah link
86
- images: images.slice(0, 10), // Batasi jumlah gambar
87
- wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length
88
- };
89
  });
90
-
91
- return {
92
- url,
93
- success: true,
94
- content,
95
- extractedAt: new Date().toISOString()
96
- };
97
-
98
- } catch (error) {
99
- return {
100
- url,
101
- success: false,
102
- error: error.message,
103
- extractedAt: new Date().toISOString()
104
- };
105
- } finally {
106
- // MODIFIKASI: Tutup konteksnya. Ini akan otomatis menutup halaman yang dibuat dari konteks ini.
107
- await context.close();
108
- }
109
- }
110
-
111
- // Endpoint untuk mengekstrak konten dari multiple URLs
112
- app.post('/extract-content', async (req, res) => {
113
- const { urls } = req.body;
114
-
115
- // Validasi input
116
- if (!urls || !Array.isArray(urls)) {
117
- return res.status(400).json({
118
- success: false,
119
- message: 'Body harus berisi array urls'
120
- });
121
- }
122
-
123
- if (urls.length === 0) {
124
- return res.status(400).json({
125
- success: false,
126
- message: 'Array urls tidak boleh kosong'
127
- });
128
- }
129
-
130
- if (urls.length > 10) {
131
- return res.status(400).json({
132
- success: false,
133
- message: 'Maksimal 10 URLs per request'
134
- });
135
- }
136
-
137
- // Validasi format URL
138
- const validUrls = [];
139
- const invalidUrls = [];
140
-
141
- urls.forEach(url => {
142
  try {
143
- new URL(url);
144
- validUrls.push(url);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  } catch (error) {
146
- invalidUrls.push(url);
 
 
 
 
 
 
 
147
  }
148
- });
149
-
150
- if (invalidUrls.length > 0) {
151
- return res.status(400).json({
152
- success: false,
153
- message: 'Format URL tidak valid',
154
- invalidUrls
155
- });
156
- }
157
-
158
- let browser;
159
-
160
- try {
161
- // Launch browser
162
- browser = await chromium.launch({
163
- args: [
164
- '--incognito',
165
- '--single-process',
166
- '--no-sandbox',
167
- '--no-zygote',
168
- '--no-cache'
169
- ],
170
- executablePath: process.env.CHROME_BIN,
171
- headless: true,
172
- });
173
 
174
- console.log(`Memproses ${validUrls.length} URLs...`);
 
 
 
 
 
175
 
176
- // Process URLs secara paralel dengan batasan concurrency
177
- const concurrencyLimit = 3;
178
- const results = [];
 
 
 
179
 
180
- for (let i = 0; i < validUrls.length; i += concurrencyLimit) {
181
- const batch = validUrls.slice(i, i + concurrencyLimit);
182
- const batchPromises = batch.map(url => extractContentFromUrl(url, browser));
183
- const batchResults = await Promise.all(batchPromises);
184
- results.push(...batchResults);
185
  }
186
 
187
- // Statistik hasil
188
- const successCount = results.filter(r => r.success).length;
189
- const failCount = results.filter(r => !r.success).length;
190
 
191
- res.json({
192
- success: true,
193
- message: `Berhasil memproses ${validUrls.length} URLs`,
194
- statistics: {
195
- total: validUrls.length,
196
- success: successCount,
197
- failed: failCount
198
- },
199
- results
200
  });
201
 
202
- } catch (error) {
203
- console.error('Error:', error);
204
- res.status(500).json({
205
- success: false,
206
- message: 'Terjadi kesalahan saat memproses URLs',
207
- error: error.message
208
- });
209
- } finally {
210
- if (browser) {
211
- await browser.close();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  }
213
- }
214
  });
215
 
216
- // Health check endpoint
217
  app.get('/health', (req, res) => {
218
- res.json({
219
- success: true,
220
- message: 'Content Extractor API is running',
221
- timestamp: new Date().toISOString()
222
- });
223
  });
224
 
225
- // Root endpoint
226
  app.get('/', (req, res) => {
227
- res.json(JSON.stringify({
228
- success: true,
229
- message: 'Content Extractor API',
230
- endpoints: {
231
- 'POST /extract-content': 'Extract content from URLs',
232
- 'GET /health': 'Health check',
233
- 'GET /': 'API information'
234
- },
235
- usage: {
236
- method: 'POST',
237
- endpoint: '/extract-content',
238
- body: {
239
- urls: ['https://example.com', 'https://another-site.com']
240
- }
241
- }
242
- }), 0, 2);
243
  });
244
 
245
- // Error handling middleware
246
  app.use((err, req, res, next) => {
247
- console.error('Unhandled error:', err);
248
- res.status(500).json({
249
- success: false,
250
- message: 'Internal server error',
251
- error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong'
252
- });
253
  });
254
 
255
- // 404 handler
256
  app.use((req, res) => {
257
- res.status(404).json({
258
- success: false,
259
- message: 'Endpoint not found'
260
- });
261
  });
262
 
263
- // Start server
264
  app.listen(PORT, () => {
265
- console.log(`πŸš€ Content Extractor API running on port ${PORT}`);
266
- console.log(`πŸ“– API Documentation: http://localhost:${PORT}`);
267
- console.log(`πŸ₯ Health Check: http://localhost:${PORT}/health`);
268
  });
 
5
  const app = express();
6
  const PORT = process.env.PORT || 7860;
7
 
8
+ app.set('json spaces', 2)
9
+
10
+ app.use(express.json({ limit: '500mb' }));
11
  app.use(cors());
12
 
 
13
  async function extractContentFromUrl(url, browser) {
14
+ const context = await browser.newContext({
15
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  });
17
+ const page = await context.newPage();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  try {
19
+ await page.goto(url, {
20
+ waitUntil: 'domcontentloaded',
21
+ timeout: 60000
22
+ });
23
+ await page.waitForTimeout(10000);
24
+ const content = await page.evaluate(() => {
25
+ const cleanText = (text) => {
26
+ return text ? text.replace(/\s+/g, ' ').trim() : '';
27
+ };
28
+
29
+ const title = document.title || '';
30
+ const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
31
+ const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText));
32
+ const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText));
33
+ const paragraphs = Array.from(document.querySelectorAll('p')).map(p => cleanText(p.innerText)).filter(text => text.length > 20); // Filter paragraf yang terlalu pendek
34
+ const mainContent = document.querySelector('main, article, .content, #content, .post-content');
35
+ const mainText = mainContent ? cleanText(mainContent.innerText) : '';
36
+
37
+ const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({
38
+ text: cleanText(a.innerText),
39
+ href: a.href
40
+ })).filter(link => link.text && link.href);
41
+
42
+ const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({
43
+ src: img.src,
44
+ alt: img.alt || '',
45
+ title: img.title || ''
46
+ }));
47
+
48
+ return {
49
+ title,
50
+ metaDescription,
51
+ headings: {
52
+ h1: h1Elements,
53
+ h2: h2Elements
54
+ },
55
+ paragraphs,
56
+ mainText,
57
+ //links: links.slice(0, 20),
58
+ //images: images.slice(0, 10),
59
+ wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length
60
+ };
61
+ });
62
+
63
+ return {
64
+ url,
65
+ success: true,
66
+ content,
67
+ extractedAt: new Date().toISOString()
68
+ };
69
  } catch (error) {
70
+ return {
71
+ url,
72
+ success: false,
73
+ error: error.message,
74
+ extractedAt: new Date().toISOString()
75
+ };
76
+ } finally {
77
+ await context.close();
78
  }
79
+ }
80
+
81
+ app.post('/extract-content', async (req, res) => {
82
+ const { urls } = req.body;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ if (!urls || !Array.isArray(urls)) {
85
+ return res.status(400).json({
86
+ success: false,
87
+ message: 'Body harus berisi array urls'
88
+ });
89
+ }
90
 
91
+ if (urls.length === 0) {
92
+ return res.status(400).json({
93
+ success: false,
94
+ message: 'Array urls tidak boleh kosong'
95
+ });
96
+ }
97
 
98
+ if (urls.length > 10) {
99
+ return res.status(400).json({
100
+ success: false,
101
+ message: 'Maksimal 10 URLs per request'
102
+ });
103
  }
104
 
105
+ const validUrls = [];
106
+ const invalidUrls = [];
 
107
 
108
+ urls.forEach(url => {
109
+ try {
110
+ new URL(url);
111
+ validUrls.push(url);
112
+ } catch (error) {
113
+ invalidUrls.push(url);
114
+ }
 
 
115
  });
116
 
117
+ if (invalidUrls.length > 0) {
118
+ return res.status(400).json({
119
+ success: false,
120
+ message: 'Format URL tidak valid',
121
+ invalidUrls
122
+ });
123
+ }
124
+
125
+ let browser;
126
+
127
+ try {
128
+ browser = await chromium.launch({
129
+ args: ['--incognito', '--single-process', '--no-sandbox', '--no-zygote', '--no-cache'],
130
+ executablePath: process.env.CHROME_BIN,
131
+ headless: true,
132
+ });
133
+
134
+ console.log(`Memproses ${validUrls.length} URLs...`);
135
+
136
+ const concurrencyLimit = 3;
137
+ const results = [];
138
+
139
+ for (let i = 0; i < validUrls.length; i += concurrencyLimit) {
140
+ const batch = validUrls.slice(i, i + concurrencyLimit);
141
+ const batchPromises = batch.map(url => extractContentFromUrl(url, browser));
142
+ const batchResults = await Promise.all(batchPromises);
143
+ results.push(...batchResults);
144
+ }
145
+
146
+ const successCount = results.filter(r => r.success).length;
147
+ const failCount = results.filter(r => !r.success).length;
148
+
149
+ res.json({
150
+ success: true,
151
+ message: `Berhasil memproses ${validUrls.length} URLs`,
152
+ statistics: {
153
+ total: validUrls.length,
154
+ success: successCount,
155
+ failed: failCount
156
+ },
157
+ results
158
+ });
159
+
160
+ } catch (error) {
161
+ console.error('Error:', error);
162
+ res.status(500).json({
163
+ success: false,
164
+ message: 'Terjadi kesalahan saat memproses URLs',
165
+ error: error.message
166
+ });
167
+ } finally {
168
+ if (browser) {
169
+ await browser.close();
170
+ }
171
  }
 
172
  });
173
 
 
174
  app.get('/health', (req, res) => {
175
+ res.json({
176
+ success: true,
177
+ message: 'Content Extractor API is running',
178
+ timestamp: new Date().toISOString()
179
+ });
180
  });
181
 
 
182
  app.get('/', (req, res) => {
183
+ res.json(JSON.stringify({
184
+ success: true,
185
+ message: 'Content Extractor API',
186
+ endpoints: {
187
+ 'POST /extract-content': 'Extract content from URLs',
188
+ 'GET /health': 'Health check',
189
+ 'GET /': 'API information'
190
+ },
191
+ usage: {
192
+ method: 'POST',
193
+ endpoint: '/extract-content',
194
+ body: {
195
+ urls: ['https://example.com', 'https://another-site.com']
196
+ }
197
+ }
198
+ }), 0, 2);
199
  });
200
 
 
201
  app.use((err, req, res, next) => {
202
+ console.error('Unhandled error:', err);
203
+ res.status(500).json({
204
+ success: false,
205
+ message: 'Internal server error',
206
+ error: process.env.NODE_ENV === 'development' ? err.message : 'Something went wrong'
207
+ });
208
  });
209
 
 
210
  app.use((req, res) => {
211
+ res.status(404).json({
212
+ success: false,
213
+ message: 'Endpoint not found'
214
+ });
215
  });
216
 
 
217
  app.listen(PORT, () => {
218
+ console.log(`πŸš€ Content Extractor API running on port ${PORT}`);
219
+ console.log(`πŸ“– API Documentation: http://localhost:${PORT}`);
220
+ console.log(`πŸ₯ Health Check: http://localhost:${PORT}/health`);
221
  });