vyles commited on
Commit
daa7349
·
verified ·
1 Parent(s): af570bb

Update index.js

Browse files
Files changed (1) hide show
  1. index.js +33 -44
index.js CHANGED
@@ -13,15 +13,7 @@ app.use(bodyParser.json())
13
  app.use(express.json({ limit: '500mb' }));
14
  app.use(cors());
15
 
16
- async function extractContentFromUrl(url, browser) {
17
- const context = await browser.newContext({
18
- userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
19
- viewport: { width: 1920, height: 1080 },
20
- locale: 'en-US',
21
- timezoneId: 'America/New_York'
22
- });
23
- const page = await context.newPage();
24
-
25
  try {
26
  page.setDefaultNavigationTimeout(30000);
27
  page.setDefaultTimeout(30000);
@@ -79,7 +71,7 @@ async function extractContentFromUrl(url, browser) {
79
  let totalHeight = 0;
80
  const distance = 100;
81
  let scrollCount = 0;
82
- const maxScrolls = 30; // Limit scrolling
83
 
84
  const timer = setInterval(() => {
85
  const scrollHeight = document.body.scrollHeight;
@@ -173,23 +165,6 @@ async function extractContentFromUrl(url, browser) {
173
 
174
  const finalMainText = mainText || fallbackText;
175
 
176
- const links = Array.from(document.querySelectorAll('a[href]'))
177
- .map(a => ({
178
- text: getTextContent(a),
179
- href: a.href
180
- }))
181
- .filter(link => link.text && link.href && !link.href.startsWith('javascript:'))
182
- .slice(0, 20);
183
-
184
- const images = Array.from(document.querySelectorAll('img[src]'))
185
- .filter(img => img.src && !img.src.includes('data:image'))
186
- .map(img => ({
187
- src: img.src,
188
- alt: img.alt || '',
189
- title: img.title || ''
190
- }))
191
- .slice(0, 10);
192
-
193
  const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
194
  const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
195
 
@@ -202,8 +177,6 @@ async function extractContentFromUrl(url, browser) {
202
  },
203
  paragraphs,
204
  mainText: finalMainText,
205
- //links,
206
- //images,
207
  wordCount,
208
  hasContent: wordCount > 50
209
  };
@@ -223,8 +196,6 @@ async function extractContentFromUrl(url, browser) {
223
  error: error.message,
224
  extractedAt: new Date().toISOString()
225
  };
226
- } finally {
227
- await context.close();
228
  }
229
  }
230
 
@@ -277,28 +248,46 @@ app.post('/extract-content', async (req, res) => {
277
  try {
278
  browser = await chromium.launch({
279
  args: [
280
- '--incognito',
281
- '--single-process',
282
- '--no-sandbox',
283
- '--no-zygote',
284
- '--no-cache',
285
- '--disable-dev-shm-usage',
286
  '--disable-setuid-sandbox',
 
287
  '--disable-accelerated-2d-canvas',
288
- '--disable-gpu'
 
 
 
289
  ],
290
  executablePath: process.env.CHROME_BIN,
291
  headless: true,
292
  });
293
 
294
- const concurrencyLimit = 3;
295
  const results = [];
296
 
297
- for (let i = 0; i < validUrls.length; i += concurrencyLimit) {
298
- const batch = validUrls.slice(i, i + concurrencyLimit);
299
- const batchPromises = batch.map(url => extractContentFromUrl(url, browser));
300
- const batchResults = await Promise.all(batchPromises);
301
- results.push(...batchResults);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  }
303
 
304
  const successCount = results.filter(r => r.success).length;
 
13
  app.use(express.json({ limit: '500mb' }));
14
  app.use(cors());
15
 
16
+ async function extractContentFromUrl(url, page) {
 
 
 
 
 
 
 
 
17
  try {
18
  page.setDefaultNavigationTimeout(30000);
19
  page.setDefaultTimeout(30000);
 
71
  let totalHeight = 0;
72
  const distance = 100;
73
  let scrollCount = 0;
74
+ const maxScrolls = 30;
75
 
76
  const timer = setInterval(() => {
77
  const scrollHeight = document.body.scrollHeight;
 
165
 
166
  const finalMainText = mainText || fallbackText;
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
169
  const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
170
 
 
177
  },
178
  paragraphs,
179
  mainText: finalMainText,
 
 
180
  wordCount,
181
  hasContent: wordCount > 50
182
  };
 
196
  error: error.message,
197
  extractedAt: new Date().toISOString()
198
  };
 
 
199
  }
200
  }
201
 
 
248
  try {
249
  browser = await chromium.launch({
250
  args: [
251
+ '--no-sandbox',
 
 
 
 
 
252
  '--disable-setuid-sandbox',
253
+ '--disable-dev-shm-usage',
254
  '--disable-accelerated-2d-canvas',
255
+ '--disable-gpu',
256
+ '--disable-blink-features=AutomationControlled',
257
+ '--disable-web-security',
258
+ '--disable-features=IsolateOrigins,site-per-process'
259
  ],
260
  executablePath: process.env.CHROME_BIN,
261
  headless: true,
262
  });
263
 
 
264
  const results = [];
265
 
266
+ // Process URLs sequentially to avoid browser crashes
267
+ for (const url of validUrls) {
268
+ const context = await browser.newContext({
269
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
270
+ viewport: { width: 1920, height: 1080 },
271
+ locale: 'en-US',
272
+ timezoneId: 'America/New_York'
273
+ });
274
+
275
+ const page = await context.newPage();
276
+
277
+ try {
278
+ const result = await extractContentFromUrl(url, page);
279
+ results.push(result);
280
+ } catch (error) {
281
+ console.error(`Error processing ${url}:`, error);
282
+ results.push({
283
+ url,
284
+ success: false,
285
+ error: error.message,
286
+ extractedAt: new Date().toISOString()
287
+ });
288
+ } finally {
289
+ await context.close();
290
+ }
291
  }
292
 
293
  const successCount = results.filter(r => r.success).length;