Update index.js
Browse files
index.js
CHANGED
@@ -13,15 +13,7 @@ app.use(bodyParser.json())
|
|
13 |
app.use(express.json({ limit: '500mb' }));
|
14 |
app.use(cors());
|
15 |
|
16 |
-
async function extractContentFromUrl(url,
|
17 |
-
const context = await browser.newContext({
|
18 |
-
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
19 |
-
viewport: { width: 1920, height: 1080 },
|
20 |
-
locale: 'en-US',
|
21 |
-
timezoneId: 'America/New_York'
|
22 |
-
});
|
23 |
-
const page = await context.newPage();
|
24 |
-
|
25 |
try {
|
26 |
page.setDefaultNavigationTimeout(30000);
|
27 |
page.setDefaultTimeout(30000);
|
@@ -79,7 +71,7 @@ async function extractContentFromUrl(url, browser) {
|
|
79 |
let totalHeight = 0;
|
80 |
const distance = 100;
|
81 |
let scrollCount = 0;
|
82 |
-
const maxScrolls = 30;
|
83 |
|
84 |
const timer = setInterval(() => {
|
85 |
const scrollHeight = document.body.scrollHeight;
|
@@ -173,23 +165,6 @@ async function extractContentFromUrl(url, browser) {
|
|
173 |
|
174 |
const finalMainText = mainText || fallbackText;
|
175 |
|
176 |
-
const links = Array.from(document.querySelectorAll('a[href]'))
|
177 |
-
.map(a => ({
|
178 |
-
text: getTextContent(a),
|
179 |
-
href: a.href
|
180 |
-
}))
|
181 |
-
.filter(link => link.text && link.href && !link.href.startsWith('javascript:'))
|
182 |
-
.slice(0, 20);
|
183 |
-
|
184 |
-
const images = Array.from(document.querySelectorAll('img[src]'))
|
185 |
-
.filter(img => img.src && !img.src.includes('data:image'))
|
186 |
-
.map(img => ({
|
187 |
-
src: img.src,
|
188 |
-
alt: img.alt || '',
|
189 |
-
title: img.title || ''
|
190 |
-
}))
|
191 |
-
.slice(0, 10);
|
192 |
-
|
193 |
const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
|
194 |
const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
|
195 |
|
@@ -202,8 +177,6 @@ async function extractContentFromUrl(url, browser) {
|
|
202 |
},
|
203 |
paragraphs,
|
204 |
mainText: finalMainText,
|
205 |
-
//links,
|
206 |
-
//images,
|
207 |
wordCount,
|
208 |
hasContent: wordCount > 50
|
209 |
};
|
@@ -223,8 +196,6 @@ async function extractContentFromUrl(url, browser) {
|
|
223 |
error: error.message,
|
224 |
extractedAt: new Date().toISOString()
|
225 |
};
|
226 |
-
} finally {
|
227 |
-
await context.close();
|
228 |
}
|
229 |
}
|
230 |
|
@@ -277,28 +248,46 @@ app.post('/extract-content', async (req, res) => {
|
|
277 |
try {
|
278 |
browser = await chromium.launch({
|
279 |
args: [
|
280 |
-
'--
|
281 |
-
'--single-process',
|
282 |
-
'--no-sandbox',
|
283 |
-
'--no-zygote',
|
284 |
-
'--no-cache',
|
285 |
-
'--disable-dev-shm-usage',
|
286 |
'--disable-setuid-sandbox',
|
|
|
287 |
'--disable-accelerated-2d-canvas',
|
288 |
-
'--disable-gpu'
|
|
|
|
|
|
|
289 |
],
|
290 |
executablePath: process.env.CHROME_BIN,
|
291 |
headless: true,
|
292 |
});
|
293 |
|
294 |
-
const concurrencyLimit = 3;
|
295 |
const results = [];
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
const
|
300 |
-
|
301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
}
|
303 |
|
304 |
const successCount = results.filter(r => r.success).length;
|
|
|
13 |
app.use(express.json({ limit: '500mb' }));
|
14 |
app.use(cors());
|
15 |
|
16 |
+
async function extractContentFromUrl(url, page) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
try {
|
18 |
page.setDefaultNavigationTimeout(30000);
|
19 |
page.setDefaultTimeout(30000);
|
|
|
71 |
let totalHeight = 0;
|
72 |
const distance = 100;
|
73 |
let scrollCount = 0;
|
74 |
+
const maxScrolls = 30;
|
75 |
|
76 |
const timer = setInterval(() => {
|
77 |
const scrollHeight = document.body.scrollHeight;
|
|
|
165 |
|
166 |
const finalMainText = mainText || fallbackText;
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
|
169 |
const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
|
170 |
|
|
|
177 |
},
|
178 |
paragraphs,
|
179 |
mainText: finalMainText,
|
|
|
|
|
180 |
wordCount,
|
181 |
hasContent: wordCount > 50
|
182 |
};
|
|
|
196 |
error: error.message,
|
197 |
extractedAt: new Date().toISOString()
|
198 |
};
|
|
|
|
|
199 |
}
|
200 |
}
|
201 |
|
|
|
248 |
try {
|
249 |
browser = await chromium.launch({
|
250 |
args: [
|
251 |
+
'--no-sandbox',
|
|
|
|
|
|
|
|
|
|
|
252 |
'--disable-setuid-sandbox',
|
253 |
+
'--disable-dev-shm-usage',
|
254 |
'--disable-accelerated-2d-canvas',
|
255 |
+
'--disable-gpu',
|
256 |
+
'--disable-blink-features=AutomationControlled',
|
257 |
+
'--disable-web-security',
|
258 |
+
'--disable-features=IsolateOrigins,site-per-process'
|
259 |
],
|
260 |
executablePath: process.env.CHROME_BIN,
|
261 |
headless: true,
|
262 |
});
|
263 |
|
|
|
264 |
const results = [];
|
265 |
|
266 |
+
// Process URLs sequentially to avoid browser crashes
|
267 |
+
for (const url of validUrls) {
|
268 |
+
const context = await browser.newContext({
|
269 |
+
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
270 |
+
viewport: { width: 1920, height: 1080 },
|
271 |
+
locale: 'en-US',
|
272 |
+
timezoneId: 'America/New_York'
|
273 |
+
});
|
274 |
+
|
275 |
+
const page = await context.newPage();
|
276 |
+
|
277 |
+
try {
|
278 |
+
const result = await extractContentFromUrl(url, page);
|
279 |
+
results.push(result);
|
280 |
+
} catch (error) {
|
281 |
+
console.error(`Error processing ${url}:`, error);
|
282 |
+
results.push({
|
283 |
+
url,
|
284 |
+
success: false,
|
285 |
+
error: error.message,
|
286 |
+
extractedAt: new Date().toISOString()
|
287 |
+
});
|
288 |
+
} finally {
|
289 |
+
await context.close();
|
290 |
+
}
|
291 |
}
|
292 |
|
293 |
const successCount = results.filter(r => r.success).length;
|