vyles commited on
Commit
9a73cbf
·
verified ·
1 Parent(s): 9db0314

Update index.js

Browse files
Files changed (1) hide show
  1. index.js +156 -25
index.js CHANGED
@@ -15,40 +15,157 @@ app.use(cors());
15
 
16
  async function extractContentFromUrl(url, browser) {
17
  const context = await browser.newContext({
18
- userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
19
  });
20
  const page = await context.newPage();
 
21
  try {
 
 
 
 
 
 
 
 
 
 
22
  await page.goto(url, {
23
- waitUntil: 'domcontentloaded',
24
  timeout: 60000
25
  });
26
 
27
- await page.waitForTimeout(15000);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  const content = await page.evaluate(() => {
30
  const cleanText = (text) => {
31
  return text ? text.replace(/\s+/g, ' ').trim() : '';
32
  };
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  const title = document.title || '';
35
- const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
36
- const h1Elements = Array.from(document.querySelectorAll('h1')).map(h1 => cleanText(h1.innerText));
37
- const h2Elements = Array.from(document.querySelectorAll('h2')).map(h2 => cleanText(h2.innerText));
38
- const paragraphs = Array.from(document.querySelectorAll('p')).map(p => cleanText(p.innerText)).filter(text => text.length > 20); // Filter paragraf yang terlalu pendek
39
- const mainContent = document.querySelector('main, article, .content, #content, .post-content');
40
- const mainText = mainContent ? cleanText(mainContent.innerText) : '';
41
-
42
- const links = Array.from(document.querySelectorAll('a[href]')).map(a => ({
43
- text: cleanText(a.innerText),
44
- href: a.href
45
- })).filter(link => link.text && link.href);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- const images = Array.from(document.querySelectorAll('img[src]')).map(img => ({
48
- src: img.src,
49
- alt: img.alt || '',
50
- title: img.title || ''
51
- }));
 
 
 
 
 
 
 
 
52
 
53
  return {
54
  title,
@@ -58,10 +175,11 @@ async function extractContentFromUrl(url, browser) {
58
  h2: h2Elements
59
  },
60
  paragraphs,
61
- mainText,
62
- links: links.slice(0, 20),
63
- images: images.slice(0, 10),
64
- wordCount: (mainText || paragraphs.join(' ')).split(/\s+/).length
 
65
  };
66
  });
67
 
@@ -72,6 +190,7 @@ async function extractContentFromUrl(url, browser) {
72
  extractedAt: new Date().toISOString()
73
  };
74
  } catch (error) {
 
75
  return {
76
  url,
77
  success: false,
@@ -131,7 +250,17 @@ app.post('/extract-content', async (req, res) => {
131
 
132
  try {
133
  browser = await chromium.launch({
134
- args: ['--incognito', '--single-process', '--no-sandbox', '--no-zygote', '--no-cache'],
 
 
 
 
 
 
 
 
 
 
135
  executablePath: process.env.CHROME_BIN,
136
  headless: true,
137
  });
@@ -148,6 +277,7 @@ app.post('/extract-content', async (req, res) => {
148
 
149
  const successCount = results.filter(r => r.success).length;
150
  const failCount = results.filter(r => !r.success).length;
 
151
 
152
  res.json({
153
  success: true,
@@ -155,7 +285,8 @@ app.post('/extract-content', async (req, res) => {
155
  statistics: {
156
  total: validUrls.length,
157
  success: successCount,
158
- failed: failCount
 
159
  },
160
  results
161
  });
 
15
 
16
  async function extractContentFromUrl(url, browser) {
17
  const context = await browser.newContext({
18
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
19
+ viewport: { width: 1920, height: 1080 },
20
+ locale: 'en-US',
21
+ timezoneId: 'America/New_York'
22
  });
23
  const page = await context.newPage();
24
+
25
  try {
26
+ // Intercept dan block resource yang tidak perlu
27
+ await page.route('**/*', (route) => {
28
+ const resourceType = route.request().resourceType();
29
+ if (['font', 'media'].includes(resourceType)) {
30
+ route.abort();
31
+ } else {
32
+ route.continue();
33
+ }
34
+ });
35
+
36
  await page.goto(url, {
37
+ waitUntil: 'networkidle',
38
  timeout: 60000
39
  });
40
 
41
+ // Wait for content to be visible
42
+ await page.waitForSelector('body', { state: 'visible', timeout: 30000 });
43
+
44
+ // Scroll untuk trigger lazy loading
45
+ await page.evaluate(() => {
46
+ return new Promise((resolve) => {
47
+ let totalHeight = 0;
48
+ const distance = 100;
49
+ const timer = setInterval(() => {
50
+ const scrollHeight = document.body.scrollHeight;
51
+ window.scrollBy(0, distance);
52
+ totalHeight += distance;
53
+
54
+ if(totalHeight >= scrollHeight){
55
+ clearInterval(timer);
56
+ window.scrollTo(0, 0);
57
+ resolve();
58
+ }
59
+ }, 100);
60
+ });
61
+ });
62
+
63
+ // Wait tambahan setelah scroll
64
+ await page.waitForTimeout(2000);
65
 
66
  const content = await page.evaluate(() => {
67
  const cleanText = (text) => {
68
  return text ? text.replace(/\s+/g, ' ').trim() : '';
69
  };
70
 
71
+ // Helper function untuk mendapatkan text content yang lebih baik
72
+ const getTextContent = (element) => {
73
+ if (!element) return '';
74
+
75
+ // Clone element untuk manipulasi
76
+ const clone = element.cloneNode(true);
77
+
78
+ // Remove script dan style tags
79
+ const scripts = clone.querySelectorAll('script, style, noscript');
80
+ scripts.forEach(el => el.remove());
81
+
82
+ // Get text content
83
+ return cleanText(clone.textContent || clone.innerText || '');
84
+ };
85
+
86
  const title = document.title || '';
87
+ const metaDescription = document.querySelector('meta[name="description"]')?.content ||
88
+ document.querySelector('meta[property="og:description"]')?.content || '';
89
+
90
+ // Improved heading extraction
91
+ const h1Elements = Array.from(document.querySelectorAll('h1'))
92
+ .map(h1 => getTextContent(h1))
93
+ .filter(text => text.length > 0);
94
+
95
+ const h2Elements = Array.from(document.querySelectorAll('h2'))
96
+ .map(h2 => getTextContent(h2))
97
+ .filter(text => text.length > 0);
98
+
99
+ // Improved paragraph extraction
100
+ const paragraphs = Array.from(document.querySelectorAll('p'))
101
+ .map(p => getTextContent(p))
102
+ .filter(text => text.length > 20);
103
+
104
+ // Try multiple selectors for main content
105
+ const contentSelectors = [
106
+ 'main',
107
+ 'article',
108
+ '[role="main"]',
109
+ '.content',
110
+ '#content',
111
+ '.post-content',
112
+ '.entry-content',
113
+ '.article-content',
114
+ '.page-content',
115
+ '.main-content',
116
+ '[itemprop="articleBody"]',
117
+ '.story-body',
118
+ '.article-body'
119
+ ];
120
+
121
+ let mainContent = null;
122
+ for (const selector of contentSelectors) {
123
+ mainContent = document.querySelector(selector);
124
+ if (mainContent) break;
125
+ }
126
+
127
+ // Fallback: jika tidak ada main content, ambil dari body
128
+ if (!mainContent) {
129
+ mainContent = document.body;
130
+ }
131
+
132
+ const mainText = getTextContent(mainContent);
133
+
134
+ // Jika mainText masih kosong, coba ambil semua text dari div yang panjang
135
+ let fallbackText = '';
136
+ if (!mainText || mainText.length < 100) {
137
+ const allDivs = Array.from(document.querySelectorAll('div'))
138
+ .map(div => getTextContent(div))
139
+ .filter(text => text.length > 200)
140
+ .sort((a, b) => b.length - a.length);
141
+
142
+ fallbackText = allDivs[0] || '';
143
+ }
144
+
145
+ const finalMainText = mainText || fallbackText;
146
+
147
+ // Extract links dengan filter yang lebih baik
148
+ const links = Array.from(document.querySelectorAll('a[href]'))
149
+ .map(a => ({
150
+ text: getTextContent(a),
151
+ href: a.href
152
+ }))
153
+ .filter(link => link.text && link.href && !link.href.startsWith('javascript:'))
154
+ .slice(0, 20);
155
 
156
+ // Extract images dengan filter yang lebih baik
157
+ const images = Array.from(document.querySelectorAll('img[src]'))
158
+ .filter(img => img.src && !img.src.includes('data:image'))
159
+ .map(img => ({
160
+ src: img.src,
161
+ alt: img.alt || '',
162
+ title: img.title || ''
163
+ }))
164
+ .slice(0, 10);
165
+
166
+ // Calculate word count
167
+ const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
168
+ const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
169
 
170
  return {
171
  title,
 
175
  h2: h2Elements
176
  },
177
  paragraphs,
178
+ mainText: finalMainText,
179
+ links,
180
+ images,
181
+ wordCount,
182
+ hasContent: wordCount > 50 // Flag untuk mengecek apakah ada konten
183
  };
184
  });
185
 
 
190
  extractedAt: new Date().toISOString()
191
  };
192
  } catch (error) {
193
+ console.error(`Error extracting ${url}:`, error);
194
  return {
195
  url,
196
  success: false,
 
250
 
251
  try {
252
  browser = await chromium.launch({
253
+ args: [
254
+ '--incognito',
255
+ '--single-process',
256
+ '--no-sandbox',
257
+ '--no-zygote',
258
+ '--no-cache',
259
+ '--disable-dev-shm-usage',
260
+ '--disable-setuid-sandbox',
261
+ '--disable-accelerated-2d-canvas',
262
+ '--disable-gpu'
263
+ ],
264
  executablePath: process.env.CHROME_BIN,
265
  headless: true,
266
  });
 
277
 
278
  const successCount = results.filter(r => r.success).length;
279
  const failCount = results.filter(r => !r.success).length;
280
+ const emptyContentCount = results.filter(r => r.success && (!r.content.hasContent || r.content.wordCount < 50)).length;
281
 
282
  res.json({
283
  success: true,
 
285
  statistics: {
286
  total: validUrls.length,
287
  success: successCount,
288
+ failed: failCount,
289
+ emptyContent: emptyContentCount
290
  },
291
  results
292
  });