vyles commited on
Commit
95588a7
·
verified ·
1 Parent(s): c80ffd3

Update index.js

Browse files
Files changed (1) hide show
  1. index.js +1 -25
index.js CHANGED
@@ -23,20 +23,16 @@ async function extractContentFromUrl(url, browser) {
23
  const page = await context.newPage();
24
 
25
  try {
26
- // Set default navigation timeout
27
  page.setDefaultNavigationTimeout(30000);
28
  page.setDefaultTimeout(30000);
29
 
30
- // Intercept dan block resource yang tidak perlu
31
  await page.route('**/*', (route) => {
32
  const resourceType = route.request().resourceType();
33
  const url = route.request().url();
34
 
35
- // Block unnecessary resources
36
  if (['font', 'media', 'websocket'].includes(resourceType)) {
37
  route.abort();
38
  }
39
- // Block tracking and ads
40
  else if (url.includes('google-analytics') ||
41
  url.includes('doubleclick') ||
42
  url.includes('facebook') ||
@@ -48,7 +44,6 @@ async function extractContentFromUrl(url, browser) {
48
  }
49
  });
50
 
51
- // Try different wait strategies
52
  try {
53
  await page.goto(url, {
54
  waitUntil: 'domcontentloaded',
@@ -62,28 +57,23 @@ async function extractContentFromUrl(url, browser) {
62
  });
63
  }
64
 
65
- // Wait for body to be visible
66
  try {
67
  await page.waitForSelector('body', { state: 'visible', timeout: 10000 });
68
  } catch (e) {
69
  console.log('Body selector timeout, continuing anyway');
70
  }
71
 
72
- // Wait a bit for dynamic content
73
  await page.waitForTimeout(3000);
74
 
75
- // Try to wait for common content selectors
76
  const contentSelectors = ['article', 'main', '.content', '#content'];
77
  for (const selector of contentSelectors) {
78
  try {
79
  await page.waitForSelector(selector, { timeout: 5000 });
80
  break;
81
  } catch (e) {
82
- // Continue to next selector
83
  }
84
  }
85
 
86
- // Gentle scroll untuk trigger lazy loading
87
  await page.evaluate(() => {
88
  return new Promise((resolve) => {
89
  let totalHeight = 0;
@@ -106,7 +96,6 @@ async function extractContentFromUrl(url, browser) {
106
  });
107
  });
108
 
109
- // Wait sebentar setelah scroll
110
  await page.waitForTimeout(1000);
111
 
112
  const content = await page.evaluate(() => {
@@ -114,18 +103,14 @@ async function extractContentFromUrl(url, browser) {
114
  return text ? text.replace(/\s+/g, ' ').trim() : '';
115
  };
116
 
117
- // Helper function untuk mendapatkan text content yang lebih baik
118
  const getTextContent = (element) => {
119
  if (!element) return '';
120
 
121
- // Clone element untuk manipulasi
122
  const clone = element.cloneNode(true);
123
 
124
- // Remove script dan style tags
125
  const scripts = clone.querySelectorAll('script, style, noscript, iframe');
126
  scripts.forEach(el => el.remove());
127
 
128
- // Get text content
129
  return cleanText(clone.textContent || clone.innerText || '');
130
  };
131
 
@@ -133,7 +118,6 @@ async function extractContentFromUrl(url, browser) {
133
  const metaDescription = document.querySelector('meta[name="description"]')?.content ||
134
  document.querySelector('meta[property="og:description"]')?.content || '';
135
 
136
- // Improved heading extraction
137
  const h1Elements = Array.from(document.querySelectorAll('h1'))
138
  .map(h1 => getTextContent(h1))
139
  .filter(text => text.length > 0);
@@ -142,12 +126,10 @@ async function extractContentFromUrl(url, browser) {
142
  .map(h2 => getTextContent(h2))
143
  .filter(text => text.length > 0);
144
 
145
- // Improved paragraph extraction
146
  const paragraphs = Array.from(document.querySelectorAll('p'))
147
  .map(p => getTextContent(p))
148
  .filter(text => text.length > 20);
149
 
150
- // Try multiple selectors for main content
151
  const contentSelectors = [
152
  'main',
153
  'article',
@@ -162,7 +144,6 @@ async function extractContentFromUrl(url, browser) {
162
  '[itemprop="articleBody"]',
163
  '.story-body',
164
  '.article-body',
165
- // Detik specific selectors
166
  '.detail__body-text',
167
  '.detail__body',
168
  '.itp_bodycontent'
@@ -174,14 +155,12 @@ async function extractContentFromUrl(url, browser) {
174
  if (mainContent && getTextContent(mainContent).length > 100) break;
175
  }
176
 
177
- // Fallback: jika tidak ada main content, ambil dari body
178
  if (!mainContent) {
179
  mainContent = document.body;
180
  }
181
 
182
  const mainText = getTextContent(mainContent);
183
 
184
- // Jika mainText masih kosong, coba ambil semua text dari div yang panjang
185
  let fallbackText = '';
186
  if (!mainText || mainText.length < 100) {
187
  const allDivs = Array.from(document.querySelectorAll('div'))
@@ -194,7 +173,6 @@ async function extractContentFromUrl(url, browser) {
194
 
195
  const finalMainText = mainText || fallbackText;
196
 
197
- // Extract links dengan filter yang lebih baik
198
  const links = Array.from(document.querySelectorAll('a[href]'))
199
  .map(a => ({
200
  text: getTextContent(a),
@@ -203,7 +181,6 @@ async function extractContentFromUrl(url, browser) {
203
  .filter(link => link.text && link.href && !link.href.startsWith('javascript:'))
204
  .slice(0, 20);
205
 
206
- // Extract images dengan filter yang lebih baik
207
  const images = Array.from(document.querySelectorAll('img[src]'))
208
  .filter(img => img.src && !img.src.includes('data:image'))
209
  .map(img => ({
@@ -213,7 +190,6 @@ async function extractContentFromUrl(url, browser) {
213
  }))
214
  .slice(0, 10);
215
 
216
- // Calculate word count
217
  const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
218
  const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
219
 
@@ -229,7 +205,7 @@ async function extractContentFromUrl(url, browser) {
229
  links,
230
  images,
231
  wordCount,
232
- hasContent: wordCount > 50 // Flag untuk mengecek apakah ada konten
233
  };
234
  });
235
 
 
23
  const page = await context.newPage();
24
 
25
  try {
 
26
  page.setDefaultNavigationTimeout(30000);
27
  page.setDefaultTimeout(30000);
28
 
 
29
  await page.route('**/*', (route) => {
30
  const resourceType = route.request().resourceType();
31
  const url = route.request().url();
32
 
 
33
  if (['font', 'media', 'websocket'].includes(resourceType)) {
34
  route.abort();
35
  }
 
36
  else if (url.includes('google-analytics') ||
37
  url.includes('doubleclick') ||
38
  url.includes('facebook') ||
 
44
  }
45
  });
46
 
 
47
  try {
48
  await page.goto(url, {
49
  waitUntil: 'domcontentloaded',
 
57
  });
58
  }
59
 
 
60
  try {
61
  await page.waitForSelector('body', { state: 'visible', timeout: 10000 });
62
  } catch (e) {
63
  console.log('Body selector timeout, continuing anyway');
64
  }
65
 
 
66
  await page.waitForTimeout(3000);
67
 
 
68
  const contentSelectors = ['article', 'main', '.content', '#content'];
69
  for (const selector of contentSelectors) {
70
  try {
71
  await page.waitForSelector(selector, { timeout: 5000 });
72
  break;
73
  } catch (e) {
 
74
  }
75
  }
76
 
 
77
  await page.evaluate(() => {
78
  return new Promise((resolve) => {
79
  let totalHeight = 0;
 
96
  });
97
  });
98
 
 
99
  await page.waitForTimeout(1000);
100
 
101
  const content = await page.evaluate(() => {
 
103
  return text ? text.replace(/\s+/g, ' ').trim() : '';
104
  };
105
 
 
106
  const getTextContent = (element) => {
107
  if (!element) return '';
108
 
 
109
  const clone = element.cloneNode(true);
110
 
 
111
  const scripts = clone.querySelectorAll('script, style, noscript, iframe');
112
  scripts.forEach(el => el.remove());
113
 
 
114
  return cleanText(clone.textContent || clone.innerText || '');
115
  };
116
 
 
118
  const metaDescription = document.querySelector('meta[name="description"]')?.content ||
119
  document.querySelector('meta[property="og:description"]')?.content || '';
120
 
 
121
  const h1Elements = Array.from(document.querySelectorAll('h1'))
122
  .map(h1 => getTextContent(h1))
123
  .filter(text => text.length > 0);
 
126
  .map(h2 => getTextContent(h2))
127
  .filter(text => text.length > 0);
128
 
 
129
  const paragraphs = Array.from(document.querySelectorAll('p'))
130
  .map(p => getTextContent(p))
131
  .filter(text => text.length > 20);
132
 
 
133
  const contentSelectors = [
134
  'main',
135
  'article',
 
144
  '[itemprop="articleBody"]',
145
  '.story-body',
146
  '.article-body',
 
147
  '.detail__body-text',
148
  '.detail__body',
149
  '.itp_bodycontent'
 
155
  if (mainContent && getTextContent(mainContent).length > 100) break;
156
  }
157
 
 
158
  if (!mainContent) {
159
  mainContent = document.body;
160
  }
161
 
162
  const mainText = getTextContent(mainContent);
163
 
 
164
  let fallbackText = '';
165
  if (!mainText || mainText.length < 100) {
166
  const allDivs = Array.from(document.querySelectorAll('div'))
 
173
 
174
  const finalMainText = mainText || fallbackText;
175
 
 
176
  const links = Array.from(document.querySelectorAll('a[href]'))
177
  .map(a => ({
178
  text: getTextContent(a),
 
181
  .filter(link => link.text && link.href && !link.href.startsWith('javascript:'))
182
  .slice(0, 20);
183
 
 
184
  const images = Array.from(document.querySelectorAll('img[src]'))
185
  .filter(img => img.src && !img.src.includes('data:image'))
186
  .map(img => ({
 
190
  }))
191
  .slice(0, 10);
192
 
 
193
  const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
194
  const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
195
 
 
205
  links,
206
  images,
207
  wordCount,
208
+ hasContent: wordCount > 50
209
  };
210
  });
211