Update index.js
Browse files
index.js
CHANGED
@@ -23,20 +23,16 @@ async function extractContentFromUrl(url, browser) {
|
|
23 |
const page = await context.newPage();
|
24 |
|
25 |
try {
|
26 |
-
// Set default navigation timeout
|
27 |
page.setDefaultNavigationTimeout(30000);
|
28 |
page.setDefaultTimeout(30000);
|
29 |
|
30 |
-
// Intercept dan block resource yang tidak perlu
|
31 |
await page.route('**/*', (route) => {
|
32 |
const resourceType = route.request().resourceType();
|
33 |
const url = route.request().url();
|
34 |
|
35 |
-
// Block unnecessary resources
|
36 |
if (['font', 'media', 'websocket'].includes(resourceType)) {
|
37 |
route.abort();
|
38 |
}
|
39 |
-
// Block tracking and ads
|
40 |
else if (url.includes('google-analytics') ||
|
41 |
url.includes('doubleclick') ||
|
42 |
url.includes('facebook') ||
|
@@ -48,7 +44,6 @@ async function extractContentFromUrl(url, browser) {
|
|
48 |
}
|
49 |
});
|
50 |
|
51 |
-
// Try different wait strategies
|
52 |
try {
|
53 |
await page.goto(url, {
|
54 |
waitUntil: 'domcontentloaded',
|
@@ -62,28 +57,23 @@ async function extractContentFromUrl(url, browser) {
|
|
62 |
});
|
63 |
}
|
64 |
|
65 |
-
// Wait for body to be visible
|
66 |
try {
|
67 |
await page.waitForSelector('body', { state: 'visible', timeout: 10000 });
|
68 |
} catch (e) {
|
69 |
console.log('Body selector timeout, continuing anyway');
|
70 |
}
|
71 |
|
72 |
-
// Wait a bit for dynamic content
|
73 |
await page.waitForTimeout(3000);
|
74 |
|
75 |
-
// Try to wait for common content selectors
|
76 |
const contentSelectors = ['article', 'main', '.content', '#content'];
|
77 |
for (const selector of contentSelectors) {
|
78 |
try {
|
79 |
await page.waitForSelector(selector, { timeout: 5000 });
|
80 |
break;
|
81 |
} catch (e) {
|
82 |
-
// Continue to next selector
|
83 |
}
|
84 |
}
|
85 |
|
86 |
-
// Gentle scroll untuk trigger lazy loading
|
87 |
await page.evaluate(() => {
|
88 |
return new Promise((resolve) => {
|
89 |
let totalHeight = 0;
|
@@ -106,7 +96,6 @@ async function extractContentFromUrl(url, browser) {
|
|
106 |
});
|
107 |
});
|
108 |
|
109 |
-
// Wait sebentar setelah scroll
|
110 |
await page.waitForTimeout(1000);
|
111 |
|
112 |
const content = await page.evaluate(() => {
|
@@ -114,18 +103,14 @@ async function extractContentFromUrl(url, browser) {
|
|
114 |
return text ? text.replace(/\s+/g, ' ').trim() : '';
|
115 |
};
|
116 |
|
117 |
-
// Helper function untuk mendapatkan text content yang lebih baik
|
118 |
const getTextContent = (element) => {
|
119 |
if (!element) return '';
|
120 |
|
121 |
-
// Clone element untuk manipulasi
|
122 |
const clone = element.cloneNode(true);
|
123 |
|
124 |
-
// Remove script dan style tags
|
125 |
const scripts = clone.querySelectorAll('script, style, noscript, iframe');
|
126 |
scripts.forEach(el => el.remove());
|
127 |
|
128 |
-
// Get text content
|
129 |
return cleanText(clone.textContent || clone.innerText || '');
|
130 |
};
|
131 |
|
@@ -133,7 +118,6 @@ async function extractContentFromUrl(url, browser) {
|
|
133 |
const metaDescription = document.querySelector('meta[name="description"]')?.content ||
|
134 |
document.querySelector('meta[property="og:description"]')?.content || '';
|
135 |
|
136 |
-
// Improved heading extraction
|
137 |
const h1Elements = Array.from(document.querySelectorAll('h1'))
|
138 |
.map(h1 => getTextContent(h1))
|
139 |
.filter(text => text.length > 0);
|
@@ -142,12 +126,10 @@ async function extractContentFromUrl(url, browser) {
|
|
142 |
.map(h2 => getTextContent(h2))
|
143 |
.filter(text => text.length > 0);
|
144 |
|
145 |
-
// Improved paragraph extraction
|
146 |
const paragraphs = Array.from(document.querySelectorAll('p'))
|
147 |
.map(p => getTextContent(p))
|
148 |
.filter(text => text.length > 20);
|
149 |
|
150 |
-
// Try multiple selectors for main content
|
151 |
const contentSelectors = [
|
152 |
'main',
|
153 |
'article',
|
@@ -162,7 +144,6 @@ async function extractContentFromUrl(url, browser) {
|
|
162 |
'[itemprop="articleBody"]',
|
163 |
'.story-body',
|
164 |
'.article-body',
|
165 |
-
// Detik specific selectors
|
166 |
'.detail__body-text',
|
167 |
'.detail__body',
|
168 |
'.itp_bodycontent'
|
@@ -174,14 +155,12 @@ async function extractContentFromUrl(url, browser) {
|
|
174 |
if (mainContent && getTextContent(mainContent).length > 100) break;
|
175 |
}
|
176 |
|
177 |
-
// Fallback: jika tidak ada main content, ambil dari body
|
178 |
if (!mainContent) {
|
179 |
mainContent = document.body;
|
180 |
}
|
181 |
|
182 |
const mainText = getTextContent(mainContent);
|
183 |
|
184 |
-
// Jika mainText masih kosong, coba ambil semua text dari div yang panjang
|
185 |
let fallbackText = '';
|
186 |
if (!mainText || mainText.length < 100) {
|
187 |
const allDivs = Array.from(document.querySelectorAll('div'))
|
@@ -194,7 +173,6 @@ async function extractContentFromUrl(url, browser) {
|
|
194 |
|
195 |
const finalMainText = mainText || fallbackText;
|
196 |
|
197 |
-
// Extract links dengan filter yang lebih baik
|
198 |
const links = Array.from(document.querySelectorAll('a[href]'))
|
199 |
.map(a => ({
|
200 |
text: getTextContent(a),
|
@@ -203,7 +181,6 @@ async function extractContentFromUrl(url, browser) {
|
|
203 |
.filter(link => link.text && link.href && !link.href.startsWith('javascript:'))
|
204 |
.slice(0, 20);
|
205 |
|
206 |
-
// Extract images dengan filter yang lebih baik
|
207 |
const images = Array.from(document.querySelectorAll('img[src]'))
|
208 |
.filter(img => img.src && !img.src.includes('data:image'))
|
209 |
.map(img => ({
|
@@ -213,7 +190,6 @@ async function extractContentFromUrl(url, browser) {
|
|
213 |
}))
|
214 |
.slice(0, 10);
|
215 |
|
216 |
-
// Calculate word count
|
217 |
const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
|
218 |
const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
|
219 |
|
@@ -229,7 +205,7 @@ async function extractContentFromUrl(url, browser) {
|
|
229 |
links,
|
230 |
images,
|
231 |
wordCount,
|
232 |
-
hasContent: wordCount > 50
|
233 |
};
|
234 |
});
|
235 |
|
|
|
23 |
const page = await context.newPage();
|
24 |
|
25 |
try {
|
|
|
26 |
page.setDefaultNavigationTimeout(30000);
|
27 |
page.setDefaultTimeout(30000);
|
28 |
|
|
|
29 |
await page.route('**/*', (route) => {
|
30 |
const resourceType = route.request().resourceType();
|
31 |
const url = route.request().url();
|
32 |
|
|
|
33 |
if (['font', 'media', 'websocket'].includes(resourceType)) {
|
34 |
route.abort();
|
35 |
}
|
|
|
36 |
else if (url.includes('google-analytics') ||
|
37 |
url.includes('doubleclick') ||
|
38 |
url.includes('facebook') ||
|
|
|
44 |
}
|
45 |
});
|
46 |
|
|
|
47 |
try {
|
48 |
await page.goto(url, {
|
49 |
waitUntil: 'domcontentloaded',
|
|
|
57 |
});
|
58 |
}
|
59 |
|
|
|
60 |
try {
|
61 |
await page.waitForSelector('body', { state: 'visible', timeout: 10000 });
|
62 |
} catch (e) {
|
63 |
console.log('Body selector timeout, continuing anyway');
|
64 |
}
|
65 |
|
|
|
66 |
await page.waitForTimeout(3000);
|
67 |
|
|
|
68 |
const contentSelectors = ['article', 'main', '.content', '#content'];
|
69 |
for (const selector of contentSelectors) {
|
70 |
try {
|
71 |
await page.waitForSelector(selector, { timeout: 5000 });
|
72 |
break;
|
73 |
} catch (e) {
|
|
|
74 |
}
|
75 |
}
|
76 |
|
|
|
77 |
await page.evaluate(() => {
|
78 |
return new Promise((resolve) => {
|
79 |
let totalHeight = 0;
|
|
|
96 |
});
|
97 |
});
|
98 |
|
|
|
99 |
await page.waitForTimeout(1000);
|
100 |
|
101 |
const content = await page.evaluate(() => {
|
|
|
103 |
return text ? text.replace(/\s+/g, ' ').trim() : '';
|
104 |
};
|
105 |
|
|
|
106 |
const getTextContent = (element) => {
|
107 |
if (!element) return '';
|
108 |
|
|
|
109 |
const clone = element.cloneNode(true);
|
110 |
|
|
|
111 |
const scripts = clone.querySelectorAll('script, style, noscript, iframe');
|
112 |
scripts.forEach(el => el.remove());
|
113 |
|
|
|
114 |
return cleanText(clone.textContent || clone.innerText || '');
|
115 |
};
|
116 |
|
|
|
118 |
const metaDescription = document.querySelector('meta[name="description"]')?.content ||
|
119 |
document.querySelector('meta[property="og:description"]')?.content || '';
|
120 |
|
|
|
121 |
const h1Elements = Array.from(document.querySelectorAll('h1'))
|
122 |
.map(h1 => getTextContent(h1))
|
123 |
.filter(text => text.length > 0);
|
|
|
126 |
.map(h2 => getTextContent(h2))
|
127 |
.filter(text => text.length > 0);
|
128 |
|
|
|
129 |
const paragraphs = Array.from(document.querySelectorAll('p'))
|
130 |
.map(p => getTextContent(p))
|
131 |
.filter(text => text.length > 20);
|
132 |
|
|
|
133 |
const contentSelectors = [
|
134 |
'main',
|
135 |
'article',
|
|
|
144 |
'[itemprop="articleBody"]',
|
145 |
'.story-body',
|
146 |
'.article-body',
|
|
|
147 |
'.detail__body-text',
|
148 |
'.detail__body',
|
149 |
'.itp_bodycontent'
|
|
|
155 |
if (mainContent && getTextContent(mainContent).length > 100) break;
|
156 |
}
|
157 |
|
|
|
158 |
if (!mainContent) {
|
159 |
mainContent = document.body;
|
160 |
}
|
161 |
|
162 |
const mainText = getTextContent(mainContent);
|
163 |
|
|
|
164 |
let fallbackText = '';
|
165 |
if (!mainText || mainText.length < 100) {
|
166 |
const allDivs = Array.from(document.querySelectorAll('div'))
|
|
|
173 |
|
174 |
const finalMainText = mainText || fallbackText;
|
175 |
|
|
|
176 |
const links = Array.from(document.querySelectorAll('a[href]'))
|
177 |
.map(a => ({
|
178 |
text: getTextContent(a),
|
|
|
181 |
.filter(link => link.text && link.href && !link.href.startsWith('javascript:'))
|
182 |
.slice(0, 20);
|
183 |
|
|
|
184 |
const images = Array.from(document.querySelectorAll('img[src]'))
|
185 |
.filter(img => img.src && !img.src.includes('data:image'))
|
186 |
.map(img => ({
|
|
|
190 |
}))
|
191 |
.slice(0, 10);
|
192 |
|
|
|
193 |
const allText = finalMainText || paragraphs.join(' ') || document.body.innerText || '';
|
194 |
const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
|
195 |
|
|
|
205 |
links,
|
206 |
images,
|
207 |
wordCount,
|
208 |
+
hasContent: wordCount > 50
|
209 |
};
|
210 |
});
|
211 |
|