|
const express = require('express'); |
|
const bodyParser = require('body-parser'); |
|
const multer = require('multer'); |
|
const cheerio = require('cheerio'); |
|
const { minifyHtml } = require('./minify'); |
|
const { removeMedia } = require('./removeMedia'); |
|
|
|
const app = express(); |
|
|
|
|
|
const MAX_SIZE = '50mb'; |
|
|
|
|
|
const upload = multer({ |
|
limits: { |
|
fileSize: 50 * 1024 * 1024, |
|
fieldSize: 50 * 1024 * 1024 |
|
} |
|
}); |
|
|
|
|
|
app.use(express.static('public')); |
|
app.use(bodyParser.json({limit: MAX_SIZE})); |
|
app.use(bodyParser.urlencoded({ |
|
extended: true, |
|
limit: MAX_SIZE, |
|
parameterLimit: 50000 |
|
})); |
|
app.use(express.json({limit: MAX_SIZE})); |
|
app.use(express.urlencoded({ |
|
limit: MAX_SIZE, |
|
extended: true, |
|
parameterLimit: 50000 |
|
})); |
|
|
|
function compressHtmlForLlm(html, options = {}) { |
|
const operationStatus = { |
|
minification: { success: false, error: null }, |
|
cheerioLoad: { success: false, error: null }, |
|
headCleaning: { success: false, error: null }, |
|
scriptRemoval: { success: false, error: null }, |
|
styleRemoval: { success: false, error: null }, |
|
mediaRemoval: { success: false, error: null }, |
|
repeatingElements: { success: false, error: null }, |
|
textTruncation: { success: false, error: null } |
|
}; |
|
|
|
let processed = html; |
|
let $ = null; |
|
|
|
|
|
if (options.minifyHtml) { |
|
const minifyResult = minifyHtml(html, { |
|
removeScripts: options.removeScripts, |
|
removeStyles: options.removeStyles |
|
}); |
|
|
|
if (minifyResult.success) { |
|
processed = minifyResult.minifiedHtml; |
|
operationStatus.minification = { success: true, error: null }; |
|
} else { |
|
operationStatus.minification = { |
|
success: false, |
|
error: minifyResult.error?.message || 'Minification failed' |
|
}; |
|
} |
|
} |
|
|
|
|
|
try { |
|
$ = cheerio.load(processed, { |
|
decodeEntities: false, |
|
xmlMode: false, |
|
lowerCaseTags: true |
|
}); |
|
operationStatus.cheerioLoad.success = true; |
|
} catch (err) { |
|
operationStatus.cheerioLoad.error = err.message.substring(0, 100); |
|
console.error('Cheerio load failed:', err); |
|
return { html: processed, status: operationStatus }; |
|
} |
|
|
|
|
|
if (options.removeScripts) { |
|
try { |
|
$('script').remove(); |
|
operationStatus.scriptRemoval.success = true; |
|
} catch (err) { |
|
operationStatus.scriptRemoval.error = err.message.substring(0, 100); |
|
console.warn('Script removal failed:', err); |
|
} |
|
} |
|
|
|
|
|
if (options.removeStyles) { |
|
try { |
|
$('style').remove(); |
|
$('link[rel="stylesheet"]').remove(); |
|
operationStatus.styleRemoval.success = true; |
|
} catch (err) { |
|
operationStatus.styleRemoval.error = err.message.substring(0, 100); |
|
console.warn('Style removal failed:', err); |
|
} |
|
} |
|
|
|
|
|
if (options.removeMedia) { |
|
try { |
|
const mediaResult = removeMedia($); |
|
if (mediaResult.success) { |
|
operationStatus.mediaRemoval.success = true; |
|
} else { |
|
operationStatus.mediaRemoval.error = mediaResult.error.substring(0, 100); |
|
console.warn('Media removal failed:', mediaResult.error); |
|
} |
|
} catch (err) { |
|
operationStatus.mediaRemoval.error = err.message.substring(0, 100); |
|
console.warn('Media removal failed:', err); |
|
} |
|
} |
|
|
|
|
|
if (options.cleanHead) { |
|
try { |
|
cleanHead($); |
|
operationStatus.headCleaning.success = true; |
|
} catch (err) { |
|
operationStatus.headCleaning.error = err.message.substring(0, 100); |
|
} |
|
} |
|
|
|
|
|
if (options.handleRepeatingElements) { |
|
try { |
|
handleRepeatingElements($); |
|
operationStatus.repeatingElements.success = true; |
|
} catch (err) { |
|
operationStatus.repeatingElements.error = err.message.substring(0, 100); |
|
} |
|
} |
|
|
|
|
|
if (options.truncateText) { |
|
try { |
|
truncateText($, options.truncateLength); |
|
operationStatus.textTruncation.success = true; |
|
} catch (err) { |
|
operationStatus.textTruncation.error = err.message.substring(0, 100); |
|
} |
|
} |
|
|
|
let finalHtml = ''; |
|
try { |
|
finalHtml = $.html(); |
|
} catch (err) { |
|
console.error('Final HTML generation failed:', err); |
|
finalHtml = processed; |
|
} |
|
|
|
const structure = generateStructureJson($); |
|
|
|
return { |
|
html: finalHtml, |
|
json: JSON.stringify(structure, null, 2), |
|
status: operationStatus |
|
}; |
|
} |
|
|
|
function cleanHead($) { |
|
$('head').each((_, head) => { |
|
$(head).find('link').remove(); |
|
$(head).find('script').remove(); |
|
$(head).find('meta').each((_, meta) => { |
|
const name = $(meta).attr('name')?.toLowerCase(); |
|
const property = $(meta).attr('property')?.toLowerCase(); |
|
if (!['charset', 'viewport', 'description', 'keywords'].includes(name) && |
|
!property?.includes('og:')) { |
|
$(meta).remove(); |
|
} |
|
}); |
|
}); |
|
} |
|
|
|
function handleRepeatingElements($) { |
|
$('*').each((_, elem) => { |
|
const $elem = $(elem); |
|
const children = $elem.children(); |
|
if (children.length > 3 && areElementsSimilar(children, $)) { |
|
children.slice(1, -1).each((i, child) => { |
|
if (i !== Math.floor(children.length / 2) - 1) { |
|
$(child).remove(); |
|
} |
|
}); |
|
} |
|
}); |
|
} |
|
|
|
function truncateText($, truncateLength) { |
|
$('*').each((_, elem) => { |
|
const $elem = $(elem); |
|
if ($elem.children().length === 0) { |
|
let text = $elem.text(); |
|
if (text.length > truncateLength) { |
|
text = text.substring(0, truncateLength/2) + '...' + |
|
text.substring(text.length - truncateLength/2); |
|
$elem.text(text); |
|
} |
|
} |
|
}); |
|
} |
|
|
|
function areElementsSimilar(elements, $) { |
|
if (elements.length < 4) return false; |
|
|
|
const firstTag = elements[0].tagName; |
|
const firstClasses = $(elements[0]).attr('class'); |
|
|
|
let similarCount = 0; |
|
elements.each((_, elem) => { |
|
if (elem.tagName === firstTag && $(elem).attr('class') === firstClasses) { |
|
similarCount++; |
|
} |
|
}); |
|
|
|
return similarCount / elements.length > 0.7; |
|
} |
|
|
|
function generateStructureJson($) { |
|
try { |
|
const structure = []; |
|
$('*').each((_, el) => { |
|
const $el = $(el); |
|
const attributes = {}; |
|
|
|
Object.entries($el.attr() || {}).forEach(([key, value]) => { |
|
attributes[key] = value; |
|
}); |
|
|
|
const textContent = $el.clone().children().remove().end().text().trim(); |
|
const truncatedText = textContent.length > 50 |
|
? textContent.substring(0, 25) + '...' + textContent.substring(textContent.length - 25) |
|
: textContent; |
|
|
|
structure.push({ |
|
tag: el.tagName, |
|
attributes: Object.keys(attributes).length ? attributes : undefined, |
|
textContent: truncatedText || undefined, |
|
childrenCount: $el.children().length, |
|
selector: generateSelector($, el) |
|
}); |
|
}); |
|
return structure; |
|
} catch (err) { |
|
console.error('Structure generation failed:', err); |
|
return []; |
|
} |
|
} |
|
|
|
function generateSelector($, element) { |
|
try { |
|
const $el = $(element); |
|
let selector = element.tagName; |
|
|
|
if ($el.attr('id')) { |
|
selector += `#${$el.attr('id')}`; |
|
} else if ($el.attr('class')) { |
|
selector += `.${$el.attr('class').replace(/\s+/g, '.')}`; |
|
} |
|
|
|
return selector; |
|
} catch (err) { |
|
console.warn('Selector generation failed:', err); |
|
return element.tagName || 'unknown'; |
|
} |
|
} |
|
|
|
function computeStats(html, processed) { |
|
try { |
|
const $ = cheerio.load(html); |
|
const $processed = cheerio.load(processed); |
|
|
|
const stats = { |
|
originalElementCount: $('*').length, |
|
processedElementCount: $processed('*').length, |
|
originalTextLength: html.length, |
|
processedTextLength: processed.length, |
|
}; |
|
|
|
return { |
|
elementReduction: `${(1 - stats.processedElementCount / stats.originalElementCount) * 100}%`, |
|
sizeReduction: `${(1 - stats.processedTextLength / stats.originalTextLength) * 100}%`, |
|
originalElements: stats.originalElementCount, |
|
remainingElements: stats.processedElementCount, |
|
originalSize: stats.originalTextLength, |
|
processedSize: stats.processedTextLength |
|
}; |
|
} catch (err) { |
|
console.error('Stats computation failed:', err); |
|
return { |
|
elementReduction: 'N/A', |
|
sizeReduction: 'N/A', |
|
originalElements: 'N/A', |
|
remainingElements: 'N/A', |
|
originalSize: html.length, |
|
processedSize: processed.length |
|
}; |
|
} |
|
} |
|
|
|
function validateScript(scriptContent) { |
|
if (!scriptContent.includes('function extract(')) { |
|
throw new Error('Script must contain a function named "extract"'); |
|
} |
|
} |
|
|
|
function executeCheerioScript(html, scriptContent) { |
|
try { |
|
validateScript(scriptContent); |
|
|
|
const context = { |
|
cheerio, |
|
input: html |
|
}; |
|
|
|
const extractorFunction = new Function('input', 'cheerio', ` |
|
${scriptContent} |
|
return extract(input, cheerio); |
|
`); |
|
|
|
const result = extractorFunction(html, cheerio); |
|
|
|
if (!result || typeof result !== 'object') { |
|
throw new Error('Extract function must return an object'); |
|
} |
|
|
|
if (!('success' in result && 'data' in result && 'error' in result)) { |
|
throw new Error('Return object must contain success, data, and error fields'); |
|
} |
|
|
|
return result; |
|
|
|
} catch (err) { |
|
return { |
|
success: false, |
|
data: null, |
|
error: err.message |
|
}; |
|
} |
|
} |
|
|
|
app.post('/process', upload.single('htmlFile'), (req, res) => { |
|
try { |
|
const startTime = Date.now(); |
|
let htmlContent = req.file |
|
? req.file.buffer.toString('utf8') |
|
: req.body.html || ''; |
|
|
|
if (!htmlContent.trim()) { |
|
return res.status(400).json({ error: 'No HTML content provided.' }); |
|
} |
|
|
|
const options = { |
|
cleanHead: req.body.cleanHead === 'true', |
|
removeScripts: req.body.removeScripts === 'true', |
|
removeStyles: req.body.removeStyles === 'true', |
|
handleRepeatingElements: req.body.handleRepeatingElements === 'true', |
|
truncateText: req.body.truncateText === 'true', |
|
truncateLength: parseInt(req.body.truncateLength) || 100, |
|
minifyHtml: req.body.minifyHtml === 'true', |
|
removeMedia: req.body.removeMedia === 'true' |
|
}; |
|
|
|
const processed = compressHtmlForLlm(htmlContent, options); |
|
const stats = computeStats(htmlContent, processed.html); |
|
|
|
return res.json({ |
|
success: true, |
|
result: processed, |
|
stats: { |
|
processingTime: `${Date.now() - startTime}ms`, |
|
elementReduction: stats.elementReduction, |
|
sizeReduction: stats.sizeReduction, |
|
originalElements: stats.originalElements, |
|
remainingElements: stats.remainingElements, |
|
originalSize: `${stats.originalSize} chars`, |
|
processedSize: `${stats.processedSize} chars` |
|
}, |
|
options, |
|
operationStatus: processed.status |
|
}); |
|
|
|
} catch (err) { |
|
console.error('Processing failed:', err); |
|
return res.status(500).json({ |
|
error: 'Internal server error.', |
|
details: err.message.substring(0, 100) |
|
}); |
|
} |
|
}); |
|
|
|
app.post('/extract', upload.single('htmlFile'), (req, res) => { |
|
try { |
|
const startTime = Date.now(); |
|
let htmlContent = req.file |
|
? req.file.buffer.toString('utf8') |
|
: req.body.html || ''; |
|
|
|
const extractorScript = req.body.script; |
|
|
|
if (!htmlContent.trim()) { |
|
return res.status(400).json({ error: 'No HTML content provided.' }); |
|
} |
|
|
|
if (!extractorScript) { |
|
return res.status(400).json({ error: 'No extractor script provided.' }); |
|
} |
|
|
|
const result = executeCheerioScript(htmlContent, extractorScript); |
|
|
|
return res.json({ |
|
success: result.success, |
|
data: result.data, |
|
error: result.error, |
|
processingTime: `${Date.now() - startTime}ms` |
|
}); |
|
|
|
} catch (err) { |
|
console.error('Extraction failed:', err); |
|
return res.status(500).json({ |
|
success: false, |
|
error: 'Internal server error.', |
|
details: err.message.substring(0, 100) |
|
}); |
|
} |
|
}); |
|
|
|
const PORT = process.env.PORT || 3000; |
|
app.listen(PORT, () => { |
|
console.log(`Server running on http://localhost:${PORT}`); |
|
}); |