Spaces:
Sleeping
Sleeping
const express = require('express'); | |
const bodyParser = require('body-parser'); | |
const multer = require('multer'); | |
const cheerio = require('cheerio'); | |
const { minifyHtml } = require('./minify'); | |
const { removeMedia } = require('./removeMedia'); | |
const app = express(); | |
const upload = multer(); | |
app.use(express.static('public')); | |
app.use(bodyParser.json()); | |
app.use(bodyParser.urlencoded({ extended: true })); | |
function compressHtmlForLlm(html, options = {}) { | |
const operationStatus = { | |
minification: { success: false, error: null }, | |
cheerioLoad: { success: false, error: null }, | |
headCleaning: { success: false, error: null }, | |
scriptRemoval: { success: false, error: null }, | |
styleRemoval: { success: false, error: null }, | |
mediaRemoval: { success: false, error: null }, | |
repeatingElements: { success: false, error: null }, | |
textTruncation: { success: false, error: null } | |
}; | |
let processed = html; | |
let $ = null; | |
// Step 1: Minification | |
if (options.minifyHtml) { | |
const minifyResult = minifyHtml(html, { | |
removeScripts: options.removeScripts, | |
removeStyles: options.removeStyles | |
}); | |
if (minifyResult.success) { | |
processed = minifyResult.minifiedHtml; | |
operationStatus.minification = { success: true, error: null }; | |
} else { | |
operationStatus.minification = { | |
success: false, | |
error: minifyResult.error?.message || 'Minification failed' | |
}; | |
} | |
} | |
// Step 2: Load with Cheerio | |
try { | |
$ = cheerio.load(processed, { | |
decodeEntities: false, | |
xmlMode: false, | |
lowerCaseTags: true | |
}); | |
operationStatus.cheerioLoad.success = true; | |
} catch (err) { | |
operationStatus.cheerioLoad.error = err.message.substring(0, 100); | |
console.error('Cheerio load failed:', err); | |
return { html: processed, status: operationStatus }; | |
} | |
// Step 3: Remove scripts | |
if (options.removeScripts) { | |
try { | |
$('script').remove(); | |
operationStatus.scriptRemoval.success = true; | |
} catch (err) { | |
operationStatus.scriptRemoval.error = err.message.substring(0, 100); | |
console.warn('Script removal failed:', err); | |
} | |
} | |
// Step 4: Remove styles | |
if (options.removeStyles) { | |
try { | |
$('style').remove(); | |
$('link[rel="stylesheet"]').remove(); | |
operationStatus.styleRemoval.success = true; | |
} catch (err) { | |
operationStatus.styleRemoval.error = err.message.substring(0, 100); | |
console.warn('Style removal failed:', err); | |
} | |
} | |
// Step 5: Remove media | |
if (options.removeMedia) { | |
try { | |
const mediaResult = removeMedia($); | |
if (mediaResult.success) { | |
operationStatus.mediaRemoval.success = true; | |
} else { | |
operationStatus.mediaRemoval.error = mediaResult.error.substring(0, 100); | |
console.warn('Media removal failed:', mediaResult.error); | |
} | |
} catch (err) { | |
operationStatus.mediaRemoval.error = err.message.substring(0, 100); | |
console.warn('Media removal failed:', err); | |
} | |
} | |
// Step 5: Clean head | |
if (options.cleanHead) { | |
try { | |
cleanHead($); | |
operationStatus.headCleaning.success = true; | |
} catch (err) { | |
operationStatus.headCleaning.error = err.message.substring(0, 100); | |
} | |
} | |
// Step 6: Handle repeating elements | |
if (options.handleRepeatingElements) { | |
try { | |
handleRepeatingElements($); | |
operationStatus.repeatingElements.success = true; | |
} catch (err) { | |
operationStatus.repeatingElements.error = err.message.substring(0, 100); | |
} | |
} | |
// Step 7: Truncate text | |
if (options.truncateText) { | |
try { | |
truncateText($, options.truncateLength); | |
operationStatus.textTruncation.success = true; | |
} catch (err) { | |
operationStatus.textTruncation.error = err.message.substring(0, 100); | |
} | |
} | |
let finalHtml = ''; | |
try { | |
finalHtml = $.html(); | |
} catch (err) { | |
console.error('Final HTML generation failed:', err); | |
finalHtml = processed; | |
} | |
const structure = generateStructureJson($); | |
return { | |
html: finalHtml, | |
json: JSON.stringify(structure, null, 2), | |
status: operationStatus | |
}; | |
} | |
function cleanHead($) { | |
$('head').each((_, head) => { | |
$(head).find('link').remove(); | |
$(head).find('script').remove(); | |
$(head).find('meta').each((_, meta) => { | |
const name = $(meta).attr('name')?.toLowerCase(); | |
const property = $(meta).attr('property')?.toLowerCase(); | |
if (!['charset', 'viewport', 'description', 'keywords'].includes(name) && | |
!property?.includes('og:')) { | |
$(meta).remove(); | |
} | |
}); | |
}); | |
} | |
function handleRepeatingElements($) { | |
$('*').each((_, elem) => { | |
const $elem = $(elem); | |
const children = $elem.children(); | |
if (children.length > 3 && areElementsSimilar(children, $)) { | |
children.slice(1, -1).each((i, child) => { | |
if (i !== Math.floor(children.length / 2) - 1) { | |
$(child).remove(); | |
} | |
}); | |
} | |
}); | |
} | |
function truncateText($, truncateLength) { | |
$('*').each((_, elem) => { | |
const $elem = $(elem); | |
if ($elem.children().length === 0) { | |
let text = $elem.text(); | |
if (text.length > truncateLength) { | |
text = text.substring(0, truncateLength/2) + '...' + | |
text.substring(text.length - truncateLength/2); | |
$elem.text(text); | |
} | |
} | |
}); | |
} | |
function areElementsSimilar(elements, $) { | |
if (elements.length < 4) return false; | |
const firstTag = elements[0].tagName; | |
const firstClasses = $(elements[0]).attr('class'); | |
let similarCount = 0; | |
elements.each((_, elem) => { | |
if (elem.tagName === firstTag && $(elem).attr('class') === firstClasses) { | |
similarCount++; | |
} | |
}); | |
return similarCount / elements.length > 0.7; | |
} | |
function generateStructureJson($) { | |
try { | |
const structure = []; | |
$('*').each((_, el) => { | |
const $el = $(el); | |
const attributes = {}; | |
Object.entries($el.attr() || {}).forEach(([key, value]) => { | |
attributes[key] = value; | |
}); | |
const textContent = $el.clone().children().remove().end().text().trim(); | |
const truncatedText = textContent.length > 50 | |
? textContent.substring(0, 25) + '...' + textContent.substring(textContent.length - 25) | |
: textContent; | |
structure.push({ | |
tag: el.tagName, | |
attributes: Object.keys(attributes).length ? attributes : undefined, | |
textContent: truncatedText || undefined, | |
childrenCount: $el.children().length, | |
selector: generateSelector($, el) | |
}); | |
}); | |
return structure; | |
} catch (err) { | |
console.error('Structure generation failed:', err); | |
return []; | |
} | |
} | |
function generateSelector($, element) { | |
try { | |
const $el = $(element); | |
let selector = element.tagName; | |
if ($el.attr('id')) { | |
selector += `#${$el.attr('id')}`; | |
} else if ($el.attr('class')) { | |
selector += `.${$el.attr('class').replace(/\s+/g, '.')}`; | |
} | |
return selector; | |
} catch (err) { | |
console.warn('Selector generation failed:', err); | |
return element.tagName || 'unknown'; | |
} | |
} | |
function computeStats(html, processed) { | |
try { | |
const $ = cheerio.load(html); | |
const $processed = cheerio.load(processed); | |
const stats = { | |
originalElementCount: $('*').length, | |
processedElementCount: $processed('*').length, | |
originalTextLength: html.length, | |
processedTextLength: processed.length, | |
}; | |
return { | |
elementReduction: `${(1 - stats.processedElementCount / stats.originalElementCount) * 100}%`, | |
sizeReduction: `${(1 - stats.processedTextLength / stats.originalTextLength) * 100}%`, | |
originalElements: stats.originalElementCount, | |
remainingElements: stats.processedElementCount, | |
originalSize: stats.originalTextLength, | |
processedSize: stats.processedTextLength | |
}; | |
} catch (err) { | |
console.error('Stats computation failed:', err); | |
return { | |
elementReduction: 'N/A', | |
sizeReduction: 'N/A', | |
originalElements: 'N/A', | |
remainingElements: 'N/A', | |
originalSize: html.length, | |
processedSize: processed.length | |
}; | |
} | |
} | |
app.post('/process', upload.single('htmlFile'), (req, res) => { | |
try { | |
const startTime = Date.now(); | |
let htmlContent = req.file | |
? req.file.buffer.toString('utf8') | |
: req.body.html || ''; | |
if (!htmlContent.trim()) { | |
return res.status(400).json({ error: 'No HTML content provided.' }); | |
} | |
const options = { | |
cleanHead: req.body.cleanHead === 'true', | |
removeScripts: req.body.removeScripts === 'true', | |
removeStyles: req.body.removeStyles === 'true', | |
handleRepeatingElements: req.body.handleRepeatingElements === 'true', | |
truncateText: req.body.truncateText === 'true', | |
truncateLength: parseInt(req.body.truncateLength) || 100, | |
minifyHtml: req.body.minifyHtml === 'true', | |
removeMedia: req.body.removeMedia === 'true' | |
}; | |
const processed = compressHtmlForLlm(htmlContent, options); | |
const stats = computeStats(htmlContent, processed.html); | |
return res.json({ | |
success: true, | |
result: processed, | |
stats: { | |
processingTime: `${Date.now() - startTime}ms`, | |
elementReduction: stats.elementReduction, | |
sizeReduction: stats.sizeReduction, | |
originalElements: stats.originalElements, | |
remainingElements: stats.remainingElements, | |
originalSize: `${stats.originalSize} chars`, | |
processedSize: `${stats.processedSize} chars` | |
}, | |
options, | |
operationStatus: processed.status | |
}); | |
} catch (err) { | |
console.error('Processing failed:', err); | |
return res.status(500).json({ | |
error: 'Internal server error.', | |
details: err.message.substring(0, 100) | |
}); | |
} | |
}); | |
const PORT = process.env.PORT || 3000; | |
app.listen(PORT, () => { | |
console.log(`Server running on http://localhost:${PORT}`); | |
}); |