const express = require('express'); const bodyParser = require('body-parser'); const multer = require('multer'); const cheerio = require('cheerio'); const { minifyHtml } = require('./minify'); const { removeMedia } = require('./removeMedia'); const app = express(); // Configure size limits const MAX_SIZE = '50mb'; // Configure multer with size limits const upload = multer({ limits: { fileSize: 50 * 1024 * 1024, // 50MB limit fieldSize: 50 * 1024 * 1024 // 50MB limit for fields } }); // Configure body parsers with consistent limits app.use(express.static('public')); app.use(bodyParser.json({limit: MAX_SIZE})); app.use(bodyParser.urlencoded({ extended: true, limit: MAX_SIZE, parameterLimit: 50000 })); app.use(express.json({limit: MAX_SIZE})); app.use(express.urlencoded({ limit: MAX_SIZE, extended: true, parameterLimit: 50000 })); function compressHtmlForLlm(html, options = {}) { const operationStatus = { minification: { success: false, error: null }, cheerioLoad: { success: false, error: null }, headCleaning: { success: false, error: null }, scriptRemoval: { success: false, error: null }, styleRemoval: { success: false, error: null }, mediaRemoval: { success: false, error: null }, repeatingElements: { success: false, error: null }, textTruncation: { success: false, error: null } }; let processed = html; let $ = null; // Step 1: Minification if (options.minifyHtml) { const minifyResult = minifyHtml(html, { removeScripts: options.removeScripts, removeStyles: options.removeStyles }); if (minifyResult.success) { processed = minifyResult.minifiedHtml; operationStatus.minification = { success: true, error: null }; } else { operationStatus.minification = { success: false, error: minifyResult.error?.message || 'Minification failed' }; } } // Step 2: Load with Cheerio try { $ = cheerio.load(processed, { decodeEntities: false, xmlMode: false, lowerCaseTags: true }); operationStatus.cheerioLoad.success = true; } catch (err) { operationStatus.cheerioLoad.error = err.message.substring(0, 100); console.error('Cheerio load failed:', err); return { html: processed, status: operationStatus }; } // Step 3: Remove scripts if (options.removeScripts) { try { $('script').remove(); operationStatus.scriptRemoval.success = true; } catch (err) { operationStatus.scriptRemoval.error = err.message.substring(0, 100); console.warn('Script removal failed:', err); } } // Step 4: Remove styles if (options.removeStyles) { try { $('style').remove(); $('link[rel="stylesheet"]').remove(); operationStatus.styleRemoval.success = true; } catch (err) { operationStatus.styleRemoval.error = err.message.substring(0, 100); console.warn('Style removal failed:', err); } } // Step 5: Remove media if (options.removeMedia) { try { const mediaResult = removeMedia($); if (mediaResult.success) { operationStatus.mediaRemoval.success = true; } else { operationStatus.mediaRemoval.error = mediaResult.error.substring(0, 100); console.warn('Media removal failed:', mediaResult.error); } } catch (err) { operationStatus.mediaRemoval.error = err.message.substring(0, 100); console.warn('Media removal failed:', err); } } // Step 6: Clean head if (options.cleanHead) { try { cleanHead($); operationStatus.headCleaning.success = true; } catch (err) { operationStatus.headCleaning.error = err.message.substring(0, 100); } } // Step 7: Handle repeating elements if (options.handleRepeatingElements) { try { handleRepeatingElements($); operationStatus.repeatingElements.success = true; } catch (err) { operationStatus.repeatingElements.error = err.message.substring(0, 100); } } // Step 8: Truncate text if (options.truncateText) { try { truncateText($, options.truncateLength); operationStatus.textTruncation.success = true; } catch (err) { operationStatus.textTruncation.error = err.message.substring(0, 100); } } let finalHtml = ''; try { finalHtml = $.html(); } catch (err) { console.error('Final HTML generation failed:', err); finalHtml = processed; } const structure = generateStructureJson($); return { html: finalHtml, json: JSON.stringify(structure, null, 2), status: operationStatus }; } function cleanHead($) { $('head').each((_, head) => { $(head).find('link').remove(); $(head).find('script').remove(); $(head).find('meta').each((_, meta) => { const name = $(meta).attr('name')?.toLowerCase(); const property = $(meta).attr('property')?.toLowerCase(); if (!['charset', 'viewport', 'description', 'keywords'].includes(name) && !property?.includes('og:')) { $(meta).remove(); } }); }); } function handleRepeatingElements($) { $('*').each((_, elem) => { const $elem = $(elem); const children = $elem.children(); if (children.length > 3 && areElementsSimilar(children, $)) { children.slice(1, -1).each((i, child) => { if (i !== Math.floor(children.length / 2) - 1) { $(child).remove(); } }); } }); } function truncateText($, truncateLength) { $('*').each((_, elem) => { const $elem = $(elem); if ($elem.children().length === 0) { let text = $elem.text(); if (text.length > truncateLength) { text = text.substring(0, truncateLength/2) + '...' + text.substring(text.length - truncateLength/2); $elem.text(text); } } }); } function areElementsSimilar(elements, $) { if (elements.length < 4) return false; const firstTag = elements[0].tagName; const firstClasses = $(elements[0]).attr('class'); let similarCount = 0; elements.each((_, elem) => { if (elem.tagName === firstTag && $(elem).attr('class') === firstClasses) { similarCount++; } }); return similarCount / elements.length > 0.7; } function generateStructureJson($) { try { const structure = []; $('*').each((_, el) => { const $el = $(el); const attributes = {}; Object.entries($el.attr() || {}).forEach(([key, value]) => { attributes[key] = value; }); const textContent = $el.clone().children().remove().end().text().trim(); const truncatedText = textContent.length > 50 ? textContent.substring(0, 25) + '...' + textContent.substring(textContent.length - 25) : textContent; structure.push({ tag: el.tagName, attributes: Object.keys(attributes).length ? attributes : undefined, textContent: truncatedText || undefined, childrenCount: $el.children().length, selector: generateSelector($, el) }); }); return structure; } catch (err) { console.error('Structure generation failed:', err); return []; } } function generateSelector($, element) { try { const $el = $(element); let selector = element.tagName; if ($el.attr('id')) { selector += `#${$el.attr('id')}`; } else if ($el.attr('class')) { selector += `.${$el.attr('class').replace(/\s+/g, '.')}`; } return selector; } catch (err) { console.warn('Selector generation failed:', err); return element.tagName || 'unknown'; } } function computeStats(html, processed) { try { const $ = cheerio.load(html); const $processed = cheerio.load(processed); const stats = { originalElementCount: $('*').length, processedElementCount: $processed('*').length, originalTextLength: html.length, processedTextLength: processed.length, }; return { elementReduction: `${(1 - stats.processedElementCount / stats.originalElementCount) * 100}%`, sizeReduction: `${(1 - stats.processedTextLength / stats.originalTextLength) * 100}%`, originalElements: stats.originalElementCount, remainingElements: stats.processedElementCount, originalSize: stats.originalTextLength, processedSize: stats.processedTextLength }; } catch (err) { console.error('Stats computation failed:', err); return { elementReduction: 'N/A', sizeReduction: 'N/A', originalElements: 'N/A', remainingElements: 'N/A', originalSize: html.length, processedSize: processed.length }; } } function validateScript(scriptContent) { if (!scriptContent.includes('function extract(')) { throw new Error('Script must contain a function named "extract"'); } } function executeCheerioScript(html, scriptContent) { try { validateScript(scriptContent); const context = { cheerio, input: html }; const extractorFunction = new Function('input', 'cheerio', ` ${scriptContent} return extract(input, cheerio); `); const result = extractorFunction(html, cheerio); if (!result || typeof result !== 'object') { throw new Error('Extract function must return an object'); } if (!('success' in result && 'data' in result && 'error' in result)) { throw new Error('Return object must contain success, data, and error fields'); } return result; } catch (err) { return { success: false, data: null, error: err.message }; } } app.post('/process', upload.single('htmlFile'), (req, res) => { try { const startTime = Date.now(); let htmlContent = req.file ? req.file.buffer.toString('utf8') : req.body.html || ''; if (!htmlContent.trim()) { return res.status(400).json({ error: 'No HTML content provided.' }); } const options = { cleanHead: req.body.cleanHead === 'true', removeScripts: req.body.removeScripts === 'true', removeStyles: req.body.removeStyles === 'true', handleRepeatingElements: req.body.handleRepeatingElements === 'true', truncateText: req.body.truncateText === 'true', truncateLength: parseInt(req.body.truncateLength) || 100, minifyHtml: req.body.minifyHtml === 'true', removeMedia: req.body.removeMedia === 'true' }; const processed = compressHtmlForLlm(htmlContent, options); const stats = computeStats(htmlContent, processed.html); return res.json({ success: true, result: processed, stats: { processingTime: `${Date.now() - startTime}ms`, elementReduction: stats.elementReduction, sizeReduction: stats.sizeReduction, originalElements: stats.originalElements, remainingElements: stats.remainingElements, originalSize: `${stats.originalSize} chars`, processedSize: `${stats.processedSize} chars` }, options, operationStatus: processed.status }); } catch (err) { console.error('Processing failed:', err); return res.status(500).json({ error: 'Internal server error.', details: err.message.substring(0, 100) }); } }); app.post('/extract', upload.single('htmlFile'), (req, res) => { try { const startTime = Date.now(); let htmlContent = req.file ? req.file.buffer.toString('utf8') : req.body.html || ''; const extractorScript = req.body.script; if (!htmlContent.trim()) { return res.status(400).json({ error: 'No HTML content provided.' }); } if (!extractorScript) { return res.status(400).json({ error: 'No extractor script provided.' }); } const result = executeCheerioScript(htmlContent, extractorScript); return res.json({ success: result.success, data: result.data, error: result.error, processingTime: `${Date.now() - startTime}ms` }); } catch (err) { console.error('Extraction failed:', err); return res.status(500).json({ success: false, error: 'Internal server error.', details: err.message.substring(0, 100) }); } }); const PORT = process.env.PORT || 3000; app.listen(PORT, () => { console.log(`Server running on http://localhost:${PORT}`); });