pvanand's picture
Update index.js
9d9ae82 verified
const express = require('express');
const bodyParser = require('body-parser');
const multer = require('multer');
const cheerio = require('cheerio');
const { minifyHtml } = require('./minify');
const { removeMedia } = require('./removeMedia');
const app = express();
// Configure size limits
const MAX_SIZE = '50mb';
// Configure multer with size limits
const upload = multer({
limits: {
fileSize: 50 * 1024 * 1024, // 50MB limit
fieldSize: 50 * 1024 * 1024 // 50MB limit for fields
}
});
// Configure body parsers with consistent limits
app.use(express.static('public'));
app.use(bodyParser.json({limit: MAX_SIZE}));
app.use(bodyParser.urlencoded({
extended: true,
limit: MAX_SIZE,
parameterLimit: 50000
}));
app.use(express.json({limit: MAX_SIZE}));
app.use(express.urlencoded({
limit: MAX_SIZE,
extended: true,
parameterLimit: 50000
}));
function compressHtmlForLlm(html, options = {}) {
const operationStatus = {
minification: { success: false, error: null },
cheerioLoad: { success: false, error: null },
headCleaning: { success: false, error: null },
scriptRemoval: { success: false, error: null },
styleRemoval: { success: false, error: null },
mediaRemoval: { success: false, error: null },
repeatingElements: { success: false, error: null },
textTruncation: { success: false, error: null }
};
let processed = html;
let $ = null;
// Step 1: Minification
if (options.minifyHtml) {
const minifyResult = minifyHtml(html, {
removeScripts: options.removeScripts,
removeStyles: options.removeStyles
});
if (minifyResult.success) {
processed = minifyResult.minifiedHtml;
operationStatus.minification = { success: true, error: null };
} else {
operationStatus.minification = {
success: false,
error: minifyResult.error?.message || 'Minification failed'
};
}
}
// Step 2: Load with Cheerio
try {
$ = cheerio.load(processed, {
decodeEntities: false,
xmlMode: false,
lowerCaseTags: true
});
operationStatus.cheerioLoad.success = true;
} catch (err) {
operationStatus.cheerioLoad.error = err.message.substring(0, 100);
console.error('Cheerio load failed:', err);
return { html: processed, status: operationStatus };
}
// Step 3: Remove scripts
if (options.removeScripts) {
try {
$('script').remove();
operationStatus.scriptRemoval.success = true;
} catch (err) {
operationStatus.scriptRemoval.error = err.message.substring(0, 100);
console.warn('Script removal failed:', err);
}
}
// Step 4: Remove styles
if (options.removeStyles) {
try {
$('style').remove();
$('link[rel="stylesheet"]').remove();
operationStatus.styleRemoval.success = true;
} catch (err) {
operationStatus.styleRemoval.error = err.message.substring(0, 100);
console.warn('Style removal failed:', err);
}
}
// Step 5: Remove media
if (options.removeMedia) {
try {
const mediaResult = removeMedia($);
if (mediaResult.success) {
operationStatus.mediaRemoval.success = true;
} else {
operationStatus.mediaRemoval.error = mediaResult.error.substring(0, 100);
console.warn('Media removal failed:', mediaResult.error);
}
} catch (err) {
operationStatus.mediaRemoval.error = err.message.substring(0, 100);
console.warn('Media removal failed:', err);
}
}
// Step 6: Clean head
if (options.cleanHead) {
try {
cleanHead($);
operationStatus.headCleaning.success = true;
} catch (err) {
operationStatus.headCleaning.error = err.message.substring(0, 100);
}
}
// Step 7: Handle repeating elements
if (options.handleRepeatingElements) {
try {
handleRepeatingElements($);
operationStatus.repeatingElements.success = true;
} catch (err) {
operationStatus.repeatingElements.error = err.message.substring(0, 100);
}
}
// Step 8: Truncate text
if (options.truncateText) {
try {
truncateText($, options.truncateLength);
operationStatus.textTruncation.success = true;
} catch (err) {
operationStatus.textTruncation.error = err.message.substring(0, 100);
}
}
let finalHtml = '';
try {
finalHtml = $.html();
} catch (err) {
console.error('Final HTML generation failed:', err);
finalHtml = processed;
}
const structure = generateStructureJson($);
return {
html: finalHtml,
json: JSON.stringify(structure, null, 2),
status: operationStatus
};
}
function cleanHead($) {
$('head').each((_, head) => {
$(head).find('link').remove();
$(head).find('script').remove();
$(head).find('meta').each((_, meta) => {
const name = $(meta).attr('name')?.toLowerCase();
const property = $(meta).attr('property')?.toLowerCase();
if (!['charset', 'viewport', 'description', 'keywords'].includes(name) &&
!property?.includes('og:')) {
$(meta).remove();
}
});
});
}
function handleRepeatingElements($) {
$('*').each((_, elem) => {
const $elem = $(elem);
const children = $elem.children();
if (children.length > 3 && areElementsSimilar(children, $)) {
children.slice(1, -1).each((i, child) => {
if (i !== Math.floor(children.length / 2) - 1) {
$(child).remove();
}
});
}
});
}
function truncateText($, truncateLength) {
$('*').each((_, elem) => {
const $elem = $(elem);
if ($elem.children().length === 0) {
let text = $elem.text();
if (text.length > truncateLength) {
text = text.substring(0, truncateLength/2) + '...' +
text.substring(text.length - truncateLength/2);
$elem.text(text);
}
}
});
}
function areElementsSimilar(elements, $) {
if (elements.length < 4) return false;
const firstTag = elements[0].tagName;
const firstClasses = $(elements[0]).attr('class');
let similarCount = 0;
elements.each((_, elem) => {
if (elem.tagName === firstTag && $(elem).attr('class') === firstClasses) {
similarCount++;
}
});
return similarCount / elements.length > 0.7;
}
function generateStructureJson($) {
try {
const structure = [];
$('*').each((_, el) => {
const $el = $(el);
const attributes = {};
Object.entries($el.attr() || {}).forEach(([key, value]) => {
attributes[key] = value;
});
const textContent = $el.clone().children().remove().end().text().trim();
const truncatedText = textContent.length > 50
? textContent.substring(0, 25) + '...' + textContent.substring(textContent.length - 25)
: textContent;
structure.push({
tag: el.tagName,
attributes: Object.keys(attributes).length ? attributes : undefined,
textContent: truncatedText || undefined,
childrenCount: $el.children().length,
selector: generateSelector($, el)
});
});
return structure;
} catch (err) {
console.error('Structure generation failed:', err);
return [];
}
}
function generateSelector($, element) {
try {
const $el = $(element);
let selector = element.tagName;
if ($el.attr('id')) {
selector += `#${$el.attr('id')}`;
} else if ($el.attr('class')) {
selector += `.${$el.attr('class').replace(/\s+/g, '.')}`;
}
return selector;
} catch (err) {
console.warn('Selector generation failed:', err);
return element.tagName || 'unknown';
}
}
function computeStats(html, processed) {
try {
const $ = cheerio.load(html);
const $processed = cheerio.load(processed);
const stats = {
originalElementCount: $('*').length,
processedElementCount: $processed('*').length,
originalTextLength: html.length,
processedTextLength: processed.length,
};
return {
elementReduction: `${(1 - stats.processedElementCount / stats.originalElementCount) * 100}%`,
sizeReduction: `${(1 - stats.processedTextLength / stats.originalTextLength) * 100}%`,
originalElements: stats.originalElementCount,
remainingElements: stats.processedElementCount,
originalSize: stats.originalTextLength,
processedSize: stats.processedTextLength
};
} catch (err) {
console.error('Stats computation failed:', err);
return {
elementReduction: 'N/A',
sizeReduction: 'N/A',
originalElements: 'N/A',
remainingElements: 'N/A',
originalSize: html.length,
processedSize: processed.length
};
}
}
function validateScript(scriptContent) {
if (!scriptContent.includes('function extract(')) {
throw new Error('Script must contain a function named "extract"');
}
}
function executeCheerioScript(html, scriptContent) {
try {
validateScript(scriptContent);
const context = {
cheerio,
input: html
};
const extractorFunction = new Function('input', 'cheerio', `
${scriptContent}
return extract(input, cheerio);
`);
const result = extractorFunction(html, cheerio);
if (!result || typeof result !== 'object') {
throw new Error('Extract function must return an object');
}
if (!('success' in result && 'data' in result && 'error' in result)) {
throw new Error('Return object must contain success, data, and error fields');
}
return result;
} catch (err) {
return {
success: false,
data: null,
error: err.message
};
}
}
app.post('/process', upload.single('htmlFile'), (req, res) => {
try {
const startTime = Date.now();
let htmlContent = req.file
? req.file.buffer.toString('utf8')
: req.body.html || '';
if (!htmlContent.trim()) {
return res.status(400).json({ error: 'No HTML content provided.' });
}
const options = {
cleanHead: req.body.cleanHead === 'true',
removeScripts: req.body.removeScripts === 'true',
removeStyles: req.body.removeStyles === 'true',
handleRepeatingElements: req.body.handleRepeatingElements === 'true',
truncateText: req.body.truncateText === 'true',
truncateLength: parseInt(req.body.truncateLength) || 100,
minifyHtml: req.body.minifyHtml === 'true',
removeMedia: req.body.removeMedia === 'true'
};
const processed = compressHtmlForLlm(htmlContent, options);
const stats = computeStats(htmlContent, processed.html);
return res.json({
success: true,
result: processed,
stats: {
processingTime: `${Date.now() - startTime}ms`,
elementReduction: stats.elementReduction,
sizeReduction: stats.sizeReduction,
originalElements: stats.originalElements,
remainingElements: stats.remainingElements,
originalSize: `${stats.originalSize} chars`,
processedSize: `${stats.processedSize} chars`
},
options,
operationStatus: processed.status
});
} catch (err) {
console.error('Processing failed:', err);
return res.status(500).json({
error: 'Internal server error.',
details: err.message.substring(0, 100)
});
}
});
app.post('/extract', upload.single('htmlFile'), (req, res) => {
try {
const startTime = Date.now();
let htmlContent = req.file
? req.file.buffer.toString('utf8')
: req.body.html || '';
const extractorScript = req.body.script;
if (!htmlContent.trim()) {
return res.status(400).json({ error: 'No HTML content provided.' });
}
if (!extractorScript) {
return res.status(400).json({ error: 'No extractor script provided.' });
}
const result = executeCheerioScript(htmlContent, extractorScript);
return res.json({
success: result.success,
data: result.data,
error: result.error,
processingTime: `${Date.now() - startTime}ms`
});
} catch (err) {
console.error('Extraction failed:', err);
return res.status(500).json({
success: false,
error: 'Internal server error.',
details: err.message.substring(0, 100)
});
}
});
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
console.log(`Server running on http://localhost:${PORT}`);
});