pvanand commited on
Commit
9d9ae82
·
verified ·
1 Parent(s): 705c179

Update index.js

Browse files
Files changed (1) hide show
  1. index.js +77 -71
index.js CHANGED
@@ -6,16 +6,32 @@ const { minifyHtml } = require('./minify');
6
  const { removeMedia } = require('./removeMedia');
7
 
8
  const app = express();
9
- const upload = multer();
10
 
11
- app.use(express.static('public'));
12
- app.use(bodyParser.json());
13
- app.use(bodyParser.urlencoded({ extended: true }));
14
- // Increase JSON body parser limit
15
- app.use(express.json({limit: '50mb'}));
16
- // If you're also using URL-encoded parser
17
- app.use(express.urlencoded({limit: '50mb', extended: true}));
 
 
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  function compressHtmlForLlm(html, options = {}) {
21
  const operationStatus = {
@@ -87,24 +103,23 @@ function compressHtmlForLlm(html, options = {}) {
87
  }
88
  }
89
 
90
-
91
- // Step 5: Remove media
92
- if (options.removeMedia) {
93
  try {
94
- const mediaResult = removeMedia($);
95
- if (mediaResult.success) {
96
  operationStatus.mediaRemoval.success = true;
97
- } else {
98
  operationStatus.mediaRemoval.error = mediaResult.error.substring(0, 100);
99
  console.warn('Media removal failed:', mediaResult.error);
100
- }
101
  } catch (err) {
102
- operationStatus.mediaRemoval.error = err.message.substring(0, 100);
103
- console.warn('Media removal failed:', err);
104
  }
105
- }
106
 
107
- // Step 5: Clean head
108
  if (options.cleanHead) {
109
  try {
110
  cleanHead($);
@@ -114,7 +129,7 @@ if (options.removeMedia) {
114
  }
115
  }
116
 
117
- // Step 6: Handle repeating elements
118
  if (options.handleRepeatingElements) {
119
  try {
120
  handleRepeatingElements($);
@@ -124,7 +139,7 @@ if (options.removeMedia) {
124
  }
125
  }
126
 
127
- // Step 7: Truncate text
128
  if (options.truncateText) {
129
  try {
130
  truncateText($, options.truncateLength);
@@ -292,6 +307,47 @@ function computeStats(html, processed) {
292
  }
293
  }
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  app.post('/process', upload.single('htmlFile'), (req, res) => {
296
  try {
297
  const startTime = Date.now();
@@ -315,7 +371,6 @@ app.post('/process', upload.single('htmlFile'), (req, res) => {
315
  };
316
 
317
  const processed = compressHtmlForLlm(htmlContent, options);
318
-
319
  const stats = computeStats(htmlContent, processed.html);
320
 
321
  return res.json({
@@ -343,55 +398,6 @@ app.post('/process', upload.single('htmlFile'), (req, res) => {
343
  }
344
  });
345
 
346
- // Helper function to validate script structure
347
- function validateScript(scriptContent) {
348
- // Check if script contains function declaration
349
- if (!scriptContent.includes('function extract(')) {
350
- throw new Error('Script must contain a function named "extract"');
351
- }
352
- }
353
-
354
- // Modified function to execute the extract function
355
- function executeCheerioScript(html, scriptContent) {
356
- try {
357
- validateScript(scriptContent);
358
-
359
- // Create a safe context for the script
360
- const context = {
361
- cheerio,
362
- input: html
363
- };
364
-
365
- // Create the function from script content
366
- const extractorFunction = new Function('input', 'cheerio', `
367
- ${scriptContent}
368
- return extract(input, cheerio);
369
- `);
370
-
371
- // Execute the script with provided parameters
372
- const result = extractorFunction(html, cheerio);
373
-
374
- // Validate result structure
375
- if (!result || typeof result !== 'object') {
376
- throw new Error('Extract function must return an object');
377
- }
378
-
379
- if (!('success' in result && 'data' in result && 'error' in result)) {
380
- throw new Error('Return object must contain success, data, and error fields');
381
- }
382
-
383
- return result;
384
-
385
- } catch (err) {
386
- return {
387
- success: false,
388
- data: null,
389
- error: err.message
390
- };
391
- }
392
- }
393
-
394
- // Updated endpoint
395
  app.post('/extract', upload.single('htmlFile'), (req, res) => {
396
  try {
397
  const startTime = Date.now();
 
6
  const { removeMedia } = require('./removeMedia');
7
 
8
  const app = express();
 
9
 
10
+ // Configure size limits
11
+ const MAX_SIZE = '50mb';
12
+
13
+ // Configure multer with size limits
14
+ const upload = multer({
15
+ limits: {
16
+ fileSize: 50 * 1024 * 1024, // 50MB limit
17
+ fieldSize: 50 * 1024 * 1024 // 50MB limit for fields
18
+ }
19
+ });
20
 
21
+ // Configure body parsers with consistent limits
22
+ app.use(express.static('public'));
23
+ app.use(bodyParser.json({limit: MAX_SIZE}));
24
+ app.use(bodyParser.urlencoded({
25
+ extended: true,
26
+ limit: MAX_SIZE,
27
+ parameterLimit: 50000
28
+ }));
29
+ app.use(express.json({limit: MAX_SIZE}));
30
+ app.use(express.urlencoded({
31
+ limit: MAX_SIZE,
32
+ extended: true,
33
+ parameterLimit: 50000
34
+ }));
35
 
36
  function compressHtmlForLlm(html, options = {}) {
37
  const operationStatus = {
 
103
  }
104
  }
105
 
106
+ // Step 5: Remove media
107
+ if (options.removeMedia) {
 
108
  try {
109
+ const mediaResult = removeMedia($);
110
+ if (mediaResult.success) {
111
  operationStatus.mediaRemoval.success = true;
112
+ } else {
113
  operationStatus.mediaRemoval.error = mediaResult.error.substring(0, 100);
114
  console.warn('Media removal failed:', mediaResult.error);
115
+ }
116
  } catch (err) {
117
+ operationStatus.mediaRemoval.error = err.message.substring(0, 100);
118
+ console.warn('Media removal failed:', err);
119
  }
120
+ }
121
 
122
+ // Step 6: Clean head
123
  if (options.cleanHead) {
124
  try {
125
  cleanHead($);
 
129
  }
130
  }
131
 
132
+ // Step 7: Handle repeating elements
133
  if (options.handleRepeatingElements) {
134
  try {
135
  handleRepeatingElements($);
 
139
  }
140
  }
141
 
142
+ // Step 8: Truncate text
143
  if (options.truncateText) {
144
  try {
145
  truncateText($, options.truncateLength);
 
307
  }
308
  }
309
 
310
+ function validateScript(scriptContent) {
311
+ if (!scriptContent.includes('function extract(')) {
312
+ throw new Error('Script must contain a function named "extract"');
313
+ }
314
+ }
315
+
316
+ function executeCheerioScript(html, scriptContent) {
317
+ try {
318
+ validateScript(scriptContent);
319
+
320
+ const context = {
321
+ cheerio,
322
+ input: html
323
+ };
324
+
325
+ const extractorFunction = new Function('input', 'cheerio', `
326
+ ${scriptContent}
327
+ return extract(input, cheerio);
328
+ `);
329
+
330
+ const result = extractorFunction(html, cheerio);
331
+
332
+ if (!result || typeof result !== 'object') {
333
+ throw new Error('Extract function must return an object');
334
+ }
335
+
336
+ if (!('success' in result && 'data' in result && 'error' in result)) {
337
+ throw new Error('Return object must contain success, data, and error fields');
338
+ }
339
+
340
+ return result;
341
+
342
+ } catch (err) {
343
+ return {
344
+ success: false,
345
+ data: null,
346
+ error: err.message
347
+ };
348
+ }
349
+ }
350
+
351
  app.post('/process', upload.single('htmlFile'), (req, res) => {
352
  try {
353
  const startTime = Date.now();
 
371
  };
372
 
373
  const processed = compressHtmlForLlm(htmlContent, options);
 
374
  const stats = computeStats(htmlContent, processed.html);
375
 
376
  return res.json({
 
398
  }
399
  });
400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  app.post('/extract', upload.single('htmlFile'), (req, res) => {
402
  try {
403
  const startTime = Date.now();