Update public/index.html
Browse files- public/index.html +555 -319
public/index.html
CHANGED
@@ -1,354 +1,590 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
<div class="option-item">
|
17 |
-
<input type="checkbox" id="cleanHead" name="cleanHead" checked>
|
18 |
-
<label for="cleanHead">Clean head section</label>
|
19 |
-
</div>
|
20 |
-
<div class="option-item">
|
21 |
-
<input type="checkbox" id="removeScripts" name="removeScripts" checked>
|
22 |
-
<label for="removeScripts">Remove scripts</label>
|
23 |
-
</div>
|
24 |
-
<div class="option-item">
|
25 |
-
<input type="checkbox" id="removeStyles" name="removeStyles" checked>
|
26 |
-
<label for="removeStyles">Remove styles</label>
|
27 |
-
</div>
|
28 |
-
<div class="option-item">
|
29 |
-
<input type="checkbox" id="handleRepeatingElements" name="handleRepeatingElements" checked>
|
30 |
-
<label for="handleRepeatingElements">Handle repeating elements</label>
|
31 |
-
</div>
|
32 |
-
<div class="option-item">
|
33 |
-
<input type="checkbox" id="truncateText" name="truncateText" checked>
|
34 |
-
<label for="truncateText">Truncate text</label>
|
35 |
-
</div>
|
36 |
-
<div class="option-item">
|
37 |
-
<label for="truncateLength">Max text length:</label>
|
38 |
-
<input type="number" id="truncateLength" name="truncateLength" value="100" min="10" max="1000">
|
39 |
-
</div>
|
40 |
-
<div class="option-item">
|
41 |
-
<input type="checkbox" id="minifyHtml" name="minifyHtml" checked>
|
42 |
-
<label for="minifyHtml">Minify HTML</label>
|
43 |
-
</div>
|
44 |
-
<div class="option-item">
|
45 |
-
<input type="checkbox" id="removeMedia" name="removeMedia" checked>
|
46 |
-
<label for="removeMedia">Remove media</label>
|
47 |
-
</div>
|
48 |
-
</div>
|
49 |
-
</div>
|
50 |
-
|
51 |
-
<div class="extraction-container">
|
52 |
-
<h3>Data Extraction</h3>
|
53 |
-
<textarea
|
54 |
-
id="extractionQuery"
|
55 |
-
placeholder="Enter your extraction query (e.g., 'extract product title and price')"
|
56 |
-
></textarea>
|
57 |
-
<div class="button-group">
|
58 |
-
<button type="button" id="generateScript">Generate Extraction Script</button>
|
59 |
-
<button type="button" id="executeExtraction" disabled>Execute Extraction</button>
|
60 |
-
</div>
|
61 |
-
</div>
|
62 |
-
|
63 |
-
<div class="button-group">
|
64 |
-
<input type="file" accept=".html,.htm" id="fileInput">
|
65 |
-
<button type="submit">Process HTML</button>
|
66 |
-
</div>
|
67 |
-
</form>
|
68 |
-
|
69 |
-
<div id="operationStatus" class="operation-status" style="display: none;">
|
70 |
-
<h3>Operation Status</h3>
|
71 |
-
<div class="status-grid"></div>
|
72 |
-
</div>
|
73 |
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
</div>
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
</div>
|
103 |
-
</div>
|
104 |
-
<div class="result-content">
|
105 |
-
<pre><code class="language-json" id="jsonOutput"></code></pre>
|
106 |
-
</div>
|
107 |
-
</div>
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
<button onclick="downloadResult('extraction')">Download</button>
|
115 |
-
</div>
|
116 |
-
</div>
|
117 |
-
<div class="result-content">
|
118 |
-
<pre><code class="language-json" id="extractionOutput"></code></pre>
|
119 |
-
</div>
|
120 |
-
</div>
|
121 |
-
</div>
|
122 |
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
const API_KEY = 'ae54a922-ed3a-4634-be4a-4e4dd470800a';
|
132 |
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
const resultsContainer = document.querySelector('.results-container');
|
139 |
-
const statsContainer = document.getElementById('stats');
|
140 |
-
const copyFeedback = document.querySelector('.copy-feedback');
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
// Update tabs
|
146 |
-
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
147 |
-
tab.classList.add('active');
|
148 |
-
|
149 |
-
// Update views
|
150 |
-
const view = tab.dataset.view;
|
151 |
-
document.getElementById('htmlView').style.display = view === 'html' ? 'block' : 'none';
|
152 |
-
document.getElementById('jsonView').style.display = view === 'json' ? 'block' : 'none';
|
153 |
-
document.getElementById('extractionView').style.display = view === 'extraction' ? 'block' : 'none';
|
154 |
-
});
|
155 |
-
});
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
reader.onload = (e) => htmlInput.value = e.target.result;
|
163 |
-
reader.readAsText(file);
|
164 |
-
}
|
165 |
-
});
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
'x-api-key': API_KEY,
|
183 |
-
'Content-Type': 'application/json',
|
184 |
-
},
|
185 |
-
body: JSON.stringify({
|
186 |
-
html: htmlContent,
|
187 |
-
user_input: userInput
|
188 |
-
})
|
189 |
-
});
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
document.getElementById('jsonOutput').textContent = JSON.stringify({
|
199 |
-
cheerio_script: currentCheerioScript
|
200 |
-
}, null, 2);
|
201 |
-
Prism.highlightAll();
|
202 |
-
} else {
|
203 |
-
alert('Failed to generate extraction script');
|
204 |
-
}
|
205 |
-
} catch (error) {
|
206 |
-
alert('Error generating script: ' + error.message);
|
207 |
-
}
|
208 |
}
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
}
|
216 |
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
body: JSON.stringify({
|
226 |
-
html: htmlContent,
|
227 |
-
script: currentCheerioScript
|
228 |
-
})
|
229 |
-
});
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
Prism.highlightAll();
|
236 |
-
|
237 |
-
// Switch to extraction view
|
238 |
-
document.querySelector('[data-view="extraction"]').click();
|
239 |
-
} catch (error) {
|
240 |
-
alert('Error executing extraction: ' + error.message);
|
241 |
-
}
|
242 |
}
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
try {
|
256 |
-
const response = await fetch('/process', {
|
257 |
-
method: 'POST',
|
258 |
-
body: formData,
|
259 |
-
});
|
260 |
-
|
261 |
-
const data = await response.json();
|
262 |
-
|
263 |
-
if (data.error) {
|
264 |
-
alert(data.error);
|
265 |
-
return;
|
266 |
-
}
|
267 |
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
.map(([operation, status]) => `
|
275 |
-
<div class="status-item">
|
276 |
-
<div class="status-icon ${status.success ? 'status-success' : 'status-error'}">
|
277 |
-
${status.success ? '✓' : '✗'}
|
278 |
-
</div>
|
279 |
-
<div>
|
280 |
-
<div>${formatLabel(operation)}</div>
|
281 |
-
${status.error ? `<div class="status-message">Error: ${status.error}</div>` : ''}
|
282 |
-
</div>
|
283 |
-
</div>
|
284 |
-
`).join('');
|
285 |
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
<div class="stat-value">${value}</div>
|
293 |
-
</div>
|
294 |
-
`).join('');
|
295 |
|
296 |
-
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
}
|
309 |
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
|
314 |
-
|
315 |
-
|
316 |
-
return key
|
317 |
-
.replace(/([A-Z])/g, ' $1')
|
318 |
-
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
319 |
-
.toLowerCase()
|
320 |
-
.replace(/^./, str => str.toUpperCase())
|
321 |
-
.replace('Html', 'HTML');
|
322 |
}
|
323 |
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
showCopyFeedback();
|
329 |
-
} catch (err) {
|
330 |
-
alert('Failed to copy to clipboard');
|
331 |
-
}
|
332 |
}
|
333 |
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
const url = URL.createObjectURL(blob);
|
338 |
-
const a = document.createElement('a');
|
339 |
-
a.href = url;
|
340 |
-
a.download = `compressed.${type}`;
|
341 |
-
document.body.appendChild(a);
|
342 |
-
a.click();
|
343 |
-
document.body.removeChild(a);
|
344 |
-
URL.revokeObjectURL(url);
|
345 |
}
|
346 |
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
copyFeedback.style.display = 'none';
|
351 |
-
}, 2000);
|
352 |
}
|
353 |
-
|
354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8"/>
|
5 |
+
<title>HTML Compressor for LLM</title>
|
6 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.24.1/themes/prism.min.css">
|
7 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.24.1/plugins/toolbar/prism-toolbar.min.css">
|
8 |
+
<style>
|
9 |
+
:root {
|
10 |
+
--primary-color: #007bff;
|
11 |
+
--secondary-color: #6c757d;
|
12 |
+
--success-color: #28a745;
|
13 |
+
--border-color: #dee2e6;
|
14 |
+
--background-color: #f8f9fa;
|
15 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
body {
|
18 |
+
font-family: system-ui, -apple-system, sans-serif;
|
19 |
+
line-height: 1.6;
|
20 |
+
margin: 0;
|
21 |
+
padding: 20px;
|
22 |
+
background: var(--background-color);
|
23 |
+
}
|
24 |
|
25 |
+
.container {
|
26 |
+
max-width: 1200px;
|
27 |
+
margin: 0 auto;
|
28 |
+
background: white;
|
29 |
+
padding: 30px;
|
30 |
+
border-radius: 8px;
|
31 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
32 |
+
}
|
33 |
|
34 |
+
textarea {
|
35 |
+
width: 100%;
|
36 |
+
height: 200px;
|
37 |
+
padding: 12px;
|
38 |
+
border: 1px solid var(--border-color);
|
39 |
+
border-radius: 4px;
|
40 |
+
font-family: 'Monaco', 'Menlo', monospace;
|
41 |
+
font-size: 14px;
|
42 |
+
resize: vertical;
|
43 |
+
margin-bottom: 15px;
|
44 |
+
}
|
|
|
45 |
|
46 |
+
.options-container {
|
47 |
+
background: var(--background-color);
|
48 |
+
padding: 20px;
|
49 |
+
border-radius: 8px;
|
50 |
+
margin: 20px 0;
|
51 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
.option-grid {
|
54 |
+
display: grid;
|
55 |
+
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
56 |
+
gap: 15px;
|
57 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
.option-item {
|
60 |
+
display: flex;
|
61 |
+
align-items: center;
|
62 |
+
gap: 10px;
|
63 |
+
}
|
64 |
|
65 |
+
.button-group {
|
66 |
+
display: flex;
|
67 |
+
gap: 10px;
|
68 |
+
margin: 15px 0;
|
69 |
+
}
|
|
|
70 |
|
71 |
+
button {
|
72 |
+
background: var(--primary-color);
|
73 |
+
color: white;
|
74 |
+
padding: 8px 16px;
|
75 |
+
border: none;
|
76 |
+
border-radius: 4px;
|
77 |
+
cursor: pointer;
|
78 |
+
transition: background 0.2s;
|
79 |
+
}
|
80 |
|
81 |
+
button:hover {
|
82 |
+
background: #0056b3;
|
83 |
+
}
|
|
|
|
|
|
|
84 |
|
85 |
+
.results-container {
|
86 |
+
margin-top: 30px;
|
87 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
+
.results-tabs {
|
90 |
+
display: flex;
|
91 |
+
gap: 10px;
|
92 |
+
margin-bottom: 15px;
|
93 |
+
}
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
.tab {
|
96 |
+
padding: 8px 16px;
|
97 |
+
cursor: pointer;
|
98 |
+
border: 1px solid var(--border-color);
|
99 |
+
border-radius: 4px;
|
100 |
+
transition: all 0.2s;
|
101 |
+
}
|
102 |
|
103 |
+
.tab.active {
|
104 |
+
background: var(--primary-color);
|
105 |
+
color: white;
|
106 |
+
}
|
107 |
|
108 |
+
.result-panel {
|
109 |
+
border: 1px solid var(--border-color);
|
110 |
+
border-radius: 4px;
|
111 |
+
overflow: hidden;
|
112 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
+
.result-header {
|
115 |
+
display: flex;
|
116 |
+
justify-content: space-between;
|
117 |
+
align-items: center;
|
118 |
+
padding: 10px;
|
119 |
+
background: var(--background-color);
|
120 |
+
border-bottom: 1px solid var(--border-color);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
}
|
122 |
|
123 |
+
.result-content {
|
124 |
+
padding: 15px;
|
125 |
+
overflow: auto;
|
126 |
+
max-height: 500px;
|
127 |
+
}
|
|
|
128 |
|
129 |
+
.stats-grid {
|
130 |
+
display: grid;
|
131 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
132 |
+
gap: 15px;
|
133 |
+
margin: 20px 0;
|
134 |
+
}
|
135 |
|
136 |
+
.stat-item {
|
137 |
+
background: white;
|
138 |
+
padding: 15px;
|
139 |
+
border-radius: 4px;
|
140 |
+
border: 1px solid var(--border-color);
|
141 |
+
}
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
.stat-value {
|
144 |
+
font-size: 1.2em;
|
145 |
+
font-weight: bold;
|
146 |
+
color: var(--primary-color);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
}
|
148 |
|
149 |
+
.copy-feedback {
|
150 |
+
position: fixed;
|
151 |
+
bottom: 20px;
|
152 |
+
right: 20px;
|
153 |
+
background: var(--success-color);
|
154 |
+
color: white;
|
155 |
+
padding: 10px 20px;
|
156 |
+
border-radius: 4px;
|
157 |
+
display: none;
|
158 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
+
.operation-status {
|
161 |
+
margin: 20px 0;
|
162 |
+
padding: 15px;
|
163 |
+
border: 1px solid var(--border-color);
|
164 |
+
border-radius: 4px;
|
165 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
+
.status-grid {
|
168 |
+
display: grid;
|
169 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
170 |
+
gap: 10px;
|
171 |
+
margin-top: 10px;
|
172 |
+
}
|
|
|
|
|
|
|
173 |
|
174 |
+
.status-item {
|
175 |
+
display: flex;
|
176 |
+
align-items: center;
|
177 |
+
gap: 8px;
|
178 |
+
padding: 8px;
|
179 |
+
border-radius: 4px;
|
180 |
+
background: var(--background-color);
|
181 |
+
}
|
182 |
|
183 |
+
.status-icon {
|
184 |
+
width: 20px;
|
185 |
+
height: 20px;
|
186 |
+
border-radius: 50%;
|
187 |
+
display: flex;
|
188 |
+
align-items: center;
|
189 |
+
justify-content: center;
|
190 |
+
color: white;
|
191 |
+
font-size: 12px;
|
192 |
+
}
|
193 |
|
194 |
+
.status-success {
|
195 |
+
background: var(--success-color);
|
196 |
+
}
|
197 |
|
198 |
+
.status-error {
|
199 |
+
background: #dc3545;
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
}
|
201 |
|
202 |
+
.status-message {
|
203 |
+
font-size: 0.9em;
|
204 |
+
color: #666;
|
205 |
+
margin-top: 4px;
|
|
|
|
|
|
|
|
|
206 |
}
|
207 |
|
208 |
+
pre {
|
209 |
+
margin: 0;
|
210 |
+
border-radius: 4px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
}
|
212 |
|
213 |
+
code {
|
214 |
+
font-family: 'Monaco', 'Menlo', monospace;
|
215 |
+
font-size: 14px;
|
|
|
|
|
216 |
}
|
217 |
+
/* Add to the existing style section */
|
218 |
+
.extraction-container {
|
219 |
+
margin: 20px 0;
|
220 |
+
padding: 20px;
|
221 |
+
background: var(--background-color);
|
222 |
+
border-radius: 8px;
|
223 |
+
}
|
224 |
+
|
225 |
+
.extraction-container textarea {
|
226 |
+
height: 100px;
|
227 |
+
margin-bottom: 10px;
|
228 |
+
}
|
229 |
+
|
230 |
+
#executeExtraction:disabled {
|
231 |
+
background: var(--secondary-color);
|
232 |
+
cursor: not-allowed;
|
233 |
+
}
|
234 |
+
</style>
|
235 |
+
</head>
|
236 |
+
<body>
|
237 |
+
<div class="container">
|
238 |
+
<h1>HTML Compressor for LLM</h1>
|
239 |
+
<p>Compress HTML content for optimal LLM processing while preserving essential structure.</p>
|
240 |
+
|
241 |
+
<form id="compressorForm">
|
242 |
+
<textarea
|
243 |
+
name="html"
|
244 |
+
id="htmlInput"
|
245 |
+
placeholder="Paste your HTML here or upload a file..."
|
246 |
+
></textarea>
|
247 |
+
|
248 |
+
<div class="options-container">
|
249 |
+
<h3>Compression Options</h3>
|
250 |
+
<div class="option-grid">
|
251 |
+
<div class="option-item">
|
252 |
+
<input type="checkbox" id="cleanHead" name="cleanHead" checked>
|
253 |
+
<label for="cleanHead">Clean head section</label>
|
254 |
+
</div>
|
255 |
+
<div class="option-item">
|
256 |
+
<input type="checkbox" id="removeScripts" name="removeScripts" checked>
|
257 |
+
<label for="removeScripts">Remove scripts</label>
|
258 |
+
</div>
|
259 |
+
<div class="option-item">
|
260 |
+
<input type="checkbox" id="removeStyles" name="removeStyles" checked>
|
261 |
+
<label for="removeStyles">Remove styles</label>
|
262 |
+
</div>
|
263 |
+
<div class="option-item">
|
264 |
+
<input type="checkbox" id="handleRepeatingElements" name="handleRepeatingElements" checked>
|
265 |
+
<label for="handleRepeatingElements">Handle repeating elements</label>
|
266 |
+
</div>
|
267 |
+
<div class="option-item">
|
268 |
+
<input type="checkbox" id="truncateText" name="truncateText" checked>
|
269 |
+
<label for="truncateText">Truncate text</label>
|
270 |
+
</div>
|
271 |
+
<div class="option-item">
|
272 |
+
<label for="truncateLength">Max text length:</label>
|
273 |
+
<input type="number" id="truncateLength" name="truncateLength" value="100" min="10" max="1000">
|
274 |
+
</div>
|
275 |
+
<div class="option-item">
|
276 |
+
<input type="checkbox" id="minifyHtml" name="minifyHtml" checked>
|
277 |
+
<label for="minifyHtml">Minify HTML</label>
|
278 |
+
</div>
|
279 |
+
<div class="option-item">
|
280 |
+
<input type="checkbox" id="removeMedia" name="removeMedia" checked>
|
281 |
+
<label for="removeMedia">Remove media</label>
|
282 |
+
</div>
|
283 |
+
</div>
|
284 |
+
</div>
|
285 |
+
|
286 |
+
<div class="extraction-container">
|
287 |
+
<h3>Data Extraction</h3>
|
288 |
+
<textarea
|
289 |
+
id="extractionQuery"
|
290 |
+
placeholder="Enter your extraction query (e.g., 'extract product title and price')"
|
291 |
+
></textarea>
|
292 |
+
<div class="button-group">
|
293 |
+
<button type="button" id="generateScript">Generate Extraction Script</button>
|
294 |
+
<button type="button" id="executeExtraction" disabled>Execute Extraction</button>
|
295 |
+
</div>
|
296 |
+
</div>
|
297 |
+
|
298 |
+
<div class="button-group">
|
299 |
+
<input type="file" accept=".html,.htm" id="fileInput">
|
300 |
+
<button type="submit">Process HTML</button>
|
301 |
+
</div>
|
302 |
+
</form>
|
303 |
+
|
304 |
+
<div id="operationStatus" class="operation-status" style="display: none;">
|
305 |
+
<h3>Operation Status</h3>
|
306 |
+
<div class="status-grid"></div>
|
307 |
+
</div>
|
308 |
+
|
309 |
+
<div id="stats" class="stats-grid" style="display: none;"></div>
|
310 |
+
|
311 |
+
<div class="results-container" style="display: none;">
|
312 |
+
<div class="results-tabs">
|
313 |
+
<div class="tab active" data-view="html">Compressed HTML</div>
|
314 |
+
<div class="tab" data-view="json">JSON Structure</div>
|
315 |
+
<div class="tab" data-view="extraction">Extraction Results</div>
|
316 |
+
</div>
|
317 |
+
|
318 |
+
<div class="result-panel" id="htmlView">
|
319 |
+
<div class="result-header">
|
320 |
+
<h3>HTML Output</h3>
|
321 |
+
<div class="button-group">
|
322 |
+
<button onclick="copyResult('html')">Copy</button>
|
323 |
+
<button onclick="downloadResult('html')">Download</button>
|
324 |
+
</div>
|
325 |
+
</div>
|
326 |
+
<div class="result-content">
|
327 |
+
<pre><code class="language-html" id="htmlOutput"></code></pre>
|
328 |
+
</div>
|
329 |
+
</div>
|
330 |
+
|
331 |
+
<div class="result-panel" id="jsonView" style="display: none;">
|
332 |
+
<div class="result-header">
|
333 |
+
<h3>JSON Structure</h3>
|
334 |
+
<div class="button-group">
|
335 |
+
<button onclick="copyResult('json')">Copy</button>
|
336 |
+
<button onclick="downloadResult('json')">Download</button>
|
337 |
+
</div>
|
338 |
+
</div>
|
339 |
+
<div class="result-content">
|
340 |
+
<pre><code class="language-json" id="jsonOutput"></code></pre>
|
341 |
+
</div>
|
342 |
+
</div>
|
343 |
+
|
344 |
+
<div class="result-panel" id="extractionView" style="display: none;">
|
345 |
+
<div class="result-header">
|
346 |
+
<h3>Extraction Results</h3>
|
347 |
+
<div class="button-group">
|
348 |
+
<button onclick="copyResult('extraction')">Copy</button>
|
349 |
+
<button onclick="downloadResult('extraction')">Download</button>
|
350 |
+
</div>
|
351 |
+
</div>
|
352 |
+
<div class="result-content">
|
353 |
+
<pre><code class="language-json" id="extractionOutput"></code></pre>
|
354 |
+
</div>
|
355 |
+
</div>
|
356 |
+
</div>
|
357 |
+
|
358 |
+
<div class="copy-feedback">Copied to clipboard!</div>
|
359 |
+
</div>
|
360 |
+
|
361 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.24.1/prism.min.js"></script>
|
362 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.24.1/components/prism-markup.min.js"></script>
|
363 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.24.1/components/prism-json.min.js"></script>
|
364 |
+
<script>
|
365 |
+
const API_URL = 'https://elevatics-ai-web-scraper-chat.hf.space/api/v1/generate-cheerio-script';
|
366 |
+
const API_KEY = 'ae54a922-ed3a-4634-be4a-4e4dd470800a';
|
367 |
+
|
368 |
+
let currentCheerioScript = null;
|
369 |
+
|
370 |
+
const form = document.getElementById('compressorForm');
|
371 |
+
const fileInput = document.getElementById('fileInput');
|
372 |
+
const htmlInput = document.getElementById('htmlInput');
|
373 |
+
const resultsContainer = document.querySelector('.results-container');
|
374 |
+
const statsContainer = document.getElementById('stats');
|
375 |
+
const copyFeedback = document.querySelector('.copy-feedback');
|
376 |
+
|
377 |
+
// Tab switching
|
378 |
+
document.querySelectorAll('.tab').forEach(tab => {
|
379 |
+
tab.addEventListener('click', () => {
|
380 |
+
// Update tabs
|
381 |
+
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
382 |
+
tab.classList.add('active');
|
383 |
+
|
384 |
+
// Update views
|
385 |
+
const view = tab.dataset.view;
|
386 |
+
document.getElementById('htmlView').style.display = view === 'html' ? 'block' : 'none';
|
387 |
+
document.getElementById('jsonView').style.display = view === 'json' ? 'block' : 'none';
|
388 |
+
document.getElementById('extractionView').style.display = view === 'extraction' ? 'block' : 'none';
|
389 |
+
});
|
390 |
+
});
|
391 |
+
|
392 |
+
// File input handler
|
393 |
+
fileInput.addEventListener('change', (e) => {
|
394 |
+
const file = e.target.files[0];
|
395 |
+
if (file) {
|
396 |
+
const reader = new FileReader();
|
397 |
+
reader.onload = (e) => htmlInput.value = e.target.result;
|
398 |
+
reader.readAsText(file);
|
399 |
+
}
|
400 |
+
});
|
401 |
+
|
402 |
+
// Cheerio script generation
|
403 |
+
async function generateCheerioScript() {
|
404 |
+
const htmlContent = document.getElementById('htmlOutput').textContent;
|
405 |
+
const userInput = document.getElementById('extractionQuery').value;
|
406 |
+
|
407 |
+
if (!htmlContent || !userInput) {
|
408 |
+
alert('Please process HTML and enter an extraction query first');
|
409 |
+
return;
|
410 |
+
}
|
411 |
+
|
412 |
+
try {
|
413 |
+
const response = await fetch(API_URL, {
|
414 |
+
method: 'POST',
|
415 |
+
headers: {
|
416 |
+
'accept': 'application/json',
|
417 |
+
'x-api-key': API_KEY,
|
418 |
+
'Content-Type': 'application/json',
|
419 |
+
},
|
420 |
+
body: JSON.stringify({
|
421 |
+
html: htmlContent,
|
422 |
+
user_input: userInput
|
423 |
+
})
|
424 |
+
});
|
425 |
+
|
426 |
+
const data = await response.json();
|
427 |
+
|
428 |
+
if (data.status === 'success') {
|
429 |
+
currentCheerioScript = data.cheerio_script;
|
430 |
+
document.getElementById('executeExtraction').disabled = false;
|
431 |
+
|
432 |
+
// Show the script in the JSON view
|
433 |
+
document.getElementById('jsonOutput').textContent = JSON.stringify({
|
434 |
+
cheerio_script: currentCheerioScript
|
435 |
+
}, null, 2);
|
436 |
+
Prism.highlightAll();
|
437 |
+
} else {
|
438 |
+
alert('Failed to generate extraction script');
|
439 |
+
}
|
440 |
+
} catch (error) {
|
441 |
+
alert('Error generating script: ' + error.message);
|
442 |
+
}
|
443 |
+
}
|
444 |
+
|
445 |
+
// Execute extraction
|
446 |
+
async function executeExtraction() {
|
447 |
+
if (!currentCheerioScript) {
|
448 |
+
alert('Please generate an extraction script first');
|
449 |
+
return;
|
450 |
+
}
|
451 |
+
|
452 |
+
const htmlContent = document.getElementById('htmlOutput').textContent;
|
453 |
+
|
454 |
+
try {
|
455 |
+
const response = await fetch('/extract', {
|
456 |
+
method: 'POST',
|
457 |
+
headers: {
|
458 |
+
'Content-Type': 'application/json'
|
459 |
+
},
|
460 |
+
body: JSON.stringify({
|
461 |
+
html: htmlContent,
|
462 |
+
script: currentCheerioScript
|
463 |
+
})
|
464 |
+
});
|
465 |
+
|
466 |
+
const data = await response.json();
|
467 |
+
|
468 |
+
document.getElementById('extractionOutput').textContent =
|
469 |
+
JSON.stringify(data, null, 2);
|
470 |
+
Prism.highlightAll();
|
471 |
+
|
472 |
+
// Switch to extraction view
|
473 |
+
document.querySelector('[data-view="extraction"]').click();
|
474 |
+
} catch (error) {
|
475 |
+
alert('Error executing extraction: ' + error.message);
|
476 |
+
}
|
477 |
+
}
|
478 |
+
|
479 |
+
// Form submission
|
480 |
+
form.addEventListener('submit', async (e) => {
|
481 |
+
e.preventDefault();
|
482 |
+
|
483 |
+
const formData = new FormData(form);
|
484 |
+
|
485 |
+
// Add checkbox states
|
486 |
+
document.querySelectorAll('input[type="checkbox"]').forEach(checkbox => {
|
487 |
+
formData.set(checkbox.name, checkbox.checked);
|
488 |
+
});
|
489 |
+
|
490 |
+
try {
|
491 |
+
const response = await fetch('/process', {
|
492 |
+
method: 'POST',
|
493 |
+
body: formData,
|
494 |
+
});
|
495 |
+
|
496 |
+
const data = await response.json();
|
497 |
+
|
498 |
+
if (data.error) {
|
499 |
+
alert(data.error);
|
500 |
+
return;
|
501 |
+
}
|
502 |
+
|
503 |
+
// Display operation status
|
504 |
+
const statusContainer = document.querySelector('#operationStatus');
|
505 |
+
const statusGrid = statusContainer.querySelector('.status-grid');
|
506 |
+
statusContainer.style.display = 'block';
|
507 |
+
|
508 |
+
statusGrid.innerHTML = Object.entries(data.operationStatus)
|
509 |
+
.map(([operation, status]) => `
|
510 |
+
<div class="status-item">
|
511 |
+
<div class="status-icon ${status.success ? 'status-success' : 'status-error'}">
|
512 |
+
${status.success ? '✓' : '✗'}
|
513 |
+
</div>
|
514 |
+
<div>
|
515 |
+
<div>${formatLabel(operation)}</div>
|
516 |
+
${status.error ? `<div class="status-message">Error: ${status.error}</div>` : ''}
|
517 |
+
</div>
|
518 |
+
</div>
|
519 |
+
`).join('');
|
520 |
+
|
521 |
+
// Display stats
|
522 |
+
statsContainer.style.display = 'grid';
|
523 |
+
statsContainer.innerHTML = Object.entries(data.stats)
|
524 |
+
.map(([key, value]) => `
|
525 |
+
<div class="stat-item">
|
526 |
+
<div class="stat-label">${formatLabel(key)}</div>
|
527 |
+
<div class="stat-value">${value}</div>
|
528 |
+
</div>
|
529 |
+
`).join('');
|
530 |
+
|
531 |
+
// Show results container
|
532 |
+
resultsContainer.style.display = 'block';
|
533 |
+
|
534 |
+
// Update outputs with syntax highlighting
|
535 |
+
document.getElementById('htmlOutput').textContent = data.result.html;
|
536 |
+
document.getElementById('jsonOutput').textContent = data.result.json;
|
537 |
+
|
538 |
+
// Trigger Prism highlighting
|
539 |
+
Prism.highlightAll();
|
540 |
+
} catch (err) {
|
541 |
+
alert('Error processing HTML: ' + err.message);
|
542 |
+
}
|
543 |
+
});
|
544 |
+
|
545 |
+
// Event listeners for extraction
|
546 |
+
document.getElementById('generateScript').addEventListener('click', generateCheerioScript);
|
547 |
+
document.getElementById('executeExtraction').addEventListener('click', executeExtraction);
|
548 |
+
|
549 |
+
// Utility functions
|
550 |
+
function formatLabel(key) {
|
551 |
+
return key
|
552 |
+
.replace(/([A-Z])/g, ' $1')
|
553 |
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
554 |
+
.toLowerCase()
|
555 |
+
.replace(/^./, str => str.toUpperCase())
|
556 |
+
.replace('Html', 'HTML');
|
557 |
+
}
|
558 |
+
|
559 |
+
async function copyResult(type) {
|
560 |
+
const content = document.getElementById(`${type}Output`).textContent;
|
561 |
+
try {
|
562 |
+
await navigator.clipboard.writeText(content);
|
563 |
+
showCopyFeedback();
|
564 |
+
} catch (err) {
|
565 |
+
alert('Failed to copy to clipboard');
|
566 |
+
}
|
567 |
+
}
|
568 |
+
|
569 |
+
function downloadResult(type) {
|
570 |
+
const content = document.getElementById(`${type}Output`).textContent;
|
571 |
+
const blob = new Blob([content], { type: 'text/plain' });
|
572 |
+
const url = URL.createObjectURL(blob);
|
573 |
+
const a = document.createElement('a');
|
574 |
+
a.href = url;
|
575 |
+
a.download = `compressed.${type}`;
|
576 |
+
document.body.appendChild(a);
|
577 |
+
a.click();
|
578 |
+
document.body.removeChild(a);
|
579 |
+
URL.revokeObjectURL(url);
|
580 |
+
}
|
581 |
+
|
582 |
+
function showCopyFeedback() {
|
583 |
+
copyFeedback.style.display = 'block';
|
584 |
+
setTimeout(() => {
|
585 |
+
copyFeedback.style.display = 'none';
|
586 |
+
}, 2000);
|
587 |
+
}
|
588 |
+
</script>
|
589 |
+
</body>
|
590 |
+
</html>
|