make the title nicer
Browse files- index.html +2 -1
- src/clusters.js +38 -28
index.html
CHANGED
@@ -166,7 +166,8 @@
|
|
166 |
</script>
|
167 |
</d-front-matter>
|
168 |
<d-title>
|
169 |
-
<
|
|
|
170 |
<figure>
|
171 |
<img src="banner.png" alt="FineWeb">
|
172 |
</figure>
|
|
|
166 |
</script>
|
167 |
</d-front-matter>
|
168 |
<d-title>
|
169 |
+
<h1 class="l-page" style="text-align: center;">🍷 FineWeb: decanting the web for the finest text data at scale</h1>
|
170 |
+
<div id="title-plot" class="main-plot-container l-screen">
|
171 |
<figure>
|
172 |
<img src="banner.png" alt="FineWeb">
|
173 |
</figure>
|
src/clusters.js
CHANGED
@@ -26,12 +26,33 @@ const COLORS = [
|
|
26 |
["175", "0", "56"],
|
27 |
];
|
28 |
|
29 |
-
const BASE_SIZE =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
const getLabelHoverFormat = (row, ) => {
|
32 |
return `<b>Text</b>: ${row.text}<br><b>Label</b>: ${row.label}<br><b>Edu label</b>: ${row.eduScore}`;
|
33 |
};
|
34 |
|
|
|
35 |
// Number of annotations to display
|
36 |
const K = 15;
|
37 |
|
@@ -155,7 +176,7 @@ async function plotClusters() {
|
|
155 |
y: data.map((row) => row.y),
|
156 |
marker: {
|
157 |
color: data.map((row) => getColor(row.label, 1.0)),
|
158 |
-
size:
|
159 |
opacity: 8,
|
160 |
},
|
161 |
hoverinfo: "text",
|
@@ -170,31 +191,17 @@ async function plotClusters() {
|
|
170 |
const layout = {
|
171 |
height: 550,
|
172 |
width: parent.clientWidth,
|
173 |
-
xaxis:
|
174 |
-
|
175 |
-
|
176 |
-
zeroline: false,
|
177 |
-
title: {
|
178 |
-
text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more",
|
179 |
-
font: {
|
180 |
-
size: 16,
|
181 |
-
style: "italic",
|
182 |
-
},
|
183 |
-
// italic
|
184 |
-
},
|
185 |
-
},
|
186 |
-
yaxis: {
|
187 |
-
showticklabels: false,
|
188 |
-
showgrid: false,
|
189 |
-
zeroline: false,
|
190 |
-
},
|
191 |
-
annotations: getRelevantAnnotations(annotations, x0, x1, y0, y1),
|
192 |
font: {
|
193 |
family: "apple-system, Arial, sans-serif",
|
194 |
},
|
195 |
margin: {
|
196 |
t: 0,
|
197 |
-
b:
|
|
|
|
|
198 |
},
|
199 |
};
|
200 |
|
@@ -220,20 +227,21 @@ async function plotClusters() {
|
|
220 |
newy0,
|
221 |
newy1
|
222 |
);
|
|
|
223 |
// 1.8 otherwise it's too big
|
224 |
const zoomLevel =
|
225 |
Math.min(
|
226 |
(x1 - x0) / (newx1 - newx0),
|
227 |
(y1 - y0) / (newy1 - newy0)
|
228 |
-
) / 1.
|
229 |
Plotly.update(
|
230 |
parent,
|
231 |
{ "marker.size": BASE_SIZE * zoomLevel },
|
232 |
{ annotations: relevant_annotations },
|
233 |
);
|
234 |
}
|
235 |
-
// Zoom reset
|
236 |
-
else if (eventdata["xaxis.autorange"]) {
|
237 |
const relevant_annotations = getRelevantAnnotations(
|
238 |
annotations,
|
239 |
x0,
|
@@ -241,13 +249,15 @@ async function plotClusters() {
|
|
241 |
y0,
|
242 |
y1
|
243 |
);
|
|
|
|
|
|
|
244 |
Plotly.update(
|
245 |
parent,
|
246 |
{ "marker.size": BASE_SIZE },
|
247 |
-
{ annotations: relevant_annotations }
|
248 |
);
|
249 |
}
|
250 |
-
// Otherwise it's just the relayout itself
|
251 |
});
|
252 |
|
253 |
window.addEventListener("resize", () => {
|
@@ -268,7 +278,7 @@ document.addEventListener("DOMContentLoaded", () => {
|
|
268 |
const readCSV = async (file) => {
|
269 |
const data = await fetch(file);
|
270 |
const text = await data.text();
|
271 |
-
const csv = Papa.parse(text, { header: true });
|
272 |
return csv.data;
|
273 |
};
|
274 |
|
|
|
26 |
["175", "0", "56"],
|
27 |
];
|
28 |
|
29 |
+
const BASE_SIZE = 4;
|
30 |
+
// x0, x1, y0, y1
|
31 |
+
const DEFAULT_XAXIS = {
|
32 |
+
showticklabels: false,
|
33 |
+
showgrid: false,
|
34 |
+
zeroline: false,
|
35 |
+
title: {
|
36 |
+
text: "<a href='https://github.com/huggingface/text-clustering' target='_blank' style='color: inherit;'>Fineweb dataset</a>",
|
37 |
+
font: {
|
38 |
+
size: 16,
|
39 |
+
style: "italic",
|
40 |
+
},
|
41 |
+
},
|
42 |
+
range: [7.23943662773044, 13.605120929434547]
|
43 |
+
}
|
44 |
+
const DEFAULT_YAXIS = {
|
45 |
+
showticklabels: false,
|
46 |
+
showgrid: false,
|
47 |
+
zeroline: false,
|
48 |
+
range: [1.6774392919913423, 6.514440889610389],
|
49 |
+
}
|
50 |
|
51 |
const getLabelHoverFormat = (row, ) => {
|
52 |
return `<b>Text</b>: ${row.text}<br><b>Label</b>: ${row.label}<br><b>Edu label</b>: ${row.eduScore}`;
|
53 |
};
|
54 |
|
55 |
+
|
56 |
// Number of annotations to display
|
57 |
const K = 15;
|
58 |
|
|
|
176 |
y: data.map((row) => row.y),
|
177 |
marker: {
|
178 |
color: data.map((row) => getColor(row.label, 1.0)),
|
179 |
+
size: BASE_SIZE,
|
180 |
opacity: 8,
|
181 |
},
|
182 |
hoverinfo: "text",
|
|
|
191 |
const layout = {
|
192 |
height: 550,
|
193 |
width: parent.clientWidth,
|
194 |
+
xaxis: DEFAULT_XAXIS,
|
195 |
+
yaxis: DEFAULT_YAXIS,
|
196 |
+
annotations: getRelevantAnnotations(annotations, DEFAULT_XAXIS.range[0], DEFAULT_XAXIS.range[1], DEFAULT_YAXIS.range[0], DEFAULT_YAXIS.range[1]),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
font: {
|
198 |
family: "apple-system, Arial, sans-serif",
|
199 |
},
|
200 |
margin: {
|
201 |
t: 0,
|
202 |
+
b: 50,
|
203 |
+
l: 0,
|
204 |
+
r: 0,
|
205 |
},
|
206 |
};
|
207 |
|
|
|
227 |
newy0,
|
228 |
newy1
|
229 |
);
|
230 |
+
console.log(x0, x1, y0, y1);
|
231 |
// 1.8 otherwise it's too big
|
232 |
const zoomLevel =
|
233 |
Math.min(
|
234 |
(x1 - x0) / (newx1 - newx0),
|
235 |
(y1 - y0) / (newy1 - newy0)
|
236 |
+
) / 1.2;
|
237 |
Plotly.update(
|
238 |
parent,
|
239 |
{ "marker.size": BASE_SIZE * zoomLevel },
|
240 |
{ annotations: relevant_annotations },
|
241 |
);
|
242 |
}
|
243 |
+
// Zoom reset to full outzoomed or to base range
|
244 |
+
else if (eventdata["xaxis.autorange"] || eventdata["xaxis.range"]) {
|
245 |
const relevant_annotations = getRelevantAnnotations(
|
246 |
annotations,
|
247 |
x0,
|
|
|
249 |
y0,
|
250 |
y1
|
251 |
);
|
252 |
+
// We wan to always fully zoomed out
|
253 |
+
const xaxis = _.merge({}, DEFAULT_XAXIS, { range: [x0, x1] });
|
254 |
+
const yaxis = _.merge({}, DEFAULT_YAXIS, { range: [y0, y1] });
|
255 |
Plotly.update(
|
256 |
parent,
|
257 |
{ "marker.size": BASE_SIZE },
|
258 |
+
{ annotations: relevant_annotations, xaxis, yaxis }
|
259 |
);
|
260 |
}
|
|
|
261 |
});
|
262 |
|
263 |
window.addEventListener("resize", () => {
|
|
|
278 |
const readCSV = async (file) => {
|
279 |
const data = await fetch(file);
|
280 |
const text = await data.text();
|
281 |
+
const csv = Papa.parse(text, { header: true, skipEmptyLines: true });
|
282 |
return csv.data;
|
283 |
};
|
284 |
|