hynky HF staff commited on
Commit
a8d12ee
1 Parent(s): 5617ddc

make the title nicer

Browse files
Files changed (2) hide show
  1. index.html +2 -1
  2. src/clusters.js +38 -28
index.html CHANGED
@@ -166,7 +166,8 @@
166
  </script>
167
  </d-front-matter>
168
  <d-title>
169
- <div id="title-plot" class="main-plot-container l-page">
 
170
  <figure>
171
  <img src="banner.png" alt="FineWeb">
172
  </figure>
 
166
  </script>
167
  </d-front-matter>
168
  <d-title>
169
+ <h1 class="l-page" style="text-align: center;">🍷 FineWeb: decanting the web for the finest text data at scale</h1>
170
+ <div id="title-plot" class="main-plot-container l-screen">
171
  <figure>
172
  <img src="banner.png" alt="FineWeb">
173
  </figure>
src/clusters.js CHANGED
@@ -26,12 +26,33 @@ const COLORS = [
26
  ["175", "0", "56"],
27
  ];
28
 
29
- const BASE_SIZE = 5;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  const getLabelHoverFormat = (row, ) => {
32
  return `<b>Text</b>: ${row.text}<br><b>Label</b>: ${row.label}<br><b>Edu label</b>: ${row.eduScore}`;
33
  };
34
 
 
35
  // Number of annotations to display
36
  const K = 15;
37
 
@@ -155,7 +176,7 @@ async function plotClusters() {
155
  y: data.map((row) => row.y),
156
  marker: {
157
  color: data.map((row) => getColor(row.label, 1.0)),
158
- size: 5,
159
  opacity: 8,
160
  },
161
  hoverinfo: "text",
@@ -170,31 +191,17 @@ async function plotClusters() {
170
  const layout = {
171
  height: 550,
172
  width: parent.clientWidth,
173
- xaxis: {
174
- showticklabels: false,
175
- showgrid: false,
176
- zeroline: false,
177
- title: {
178
- text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more",
179
- font: {
180
- size: 16,
181
- style: "italic",
182
- },
183
- // italic
184
- },
185
- },
186
- yaxis: {
187
- showticklabels: false,
188
- showgrid: false,
189
- zeroline: false,
190
- },
191
- annotations: getRelevantAnnotations(annotations, x0, x1, y0, y1),
192
  font: {
193
  family: "apple-system, Arial, sans-serif",
194
  },
195
  margin: {
196
  t: 0,
197
- b: 30,
 
 
198
  },
199
  };
200
 
@@ -220,20 +227,21 @@ async function plotClusters() {
220
  newy0,
221
  newy1
222
  );
 
223
  // 1.8 otherwise it's too big
224
  const zoomLevel =
225
  Math.min(
226
  (x1 - x0) / (newx1 - newx0),
227
  (y1 - y0) / (newy1 - newy0)
228
- ) / 1.8;
229
  Plotly.update(
230
  parent,
231
  { "marker.size": BASE_SIZE * zoomLevel },
232
  { annotations: relevant_annotations },
233
  );
234
  }
235
- // Zoom reset
236
- else if (eventdata["xaxis.autorange"]) {
237
  const relevant_annotations = getRelevantAnnotations(
238
  annotations,
239
  x0,
@@ -241,13 +249,15 @@ async function plotClusters() {
241
  y0,
242
  y1
243
  );
 
 
 
244
  Plotly.update(
245
  parent,
246
  { "marker.size": BASE_SIZE },
247
- { annotations: relevant_annotations }
248
  );
249
  }
250
- // Otherwise it's just the relayout itself
251
  });
252
 
253
  window.addEventListener("resize", () => {
@@ -268,7 +278,7 @@ document.addEventListener("DOMContentLoaded", () => {
268
  const readCSV = async (file) => {
269
  const data = await fetch(file);
270
  const text = await data.text();
271
- const csv = Papa.parse(text, { header: true });
272
  return csv.data;
273
  };
274
 
 
26
  ["175", "0", "56"],
27
  ];
28
 
29
+ const BASE_SIZE = 4;
30
+ // x0, x1, y0, y1
31
+ const DEFAULT_XAXIS = {
32
+ showticklabels: false,
33
+ showgrid: false,
34
+ zeroline: false,
35
+ title: {
36
+ text: "<a href='https://github.com/huggingface/text-clustering' target='_blank' style='color: inherit;'>Fineweb dataset</a>",
37
+ font: {
38
+ size: 16,
39
+ style: "italic",
40
+ },
41
+ },
42
+ range: [7.23943662773044, 13.605120929434547]
43
+ }
44
+ const DEFAULT_YAXIS = {
45
+ showticklabels: false,
46
+ showgrid: false,
47
+ zeroline: false,
48
+ range: [1.6774392919913423, 6.514440889610389],
49
+ }
50
 
51
  const getLabelHoverFormat = (row, ) => {
52
  return `<b>Text</b>: ${row.text}<br><b>Label</b>: ${row.label}<br><b>Edu label</b>: ${row.eduScore}`;
53
  };
54
 
55
+
56
  // Number of annotations to display
57
  const K = 15;
58
 
 
176
  y: data.map((row) => row.y),
177
  marker: {
178
  color: data.map((row) => getColor(row.label, 1.0)),
179
+ size: BASE_SIZE,
180
  opacity: 8,
181
  },
182
  hoverinfo: "text",
 
191
  const layout = {
192
  height: 550,
193
  width: parent.clientWidth,
194
+ xaxis: DEFAULT_XAXIS,
195
+ yaxis: DEFAULT_YAXIS,
196
+ annotations: getRelevantAnnotations(annotations, DEFAULT_XAXIS.range[0], DEFAULT_XAXIS.range[1], DEFAULT_YAXIS.range[0], DEFAULT_YAXIS.range[1]),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  font: {
198
  family: "apple-system, Arial, sans-serif",
199
  },
200
  margin: {
201
  t: 0,
202
+ b: 50,
203
+ l: 0,
204
+ r: 0,
205
  },
206
  };
207
 
 
227
  newy0,
228
  newy1
229
  );
230
+ console.log(x0, x1, y0, y1);
231
  // 1.8 otherwise it's too big
232
  const zoomLevel =
233
  Math.min(
234
  (x1 - x0) / (newx1 - newx0),
235
  (y1 - y0) / (newy1 - newy0)
236
+ ) / 1.2;
237
  Plotly.update(
238
  parent,
239
  { "marker.size": BASE_SIZE * zoomLevel },
240
  { annotations: relevant_annotations },
241
  );
242
  }
243
+ // Zoom reset to full outzoomed or to base range
244
+ else if (eventdata["xaxis.autorange"] || eventdata["xaxis.range"]) {
245
  const relevant_annotations = getRelevantAnnotations(
246
  annotations,
247
  x0,
 
249
  y0,
250
  y1
251
  );
252
+ // We wan to always fully zoomed out
253
+ const xaxis = _.merge({}, DEFAULT_XAXIS, { range: [x0, x1] });
254
+ const yaxis = _.merge({}, DEFAULT_YAXIS, { range: [y0, y1] });
255
  Plotly.update(
256
  parent,
257
  { "marker.size": BASE_SIZE },
258
+ { annotations: relevant_annotations, xaxis, yaxis }
259
  );
260
  }
 
261
  });
262
 
263
  window.addEventListener("resize", () => {
 
278
  const readCSV = async (file) => {
279
  const data = await fetch(file);
280
  const text = await data.text();
281
+ const csv = Papa.parse(text, { header: true, skipEmptyLines: true });
282
  return csv.data;
283
  };
284