hynky HF staff commited on
Commit
0764fdf
·
1 Parent(s): 2c9e5db

add zooming effect

Browse files
Files changed (2) hide show
  1. data/clustering/info.csv +1 -1
  2. src/clusters.js +160 -111
data/clustering/info.csv CHANGED
@@ -103,4 +103,4 @@
103
  101,100,Bathroom Design & Toilet Engineering,11.779076,7.2920136
104
  102,101,Business Development,7.328447,5.659843
105
  103,102,Sports,7.6370654,-1.0701839
106
- 104,103,Sexuality,13.817207,1.6510898
 
103
  101,100,Bathroom Design & Toilet Engineering,11.779076,7.2920136
104
  102,101,Business Development,7.328447,5.659843
105
  103,102,Sports,7.6370654,-1.0701839
106
+ 104,103,Sexuality,13.817207,1.6510898
src/clusters.js CHANGED
@@ -1,39 +1,39 @@
1
  // plotly Dark24
2
  const COLORS = [
3
-
4
- ['46', '145', '229'],
5
- ['225', '95', '153'],
6
- ['28', '167', '28'],
7
- ['251', '13', '13'],
8
- ['218', '22', '255'],
9
  // ['34', '42', '42'], Black, which makes the text unreadable
10
- ['182', '129', '0'],
11
- ['117', '13', '134'],
12
- ['235', '102', '59'],
13
- ['81', '28', '251'],
14
- ['0', '160', '139'],
15
- ['251', '0', '209'],
16
- ['252', '0', '128'],
17
- ['178', '130', '141'],
18
- ['108', '124', '50'],
19
- ['119', '138', '174'],
20
- ['134', '42', '22'],
21
- ['167', '119', '241'],
22
- ['98', '0', '66'],
23
- ['22', '22', '167'],
24
- ['218', '96', '202'],
25
- ['108', '69', '22'],
26
  // ['13', '42', '99'], Black
27
- ['175', '0', '56']
28
- ]
29
 
30
- const getLabelHoverFormat = (row) => {
31
- return `<b>Text</b>: ${row.text}<br><b>Edu label</b>: ${row.eduScore}`
32
- }
33
 
34
- // Number of annotations to display
35
- const K = 15
 
36
 
 
 
37
 
38
  function createLabelOrderMapping(labels) {
39
  const labelCounts = labels.reduce((acc, label) => {
@@ -41,7 +41,9 @@ function createLabelOrderMapping(labels) {
41
  return acc;
42
  }, {});
43
 
44
- const sortedLabels = Object.entries(labelCounts).sort((a, b) => b[1] - a[1]).map(entry => entry[0]);
 
 
45
 
46
  const labelOrder = {};
47
  sortedLabels.forEach((label, index) => {
@@ -50,103 +52,121 @@ function createLabelOrderMapping(labels) {
50
  return labelOrder;
51
  }
52
 
53
-
54
-
55
-
56
  const parseAnnotations = async (file) => {
57
- return (await readCSV(file)).filter((cluster_summary) => {
58
- return parseInt(cluster_summary.cluster_id) != -1
59
- }).map((cluster_summary) => {
60
- return {
61
- x: parseFloat(cluster_summary.cluster_position_x),
62
- y: parseFloat(cluster_summary.cluster_position_y),
63
- label: parseInt(cluster_summary.cluster_id),
64
- text: cluster_summary.cluster_summaries,
65
- }
66
- })
67
- }
 
 
68
 
69
  const addStylingToAnnotations = (annotations) => {
70
- return annotations.map((annotation, i) => {
71
  return {
72
- ...annotation,
73
  showarrow: false,
74
  font: {
75
  size: 14,
76
- color: 'black',
77
- weight: 'bold'
78
- },
79
  bgcolor: getColor(annotation.label, 0.9),
80
  borderpad: 2, // Add padding around the text
81
- }
82
- })
83
- }
 
84
 
85
- const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k=K) => {
86
  const relevant_annotations = annotations.filter((annotation) => {
87
- return annotation.x >= x0 && annotation.x <= x1 && annotation.y >= y0 && annotation.y <= y1
88
- })
 
 
 
 
 
89
  return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k);
90
- }
91
-
92
 
93
  const getMinMaxTracesArea = (traces) => {
94
- const x0 = Math.min(...traces.map(trace => trace.x));
95
- const x1 = Math.max(...traces.map(trace => trace.x));
96
- const y0 = Math.min(...traces.map(trace => trace.y));
97
- const y1 = Math.max(...traces.map(trace => trace.y));
98
- return {x0, x1, y0, y1};
99
- }
100
 
101
  const readData = async () => {
102
- return (await readCSV('data/clustering/data.csv')).map(row => ({
103
  x: parseFloat(row.X),
104
  y: parseFloat(row.Y),
105
  eduScore: parseFloat(row.edu_labels),
106
  label: parseInt(row.cluster_labels),
107
  text: row.content_display,
108
  }));
109
- }
110
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  // The cluster is pretty big, so takes time to donwload
113
  // In the meantime we put there a placeholder image
114
  const destroyPlaceholderImage = (parent) => {
115
- const img = parent.querySelector('img');
116
  img.remove();
117
- }
118
 
119
  async function plotClusters() {
120
- const parent = document.getElementById('clusters-plot');
121
  // We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :)
122
  const data = await readData();
123
- const traces = [{
124
- type: 'scatter',
125
- mode: 'markers',
126
- x: data.map(row => row.x),
127
- y: data.map(row => row.y),
 
 
 
 
 
 
 
 
 
 
 
128
  marker: {
129
- color: data.map(row => getColor(row.label, 1.0)),
130
  size: 5,
131
- opacity: 8
132
  },
133
- hoverinfo: 'text',
134
- hovertext: data.map(row => getLabelHoverFormat(row)),
135
  hoverlabel: {
136
- bgcolor: 'white'
137
  },
138
- }];
139
- const labelOrder = createLabelOrderMapping(data.map(row => row.label));
140
- const annotations = (await parseAnnotations('data/clustering/info.csv')).map(
141
- (annot) => {
142
- return {
143
- ...annot,
144
- ord: labelOrder[annot.label]
145
- }
146
- }
147
- );
148
 
149
- const {x0, x1, y0, y1} = getMinMaxTracesArea(data);
150
  const layout = {
151
  height: 550,
152
  width: parent.clientWidth,
@@ -158,7 +178,7 @@ async function plotClusters() {
158
  text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more",
159
  font: {
160
  size: 16,
161
- style: 'italic'
162
  },
163
  // italic
164
  },
@@ -168,7 +188,7 @@ async function plotClusters() {
168
  showgrid: false,
169
  zeroline: false,
170
  },
171
- annotations: addStylingToAnnotations(getRelevantAnnotations(annotations, x0, x1, y0, y1)),
172
  font: {
173
  family: "apple-system, Arial, sans-serif",
174
  },
@@ -181,24 +201,56 @@ async function plotClusters() {
181
  destroyPlaceholderImage(parent);
182
  Plotly.newPlot(parent, traces, layout);
183
 
184
- parent.on('plotly_relayout', (eventdata) => {
185
  // First option zoomed in
186
  if (eventdata["xaxis.range[0]"]) {
187
- const [x0, x1] = [eventdata['xaxis.range[0]'], eventdata['xaxis.range[1]']];
188
- const [y0, y1] = [eventdata['yaxis.range[0]'], eventdata['yaxis.range[1]']];
 
 
 
 
 
 
189
  // Idk maybe we can even recompute the ordering, but I think it's fine to use the global one
190
- const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1);
191
- Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)});
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  }
193
  // Zoom reset
194
- else if (eventdata["xaxis.autorange"]){
195
- const {x0, x1, y0, y1} = getMinMaxTracesArea(data);
196
- const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1);
197
- Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)});
 
 
 
 
 
 
 
 
 
198
  }
199
  // Otherwise it's just the relayout itself
200
  });
201
-
202
  window.addEventListener("resize", () => {
203
  // If the window size is smaller than 768, we don't care as it's not shown
204
  if (window.innerWidth < 768) {
@@ -214,19 +266,16 @@ document.addEventListener("DOMContentLoaded", () => {
214
  plotClusters();
215
  });
216
 
217
-
218
  const readCSV = async (file) => {
219
- const data = await fetch(file)
220
- const text = await data.text()
221
- const csv = Papa.parse(text, {header: true});
222
  return csv.data;
223
- }
224
-
225
-
226
 
227
  const getColor = (i, opacity) => {
228
  if (i < 0) {
229
- i = i * -1
230
  }
231
- return `rgba(${COLORS[i % COLORS.length].join(',')}, ${opacity})`
232
- }
 
1
  // plotly Dark24
2
  const COLORS = [
3
+ ["46", "145", "229"],
4
+ ["225", "95", "153"],
5
+ ["28", "167", "28"],
6
+ ["251", "13", "13"],
7
+ ["218", "22", "255"],
 
8
  // ['34', '42', '42'], Black, which makes the text unreadable
9
+ ["182", "129", "0"],
10
+ ["117", "13", "134"],
11
+ ["235", "102", "59"],
12
+ ["81", "28", "251"],
13
+ ["0", "160", "139"],
14
+ ["251", "0", "209"],
15
+ ["252", "0", "128"],
16
+ ["178", "130", "141"],
17
+ ["108", "124", "50"],
18
+ ["119", "138", "174"],
19
+ ["134", "42", "22"],
20
+ ["167", "119", "241"],
21
+ ["98", "0", "66"],
22
+ ["22", "22", "167"],
23
+ ["218", "96", "202"],
24
+ ["108", "69", "22"],
25
  // ['13', '42', '99'], Black
26
+ ["175", "0", "56"],
27
+ ];
28
 
29
+ const BASE_SIZE = 5;
 
 
30
 
31
+ const getLabelHoverFormat = (row, ) => {
32
+ return `<b>Text</b>: ${row.text}<br><b>Label</b>: ${row.label}<br><b>Edu label</b>: ${row.eduScore}`;
33
+ };
34
 
35
+ // Number of annotations to display
36
+ const K = 15;
37
 
38
  function createLabelOrderMapping(labels) {
39
  const labelCounts = labels.reduce((acc, label) => {
 
41
  return acc;
42
  }, {});
43
 
44
+ const sortedLabels = Object.entries(labelCounts)
45
+ .sort((a, b) => b[1] - a[1])
46
+ .map((entry) => entry[0]);
47
 
48
  const labelOrder = {};
49
  sortedLabels.forEach((label, index) => {
 
52
  return labelOrder;
53
  }
54
 
 
 
 
55
  const parseAnnotations = async (file) => {
56
+ return (await readCSV(file))
57
+ .filter((cluster_summary) => {
58
+ return parseInt(cluster_summary.cluster_id) != -1;
59
+ })
60
+ .map((cluster_summary) => {
61
+ return {
62
+ x: parseFloat(cluster_summary.cluster_position_x),
63
+ y: parseFloat(cluster_summary.cluster_position_y),
64
+ label: parseInt(cluster_summary.cluster_id),
65
+ text: cluster_summary.cluster_summaries,
66
+ };
67
+ });
68
+ };
69
 
70
  const addStylingToAnnotations = (annotations) => {
71
+ return annotations.map((annotation) => {
72
  return {
 
73
  showarrow: false,
74
  font: {
75
  size: 14,
76
+ color: "black",
77
+ weight: "bold",
78
+ },
79
  bgcolor: getColor(annotation.label, 0.9),
80
  borderpad: 2, // Add padding around the text
81
+ ...annotation,
82
+ };
83
+ });
84
+ };
85
 
86
+ const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k = K) => {
87
  const relevant_annotations = annotations.filter((annotation) => {
88
+ return (
89
+ annotation.x >= x0 &&
90
+ annotation.x <= x1 &&
91
+ annotation.y >= y0 &&
92
+ annotation.y <= y1
93
+ );
94
+ });
95
  return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k);
96
+ };
 
97
 
98
  const getMinMaxTracesArea = (traces) => {
99
+ const x0 = Math.min(...traces.map((trace) => trace.x));
100
+ const x1 = Math.max(...traces.map((trace) => trace.x));
101
+ const y0 = Math.min(...traces.map((trace) => trace.y));
102
+ const y1 = Math.max(...traces.map((trace) => trace.y));
103
+ return { x0, x1, y0, y1 };
104
+ };
105
 
106
  const readData = async () => {
107
+ return (await readCSV("data/clustering/data.csv")).map((row) => ({
108
  x: parseFloat(row.X),
109
  y: parseFloat(row.Y),
110
  eduScore: parseFloat(row.edu_labels),
111
  label: parseInt(row.cluster_labels),
112
  text: row.content_display,
113
  }));
114
+ };
115
 
116
+ const restyleTraces = (traces, zoomLevel) => {
117
+ const res = [
118
+ {
119
+ ...traces[0],
120
+ marker: {
121
+ ...traces[0].marker,
122
+ size: BASE_SIZE * zoomLevel,
123
+ },
124
+ },
125
+ ];
126
+ return res;
127
+ };
128
 
129
  // The cluster is pretty big, so takes time to donwload
130
  // In the meantime we put there a placeholder image
131
  const destroyPlaceholderImage = (parent) => {
132
+ const img = parent.querySelector("img");
133
  img.remove();
134
+ };
135
 
136
  async function plotClusters() {
137
+ const parent = document.getElementById("clusters-plot");
138
  // We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :)
139
  const data = await readData();
140
+ const labelOrder = createLabelOrderMapping(data.map((row) => row.label));
141
+ const annotations = addStylingToAnnotations(
142
+ await parseAnnotations("data/clustering/info.csv")
143
+ ).map((annot) => {
144
+ return {
145
+ ...annot,
146
+ ord: labelOrder[annot.label],
147
+ };
148
+ });
149
+
150
+ const traces = [
151
+ {
152
+ type: "scatter",
153
+ mode: "markers",
154
+ x: data.map((row) => row.x),
155
+ y: data.map((row) => row.y),
156
  marker: {
157
+ color: data.map((row) => getColor(row.label, 1.0)),
158
  size: 5,
159
+ opacity: 8,
160
  },
161
+ hoverinfo: "text",
162
+ hovertext: data.map((row) => getLabelHoverFormat(row)),
163
  hoverlabel: {
164
+ bgcolor: "white",
165
  },
166
+ },
167
+ ];
 
 
 
 
 
 
 
 
168
 
169
+ const { x0, x1, y0, y1 } = getMinMaxTracesArea(data);
170
  const layout = {
171
  height: 550,
172
  width: parent.clientWidth,
 
178
  text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more",
179
  font: {
180
  size: 16,
181
+ style: "italic",
182
  },
183
  // italic
184
  },
 
188
  showgrid: false,
189
  zeroline: false,
190
  },
191
+ annotations: getRelevantAnnotations(annotations, x0, x1, y0, y1),
192
  font: {
193
  family: "apple-system, Arial, sans-serif",
194
  },
 
201
  destroyPlaceholderImage(parent);
202
  Plotly.newPlot(parent, traces, layout);
203
 
204
+ parent.on("plotly_relayout", (eventdata) => {
205
  // First option zoomed in
206
  if (eventdata["xaxis.range[0]"]) {
207
+ const [newx0, newx1] = [
208
+ eventdata["xaxis.range[0]"],
209
+ eventdata["xaxis.range[1]"],
210
+ ];
211
+ const [newy0, newy1] = [
212
+ eventdata["yaxis.range[0]"],
213
+ eventdata["yaxis.range[1]"],
214
+ ];
215
  // Idk maybe we can even recompute the ordering, but I think it's fine to use the global one
216
+ const relevant_annotations = getRelevantAnnotations(
217
+ annotations,
218
+ newx0,
219
+ newx1,
220
+ newy0,
221
+ newy1
222
+ );
223
+ // 1.8 otherwise it's too big
224
+ const zoomLevel =
225
+ Math.min(
226
+ (x1 - x0) / (newx1 - newx0),
227
+ (y1 - y0) / (newy1 - newy0)
228
+ ) / 1.8;
229
+ Plotly.update(
230
+ parent,
231
+ { "marker.size": BASE_SIZE * zoomLevel },
232
+ { annotations: relevant_annotations },
233
+ relevantDataIdx
234
+ );
235
  }
236
  // Zoom reset
237
+ else if (eventdata["xaxis.autorange"]) {
238
+ const relevant_annotations = getRelevantAnnotations(
239
+ annotations,
240
+ x0,
241
+ x1,
242
+ y0,
243
+ y1
244
+ );
245
+ Plotly.update(
246
+ parent,
247
+ { "marker.size": BASE_SIZE },
248
+ { annotations: relevant_annotations }
249
+ );
250
  }
251
  // Otherwise it's just the relayout itself
252
  });
253
+
254
  window.addEventListener("resize", () => {
255
  // If the window size is smaller than 768, we don't care as it's not shown
256
  if (window.innerWidth < 768) {
 
266
  plotClusters();
267
  });
268
 
 
269
  const readCSV = async (file) => {
270
+ const data = await fetch(file);
271
+ const text = await data.text();
272
+ const csv = Papa.parse(text, { header: true });
273
  return csv.data;
274
+ };
 
 
275
 
276
  const getColor = (i, opacity) => {
277
  if (i < 0) {
278
+ i = i * -1;
279
  }
280
+ return `rgba(${COLORS[i % COLORS.length].join(",")}, ${opacity})`;
281
+ };