add zooming effect
Browse files- data/clustering/info.csv +1 -1
- src/clusters.js +160 -111
data/clustering/info.csv
CHANGED
@@ -103,4 +103,4 @@
|
|
103 |
101,100,Bathroom Design & Toilet Engineering,11.779076,7.2920136
|
104 |
102,101,Business Development,7.328447,5.659843
|
105 |
103,102,Sports,7.6370654,-1.0701839
|
106 |
-
104,103,Sexuality,13.817207,1.6510898
|
|
|
103 |
101,100,Bathroom Design & Toilet Engineering,11.779076,7.2920136
|
104 |
102,101,Business Development,7.328447,5.659843
|
105 |
103,102,Sports,7.6370654,-1.0701839
|
106 |
+
104,103,Sexuality,13.817207,1.6510898
|
src/clusters.js
CHANGED
@@ -1,39 +1,39 @@
|
|
1 |
// plotly Dark24
|
2 |
const COLORS = [
|
3 |
-
|
4 |
-
[
|
5 |
-
[
|
6 |
-
[
|
7 |
-
[
|
8 |
-
['218', '22', '255'],
|
9 |
// ['34', '42', '42'], Black, which makes the text unreadable
|
10 |
-
[
|
11 |
-
[
|
12 |
-
[
|
13 |
-
[
|
14 |
-
[
|
15 |
-
[
|
16 |
-
[
|
17 |
-
[
|
18 |
-
[
|
19 |
-
[
|
20 |
-
[
|
21 |
-
[
|
22 |
-
[
|
23 |
-
[
|
24 |
-
[
|
25 |
-
[
|
26 |
// ['13', '42', '99'], Black
|
27 |
-
[
|
28 |
-
]
|
29 |
|
30 |
-
const
|
31 |
-
return `<b>Text</b>: ${row.text}<br><b>Edu label</b>: ${row.eduScore}`
|
32 |
-
}
|
33 |
|
34 |
-
|
35 |
-
|
|
|
36 |
|
|
|
|
|
37 |
|
38 |
function createLabelOrderMapping(labels) {
|
39 |
const labelCounts = labels.reduce((acc, label) => {
|
@@ -41,7 +41,9 @@ function createLabelOrderMapping(labels) {
|
|
41 |
return acc;
|
42 |
}, {});
|
43 |
|
44 |
-
const sortedLabels = Object.entries(labelCounts)
|
|
|
|
|
45 |
|
46 |
const labelOrder = {};
|
47 |
sortedLabels.forEach((label, index) => {
|
@@ -50,103 +52,121 @@ function createLabelOrderMapping(labels) {
|
|
50 |
return labelOrder;
|
51 |
}
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
const parseAnnotations = async (file) => {
|
57 |
-
return (await readCSV(file))
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
}
|
|
|
|
|
68 |
|
69 |
const addStylingToAnnotations = (annotations) => {
|
70 |
-
return annotations.map((annotation
|
71 |
return {
|
72 |
-
...annotation,
|
73 |
showarrow: false,
|
74 |
font: {
|
75 |
size: 14,
|
76 |
-
color:
|
77 |
-
weight:
|
78 |
-
|
79 |
bgcolor: getColor(annotation.label, 0.9),
|
80 |
borderpad: 2, // Add padding around the text
|
81 |
-
|
82 |
-
|
83 |
-
}
|
|
|
84 |
|
85 |
-
const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k=K) => {
|
86 |
const relevant_annotations = annotations.filter((annotation) => {
|
87 |
-
return
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
89 |
return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k);
|
90 |
-
}
|
91 |
-
|
92 |
|
93 |
const getMinMaxTracesArea = (traces) => {
|
94 |
-
const x0 = Math.min(...traces.map(trace => trace.x));
|
95 |
-
const x1 = Math.max(...traces.map(trace => trace.x));
|
96 |
-
const y0 = Math.min(...traces.map(trace => trace.y));
|
97 |
-
const y1 = Math.max(...traces.map(trace => trace.y));
|
98 |
-
return {x0, x1, y0, y1};
|
99 |
-
}
|
100 |
|
101 |
const readData = async () => {
|
102 |
-
return (await readCSV(
|
103 |
x: parseFloat(row.X),
|
104 |
y: parseFloat(row.Y),
|
105 |
eduScore: parseFloat(row.edu_labels),
|
106 |
label: parseInt(row.cluster_labels),
|
107 |
text: row.content_display,
|
108 |
}));
|
109 |
-
}
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
// The cluster is pretty big, so takes time to donwload
|
113 |
// In the meantime we put there a placeholder image
|
114 |
const destroyPlaceholderImage = (parent) => {
|
115 |
-
const img = parent.querySelector(
|
116 |
img.remove();
|
117 |
-
}
|
118 |
|
119 |
async function plotClusters() {
|
120 |
-
const parent = document.getElementById(
|
121 |
// We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :)
|
122 |
const data = await readData();
|
123 |
-
const
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
marker: {
|
129 |
-
color: data.map(row => getColor(row.label, 1.0)),
|
130 |
size: 5,
|
131 |
-
opacity: 8
|
132 |
},
|
133 |
-
hoverinfo:
|
134 |
-
hovertext: data.map(row => getLabelHoverFormat(row)),
|
135 |
hoverlabel: {
|
136 |
-
bgcolor:
|
137 |
},
|
138 |
-
|
139 |
-
|
140 |
-
const annotations = (await parseAnnotations('data/clustering/info.csv')).map(
|
141 |
-
(annot) => {
|
142 |
-
return {
|
143 |
-
...annot,
|
144 |
-
ord: labelOrder[annot.label]
|
145 |
-
}
|
146 |
-
}
|
147 |
-
);
|
148 |
|
149 |
-
const {x0, x1, y0, y1} = getMinMaxTracesArea(data);
|
150 |
const layout = {
|
151 |
height: 550,
|
152 |
width: parent.clientWidth,
|
@@ -158,7 +178,7 @@ async function plotClusters() {
|
|
158 |
text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more",
|
159 |
font: {
|
160 |
size: 16,
|
161 |
-
style:
|
162 |
},
|
163 |
// italic
|
164 |
},
|
@@ -168,7 +188,7 @@ async function plotClusters() {
|
|
168 |
showgrid: false,
|
169 |
zeroline: false,
|
170 |
},
|
171 |
-
annotations:
|
172 |
font: {
|
173 |
family: "apple-system, Arial, sans-serif",
|
174 |
},
|
@@ -181,24 +201,56 @@ async function plotClusters() {
|
|
181 |
destroyPlaceholderImage(parent);
|
182 |
Plotly.newPlot(parent, traces, layout);
|
183 |
|
184 |
-
parent.on(
|
185 |
// First option zoomed in
|
186 |
if (eventdata["xaxis.range[0]"]) {
|
187 |
-
const [
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
// Idk maybe we can even recompute the ordering, but I think it's fine to use the global one
|
190 |
-
const relevant_annotations = getRelevantAnnotations(
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
}
|
193 |
// Zoom reset
|
194 |
-
else if (eventdata["xaxis.autorange"]){
|
195 |
-
const
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
}
|
199 |
// Otherwise it's just the relayout itself
|
200 |
});
|
201 |
-
|
202 |
window.addEventListener("resize", () => {
|
203 |
// If the window size is smaller than 768, we don't care as it's not shown
|
204 |
if (window.innerWidth < 768) {
|
@@ -214,19 +266,16 @@ document.addEventListener("DOMContentLoaded", () => {
|
|
214 |
plotClusters();
|
215 |
});
|
216 |
|
217 |
-
|
218 |
const readCSV = async (file) => {
|
219 |
-
const data = await fetch(file)
|
220 |
-
const text = await data.text()
|
221 |
-
const csv = Papa.parse(text, {header: true});
|
222 |
return csv.data;
|
223 |
-
}
|
224 |
-
|
225 |
-
|
226 |
|
227 |
const getColor = (i, opacity) => {
|
228 |
if (i < 0) {
|
229 |
-
i = i * -1
|
230 |
}
|
231 |
-
return `rgba(${COLORS[i % COLORS.length].join(
|
232 |
-
}
|
|
|
1 |
// plotly Dark24
|
2 |
const COLORS = [
|
3 |
+
["46", "145", "229"],
|
4 |
+
["225", "95", "153"],
|
5 |
+
["28", "167", "28"],
|
6 |
+
["251", "13", "13"],
|
7 |
+
["218", "22", "255"],
|
|
|
8 |
// ['34', '42', '42'], Black, which makes the text unreadable
|
9 |
+
["182", "129", "0"],
|
10 |
+
["117", "13", "134"],
|
11 |
+
["235", "102", "59"],
|
12 |
+
["81", "28", "251"],
|
13 |
+
["0", "160", "139"],
|
14 |
+
["251", "0", "209"],
|
15 |
+
["252", "0", "128"],
|
16 |
+
["178", "130", "141"],
|
17 |
+
["108", "124", "50"],
|
18 |
+
["119", "138", "174"],
|
19 |
+
["134", "42", "22"],
|
20 |
+
["167", "119", "241"],
|
21 |
+
["98", "0", "66"],
|
22 |
+
["22", "22", "167"],
|
23 |
+
["218", "96", "202"],
|
24 |
+
["108", "69", "22"],
|
25 |
// ['13', '42', '99'], Black
|
26 |
+
["175", "0", "56"],
|
27 |
+
];
|
28 |
|
29 |
+
const BASE_SIZE = 5;
|
|
|
|
|
30 |
|
31 |
+
const getLabelHoverFormat = (row, ) => {
|
32 |
+
return `<b>Text</b>: ${row.text}<br><b>Label</b>: ${row.label}<br><b>Edu label</b>: ${row.eduScore}`;
|
33 |
+
};
|
34 |
|
35 |
+
// Number of annotations to display
|
36 |
+
const K = 15;
|
37 |
|
38 |
function createLabelOrderMapping(labels) {
|
39 |
const labelCounts = labels.reduce((acc, label) => {
|
|
|
41 |
return acc;
|
42 |
}, {});
|
43 |
|
44 |
+
const sortedLabels = Object.entries(labelCounts)
|
45 |
+
.sort((a, b) => b[1] - a[1])
|
46 |
+
.map((entry) => entry[0]);
|
47 |
|
48 |
const labelOrder = {};
|
49 |
sortedLabels.forEach((label, index) => {
|
|
|
52 |
return labelOrder;
|
53 |
}
|
54 |
|
|
|
|
|
|
|
55 |
const parseAnnotations = async (file) => {
|
56 |
+
return (await readCSV(file))
|
57 |
+
.filter((cluster_summary) => {
|
58 |
+
return parseInt(cluster_summary.cluster_id) != -1;
|
59 |
+
})
|
60 |
+
.map((cluster_summary) => {
|
61 |
+
return {
|
62 |
+
x: parseFloat(cluster_summary.cluster_position_x),
|
63 |
+
y: parseFloat(cluster_summary.cluster_position_y),
|
64 |
+
label: parseInt(cluster_summary.cluster_id),
|
65 |
+
text: cluster_summary.cluster_summaries,
|
66 |
+
};
|
67 |
+
});
|
68 |
+
};
|
69 |
|
70 |
const addStylingToAnnotations = (annotations) => {
|
71 |
+
return annotations.map((annotation) => {
|
72 |
return {
|
|
|
73 |
showarrow: false,
|
74 |
font: {
|
75 |
size: 14,
|
76 |
+
color: "black",
|
77 |
+
weight: "bold",
|
78 |
+
},
|
79 |
bgcolor: getColor(annotation.label, 0.9),
|
80 |
borderpad: 2, // Add padding around the text
|
81 |
+
...annotation,
|
82 |
+
};
|
83 |
+
});
|
84 |
+
};
|
85 |
|
86 |
+
const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k = K) => {
|
87 |
const relevant_annotations = annotations.filter((annotation) => {
|
88 |
+
return (
|
89 |
+
annotation.x >= x0 &&
|
90 |
+
annotation.x <= x1 &&
|
91 |
+
annotation.y >= y0 &&
|
92 |
+
annotation.y <= y1
|
93 |
+
);
|
94 |
+
});
|
95 |
return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k);
|
96 |
+
};
|
|
|
97 |
|
98 |
const getMinMaxTracesArea = (traces) => {
|
99 |
+
const x0 = Math.min(...traces.map((trace) => trace.x));
|
100 |
+
const x1 = Math.max(...traces.map((trace) => trace.x));
|
101 |
+
const y0 = Math.min(...traces.map((trace) => trace.y));
|
102 |
+
const y1 = Math.max(...traces.map((trace) => trace.y));
|
103 |
+
return { x0, x1, y0, y1 };
|
104 |
+
};
|
105 |
|
106 |
const readData = async () => {
|
107 |
+
return (await readCSV("data/clustering/data.csv")).map((row) => ({
|
108 |
x: parseFloat(row.X),
|
109 |
y: parseFloat(row.Y),
|
110 |
eduScore: parseFloat(row.edu_labels),
|
111 |
label: parseInt(row.cluster_labels),
|
112 |
text: row.content_display,
|
113 |
}));
|
114 |
+
};
|
115 |
|
116 |
+
const restyleTraces = (traces, zoomLevel) => {
|
117 |
+
const res = [
|
118 |
+
{
|
119 |
+
...traces[0],
|
120 |
+
marker: {
|
121 |
+
...traces[0].marker,
|
122 |
+
size: BASE_SIZE * zoomLevel,
|
123 |
+
},
|
124 |
+
},
|
125 |
+
];
|
126 |
+
return res;
|
127 |
+
};
|
128 |
|
129 |
// The cluster is pretty big, so takes time to donwload
|
130 |
// In the meantime we put there a placeholder image
|
131 |
const destroyPlaceholderImage = (parent) => {
|
132 |
+
const img = parent.querySelector("img");
|
133 |
img.remove();
|
134 |
+
};
|
135 |
|
136 |
async function plotClusters() {
|
137 |
+
const parent = document.getElementById("clusters-plot");
|
138 |
// We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :)
|
139 |
const data = await readData();
|
140 |
+
const labelOrder = createLabelOrderMapping(data.map((row) => row.label));
|
141 |
+
const annotations = addStylingToAnnotations(
|
142 |
+
await parseAnnotations("data/clustering/info.csv")
|
143 |
+
).map((annot) => {
|
144 |
+
return {
|
145 |
+
...annot,
|
146 |
+
ord: labelOrder[annot.label],
|
147 |
+
};
|
148 |
+
});
|
149 |
+
|
150 |
+
const traces = [
|
151 |
+
{
|
152 |
+
type: "scatter",
|
153 |
+
mode: "markers",
|
154 |
+
x: data.map((row) => row.x),
|
155 |
+
y: data.map((row) => row.y),
|
156 |
marker: {
|
157 |
+
color: data.map((row) => getColor(row.label, 1.0)),
|
158 |
size: 5,
|
159 |
+
opacity: 8,
|
160 |
},
|
161 |
+
hoverinfo: "text",
|
162 |
+
hovertext: data.map((row) => getLabelHoverFormat(row)),
|
163 |
hoverlabel: {
|
164 |
+
bgcolor: "white",
|
165 |
},
|
166 |
+
},
|
167 |
+
];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
+
const { x0, x1, y0, y1 } = getMinMaxTracesArea(data);
|
170 |
const layout = {
|
171 |
height: 550,
|
172 |
width: parent.clientWidth,
|
|
|
178 |
text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more",
|
179 |
font: {
|
180 |
size: 16,
|
181 |
+
style: "italic",
|
182 |
},
|
183 |
// italic
|
184 |
},
|
|
|
188 |
showgrid: false,
|
189 |
zeroline: false,
|
190 |
},
|
191 |
+
annotations: getRelevantAnnotations(annotations, x0, x1, y0, y1),
|
192 |
font: {
|
193 |
family: "apple-system, Arial, sans-serif",
|
194 |
},
|
|
|
201 |
destroyPlaceholderImage(parent);
|
202 |
Plotly.newPlot(parent, traces, layout);
|
203 |
|
204 |
+
parent.on("plotly_relayout", (eventdata) => {
|
205 |
// First option zoomed in
|
206 |
if (eventdata["xaxis.range[0]"]) {
|
207 |
+
const [newx0, newx1] = [
|
208 |
+
eventdata["xaxis.range[0]"],
|
209 |
+
eventdata["xaxis.range[1]"],
|
210 |
+
];
|
211 |
+
const [newy0, newy1] = [
|
212 |
+
eventdata["yaxis.range[0]"],
|
213 |
+
eventdata["yaxis.range[1]"],
|
214 |
+
];
|
215 |
// Idk maybe we can even recompute the ordering, but I think it's fine to use the global one
|
216 |
+
const relevant_annotations = getRelevantAnnotations(
|
217 |
+
annotations,
|
218 |
+
newx0,
|
219 |
+
newx1,
|
220 |
+
newy0,
|
221 |
+
newy1
|
222 |
+
);
|
223 |
+
// 1.8 otherwise it's too big
|
224 |
+
const zoomLevel =
|
225 |
+
Math.min(
|
226 |
+
(x1 - x0) / (newx1 - newx0),
|
227 |
+
(y1 - y0) / (newy1 - newy0)
|
228 |
+
) / 1.8;
|
229 |
+
Plotly.update(
|
230 |
+
parent,
|
231 |
+
{ "marker.size": BASE_SIZE * zoomLevel },
|
232 |
+
{ annotations: relevant_annotations },
|
233 |
+
relevantDataIdx
|
234 |
+
);
|
235 |
}
|
236 |
// Zoom reset
|
237 |
+
else if (eventdata["xaxis.autorange"]) {
|
238 |
+
const relevant_annotations = getRelevantAnnotations(
|
239 |
+
annotations,
|
240 |
+
x0,
|
241 |
+
x1,
|
242 |
+
y0,
|
243 |
+
y1
|
244 |
+
);
|
245 |
+
Plotly.update(
|
246 |
+
parent,
|
247 |
+
{ "marker.size": BASE_SIZE },
|
248 |
+
{ annotations: relevant_annotations }
|
249 |
+
);
|
250 |
}
|
251 |
// Otherwise it's just the relayout itself
|
252 |
});
|
253 |
+
|
254 |
window.addEventListener("resize", () => {
|
255 |
// If the window size is smaller than 768, we don't care as it's not shown
|
256 |
if (window.innerWidth < 768) {
|
|
|
266 |
plotClusters();
|
267 |
});
|
268 |
|
|
|
269 |
const readCSV = async (file) => {
|
270 |
+
const data = await fetch(file);
|
271 |
+
const text = await data.text();
|
272 |
+
const csv = Papa.parse(text, { header: true });
|
273 |
return csv.data;
|
274 |
+
};
|
|
|
|
|
275 |
|
276 |
const getColor = (i, opacity) => {
|
277 |
if (i < 0) {
|
278 |
+
i = i * -1;
|
279 |
}
|
280 |
+
return `rgba(${COLORS[i % COLORS.length].join(",")}, ${opacity})`;
|
281 |
+
};
|