add cluster
Browse files- data/clustering/data.csv +0 -0
- data/clustering/info.csv +106 -0
- index.html +8 -3
- src/clusters.js +223 -0
- src/plotting.js +48 -44
data/clustering/data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/clustering/info.csv
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,cluster_id,cluster_summaries,cluster_position_x,cluster_position_y
|
2 |
+
0,-1,None,9.926462,4.7121987
|
3 |
+
1,0,Philosophical/Spiritual Introspection,10.312462,1.2666532
|
4 |
+
2,1,"Scholarships",8.167274,4.8995786
|
5 |
+
3,2,Politics,8.81142,2.4859838
|
6 |
+
4,3,Theology,9.615214,0.3783942
|
7 |
+
5,4,Dating,4.985182,1.8439052
|
8 |
+
6,5,Accommodation,11.457769,5.080919
|
9 |
+
7,6,Football,6.6154537,-1.6859366
|
10 |
+
8,7,Film Festival,6.9734483,1.4548192
|
11 |
+
9,8,Culinary,13.426296,4.5412893
|
12 |
+
10,9,Music,6.0653744,0.7536916
|
13 |
+
11,10,Gambling,3.124241,3.2533677
|
14 |
+
12,11,Baseball,7.133596,-2.4256644
|
15 |
+
13,12,Technology,6.4929094,6.768577
|
16 |
+
14,13,Website Policies,4.873843,5.771508
|
17 |
+
15,14,Weddings,11.815845,3.7894728
|
18 |
+
16,15,Gaming,5.529167,2.9530518
|
19 |
+
17,16,Commodities/Services Provision,10.453564,5.8489122
|
20 |
+
18,17,Crafts,13.287651,6.4237967
|
21 |
+
19,18,Automobiles,9.9531145,8.840178
|
22 |
+
20,19,Watches,13.893139,9.859185
|
23 |
+
21,20,Dogs,12.595798,3.5351615
|
24 |
+
22,21,Photography,10.7942295,3.5504062
|
25 |
+
23,22,Legalities,8.942016,4.72733
|
26 |
+
24,23,Consumer Electronics,7.078649,8.338984
|
27 |
+
25,24,Insulation,10.520957,7.914946
|
28 |
+
26,25,Cannabis,14.317424,3.2114828
|
29 |
+
27,26,Footwear,15.052116,7.6956415
|
30 |
+
28,27,Real Estate,9.536316,6.103533
|
31 |
+
29,28,Relocation,10.205071,7.1883316
|
32 |
+
30,29,Sports betting,3.2779586,2.443366
|
33 |
+
31,30,Narratives,7.613535,1.8300554
|
34 |
+
32,31,Dating,4.788838,2.1900373
|
35 |
+
33,32,Apparel/Clothing,14.394226,7.3073387
|
36 |
+
34,33,User Authentication,5.265638,6.4014487
|
37 |
+
35,34,Academicwriting,6.9187264,3.4357684
|
38 |
+
36,35,Sports,7.4969172,-2.086585
|
39 |
+
37,36,Fashion/Lifestyle Products,13.821669,7.7150764
|
40 |
+
38,37,Diverse events,9.437052,2.2438836
|
41 |
+
39,38,Blockchain/Cryptocurrency,7.7586045,6.9439344
|
42 |
+
40,39,Online Businesses/Marketing,6.522259,5.219268
|
43 |
+
41,40,Healthcare,11.425277,2.3801014
|
44 |
+
42,41,Home Decor,12.878046,7.2632184
|
45 |
+
43,42,Biomedicine,12.789575,2.3376262
|
46 |
+
44,43,Jewelry,14.259997,8.653363
|
47 |
+
45,44,Addiction,11.561383,1.3774762
|
48 |
+
46,45,Products,11.711758,8.423251
|
49 |
+
47,46,Multi-purposefulness,11.080702,7.4574013
|
50 |
+
48,47,"Mass transit",9.910158,5.4402313
|
51 |
+
49,48,Ethernet,6.9763823,7.7909245
|
52 |
+
50,49,Legal,9.516912,4.636553
|
53 |
+
51,50,E-commerce,13.263438,8.6548195
|
54 |
+
52,51,Audio,7.717162,8.903019
|
55 |
+
53,52,Infrastructure,10.52904,5.369669
|
56 |
+
54,53,Firearms,11.062812,9.268473
|
57 |
+
55,54,Freight/Logistics,9.551044,7.0336204
|
58 |
+
56,55,Products,12.073747,7.645973
|
59 |
+
57,56,Vaccinations,11.9387045,2.7824683
|
60 |
+
58,57,Artwork,11.019163,4.1677165
|
61 |
+
59,58,Viticulture,14.223523,5.0761614
|
62 |
+
60,59,WordPress,5.9597983,5.824579
|
63 |
+
61,60,Cosmetics/Dermatology,15.093273,3.4669027
|
64 |
+
62,61,Software,6.375921,6.4298844
|
65 |
+
63,62,Dentistry,14.76626,1.1620314
|
66 |
+
64,63,Pest Control,13.201735,3.6806118
|
67 |
+
65,64,SEO,5.720493,5.238112
|
68 |
+
66,65,Lottery,1.7142816,2.9782674
|
69 |
+
67,66,Narratives,8.460977,1.0804662
|
70 |
+
68,67,Waste Reduction & Recycling,10.634534,6.959523
|
71 |
+
69,68,Communication,6.438943,5.9467845
|
72 |
+
70,69,Orthopedics,13.005415,1.1908791
|
73 |
+
71,70,Home Decor & Furniture,12.732457,7.876862
|
74 |
+
72,71,Education,7.6568975,3.4944353
|
75 |
+
73,72,Sports,7.295141,-0.7343214
|
76 |
+
74,73,Social Media Advertising,6.133886,4.8547883
|
77 |
+
75,74,Privacy,4.756733,6.3598356
|
78 |
+
76,75,Website design,6.1168823,5.465095
|
79 |
+
77,76,Roofing,11.389448,8.080609
|
80 |
+
78,77,Nutrition/Supplements,13.631578,2.5334294
|
81 |
+
79,78,Haircare/Hairstyling,15.544645,4.54254
|
82 |
+
80,79,Cookies,4.341592,6.819268
|
83 |
+
81,80,International Trade,8.993828,6.4757586
|
84 |
+
82,81,Entrepreneurial Resources,9.435777,5.3340797
|
85 |
+
83,82,Cricket,6.5171986,-1.245905
|
86 |
+
84,83,Crafts,13.852216,7.049825
|
87 |
+
85,84,Floristry,13.407425,5.8741536
|
88 |
+
86,85,Genealogy,9.530803,1.6548243
|
89 |
+
87,86,Mental Health,11.074349,1.6069281
|
90 |
+
88,87,Volunteerism,10.145443,3.6734574
|
91 |
+
89,88,Lighting,11.385381,8.93693
|
92 |
+
90,89,Artificial Intelligence,6.5306387,6.2178063
|
93 |
+
91,90,Business,7.471462,6.4142885
|
94 |
+
92,91,E-commerce,13.638669,6.5098934
|
95 |
+
93,92,Urbanization/Over-tourism,10.221115,6.100654
|
96 |
+
94,93,Events,10.8449,3.9822264
|
97 |
+
95,94,Pharmaceuticals/Biotechnology,12.318266,2.4331784
|
98 |
+
96,95,Professional Wrestling,6.856304,-0.65598303
|
99 |
+
97,96,Various,9.3211975,3.4894605
|
100 |
+
98,97,Medicine,13.17882,2.1281319
|
101 |
+
99,98,Community Engagement,9.848856,3.5187004
|
102 |
+
100,99,Fitness,12.504849,0.9134393
|
103 |
+
101,100,Bathroom Design & Toilet Engineering,11.779076,7.2920136
|
104 |
+
102,101,Business Development,7.328447,5.659843
|
105 |
+
103,102,Sports,7.6370654,-1.0701839
|
106 |
+
104,103,Sexuality,13.817207,1.6510898
|
index.html
CHANGED
@@ -5,7 +5,9 @@
|
|
5 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjs/12.4.2/math.min.js" charset="utf-8"></script>
|
6 |
<script src="https://cdn.plot.ly/plotly-2.32.0.min.js" charset="utf-8"></script>
|
7 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js" charset="utf-8"></script>
|
|
|
8 |
<script type="module" src="src/plotting.js"></script>
|
|
|
9 |
<link rel="stylesheet" href="style.css">
|
10 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
11 |
<meta charset="utf8">
|
@@ -159,9 +161,12 @@
|
|
159 |
</script>
|
160 |
</d-front-matter>
|
161 |
<d-title>
|
162 |
-
<
|
163 |
-
<
|
164 |
-
|
|
|
|
|
|
|
165 |
</d-title>
|
166 |
<d-byline></d-byline>
|
167 |
<d-article>
|
|
|
5 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjs/12.4.2/math.min.js" charset="utf-8"></script>
|
6 |
<script src="https://cdn.plot.ly/plotly-2.32.0.min.js" charset="utf-8"></script>
|
7 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js" charset="utf-8"></script>
|
8 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.4.1/papaparse.js" integrity="sha512-M0cjXJTonbWEdLI3HJIoJSQBb9980RWmOCk+tvWkhgFrAZqSSIg1+1Db/vDu7Qk9W3L90gBynve17PYvarjfQA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
9 |
<script type="module" src="src/plotting.js"></script>
|
10 |
+
<script type="module" src="src/clusters.js"></script>
|
11 |
<link rel="stylesheet" href="style.css">
|
12 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
13 |
<meta charset="utf8">
|
|
|
161 |
</script>
|
162 |
</d-front-matter>
|
163 |
<d-title>
|
164 |
+
<div class="main-plot-container l-page">
|
165 |
+
<figure>
|
166 |
+
<img src="banner.png" alt="FineWeb">
|
167 |
+
</figure>
|
168 |
+
<div id="clusters-plot"></div>
|
169 |
+
</div>
|
170 |
</d-title>
|
171 |
<d-byline></d-byline>
|
172 |
<d-article>
|
src/clusters.js
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// plotly Dark24
|
2 |
+
const COLORS = [
|
3 |
+
|
4 |
+
['46', '145', '229'],
|
5 |
+
['225', '95', '153'],
|
6 |
+
['28', '167', '28'],
|
7 |
+
['251', '13', '13'],
|
8 |
+
['218', '22', '255'],
|
9 |
+
// ['34', '42', '42'], Black, which makes the text unreadable
|
10 |
+
['182', '129', '0'],
|
11 |
+
['117', '13', '134'],
|
12 |
+
['235', '102', '59'],
|
13 |
+
['81', '28', '251'],
|
14 |
+
['0', '160', '139'],
|
15 |
+
['251', '0', '209'],
|
16 |
+
['252', '0', '128'],
|
17 |
+
['178', '130', '141'],
|
18 |
+
['108', '124', '50'],
|
19 |
+
['119', '138', '174'],
|
20 |
+
['134', '42', '22'],
|
21 |
+
['167', '119', '241'],
|
22 |
+
['98', '0', '66'],
|
23 |
+
['22', '22', '167'],
|
24 |
+
['218', '96', '202'],
|
25 |
+
['108', '69', '22'],
|
26 |
+
// ['13', '42', '99'], Black
|
27 |
+
['175', '0', '56']
|
28 |
+
]
|
29 |
+
|
30 |
+
const getLabelHoverFormat = (row) => {
|
31 |
+
return `<b>Text</b>: ${row.text}<br><b>Edu label</b>: ${row.eduScore}`
|
32 |
+
}
|
33 |
+
|
34 |
+
// Number of annotations to display
|
35 |
+
const K = 15
|
36 |
+
|
37 |
+
|
38 |
+
function createLabelOrderMapping(labels) {
|
39 |
+
const labelCounts = labels.reduce((acc, label) => {
|
40 |
+
acc[label] = (acc[label] || 0) + 1;
|
41 |
+
return acc;
|
42 |
+
}, {});
|
43 |
+
|
44 |
+
const sortedLabels = Object.entries(labelCounts).sort((a, b) => b[1] - a[1]).map(entry => entry[0]);
|
45 |
+
|
46 |
+
const labelOrder = {};
|
47 |
+
sortedLabels.forEach((label, index) => {
|
48 |
+
labelOrder[label] = index;
|
49 |
+
});
|
50 |
+
return labelOrder;
|
51 |
+
}
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
const parseAnnotations = async (file) => {
|
57 |
+
return (await readCSV(file)).filter((cluster_summary) => {
|
58 |
+
return parseInt(cluster_summary.cluster_id) != -1
|
59 |
+
}).map((cluster_summary) => {
|
60 |
+
return {
|
61 |
+
x: parseFloat(cluster_summary.cluster_position_x),
|
62 |
+
y: parseFloat(cluster_summary.cluster_position_y),
|
63 |
+
label: parseInt(cluster_summary.cluster_id),
|
64 |
+
text: cluster_summary.cluster_summaries,
|
65 |
+
}
|
66 |
+
})
|
67 |
+
}
|
68 |
+
|
69 |
+
const addStylingToAnnotations = (annotations) => {
|
70 |
+
return annotations.map((annotation, i) => {
|
71 |
+
return {
|
72 |
+
...annotation,
|
73 |
+
showarrow: false,
|
74 |
+
font: {
|
75 |
+
size: 14,
|
76 |
+
color: 'black',
|
77 |
+
weight: 'bold'
|
78 |
+
},
|
79 |
+
bgcolor: getColor(annotation.label, 0.9),
|
80 |
+
borderpad: 2, // Add padding around the text
|
81 |
+
}
|
82 |
+
})
|
83 |
+
}
|
84 |
+
|
85 |
+
const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k=K) => {
|
86 |
+
const relevant_annotations = annotations.filter((annotation) => {
|
87 |
+
return annotation.x >= x0 && annotation.x <= x1 && annotation.y >= y0 && annotation.y <= y1
|
88 |
+
})
|
89 |
+
return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k);
|
90 |
+
}
|
91 |
+
|
92 |
+
|
93 |
+
const getMinMaxTracesArea = (traces) => {
|
94 |
+
const x0 = Math.min(...traces.map(trace => trace.x));
|
95 |
+
const x1 = Math.max(...traces.map(trace => trace.x));
|
96 |
+
const y0 = Math.min(...traces.map(trace => trace.y));
|
97 |
+
const y1 = Math.max(...traces.map(trace => trace.y));
|
98 |
+
return {x0, x1, y0, y1};
|
99 |
+
}
|
100 |
+
|
101 |
+
const readData = async () => {
|
102 |
+
return (await readCSV('data/clustering/data.csv')).map(row => ({
|
103 |
+
x: parseFloat(row.X),
|
104 |
+
y: parseFloat(row.Y),
|
105 |
+
eduScore: parseFloat(row.edu_labels),
|
106 |
+
label: parseInt(row.cluster_labels),
|
107 |
+
text: row.content_display,
|
108 |
+
}));
|
109 |
+
}
|
110 |
+
|
111 |
+
async function plotClusters() {
|
112 |
+
const parent = document.getElementById('clusters-plot');
|
113 |
+
const data = await readData();
|
114 |
+
const traces = [{
|
115 |
+
type: 'scatter',
|
116 |
+
mode: 'markers',
|
117 |
+
x: data.map(row => row.x),
|
118 |
+
y: data.map(row => row.y),
|
119 |
+
marker: {
|
120 |
+
color: data.map(row => getColor(row.label, 1.0)),
|
121 |
+
size: 5,
|
122 |
+
opacity: 8
|
123 |
+
},
|
124 |
+
hoverinfo: 'text',
|
125 |
+
hovertext: data.map(row => getLabelHoverFormat(row)),
|
126 |
+
hoverlabel: {
|
127 |
+
bgcolor: 'white'
|
128 |
+
},
|
129 |
+
}];
|
130 |
+
const labelOrder = createLabelOrderMapping(data.map(row => row.label));
|
131 |
+
const annotations = (await parseAnnotations('data/clustering/info.csv')).map(
|
132 |
+
(annot) => {
|
133 |
+
return {
|
134 |
+
...annot,
|
135 |
+
ord: labelOrder[annot.label]
|
136 |
+
}
|
137 |
+
}
|
138 |
+
);
|
139 |
+
|
140 |
+
const {x0, x1, y0, y1} = getMinMaxTracesArea(data);
|
141 |
+
const layout = {
|
142 |
+
height: 550,
|
143 |
+
width: parent.clientWidth,
|
144 |
+
xaxis: {
|
145 |
+
showticklabels: false,
|
146 |
+
showgrid: false,
|
147 |
+
zeroline: false,
|
148 |
+
title: {
|
149 |
+
text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more",
|
150 |
+
font: {
|
151 |
+
size: 16,
|
152 |
+
style: 'italic'
|
153 |
+
},
|
154 |
+
// italic
|
155 |
+
},
|
156 |
+
},
|
157 |
+
yaxis: {
|
158 |
+
showticklabels: false,
|
159 |
+
showgrid: false,
|
160 |
+
zeroline: false,
|
161 |
+
},
|
162 |
+
annotations: addStylingToAnnotations(getRelevantAnnotations(annotations, x0, x1, y0, y1)),
|
163 |
+
font: {
|
164 |
+
family: "apple-system, Arial, sans-serif",
|
165 |
+
},
|
166 |
+
margin: {
|
167 |
+
t: 0,
|
168 |
+
b: 30,
|
169 |
+
},
|
170 |
+
};
|
171 |
+
|
172 |
+
Plotly.newPlot(parent, traces, layout);
|
173 |
+
|
174 |
+
parent.on('plotly_relayout', (eventdata) => {
|
175 |
+
// First option zoomed in
|
176 |
+
if (eventdata["xaxis.range[0]"]) {
|
177 |
+
const [x0, x1] = [eventdata['xaxis.range[0]'], eventdata['xaxis.range[1]']];
|
178 |
+
const [y0, y1] = [eventdata['yaxis.range[0]'], eventdata['yaxis.range[1]']];
|
179 |
+
// Idk maybe we can even recompute the ordering, but I think it's fine to use the global one
|
180 |
+
const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1);
|
181 |
+
Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)});
|
182 |
+
}
|
183 |
+
// Zoom reset
|
184 |
+
else if (eventdata["xaxis.autorange"]){
|
185 |
+
const {x0, x1, y0, y1} = getMinMaxTracesArea(data);
|
186 |
+
const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1);
|
187 |
+
Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)});
|
188 |
+
}
|
189 |
+
// Otherwise it's just the relayout itself
|
190 |
+
});
|
191 |
+
|
192 |
+
window.addEventListener("resize", () => {
|
193 |
+
// If the window size is smaller than 768, we don't care as it's not shown
|
194 |
+
if (window.innerWidth < 768) {
|
195 |
+
return;
|
196 |
+
}
|
197 |
+
Plotly.relayout(parent, {
|
198 |
+
width: parent.offsetWidth,
|
199 |
+
});
|
200 |
+
});
|
201 |
+
}
|
202 |
+
|
203 |
+
document.addEventListener("DOMContentLoaded", () => {
|
204 |
+
plotClusters();
|
205 |
+
});
|
206 |
+
|
207 |
+
|
208 |
+
const readCSV = async (file) => {
|
209 |
+
const data = await fetch(file)
|
210 |
+
const text = await data.text()
|
211 |
+
const csv = Papa.parse(text, {header: true});
|
212 |
+
return csv.data;
|
213 |
+
}
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
const getColor = (i, opacity) => {
|
218 |
+
if (i < 0) {
|
219 |
+
i = i * -1
|
220 |
+
}
|
221 |
+
console.log(COLORS[i % COLORS.length])
|
222 |
+
return `rgba(${COLORS[i % COLORS.length].join(',')}, ${opacity})`
|
223 |
+
}
|
src/plotting.js
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
const TASK_ID_TO_NAME = {
|
2 |
// Ablations
|
3 |
-
|
4 |
"commonsense_qa/acc_norm": "Commonsense QA Norm",
|
5 |
"hellaswag/acc_norm": "HellaSwag",
|
6 |
"openbookqa/acc_norm": "OpenBook QA Norm",
|
@@ -11,27 +11,26 @@ const TASK_ID_TO_NAME = {
|
|
11 |
"mmlu/acc_norm": "MMLU",
|
12 |
|
13 |
// Stats
|
14 |
-
|
15 |
};
|
16 |
|
17 |
const DATASET_ID_TO_NAME = {
|
18 |
-
|
19 |
-
|
20 |
"tiiuae_falcon-refinedweb_data": "RefinedWeb",
|
21 |
"red-pajama-v2_jsonl-deduplicated-extract": "RedPajamaV2",
|
22 |
"dolma-sample": "Dolma1.6",
|
23 |
-
|
24 |
};
|
25 |
|
26 |
const DEFAULT_SETTINGS = {
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
},
|
32 |
-
|
33 |
-
}
|
34 |
-
|
35 |
|
36 |
const DEFAULT_LAYOUT = {
|
37 |
title: {
|
@@ -99,44 +98,43 @@ const getAutoRange = (traces) => {
|
|
99 |
let minX = Math.min(...traces.flatMap((trace) => trace.x));
|
100 |
let maxX = Math.max(...traces.flatMap((trace) => trace.x));
|
101 |
return [minX * 0.95, maxX * 1.05];
|
102 |
-
}
|
103 |
|
104 |
const init_ablation_plot = function () {
|
105 |
const plotElements = document.querySelectorAll('[id^="plot-"]');
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
if (slider !== undefined) {
|
126 |
-
slider.addEventListener("input", () => {
|
127 |
-
clearTimeout(timeoutId);
|
128 |
timeoutId = setTimeout(() => {
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
// Shared plot
|
134 |
-
Plotly.newPlot(plot,
|
135 |
|
136 |
async function updatePlot(dropdown, slider) {
|
137 |
const metricName = dropdown.value;
|
138 |
const sliderValue = parseInt(slider?.value ?? 0);
|
139 |
-
const metricData = await fetch(
|
|
|
|
|
140 |
const traces = metricData?.traces?.[metricName] ?? [];
|
141 |
for (const key in metricData?.data ?? []) {
|
142 |
const traceData = metricData.data[key];
|
@@ -161,7 +159,11 @@ const init_ablation_plot = function () {
|
|
161 |
{
|
162 |
width: width,
|
163 |
yaxis: { title: { text: TASK_ID_TO_NAME[metricName] } },
|
164 |
-
xaxis: {
|
|
|
|
|
|
|
|
|
165 |
},
|
166 |
metricData.layout
|
167 |
);
|
@@ -197,7 +199,9 @@ const createAblationPlottingElements = (
|
|
197 |
plotElement.appendChild(plot);
|
198 |
plotElement.appendChild(controls);
|
199 |
|
200 |
-
const metricOptions = Object.keys(indexMapping).filter(
|
|
|
|
|
201 |
// Dropdown
|
202 |
const dropdownLabel = document.createElement("label");
|
203 |
dropdownLabel.textContent = "Metric:";
|
@@ -216,7 +220,7 @@ const createAblationPlottingElements = (
|
|
216 |
dropdownContainer.appendChild(dropdown);
|
217 |
controls.appendChild(dropdownContainer);
|
218 |
|
219 |
-
let slider = undefined
|
220 |
if (settings.slider !== null) {
|
221 |
const sliderLabel = document.createElement("label");
|
222 |
sliderLabel.textContent = "Rolling window:";
|
|
|
1 |
const TASK_ID_TO_NAME = {
|
2 |
// Ablations
|
3 |
+
agg_score: "Aggregate Score",
|
4 |
"commonsense_qa/acc_norm": "Commonsense QA Norm",
|
5 |
"hellaswag/acc_norm": "HellaSwag",
|
6 |
"openbookqa/acc_norm": "OpenBook QA Norm",
|
|
|
11 |
"mmlu/acc_norm": "MMLU",
|
12 |
|
13 |
// Stats
|
14 |
+
ccnet: "CCNet",
|
15 |
};
|
16 |
|
17 |
const DATASET_ID_TO_NAME = {
|
18 |
+
pii_removed: "Fineweb",
|
19 |
+
allenai_c4_en: "C4",
|
20 |
"tiiuae_falcon-refinedweb_data": "RefinedWeb",
|
21 |
"red-pajama-v2_jsonl-deduplicated-extract": "RedPajamaV2",
|
22 |
"dolma-sample": "Dolma1.6",
|
23 |
+
dedup_minhash_independent_output: "Individual Dedup MinHash",
|
24 |
};
|
25 |
|
26 |
const DEFAULT_SETTINGS = {
|
27 |
+
slider: {
|
28 |
+
max: 30,
|
29 |
+
min: 0,
|
30 |
+
default: 0,
|
31 |
},
|
32 |
+
defaultMetric: "agg_score",
|
33 |
+
};
|
|
|
34 |
|
35 |
const DEFAULT_LAYOUT = {
|
36 |
title: {
|
|
|
98 |
let minX = Math.min(...traces.flatMap((trace) => trace.x));
|
99 |
let maxX = Math.max(...traces.flatMap((trace) => trace.x));
|
100 |
return [minX * 0.95, maxX * 1.05];
|
101 |
+
};
|
102 |
|
103 |
const init_ablation_plot = function () {
|
104 |
const plotElements = document.querySelectorAll('[id^="plot-"]');
|
105 |
+
plotElements.forEach(async (plotElement) => {
|
106 |
+
const plotName = plotElement.id.replace("plot-", "");
|
107 |
+
const indexData = await fetch(`data/plots/${plotName}/index.json`).then(
|
108 |
+
(response) => response.json()
|
109 |
+
);
|
110 |
+
const settings = _.merge({}, DEFAULT_SETTINGS, indexData.settings);
|
111 |
+
const indexMapping = indexData.files;
|
112 |
+
const { dropdown, slider, plot } = createAblationPlottingElements(
|
113 |
+
plotElement,
|
114 |
+
indexMapping,
|
115 |
+
settings
|
116 |
+
);
|
117 |
+
plot.id = `graph-${plotName}`;
|
118 |
+
dropdown.addEventListener("change", () => updatePlot(dropdown, slider));
|
119 |
+
let timeoutId;
|
120 |
+
// Debounce the slider
|
121 |
+
if (slider !== undefined) {
|
122 |
+
slider.addEventListener("input", () => {
|
123 |
+
clearTimeout(timeoutId);
|
|
|
|
|
|
|
124 |
timeoutId = setTimeout(() => {
|
125 |
+
updatePlot(dropdown, slider);
|
126 |
+
}, 500);
|
127 |
+
});
|
128 |
+
}
|
129 |
// Shared plot
|
130 |
+
Plotly.newPlot(plot, []);
|
131 |
|
132 |
async function updatePlot(dropdown, slider) {
|
133 |
const metricName = dropdown.value;
|
134 |
const sliderValue = parseInt(slider?.value ?? 0);
|
135 |
+
const metricData = await fetch(
|
136 |
+
`data/plots/${plotName}/${indexMapping[metricName]["file"]}`
|
137 |
+
).then((response) => response.json());
|
138 |
const traces = metricData?.traces?.[metricName] ?? [];
|
139 |
for (const key in metricData?.data ?? []) {
|
140 |
const traceData = metricData.data[key];
|
|
|
159 |
{
|
160 |
width: width,
|
161 |
yaxis: { title: { text: TASK_ID_TO_NAME[metricName] } },
|
162 |
+
xaxis: {
|
163 |
+
range: settings.autoSetXRange
|
164 |
+
? getAutoRange(traces)
|
165 |
+
: undefined,
|
166 |
+
},
|
167 |
},
|
168 |
metricData.layout
|
169 |
);
|
|
|
199 |
plotElement.appendChild(plot);
|
200 |
plotElement.appendChild(controls);
|
201 |
|
202 |
+
const metricOptions = Object.keys(indexMapping).filter(
|
203 |
+
(metric) => metric in TASK_ID_TO_NAME
|
204 |
+
);
|
205 |
// Dropdown
|
206 |
const dropdownLabel = document.createElement("label");
|
207 |
dropdownLabel.textContent = "Metric:";
|
|
|
220 |
dropdownContainer.appendChild(dropdown);
|
221 |
controls.appendChild(dropdownContainer);
|
222 |
|
223 |
+
let slider = undefined;
|
224 |
if (settings.slider !== null) {
|
225 |
const sliderLabel = document.createElement("label");
|
226 |
sliderLabel.textContent = "Rolling window:";
|