// plotly Dark24
const COLORS = [
['46', '145', '229'],
['225', '95', '153'],
['28', '167', '28'],
['251', '13', '13'],
['218', '22', '255'],
// ['34', '42', '42'], Black, which makes the text unreadable
['182', '129', '0'],
['117', '13', '134'],
['235', '102', '59'],
['81', '28', '251'],
['0', '160', '139'],
['251', '0', '209'],
['252', '0', '128'],
['178', '130', '141'],
['108', '124', '50'],
['119', '138', '174'],
['134', '42', '22'],
['167', '119', '241'],
['98', '0', '66'],
['22', '22', '167'],
['218', '96', '202'],
['108', '69', '22'],
// ['13', '42', '99'], Black
['175', '0', '56']
]
const getLabelHoverFormat = (row) => {
return `Text: ${row.text}
Edu label: ${row.eduScore}`
}
// Number of annotations to display
const K = 15
function createLabelOrderMapping(labels) {
const labelCounts = labels.reduce((acc, label) => {
acc[label] = (acc[label] || 0) + 1;
return acc;
}, {});
const sortedLabels = Object.entries(labelCounts).sort((a, b) => b[1] - a[1]).map(entry => entry[0]);
const labelOrder = {};
sortedLabels.forEach((label, index) => {
labelOrder[label] = index;
});
return labelOrder;
}
const parseAnnotations = async (file) => {
return (await readCSV(file)).filter((cluster_summary) => {
return parseInt(cluster_summary.cluster_id) != -1
}).map((cluster_summary) => {
return {
x: parseFloat(cluster_summary.cluster_position_x),
y: parseFloat(cluster_summary.cluster_position_y),
label: parseInt(cluster_summary.cluster_id),
text: cluster_summary.cluster_summaries,
}
})
}
const addStylingToAnnotations = (annotations) => {
return annotations.map((annotation, i) => {
return {
...annotation,
showarrow: false,
font: {
size: 14,
color: 'black',
weight: 'bold'
},
bgcolor: getColor(annotation.label, 0.9),
borderpad: 2, // Add padding around the text
}
})
}
const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k=K) => {
const relevant_annotations = annotations.filter((annotation) => {
return annotation.x >= x0 && annotation.x <= x1 && annotation.y >= y0 && annotation.y <= y1
})
return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k);
}
const getMinMaxTracesArea = (traces) => {
const x0 = Math.min(...traces.map(trace => trace.x));
const x1 = Math.max(...traces.map(trace => trace.x));
const y0 = Math.min(...traces.map(trace => trace.y));
const y1 = Math.max(...traces.map(trace => trace.y));
return {x0, x1, y0, y1};
}
const readData = async () => {
return (await readCSV('data/clustering/data.csv')).map(row => ({
x: parseFloat(row.X),
y: parseFloat(row.Y),
eduScore: parseFloat(row.edu_labels),
label: parseInt(row.cluster_labels),
text: row.content_display,
}));
}
const showImage = (parent) => {
// create the image
const img = document.createElement('img');
img.src = 'plots/clusters.png';
img.alt = 'Clusters';
parent.appendChild(img);
}
const destroyImage = (parent) => {
const img = parent.querySelector('img');
img.remove();
}
async function plotClusters() {
const parent = document.getElementById('clusters-plot');
// We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :)
showImage(parent);
const data = await readData();
const traces = [{
type: 'scatter',
mode: 'markers',
x: data.map(row => row.x),
y: data.map(row => row.y),
marker: {
color: data.map(row => getColor(row.label, 1.0)),
size: 5,
opacity: 8
},
hoverinfo: 'text',
hovertext: data.map(row => getLabelHoverFormat(row)),
hoverlabel: {
bgcolor: 'white'
},
}];
const labelOrder = createLabelOrderMapping(data.map(row => row.label));
const annotations = (await parseAnnotations('data/clustering/info.csv')).map(
(annot) => {
return {
...annot,
ord: labelOrder[annot.label]
}
}
);
const {x0, x1, y0, y1} = getMinMaxTracesArea(data);
const layout = {
height: 550,
width: parent.clientWidth,
xaxis: {
showticklabels: false,
showgrid: false,
zeroline: false,
title: {
text: "Fineweb dataset (clustered using TODO and labeled using TODO),
zoom in to see more",
font: {
size: 16,
style: 'italic'
},
// italic
},
},
yaxis: {
showticklabels: false,
showgrid: false,
zeroline: false,
},
annotations: addStylingToAnnotations(getRelevantAnnotations(annotations, x0, x1, y0, y1)),
font: {
family: "apple-system, Arial, sans-serif",
},
margin: {
t: 0,
b: 30,
},
};
destroyImage(parent);
Plotly.newPlot(parent, traces, layout);
parent.on('plotly_relayout', (eventdata) => {
// First option zoomed in
if (eventdata["xaxis.range[0]"]) {
const [x0, x1] = [eventdata['xaxis.range[0]'], eventdata['xaxis.range[1]']];
const [y0, y1] = [eventdata['yaxis.range[0]'], eventdata['yaxis.range[1]']];
// Idk maybe we can even recompute the ordering, but I think it's fine to use the global one
const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1);
Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)});
}
// Zoom reset
else if (eventdata["xaxis.autorange"]){
const {x0, x1, y0, y1} = getMinMaxTracesArea(data);
const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1);
Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)});
}
// Otherwise it's just the relayout itself
});
window.addEventListener("resize", () => {
// If the window size is smaller than 768, we don't care as it's not shown
if (window.innerWidth < 768) {
return;
}
Plotly.relayout(parent, {
width: parent.offsetWidth,
});
});
}
document.addEventListener("DOMContentLoaded", () => {
plotClusters();
});
const readCSV = async (file) => {
const data = await fetch(file)
const text = await data.text()
const csv = Papa.parse(text, {header: true});
return csv.data;
}
const getColor = (i, opacity) => {
if (i < 0) {
i = i * -1
}
return `rgba(${COLORS[i % COLORS.length].join(',')}, ${opacity})`
}