// plotly Dark24 const COLORS = [ ['46', '145', '229'], ['225', '95', '153'], ['28', '167', '28'], ['251', '13', '13'], ['218', '22', '255'], // ['34', '42', '42'], Black, which makes the text unreadable ['182', '129', '0'], ['117', '13', '134'], ['235', '102', '59'], ['81', '28', '251'], ['0', '160', '139'], ['251', '0', '209'], ['252', '0', '128'], ['178', '130', '141'], ['108', '124', '50'], ['119', '138', '174'], ['134', '42', '22'], ['167', '119', '241'], ['98', '0', '66'], ['22', '22', '167'], ['218', '96', '202'], ['108', '69', '22'], // ['13', '42', '99'], Black ['175', '0', '56'] ] const getLabelHoverFormat = (row) => { return `Text: ${row.text}
Edu label: ${row.eduScore}` } // Number of annotations to display const K = 15 function createLabelOrderMapping(labels) { const labelCounts = labels.reduce((acc, label) => { acc[label] = (acc[label] || 0) + 1; return acc; }, {}); const sortedLabels = Object.entries(labelCounts).sort((a, b) => b[1] - a[1]).map(entry => entry[0]); const labelOrder = {}; sortedLabels.forEach((label, index) => { labelOrder[label] = index; }); return labelOrder; } const parseAnnotations = async (file) => { return (await readCSV(file)).filter((cluster_summary) => { return parseInt(cluster_summary.cluster_id) != -1 }).map((cluster_summary) => { return { x: parseFloat(cluster_summary.cluster_position_x), y: parseFloat(cluster_summary.cluster_position_y), label: parseInt(cluster_summary.cluster_id), text: cluster_summary.cluster_summaries, } }) } const addStylingToAnnotations = (annotations) => { return annotations.map((annotation, i) => { return { ...annotation, showarrow: false, font: { size: 14, color: 'black', weight: 'bold' }, bgcolor: getColor(annotation.label, 0.9), borderpad: 2, // Add padding around the text } }) } const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k=K) => { const relevant_annotations = annotations.filter((annotation) => { return annotation.x >= x0 && annotation.x <= x1 && annotation.y >= y0 && annotation.y <= y1 }) return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k); } const getMinMaxTracesArea = (traces) => { const x0 = Math.min(...traces.map(trace => trace.x)); const x1 = Math.max(...traces.map(trace => trace.x)); const y0 = Math.min(...traces.map(trace => trace.y)); const y1 = Math.max(...traces.map(trace => trace.y)); return {x0, x1, y0, y1}; } const readData = async () => { return (await readCSV('data/clustering/data.csv')).map(row => ({ x: parseFloat(row.X), y: parseFloat(row.Y), eduScore: parseFloat(row.edu_labels), label: parseInt(row.cluster_labels), text: row.content_display, })); } const showImage = (parent) => { // create the image const img = document.createElement('img'); img.src = 'plots/clusters.png'; img.alt = 'Clusters'; parent.appendChild(img); } const destroyImage = (parent) => { const img = parent.querySelector('img'); img.remove(); } async function plotClusters() { const parent = document.getElementById('clusters-plot'); // We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :) showImage(parent); const data = await readData(); const traces = [{ type: 'scatter', mode: 'markers', x: data.map(row => row.x), y: data.map(row => row.y), marker: { color: data.map(row => getColor(row.label, 1.0)), size: 5, opacity: 8 }, hoverinfo: 'text', hovertext: data.map(row => getLabelHoverFormat(row)), hoverlabel: { bgcolor: 'white' }, }]; const labelOrder = createLabelOrderMapping(data.map(row => row.label)); const annotations = (await parseAnnotations('data/clustering/info.csv')).map( (annot) => { return { ...annot, ord: labelOrder[annot.label] } } ); const {x0, x1, y0, y1} = getMinMaxTracesArea(data); const layout = { height: 550, width: parent.clientWidth, xaxis: { showticklabels: false, showgrid: false, zeroline: false, title: { text: "Fineweb dataset (clustered using TODO and labeled using TODO),
zoom in to see more", font: { size: 16, style: 'italic' }, // italic }, }, yaxis: { showticklabels: false, showgrid: false, zeroline: false, }, annotations: addStylingToAnnotations(getRelevantAnnotations(annotations, x0, x1, y0, y1)), font: { family: "apple-system, Arial, sans-serif", }, margin: { t: 0, b: 30, }, }; destroyImage(parent); Plotly.newPlot(parent, traces, layout); parent.on('plotly_relayout', (eventdata) => { // First option zoomed in if (eventdata["xaxis.range[0]"]) { const [x0, x1] = [eventdata['xaxis.range[0]'], eventdata['xaxis.range[1]']]; const [y0, y1] = [eventdata['yaxis.range[0]'], eventdata['yaxis.range[1]']]; // Idk maybe we can even recompute the ordering, but I think it's fine to use the global one const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1); Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)}); } // Zoom reset else if (eventdata["xaxis.autorange"]){ const {x0, x1, y0, y1} = getMinMaxTracesArea(data); const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1); Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)}); } // Otherwise it's just the relayout itself }); window.addEventListener("resize", () => { // If the window size is smaller than 768, we don't care as it's not shown if (window.innerWidth < 768) { return; } Plotly.relayout(parent, { width: parent.offsetWidth, }); }); } document.addEventListener("DOMContentLoaded", () => { plotClusters(); }); const readCSV = async (file) => { const data = await fetch(file) const text = await data.text() const csv = Papa.parse(text, {header: true}); return csv.data; } const getColor = (i, opacity) => { if (i < 0) { i = i * -1 } return `rgba(${COLORS[i % COLORS.length].join(',')}, ${opacity})` }