// plotly Dark24 const COLORS = [ ["46", "145", "229"], ["225", "95", "153"], ["28", "167", "28"], ["251", "13", "13"], ["218", "22", "255"], // ['34', '42', '42'], Black, which makes the text unreadable ["182", "129", "0"], ["117", "13", "134"], ["235", "102", "59"], ["81", "28", "251"], ["0", "160", "139"], ["251", "0", "209"], ["252", "0", "128"], ["178", "130", "141"], ["108", "124", "50"], ["119", "138", "174"], ["134", "42", "22"], ["167", "119", "241"], ["98", "0", "66"], ["22", "22", "167"], ["218", "96", "202"], ["108", "69", "22"], // ['13', '42', '99'], Black ["175", "0", "56"], ]; const BASE_SIZE = 5; const getLabelHoverFormat = (row, ) => { return `Text: ${row.text}
Label: ${row.label}
Edu label: ${row.eduScore}`; }; // Number of annotations to display const K = 15; function createLabelOrderMapping(labels) { const labelCounts = labels.reduce((acc, label) => { acc[label] = (acc[label] || 0) + 1; return acc; }, {}); const sortedLabels = Object.entries(labelCounts) .sort((a, b) => b[1] - a[1]) .map((entry) => entry[0]); const labelOrder = {}; sortedLabels.forEach((label, index) => { labelOrder[label] = index; }); return labelOrder; } const parseAnnotations = async (file) => { return (await readCSV(file)) .filter((cluster_summary) => { return parseInt(cluster_summary.cluster_id) != -1; }) .map((cluster_summary) => { return { x: parseFloat(cluster_summary.cluster_position_x), y: parseFloat(cluster_summary.cluster_position_y), label: parseInt(cluster_summary.cluster_id), text: cluster_summary.cluster_summaries, }; }); }; const addStylingToAnnotations = (annotations) => { return annotations.map((annotation) => { return { showarrow: false, font: { size: 14, color: "black", weight: "bold", }, bgcolor: getColor(annotation.label, 0.9), borderpad: 2, // Add padding around the text ...annotation, }; }); }; const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k = K) => { const relevant_annotations = annotations.filter((annotation) => { return ( annotation.x >= x0 && annotation.x <= x1 && annotation.y >= y0 && annotation.y <= y1 ); }); return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k); }; const getMinMaxTracesArea = (traces) => { const x0 = Math.min(...traces.map((trace) => trace.x)); const x1 = Math.max(...traces.map((trace) => trace.x)); const y0 = Math.min(...traces.map((trace) => trace.y)); const y1 = Math.max(...traces.map((trace) => trace.y)); return { x0, x1, y0, y1 }; }; const readData = async () => { return (await readCSV("data/clustering/data.csv")).map((row) => ({ x: parseFloat(row.X), y: parseFloat(row.Y), eduScore: parseFloat(row.edu_labels), label: parseInt(row.cluster_labels), text: row.content_display, })); }; const restyleTraces = (traces, zoomLevel) => { const res = [ { ...traces[0], marker: { ...traces[0].marker, size: BASE_SIZE * zoomLevel, }, }, ]; return res; }; // The cluster is pretty big, so takes time to donwload // In the meantime we put there a placeholder image const destroyPlaceholderImage = (parent) => { const img = parent.querySelector("img"); img.remove(); }; async function plotClusters() { const parent = document.getElementById("clusters-plot"); // We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :) const data = await readData(); const labelOrder = createLabelOrderMapping(data.map((row) => row.label)); const annotations = addStylingToAnnotations( await parseAnnotations("data/clustering/info.csv") ).map((annot) => { return { ...annot, ord: labelOrder[annot.label], }; }); const traces = [ { type: "scatter", mode: "markers", x: data.map((row) => row.x), y: data.map((row) => row.y), marker: { color: data.map((row) => getColor(row.label, 1.0)), size: 5, opacity: 8, }, hoverinfo: "text", hovertext: data.map((row) => getLabelHoverFormat(row)), hoverlabel: { bgcolor: "white", }, }, ]; const { x0, x1, y0, y1 } = getMinMaxTracesArea(data); const layout = { height: 550, width: parent.clientWidth, xaxis: { showticklabels: false, showgrid: false, zeroline: false, title: { text: "Fineweb dataset (clustered using TODO and labeled using TODO),
zoom in to see more", font: { size: 16, style: "italic", }, // italic }, }, yaxis: { showticklabels: false, showgrid: false, zeroline: false, }, annotations: getRelevantAnnotations(annotations, x0, x1, y0, y1), font: { family: "apple-system, Arial, sans-serif", }, margin: { t: 0, b: 30, }, }; destroyPlaceholderImage(parent); Plotly.newPlot(parent, traces, layout); parent.on("plotly_relayout", (eventdata) => { // First option zoomed in if (eventdata["xaxis.range[0]"]) { const [newx0, newx1] = [ eventdata["xaxis.range[0]"], eventdata["xaxis.range[1]"], ]; const [newy0, newy1] = [ eventdata["yaxis.range[0]"], eventdata["yaxis.range[1]"], ]; // Idk maybe we can even recompute the ordering, but I think it's fine to use the global one const relevant_annotations = getRelevantAnnotations( annotations, newx0, newx1, newy0, newy1 ); // 1.8 otherwise it's too big const zoomLevel = Math.min( (x1 - x0) / (newx1 - newx0), (y1 - y0) / (newy1 - newy0) ) / 1.8; Plotly.update( parent, { "marker.size": BASE_SIZE * zoomLevel }, { annotations: relevant_annotations }, ); } // Zoom reset else if (eventdata["xaxis.autorange"]) { const relevant_annotations = getRelevantAnnotations( annotations, x0, x1, y0, y1 ); Plotly.update( parent, { "marker.size": BASE_SIZE }, { annotations: relevant_annotations } ); } // Otherwise it's just the relayout itself }); window.addEventListener("resize", () => { // If the window size is smaller than 768, we don't care as it's not shown if (window.innerWidth < 768) { return; } Plotly.relayout(parent, { width: parent.offsetWidth, }); }); } document.addEventListener("DOMContentLoaded", () => { plotClusters(); }); const readCSV = async (file) => { const data = await fetch(file); const text = await data.text(); const csv = Papa.parse(text, { header: true }); return csv.data; }; const getColor = (i, opacity) => { if (i < 0) { i = i * -1; } return `rgba(${COLORS[i % COLORS.length].join(",")}, ${opacity})`; };