import pandas as pd import json import re # Load the CSV file leaderboard_df = [] with open("benchmark_results.csv", "r") as f: header = f.readline().strip().split(",") header = [h.strip() for h in header] for i, line in enumerate(f): leaderboard_df.append(line.strip().split(",", 13)) # Load metadata metadata = json.load(open('metadata.json')) for k, v in list(metadata.items()): metadata[k.split(",")[0]] = v # Create DataFrame leaderboard_df = pd.DataFrame(leaderboard_df, columns=header) # Filter and process DataFrame leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | ( leaderboard_df["Benchmark Version"] == 'eq-bench_pl')] leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]] def parse_parseable(x): if x["Num Questions Parseable"] == 'FAILED': m = re.match(r'(\d+)\.0 questions were parseable', x["Error"]) return m.group(1) return x["Num Questions Parseable"] leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply( lambda x: parse_parseable(x), axis=1) NUMBER_OF_QUESTIONS = 171.0 def fraction_to_percentage(numerator: float, denominator: float) -> float: return (numerator / denominator) * 100 leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS)) def get_params(model_name): if model_name in metadata: return metadata[model_name] else: print(model_name) return None leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x)) leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None) leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100)) leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0 leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False]) leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"}) # Generate HTML with DataTables html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Leaderboard</title> <link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css"> <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> <script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script> <style> body { font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif; margin: 0; padding: 20px; color: #333; background-color: #fff; } .numeric-cell { text-align: right; padding: 8px !important; } </style> <script> (function($) { $.fn.colorize = function(oOptions) { var settings = $.extend({ parse: function(e) { return parseFloat(e.html()); }, min: undefined, max: undefined, readable: true, themes: { "default": { color_min: "#C80000", color_mid: "#FFFFFF", color_max: "#10A54A" } }, theme: "default", center: undefined, percent: false }, oOptions); function getColor(color1, color2, ratio) { var hex = function(x) { x = x.toString(16); return (x.length == 1) ? '0' + x : x; } color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1 color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2 var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio)); var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio)); var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio)); return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase(); } function getContrastYIQ(hexcolor) { var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor; var r = parseInt(hex.substr(0,2),16); var g = parseInt(hex.substr(2,2),16); var b = parseInt(hex.substr(4,2),16); var yiq = ((r*299)+(g*587)+(b*114))/1000; return (yiq >= 128) ? 'black' : 'white'; } var min = settings.min; var max = settings.max; if (min === undefined || max === undefined) { min = Infinity; max = -Infinity; this.each(function() { var value = parseFloat(settings.parse($(this))); if (!isNaN(value) && isFinite(value)) { min = Math.min(min, value); max = Math.max(max, value); } }); } var center = settings.center !== undefined ? settings.center : (max + min) / 2; var adj = Math.max(Math.abs(max - center), Math.abs(center - min)); this.each(function() { var value = parseFloat(settings.parse($(this))); if (isNaN(value) || !isFinite(value)) return; var ratio = (value - center) / adj; var color1, color2; if (value < center) { ratio = Math.abs(ratio); if (ratio > 1) ratio = 1; color1 = settings.themes[settings.theme].color_min; color2 = settings.themes[settings.theme].color_mid; } else { ratio = Math.abs(ratio); if (ratio > 1) ratio = 1; color1 = settings.themes[settings.theme].color_max; color2 = settings.themes[settings.theme].color_mid; } var color = getColor(color1, color2, ratio); $(this).css('background-color', color); if (settings.readable) $(this).css('color', getContrastYIQ(color)); }); return this; }; }(jQuery)); $(document).ready(function() { // Add custom filtering function $.fn.dataTable.ext.search.push(function(settings, data, dataIndex) { var searchValue = $('.dataTables_filter input').val(); if (!searchValue) return true; // Split search terms by semicolon and trim whitespace var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase()); var modelName = data[0].toLowerCase(); // Model name is in first column // Return true if ANY search terms are found in the model name (OR logic) return searchTerms.some(term => modelName.includes(term)); }); // Custom sorting function for benchmark scores $.fn.dataTable.ext.type.order['score-pre'] = function(data) { var score = parseFloat(data); return isNaN(score) ? -Infinity : score; }; // Get min/max values for each numeric column before initializing DataTables var columnRanges = { 1: { min: Infinity, max: -Infinity }, // Params 2: { min: Infinity, max: -Infinity }, // Benchmark Score 3: { min: Infinity, max: -Infinity } // Percentage Questions Parseable }; $('#leaderboard tbody td').each(function() { var columnIdx = $(this).index(); if (columnIdx in columnRanges) { var value = parseFloat($(this).text()); if (!isNaN(value) && isFinite(value)) { columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value); columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value); } } }); var table = $('#leaderboard').DataTable({ "order": [[2, "desc"]], // Sort by Benchmark Score by default "pageLength": 20, // Show 20 results per page "lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]], // Update length menu options "columnDefs": [ { "targets": [1], "className": "numeric-cell" }, { "type": "score", "targets": [2], // Apply custom sorting to Benchmark Score column "className": "numeric-cell" }, { "targets": [3], "className": "numeric-cell" } ], "drawCallback": function() { // Apply colorization with pre-calculated ranges $("#leaderboard tbody td:nth-child(2)").colorize({ parse: function(e) { return parseFloat($(e).text()); }, min: columnRanges[1].min, max: columnRanges[1].max, themes: { "default": { color_min: "#10A54A", // White for smaller models color_mid: "#FFD700", // Gold/yellow for medium models color_max: "#C80000" // Hot pink for larger models } } }); $("#leaderboard tbody td:nth-child(3)").colorize({ parse: function(e) { return parseFloat($(e).text()); }, min: columnRanges[2].min, max: columnRanges[2].max, themes: { "default": { color_min: "#C80000", // Red for lower scores color_mid: "#FFD700", // Gold/yellow for medium scores color_max: "#10A54A" // Green for higher scores } } }); $("#leaderboard tbody td:nth-child(4)").colorize({ parse: function(e) { return parseFloat($(e).text()); }, min: columnRanges[3].min, max: columnRanges[3].max, themes: { "default": { color_min: "#C80000", // Red for lower percentages color_mid: "#FFD700", // Gold/yellow for medium percentages color_max: "#10A54A" // Green for higher percentages } } }); }, // Override the default search behavior "search": { "smart": false }, // Update search on input change "initComplete": function() { var table = this.api(); $('.dataTables_filter input') .off() // Remove default binding .on('input', function() { table.draw(); }); } }); }); </script> </head> <body> <h1>Leaderboard</h1> <table id="leaderboard" class="display" style="width:100%"> <thead> <tr> <th>Model</th> <th>Params</th> <th>Benchmark Score</th> <th>Percentage Questions Parseable</th> <th>Error</th> </tr> </thead> <tbody> """ # Add rows to the HTML table for _, row in leaderboard_df.iterrows(): html += f""" <tr> <td>{row['Model']}</td> <td>{row['Params']}</td> <td>{row['Benchmark Score']:.2f}</td> <td>{row['Percentage Questions Parseable']:.2f}</td> <td>{row['Error']}</td> </tr> """ # Close the HTML tags html += """ </tbody> </table> </body> </html> """ # Save the HTML to a file with open("leaderboard.html", "w") as file: file.write(html) print("HTML leaderboard generated and saved as leaderboard.html")