import json import os import re from datetime import datetime from typing import Tuple import pandas as pd from bs4 import BeautifulSoup def format_datetime(dt_str: str) -> str: """ Format a datetime string for display. :param dt_str: String representing a datetime in ISO format :return: Formatted datetime string """ return dt_str.replace("T", " ").split("+")[0] def read_json_line_by_line(file_path): """ Read a JSON file line by line, parsing each line as a separate JSON object. :param file_path: Path to the JSON file :return: List of parsed JSON objects This function is useful for reading large JSON files that contain one JSON object per line. It handles JSON parsing errors gracefully, skipping invalid lines. """ data = [] with open(file_path, "r") as f: for line in f: try: item = json.loads(line.strip()) data.append(item) except json.JSONDecodeError: print(f"Skipping invalid JSON in {file_path}: {line}") return data def calculate_change(new: float, old: float, metric_name: str) -> Tuple[float, str]: """Calculate percentage change and return with appropriate emoji.""" pct_change = new - old if abs(pct_change) < 1: emoji = "↔ī¸" elif pct_change > 0: emoji = "đŸŸĸ" if "wer" not in metric_name.lower() else "❌" else: emoji = "❌" if "wer" not in metric_name.lower() else "đŸŸĸ" return (pct_change, emoji) def has_changes(config, prev_dict, curr_dict): """Check if any metrics have changed.""" curr = curr_dict[config] prev = prev_dict[config] metrics = ["speed", "tokens_per_second", "average_wer", "qoi"] for key in metrics: if key in curr and key in prev: curr_val = curr[key] prev_val = prev[key] if abs(curr_val - prev_val) >= 1: # 1% threshold return True return False def format_metrics_table(config, prev_dict, curr_dict): """Format metrics into a table string.""" curr = curr_dict[config] prev = prev_dict[config] metrics = [ ("Speed", "speed"), ("Tok/s", "tokens_per_second"), ("WER", "average_wer"), ("QoI", "qoi"), ] table = "```\nMetric Previous Current Change\n--------------------------------\n" for metric_name, key in metrics: if key in curr and key in prev: curr_val = curr[key] prev_val = prev[key] pct_change, _ = calculate_change(curr_val, prev_val, metric_name) if abs(pct_change) >= 1: # Only show metrics with changes table += f"{metric_name:<9} {prev_val:<11.2f} {curr_val:<10.2f} {pct_change:.2f}\n" table += "```" return table def extract_status_and_os(cell_value): """ Extract status and OS versions from a cell, handling both HTML and plain text. Returns list of tuples: [(status, os_version), ...] """ results = [] cell_value = str(cell_value) # First, handle the case where there's no HTML tags if cell_value == "Not Supported": return results # Split the cell into parts (first element and subsequent

elements) parts = cell_value.split("

") for part in parts: part = part.strip("

") if not part: continue # Check if part contains warning symbol if "⚠ī¸" in part: # Parse HTML to extract OS version from anchor tag soup = BeautifulSoup(part, "html.parser") # Find text after href that contains OS version text = soup.get_text() os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", text) if os_match: os_version = os_match.group(0) results.append(("⚠ī¸", os_version)) else: # For success cases, OS version is directly in the text os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", part) if os_match: os_version = os_match.group(0) results.append(("✅", os_version)) return results def escape_string(s: str) -> str: """Escape a string to be used as a value in JSON.""" return ( s.replace("\\", "\\\\") .replace('"', '\\"') .replace("\n", "\\n") .replace("\r", "\\r") ) def analyze_support_changes(prev_csv, curr_csv): """Analyze support changes between CSV files.""" # Read CSV files prev_df = pd.read_csv(prev_csv) prev_df.set_index(prev_df.columns[0], inplace=True) curr_df = pd.read_csv(curr_csv) curr_df.set_index(curr_df.columns[0], inplace=True) # Get device lists (excluding first column which is the index) prev_devices = sorted(prev_df.columns[1:]) curr_devices = sorted(curr_df.columns[1:]) # Calculate device ratio device_ratio = len(curr_devices) / len(prev_devices) if prev_devices else 1 needs_alert = device_ratio < 0.9 # Alert if less than 90% of previous devices # Convert to dictionary for easier comparison prev_status = {} curr_status = {} # Process previous data for idx in range(len(prev_df)): model = prev_df.index[idx] for col_idx in range(1, len(prev_df.columns)): cell_value = prev_df.iloc[idx, col_idx] device = prev_df.columns[col_idx] statuses = extract_status_and_os(cell_value) for status, os_version in statuses: prev_status[(model, device, os_version)] = status # Process current data and track new configurations new_configs = [] for idx in range(len(curr_df)): model = curr_df.index[idx] for col_idx in range(1, len(curr_df.columns)): cell_value = curr_df.iloc[idx, col_idx] device = curr_df.columns[col_idx] statuses = extract_status_and_os(cell_value) for status, os_version in statuses: curr_status[(model, device, os_version)] = status # Check if this is a new configuration if (model, device, os_version) not in prev_status: new_configs.append((model, device, os_version)) # Find changes fixed_errors = [] new_errors = [] # Check all configurations that exist in both datasets common_configs = set(prev_status.keys()) & set(curr_status.keys()) for config in common_configs: model, device, os_version = config if prev_status[config] == "⚠ī¸" and curr_status[config] == "✅": fixed_errors.append((model, device, os_version)) elif prev_status[config] == "✅" and curr_status[config] == "⚠ī¸": new_errors.append((model, device, os_version)) return fixed_errors, new_errors, new_configs, needs_alert def generate_report(): # Load current and previous data prev_perf_data = read_json_line_by_line("report_data/performance_data.json") curr_perf_data = read_json_line_by_line("dashboard_data/performance_data.json") prev_dict = {(d["model"], d["device"], d["os"]): d for d in prev_perf_data} curr_dict = {(d["model"], d["device"], d["os"]): d for d in curr_perf_data} common_configs = set(curr_dict.keys()) & set(prev_dict.keys()) # Load version data with open("report_data/version.json", "r") as f: prev_version = json.load(f) with open("dashboard_data/version.json", "r") as f: curr_version = json.load(f) prev_releases = set(prev_version.get("releases", [])) curr_releases = set(curr_version.get("releases", [])) new_releases = curr_releases - prev_releases removed_releases = prev_releases - curr_releases # Track metrics total_configs = len(common_configs) improved_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0} regressed_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0} new_data_points = len(set(curr_dict.keys()) - set(prev_dict.keys())) # Analyze support changes fixed_errors, new_errors, new_configs, needs_alert = analyze_support_changes( "report_data/support_data.csv", "dashboard_data/support_data.csv" ) # Create Slack blocks current_time = datetime.now().strftime("%B %-d, %Y %H:%M:%S") prev_release_tag, curr_release_tag = ( prev_version["whisperkit_version"], curr_version["whisperkit_version"], ) slack_blocks = { "blocks": [ { "type": "header", "text": { "type": "plain_text", "text": "🔔 WhisperKit Dataset Update Report 🔔", "emoji": True, }, }, { "type": "context", "elements": [{"text": f"*{current_time}*", "type": "mrkdwn"}], }, {"type": "divider"}, { "type": "section", "text": {"type": "mrkdwn", "text": "ℹī¸ *CURRENT VERSION INFO* ℹī¸"}, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *Last Modified:* `{format_datetime(curr_version['last_modified'])}`", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *Dataset SHA:* `{curr_version['sha']}`", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *Current Releases:* {', '.join(f'`{r}`' for r in curr_version['releases'])}", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *Current Release Tag:* `{curr_release_tag}`", }, }, {"type": "divider"}, { "type": "section", "text": { "type": "mrkdwn", "text": "🔄 *SUMMARY OF PERFORMANCE UPDATES* 🔄", }, }, ] } # Add release information slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *Added Releases:* {', '.join(sorted(new_releases)) if new_releases else 'None'}", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *Removed Releases:* {', '.join(sorted(removed_releases)) if removed_releases else 'None'}", }, }, ] ) if prev_release_tag != curr_release_tag: slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *Release Tag Change:* `{prev_release_tag}` → `{curr_release_tag}`", }, } ) slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "\n", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *New Data Points:* `{new_data_points}` new configurations", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": "\n", }, }, ] ) # Add metrics summary for metric_name, key in [ ("Speed", "speed"), ("Tok/s", "tokens_per_second"), ("WER", "average_wer"), ("QoI", "qoi"), ]: slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ *{metric_name}:* `{improved_metrics[key]}` improved, `{regressed_metrics[key]}` regressed", }, } ) # Add support changes section if fixed_errors or new_errors or new_configs: slack_blocks["blocks"].extend( [ {"type": "divider"}, { "type": "section", "text": {"type": "mrkdwn", "text": "📱 *DEVICE SUPPORT CHANGES* 📱"}, }, ] ) if fixed_errors: slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "*Successful Configurations That Override Previous Failures*", }, } ] ) for model, device, os_version in sorted(fixed_errors): slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ {model} on {device} ({os_version})", }, } ) if new_errors: slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "*Failed Configurations That Override Previous Successes*", }, } ] ) for model, device, os_version in sorted(new_errors): slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ {model} on {device} ({os_version})", }, } ) if new_configs: slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "*Newly Tested Configurations*", }, } ] ) for model, device, os_version in sorted(new_configs): slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"â€ĸ {model} on {device} ({os_version})", }, } ) # Add alert if significant decrease in device count if needs_alert: slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": "⚠ī¸ *ALERT:* Current device count is less than 90% of previous version's device count, test on more devices before updating the benchmark website!", }, } ) # Create performance text as a single mrkdwn string if common_configs: performance_text = "💡 *Performance Updates* 💡\n\n" # Group by model for better organization models = sorted(set(model for model, _, _ in common_configs)) for model in models: model_configs = sorted([cfg for cfg in common_configs if cfg[0] == model]) for config in model_configs: device_info = f"*{model}* ({config[2]})" if not has_changes(config, prev_dict, curr_dict): # If no changes, just add the model with a checkmark performance_text += f"{device_info} ✅\n\n" else: # If there are changes, show the metrics performance_text += f"{device_info}\n" performance_text += format_metrics_table( config, prev_dict, curr_dict ) performance_text += "\n\n" # Write to GITHUB_OUTPUT github_output = os.getenv("GITHUB_OUTPUT") if github_output: with open(github_output, "a") as f: f.write("slack_message_payload<