# Evaluation Card Template title: "[Evaluation Name]" summary: > Brief description of the evaluation approach, its purpose, and scope. metadata: authors: [] maintainers: [] creation_date: "" last_review_date: "" next_review_date: "" version_compatibility: [] repository_link: "" # Link to the code repository paper_link: "" # Link to the research paper evaluation_design: motivation: scientific_needs: "" approach_justification: "" expected_benefits: "" tradeoffs: "" type_and_structure: type: "" # benchmark, challenge, red teaming, deployment study, structured test structure: "" timeline: "" key_design_decisions: [] design_process: stakeholder_consultation: "" pilot_studies: [] validation_approaches: [] stakeholders_and_resources: target_users: [] required_expertise: [] resource_requirements: [] cost_considerations: "" estimand: target_construct: primary_capability: "" measurement_type: "" # representational or pragmatic relationship_to_applications: "" theoretical_framework: "" scope_and_limitations: coverage: "" excluded_capabilities: [] known_blind_spots: [] theoretical_limitations: [] assessment_components: test_set: data_sources: [] sampling_methodology: "" known_biases: [] approach_to_duplicates: "" data_quality: "" challenge: design_principles: [] task_selection_criteria: [] difficulty_progression: "" time_constraints: "" red_teaming: probing_methodology: "" coverage_strategy: "" adversarial_approach: "" safety_considerations: "" deployment_study: environment_characteristics: "" integration_points: [] success_criteria: [] monitoring_approach: "" estimator: evaluation_protocol: methodology: "" control_measures: [] handling_random_components: "" reproducibility_requirements: "" metrics: primary_metrics: [] aggregation_methodology: "" task_weightings: {} performance_bounds: {} connection_to_outcomes: "" metric_details: - name: "" definition: "" implementation: "" edge_cases: [] statistical_properties: "" baseline_values: {} failure_modes: [] technical_framework: implementation_requirements: [] time_constraints: "" dependencies: [] authentication_needs: "" constraints_and_rules: allowed_resources: [] permitted_approaches: [] optimization_constraints: [] ethical_boundaries: [] estimate: required_reporting: essential_metrics: [] results_disaggregation: "" uncertainty_quantification: "" performance_variation: "" resource_usage_reporting: "" reproducibility_information: documentation_requirements: [] environment_specifications: "" randomization_handling: "" output_standardization: "" results_communication: visualization: recommended_plots: [] standardized_formats: [] key_comparisons: [] leaderboard_guidelines: submission_process: "" required_metadata: [] known_issues_and_limitations: validity_concerns: construct_validity: "" gaming_possibilities: "" stability_considerations: "" temporal_validity: "" practical_limitations: resource_constraints: "" scalability_issues: "" cost_factors: "" time_boundaries: "" bias_and_fairness: known_biases: [] representation_issues: "" potential_impacts: "" mitigation_approaches: [] version_and_maintenance: version_information: version: "" release_date: "" change_history: [] update_plans: "" maintenance_protocol: update_frequency: "" deprecation_policy: "" issue_reporting: "" community_involvement: "" criteria_for_updates: [] breaking_change_policy: "" backwards_compatibility: "" migration_guides: "" citation_and_usage: citation_information: recommended_citation: "" related_publications: [] licensing_details: "" usage_guidelines: recommended_applications: [] inappropriate_uses: [] implementation_best_practices: "" ethical_considerations: "" additional_notes: related_evaluations: [] future_directions: ""