Spaces:

jablonkagroup
/

eval-cards-gallery

Running

App Files Files Community

n0w0f commited on 15 days ago

Commit

32d70d7

1 Parent(s): a0639a7

feat: init submission, feedback and display

Browse files

Files changed (8) hide show

README.md +6 -6
app.py +433 -0
eval_cards/ChemBench_20250312_170522.yaml +411 -0
requirement.txt +4 -0
script.js +90 -0
style.css +12 -0
template.yaml +0 -0
yaml_template.yaml +0 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
-title: Eval Cards Gallery
-emoji: 🏃
-colorFrom: indigo
-colorTo: green
 sdk: gradio
 sdk_version: 5.20.1
 app_file: app.py
 pinned: false
 license: mit
-short_description: Registry to collect eval-cards on benchmarking efforts
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Eval Cards
+emoji: 🏆
+colorFrom: green
+colorTo: purple
 sdk: gradio
 sdk_version: 5.20.1
 app_file: app.py
 pinned: false
 license: mit
+short_description: Registry of eval-cards from different benchmark
 ---
+For more details refer : https://github.com/lamalab-org/eval-cards

app.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import datetime
+import os
+import re
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+import requests
+import yaml
+# Constants
+EVAL_CARDS_DIR = "eval_cards"
+TEMPLATE_PATH = "template.yaml"
+DEFAULT_MODEL = "anthropic/claude-3-haiku-20240307"  # Or any other model available on HF
+# Ensure the eval cards directory exists
+os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
+# Copy the template to the appropriate location
+with open("template.yaml", "w") as f:
+    with open("yaml_template.yaml", "r") as template_file:
+        f.write(template_file.read())
+def load_template():
+    """Load the YAML template"""
+    with open(TEMPLATE_PATH, "r") as file:
+        return file.read()
+def yaml_to_dict(yaml_str):
+    """Convert YAML string to Python dictionary"""
+    try:
+        return yaml.safe_load(yaml_str)
+    except yaml.YAMLError as e:
+        return {"error": str(e)}
+def compute_coverage_score(eval_data):
+    """
+    Compute a coverage score for the eval card
+    Returns a score from 0-100 and a breakdown of coverage by section
+    """
+    sections = {
+        "metadata": 5,
+        "evaluation_design": 15,
+        "estimand": 20,
+        "estimator": 25,
+        "estimate": 10,
+        "results_communication": 10,
+        "known_issues_and_limitations": 10,
+        "version_and_maintenance": 5,
+        "citation_and_usage": 5,
+    }
+    scores = {}
+    total_score = 0
+    def count_filled_fields(data, prefix=""):
+        if isinstance(data, dict):
+            filled = 0
+            total = 0
+            for key, value in data.items():
+                if isinstance(value, (dict, list)):
+                    sub_filled, sub_total = count_filled_fields(value, f"{prefix}.{key}" if prefix else key)
+                    filled += sub_filled
+                    total += sub_total
+                else:
+                    total += 1
+                    if value and not (isinstance(value, str) and value.strip() in ["", "[]", "{}"]):
+                        filled += 1
+            return filled, total
+        elif isinstance(data, list):
+            if not data:
+                return 0, 1
+            filled = 0
+            total = 0
+            for item in data:
+                sub_filled, sub_total = count_filled_fields(item)
+                filled += sub_filled
+                total += sub_total
+            return filled, total
+        else:
+            return 1 if data else 0, 1
+    # Compute scores for each section
+    for section, weight in sections.items():
+        if section in eval_data:
+            filled, total = count_filled_fields(eval_data[section])
+            completion_rate = filled / total if total > 0 else 0
+            scores[section] = {
+                "score": round(completion_rate * weight, 2),
+                "max_score": weight,
+                "completion_rate": round(completion_rate * 100, 2),
+                "fields_filled": filled,
+                "fields_total": total
+            }
+            total_score += scores[section]["score"]
+        else:
+            scores[section] = {
+                "score": 0,
+                "max_score": weight,
+                "completion_rate": 0,
+                "fields_filled": 0,
+                "fields_total": 0
+            }
+    return round(total_score, 2), scores
+def get_llm_feedback(yaml_content, api_token=None):
+    """
+    Get feedback on the eval card from Groq's LLM
+    """
+    if not api_token:
+        return "API token is required for LLM feedback."
+    try:
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_token}"
+        }
+        prompt = f"""
+        I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness,
+        consistency, and clarity. Provide specific recommendations for improvement.
+        Focus on:
+        1. Sections that need more detail
+        2. Inconsistencies or contradictions
+        3. Clarity of language and explanations
+        4. Alignment with best practices for ML evaluation
+        Here's the YAML content:
+        ```yaml
+        {yaml_content}
+        ```
+        Provide your feedback in a structured format with specific, actionable recommendations.
+        """
+        payload = {
+            "model": "llama-3.3-70b-versatile", # or another groq supported model
+            "messages": [
+                {"role": "user", "content": prompt}
+            ]
+        }
+        response = requests.post(
+            "https://api.groq.com/openai/v1/chat/completions",
+            headers=headers,
+            json=payload
+        )
+        if response.status_code == 200:
+            return response.json()["choices"][0]["message"]["content"]
+        else:
+            return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}"
+    except Exception as e:
+        return f"Error getting Groq LLM feedback: {str(e)}"
+def save_eval_card(yaml_content, filename=None):
+    """Save an eval card to the repository"""
+    try:
+        # Parse YAML to validate it
+        eval_data = yaml.safe_load(yaml_content)
+        # Generate filename if not provided
+        if not filename:
+            eval_name = eval_data.get("title", "Unnamed Evaluation")
+            # Clean filename
+            filename = re.sub(r'[^\w\-_]', '_', eval_name)
+            filename = f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
+        # Save file
+        file_path = os.path.join(EVAL_CARDS_DIR, filename)
+        with open(file_path, "w") as file:
+            file.write(yaml_content)
+        return True, file_path
+    except Exception as e:
+        return False, str(e)
+def load_all_eval_cards():
+    """Load all eval cards from the repository"""
+    eval_cards = []
+    for filename in os.listdir(EVAL_CARDS_DIR):
+        if filename.endswith(".yaml"):
+            file_path = os.path.join(EVAL_CARDS_DIR, filename)
+            try:
+                with open(file_path, "r") as file:
+                    yaml_content = file.read()
+                    eval_data = yaml.safe_load(yaml_content)
+                    # Compute coverage score
+                    score, score_details = compute_coverage_score(eval_data)
+                    # Extract key metadata
+                    eval_cards.append({
+                        "filename": filename,
+                        "title": eval_data.get("title", "Unnamed Evaluation"),
+                        "summary": eval_data.get("summary", ""),
+                        "authors": ", ".join(eval_data.get("metadata", {}).get("authors", [])),
+                        "creation_date": eval_data.get("metadata", {}).get("creation_date", ""),
+                        "coverage_score": score,
+                        "score_details": score_details,
+                        "yaml_content": yaml_content,
+                        "data": eval_data
+                    })
+            except Exception as e:
+                print(f"Error loading {filename}: {str(e)}")
+    return eval_cards
+def format_eval_card_as_html(eval_card):
+    """Format an eval card as HTML for display"""
+    html = f"""
+    <div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
+        <h3>{eval_card['title']}</h3>
+        <p>{eval_card['summary']}</p>
+        <p><strong>Authors:</strong> {eval_card['authors']}</p>
+        <p><strong>Created:</strong> {eval_card['creation_date']}</p>
+        <p><strong>Coverage Score:</strong> {eval_card['coverage_score']}%</p>
+        <h4>Coverage by Section:</h4>
+        <table style="width: 100%; border-collapse: collapse;">
+            <tr>
+                <th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th>
+                <th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th>
+                <th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
+            </tr>
+    """
+    for section, details in eval_card['score_details'].items():
+        html += f"""
+            <tr>
+                <td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
+                <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['score']}/{details['max_score']}</td>
+                <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['completion_rate']}%</td>
+            </tr>
+        """
+    html += """
+        </table>
+        <div style="margin-top: 10px;">
+            <a href="#" onclick="viewYaml(this)" data-filename="{eval_card['filename']}" style="text-decoration: none; color: #3273dc;">View YAML</a>
+        </div>
+    </div>
+    """
+    return html
+def create_eval_cards_table(eval_cards):
+    """Create an HTML table of eval cards"""
+    if not eval_cards:
+        return "<p>No evaluation cards found.</p>"
+    # Sort by coverage score (highest first)
+    eval_cards.sort(key=lambda x: x['coverage_score'], reverse=True)
+    html = ""
+    for eval_card in eval_cards:
+        html += format_eval_card_as_html(eval_card)
+    return html
+def upload_file(file):
+    """Process an uploaded YAML file"""
+    if file is None:
+        return "No file uploaded", None
+    try:
+        yaml_content = file.decode("utf-8")
+        # Validate YAML
+        eval_data = yaml.safe_load(yaml_content)
+        return yaml_content, eval_data
+    except Exception as e:
+        return f"Error processing file: {str(e)}", None
+def get_feedback(yaml_content, api_token):
+    """Get LLM feedback on the eval card"""
+    if not yaml_content:
+        return "Please upload or paste a YAML file first."
+    if not api_token:
+        return "Please provide an API token for the LLM service."
+    feedback = get_llm_feedback(yaml_content, api_token)
+    return feedback
+def submit_eval_card(yaml_content):
+    """Submit an eval card to the repository"""
+    if not yaml_content:
+        return "Please upload or paste a YAML file first.", None, None
+    try:
+        # Validate YAML
+        eval_data = yaml.safe_load(yaml_content)
+        # Compute coverage score
+        score, score_details = compute_coverage_score(eval_data)
+        # Save eval card
+        success, file_path = save_eval_card(yaml_content)
+        if success:
+            return f"Evaluation card saved successfully! Coverage score: {score}%", score, score_details
+        else:
+            return f"Error saving evaluation card: {file_path}", None, None
+    except Exception as e:
+        return f"Error processing evaluation card: {str(e)}", None, None
+def refresh_gallery():
+    """Refresh the gallery of eval cards"""
+    eval_cards = load_all_eval_cards()
+    html = create_eval_cards_table(eval_cards)
+    # Convert data to pandas DataFrame for table view
+    table_data = []
+    for card in eval_cards:
+        table_data.append({
+            "Title": card["title"],
+            "Authors": card["authors"],
+            "Creation Date": card["creation_date"],
+            "Coverage Score": f"{card['coverage_score']}%"
+        })
+    df = pd.DataFrame(table_data)
+    return html, df if not df.empty else None
+def handle_upload_tab(file_obj, yaml_text):
+    """Handle upload tab actions - either use uploaded file or pasted text"""
+    if file_obj is not None:
+        yaml_content, eval_data = upload_file(file_obj)
+        return yaml_content
+    else:
+        return yaml_text
+# Create the Gradio interface
+with gr.Blocks(title="Evaluation Card Repository") as app:
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("# Evaluation Card Repository")
+            gr.Markdown("""
+            This application allows you to upload, validate, and explore ML evaluation cards.
+            Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the repository.
+            """)
+    with gr.Tabs():
+        with gr.TabItem("Upload & Review"):
+            with gr.Row():
+                with gr.Column():
+                    file_upload = gr.File(label="Upload YAML File", file_types=[".yaml", ".yml"])
+                    with gr.Accordion("Or paste YAML content", open=False):
+                        yaml_input = gr.TextArea(label="YAML Content", placeholder="Paste your YAML content here...", lines=10)
+                    load_template_btn = gr.Button("Load Template")
+                    api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
+                    with gr.Row():
+                        get_feedback_btn = gr.Button("Get LLM Feedback")
+                        submit_btn = gr.Button("Submit Evaluation Card", variant="primary")
+                with gr.Column():
+                    yaml_display = gr.TextArea(label="Current YAML", lines=20)
+                    with gr.Accordion("LLM Feedback", open=True):
+                        feedback_display = gr.Markdown()
+                    with gr.Accordion("Submission Result", open=True):
+                        result_display = gr.Markdown()
+                        coverage_score = gr.Number(label="Coverage Score", visible=False)
+                        coverage_details = gr.JSON(label="Coverage Details", visible=False)
+        with gr.TabItem("Gallery"):
+            refresh_btn = gr.Button("Refresh Gallery")
+            with gr.Tabs():
+                with gr.TabItem("Card View"):
+                    gallery_html = gr.HTML()
+                with gr.TabItem("Table View"):
+                    gallery_table = gr.DataFrame()
+    # Set up event handlers
+    load_template_btn.click(
+        fn=load_template,
+        outputs=[yaml_display]
+    )
+    file_upload.change(
+        fn=handle_upload_tab,
+        inputs=[file_upload, yaml_input],
+        outputs=[yaml_display]
+    )
+    yaml_input.change(
+        fn=lambda x: x,
+        inputs=[yaml_input],
+        outputs=[yaml_display]
+    )
+    get_feedback_btn.click(
+        fn=get_feedback,
+        inputs=[yaml_display, api_token],
+        outputs=[feedback_display]
+    )
+    submit_btn.click(
+        fn=submit_eval_card,
+        inputs=[yaml_display],
+        outputs=[result_display, coverage_score, coverage_details]
+    )
+    refresh_btn.click(
+        fn=refresh_gallery,
+        outputs=[gallery_html, gallery_table]
+    )
+    # Initialize the gallery on app start
+    app.load(
+        fn=refresh_gallery,
+        outputs=[gallery_html, gallery_table]
+    )
+# Launch the app
+if __name__ == "__main__":
+    app.launch()

eval_cards/ChemBench_20250312_170522.yaml ADDED Viewed

	@@ -0,0 +1,411 @@

+title: "ChemBench"
+summary: >
+  ChemBench was developed as a comprehensive benchmarking suite for the performance of LLMs in chemistry.
+  It features a curation of more than 2,700 question-answer pairs classified to probe knowledge, intuition
+  and reasoning abilities of LLMs. ChemBench goes beyond simple MCQ evaluation, supports floating point
+  answers (also in scientific notation), and prompts models closely to how they were trained.
+metadata:
+  authors:
+    - Adrian Mirza
+    - Nawaf Alampara
+    - Sreekanth Kunchapu
+    - Martiño Ríos-García
+    - Benedict Emoekabu
+    - Aswanth Krishnan
+    - Tanya Gupta
+    - Mara Schilling-Wilhelmi
+    - Macjonathan Okereke
+    - Anagha Aneesh
+    - Mehrdad Asgari
+    - Juliane Eberhardt
+    - Amir Mohammad Elahi
+    - Hani M. Elbeheiry
+    - María Victoria Gil
+    - Christina Glaubitz
+    - Maximilian Greiner
+    - Caroline T. Holick
+    - Tim Hoffmann
+    - Abdelrahman Ibrahim
+    - Lea C. Klepsch
+    - Yannik Köster
+    - Fabian Alexander Kreth
+    - Jakob Meyer
+    - Santiago Miret
+    - Jan Matthias Peschel
+    - Michael Ringleb
+    - Nicole Roesner
+    - Johanna Schreiber
+    - Ulrich S. Schubert
+    - Leanne M. Stafast
+    - Dinga Wonanke
+    - Michael Pieler
+    - Philippe Schwaller
+    - Kevin Maik Jablonka
+  maintainers:
+    - Adrian Mirza
+    - Nawaf Alampara
+    - Martiño Ríos-García
+    - Kevin Maik Jablonka
+  creation_date: "2023-05-15"
+  last_review_date: "2024-11-01"
+  next_review_date: "YTBD"
+  version_compatibility:
+    - "v0.3.0"
+evaluation_design:
+  motivation:
+    scientific_needs: >
+      ChemBench is one of the pioneering benchmarks to evaluate performance of LLMs in chemistry specifically.
+      Prior selection of LLMs on chemistry tasks has been based on their performance on general benchmarks like Big Bench.
+    approach_justification: >
+      ChemBench comprehensively evaluates almost all the leading models on a wide range of chemistry topics,
+      allowing topic-specific leaders identification. It also probes safety knowledge of LLMs and evaluates
+      measures of alignment with human intuitions.
+    expected_benefits: >
+      Provides comparison metrics for LLM training on chemistry-specific tasks and evaluates performance
+      across different chemistry topics.
+    tradeoffs: >
+      Current LLMs lack human intuitions. ChemBench currently does not support evaluation of open-ended chemistry tasks.
+  type_and_structure:
+    type: "Benchmark"
+    structure: >
+      End-to-end automation, careful validation by experts, and usability with black box systems.
+      The benchmark covers a diverse set of topics and skills (reasoning, calculation, knowledge, and intuition)
+      across a range of difficulty levels.
+    timeline: ""
+    key_design_decisions:
+      - Benchmark approach for scalability and easier accessibility
+      - End-to-end automation for frequent model evaluation
+      - Careful validation by experts to minimize incorrect or unanswerable questions
+      - Support for models with special treatment of molecules
+      - Usability with black box systems without access to weights or logits
+      - Probing capabilities beyond MCQs to reflect real-world chemistry
+      - Coverage of diverse topics and skills
+      - Range of difficulty levels to measure improvement
+      - Impossible to completely solve with current models
+    design_process:
+      stakeholder_consultation: "ChemBench is internally used by some of the leading AI labs"
+      pilot_studies:
+        - "LLM ChemBench results were compared against humans using a subset of ChemBench"
+      validation_approaches:
+        - "Codebase tested with unit tests covering parsing modules, metrics modules, and extraction modules"
+        - "Questions verified manually by experts through GitHub pull requests"
+        - "Automated checks via GitHub Actions for schemas, LATEX templating, and formatting"
+        - "Leaderboard verification of complete corpus evaluation"
+  stakeholders_and_resources:
+    target_users:
+      - "General audience developing or evaluating ML models"
+      - "Researchers developing chemistry datasets"
+    required_expertise:
+      - "Basic knowledge of using benchmarks (simple how-to guide provided)"
+    resource_requirements:
+      - "API keys for closed-source models"
+      - "GPUs for fast local benchmarking (CPU also possible but slower)"
+    cost_considerations: "Nil"
+estimand:
+  target_construct:
+    primary_capability: "Capabilities of models to answer chemistry questions"
+    measurement_type: "Pragmatic"
+    relationship_to_applications: >
+      ChemBench score can be considered a comparative metric to measure gains in LLM training.
+      Shows positive correlation to performance on tasks like data extraction.
+    theoretical_framework: >
+      Assumes the corpus is not being used for training during model development.
+      Findings on capabilities are based on performance in answering questions that rely on
+      reasoning, calculation, knowledge, and intuition for humans to solve.
+  scope_and_limitations:
+    coverage: >
+      Over 2,700 question-answer pairs classified to probe knowledge, intuition, and reasoning.
+      Covers subjects within Chemistry taught at undergraduate and postgraduate level courses.
+    excluded_capabilities:
+      - "Property prediction capabilities"
+      - "Data extraction capabilities"
+      - "Embedding meaningfulness"
+      - "Agentic capabilities"
+    known_blind_spots:
+      - "Questions considered answered correctly only if final answer is correct"
+      - "Partial scoring and open-ended evaluation not covered"
+    theoretical_limitations:
+      - "Questions treated with equal weights, no clear approach for weighing tasks"
+      - "Reliability and correlation between log probabilities and model responses not known"
+  assessment_components:
+    test_set:
+      data_sources:
+        - "Curated questions from existing exams or exercise sheets"
+        - "Programmatically created questions"
+      sampling_methodology: "Each model evaluated on all questions"
+      known_biases:
+        - "Questions mainly curated from the background of the developers"
+      approach_to_duplicates: >
+        Each question-answer pair hashed to create unique IDs, filtering to keep unique questions based on UUIDs.
+      data_quality: >
+        Guidelines followed by reviewers: originality, clarity, factual correctness, and avoiding ambiguity.
+estimator:
+  evaluation_protocol:
+    methodology: >
+      Distinct prompt templates for completion and instruction-tuned models. Multistep parsing workflow
+      based on regular expressions with LLM extraction as fallback. Comprehensive refusal detection combining
+      regular expression-based detection and a fine-tuned BERT model.
+    control_measures:
+      - "Model-specific prompt templates"
+      - "Consistent parsing workflow"
+      - "Refusal detection and retry mechanism"
+    handling_random_components: "Refusal detection and retry mechanism for up to n times"
+    reproducibility_requirements: >
+      Storage of model timestamp, time, and version of the dataset used for benchmarking.
+  metrics:
+    primary_metrics:
+      - "Fraction of correctly answered questions"
+    aggregation_methodology: "Final score is mean of scores across all questions from all topics"
+    task_weightings:
+      approach: "All questions treated equally to avoid ambiguity"
+      note: "Questions classified into three difficulty levels manually by experts for further analysis"
+    performance_bounds:
+      scoring: "No partial scoring - all questions measured as correct/incorrect"
+    connection_to_outcomes: "Scores reflect how well the model is trained on chemistry"
+    metric_details:
+      - name: "Fraction Correct"
+        definition: >
+          Proportion of correct answers out of total questions. For MCQs, uses Hamming Loss;
+          for numerics, uses Mean Absolute Error with 1% threshold.
+        implementation: >
+          (1/n) * (sum(1-HammingLoss_i for i in MCQ) + sum(indicator(MAE_j < 0.01*|Target_j|) for j in Numeric))
+        edge_cases:
+          - "Perfect score: 1 when all questions answered correctly"
+          - "Complete failure: 0 when all questions answered incorrectly"
+        statistical_properties:
+          - "Simplicity: Easy to calculate and interpret"
+          - "Range: Always bounded between [0, 1]"
+          - "Binary nature: Each question contributes either 0 or 1"
+        failure_modes:
+          - "Masking: High overall accuracy can hide poor performance on specific question types"
+          - "Insensitivity to confidence: Doesn't account for prediction confidence"
+          - "Equal weighting: Assigns equal importance regardless of difficulty"
+          - "Heterogeneous data: Combining different question types with different evaluation criteria"
+          - "Threshold sensitivity: Results highly dependent on chosen thresholds"
+          - "Near-zero targets: For small target values, 1% threshold becomes extremely stringent"
+      - name: "Hamming Loss"
+        definition: >
+          Measures fraction of labels incorrectly predicted for MCQs.
+          (1/L) * sum(indicator(y_i,l != y_hat_i,l) for l in 1 to L)
+        implementation: "For single-answer MCQ, 0 if answer correct, 1 if incorrect"
+        statistical_properties:
+          - "Linearity: Scales linearly with misclassifications"
+          - "Range: Always bounded between [0, 1]"
+          - "Symmetry: Treats false positives and negatives equally"
+        failure_modes:
+          - "Equal weighting: Assigns equal importance regardless of difficulty"
+          - "Lack of severity grading: All errors weighted equally"
+          - "Multi-label complexity: May not capture label dependencies"
+          - "Simplistic for complex MCQs: Doesn't account for partial correctness"
+  technical_framework:
+    implementation_requirements:
+      - "Installing ChemBench package"
+      - "API keys for closed-source models"
+      - "GPUs for fast benchmarking (CPU also possible)"
+    time_constraints: "Complete benchmarking requires around 2 hours"
+    dependencies:
+      - "tenacity==8.3.0"
+      - "langchain>=0.1.5"
+      - "fastcore>=1.5.29"
+      - "scikit-learn>=1.4.0"
+      - "loguru>=0.7.2"
+      - "litellm>=1.59.1"
+      - "backoff>=2.2.1"
+      - "tqdm>=4.66.1"
+      - "pint>=0.23"
+      - "pandas>=2.2.0"
+      - "python-dotenv>=1.0.1"
+      - "fire>=0.5.0"
+      - "datasets"
+      - "torch"
+      - "transformers"
+      - "langchain-community>=0.0.17"
+      - "pillow"
+  constraints_and_rules:
+    allowed_resources:
+      - "Models not trained on the ChemBench corpus (not tested)"
+    permitted_approaches:
+      - "Tools or other agentic setups"
+      - "No constraints on model parameters or computational constraints"
+      - "No constraints on temperature or decoding strategies"
+      - "No constraints on architecture or post-training approaches"
+    optimization_constraints:
+      - "Prompts not optimized unless part of modeling"
+    ethical_boundaries:
+      - "Models not trained on the ChemBench corpus (not tested)"
+estimate:
+  required_reporting:
+    essential_metrics:
+      - "all_correct (binary score of 0/1 for each question)"
+      - "Fraction correct (final score computed across all questions)"
+      - "Refusal detections and LLM parsing flags"
+    results_disaggregation: >
+      Individual scoring and relative position available for Topics:
+      Analytical Chemistry, Materials Science, Technical Chemistry, General Chemistry,
+      Physical Chemistry, Toxicity and Safety, Inorganic Chemistry, Organic Chemistry,
+      and Human Preference. Separate scores for easy/hard tasks, reasoning tasks,
+      computation tasks, knowledge tasks, human preference alignment, and comparison
+      against human chemists.
+    uncertainty_quantification: >
+      ChemBench has a unique way to obtain confidence of model predictions using prompting,
+      but this is a separate analysis not part of benchmark metrics.
+    performance_variation: "Currently not done"
+    resource_usage_reporting: "Currently tracks number of parameters if available"
+  reproducibility_information:
+    documentation_requirements:
+      - "model_name"
+      - "model_timestamp"
+      - "model_description"
+      - "date_published (optional)"
+      - "open_weights (optional)"
+      - "open_dataset (optional)"
+      - "nr_of_parameters (optional)"
+      - "github (optional)"
+      - "paper (optional)"
+      - "api_endpoint (optional)"
+      - "nr_of_tokens (optional)"
+      - "architecture (optional)"
+      - "mixture_of_experts (optional)"
+      - "model_alignment (optional)"
+      - "reinforcement_learning_from_human_feedback (optional)"
+      - "domain_specific_pretraining (optional)"
+      - "domain_specific_finetuning (optional)"
+      - "tool_use (optional)"
+      - "tool_type (optional)"
+      - "temperature (optional)"
+      - "epochs (optional)"
+      - "reasoning_model (optional)"
+      - "reasoning_type (optional)"
+    environment_specifications: >
+      Benchmarking performed using latest version of ChemBench pipeline and ChemBench Dataset.
+    randomization_handling: >
+      Temperature or other randomization or seeding expected in model description.
+    output_standardization: >
+      Outputs prompted to be given in ChemBench parsing compatible format.
+results_communication:
+  visualization:
+    recommended_plots:
+      - "Spider chart showing model performance on different topics against baseline and other leading models"
+      - "Reliability and distribution of confidence estimates, showing confidence calibration"
+    standardized_formats:
+      - "Latest results maintained in ChemBench-Leaderboard"
+      - "Refusals counted as incorrect"
+      - "Baseline model as defined in paper"
+      - "Final answer based on ChemBench pipeline, not log probabilities"
+  leaderboard_guidelines:
+    submission_process: "Detailed in Huggingface Space documentation"
+    required_metadata:
+      - "Model details as specified in documentation requirements"
+known_issues_and_limitations:
+  validity_concerns:
+    construct_validity: >
+      Even though ChemBench goes beyond MCQ-only benchmarks by including numeric questions,
+      evaluation on open-ended tasks is not included. Partial scoring and task weighing not supported.
+    gaming_possibilities: "Possibility to host ChemBench as a challenge"
+    stability_considerations: >
+      Refusal detection and retry mechanism implemented to tackle LLM refusals,
+      combining regex-based detection and fine-tuned BERT model.
+    temporal_validity: >
+      Questions based on scientific principles won't lose validity,
+      but may appear in training corpora over time.
+  practical_limitations:
+    resource_constraints: "Based on the model being benchmarked"
+    scalability_issues: "Based on the model being benchmarked"
+    cost_factors: "Based on the model being benchmarked"
+    time_boundaries: "Benchmark might lose validity as questions leak to training corpora"
+  bias_and_fairness:
+    known_biases:
+      - "Biases from human curation process"
+    representation_issues: "Certain areas of chemistry not evaluated"
+    potential_impacts: "Certain areas of chemistry not evaluated"
+    mitigation_approaches: "Curation by team of more than 10 people to balance biases"
+version_and_maintenance:
+  version_information:
+    version:
+      results: "v1.0.4"
+      dataset: "v1.0.0"
+      code: "v0.3.0"
+    release_date: "2024-11-01"
+    change_history: "Tracked in GitHub repository changelog"
+    update_plans: "Discussed in GitHub repository discussions"
+  maintenance_protocol:
+    update_frequency: "Ad hoc after release"
+    deprecation_policy: >
+      Based on major issues with questions. Questions removed and dataset version updated.
+      Major updates lead to rerunning models for updated Leaderboard.
+    issue_reporting: "Issues tracked in GitHub repository"
+    community_involvement: >
+      Maintainers active in solving user issues on GitHub.
+      Proposal for forum in Mat Sci Community Disclosure.
+      Discussions available on GitHub and Huggingface.
+    criteria_for_updates:
+      - "Codebase updated for new features or bug fixes"
+      - "Dataset updated when questions added or removed"
+      - "Leaderboard updated for new models or dataset updates"
+    breaking_change_policy: >
+      All models in leaderboard rerun with new updates.
+      Update of arXiv paper released. Proposal to release a commit.
+    backwards_compatibility: >
+      Pydantic base classes for task and report stable for compatibility.
+      Major changes to tasks and report backward compatible.
+    migration_guides: "Released in documentation as needed"
+citation_and_usage:
+  citation_information:
+    recommended_citation: >
+      @misc{mirza2024largelanguagemodelssuperhuman,
+            title={Are large language models superhuman chemists?},
+            author={Adrian Mirza and Nawaf Alampara and Sreekanth Kunchapu and Benedict Emoekabu and Aswanth Krishnan and Mara Wilhelmi and Macjonathan Okereke and Juliane Eberhardt and Amir Mohammad Elahi and Maximilian Greiner and Caroline T. Holick and Tanya Gupta and Mehrdad Asgari and Christina Glaubitz and Lea C. Klepsch and Yannik Köster and Jakob Meyer and Santiago Miret and Tim Hoffmann and Fabian Alexander Kreth and Michael Ringleb and Nicole Roesner and Ulrich S. Schubert and Leanne M. Stafast and Dinga Wonanke and Michael Pieler and Philippe Schwaller and Kevin Maik Jablonka},
+            year={2024},
+            eprint={2404.01475},
+            archivePrefix={arXiv},
+            primaryClass={cs.LG},
+            url={https://arxiv.org/abs/2404.01475},
+      }
+    related_publications:
+      - "Are large language models superhuman chemists? (https://arxiv.org/abs/2404.01475)"
+      - "Probing the limitations of multimodal language models for chemistry and materials research (https://arxiv.org/pdf/2411.16955)"
+    licensing_details: "MIT License"
+  usage_guidelines:
+    recommended_applications:
+      - "Evaluation of LLM capabilities in chemistry"
+    inappropriate_uses:
+      - "Training models with the ChemBench dataset"
+    implementation_best_practices: >
+      Results obtained with ChemBench pipeline and latest dataset at time of benchmarking considered valid practice.
+    ethical_considerations: "ChemBench dataset not meant for training"
+additional_notes:
+  related_evaluations:
+    - "ChemBench extension for multimodal models (https://arxiv.org/pdf/2411.16955)"
+    - "MatText for bottlenecks of finetuned LLMs on property prediction (https://arxiv.org/abs/2406.17295)"
+    - "MaScQA for investigating materials science knowledge of LLMs (https://pubs.rsc.org/en/content/articlelanding/2024/dd/d3dd00188a)"
+    - "Measuring Capabilities of Language Models for Biology Research (https://arxiv.org/abs/2407.10362)"
+  future_directions: >
+    Sensitivity to prompting, improving performance with prompt optimization.
+    Mechanistic interpretability. Benchmarking agents on ChemBench.
+    Effect of grounding and post-training approaches.

requirement.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+pyyaml>=6.0
+pandas>=2.0.0
+requests>=2.31.0

script.js ADDED Viewed

	@@ -0,0 +1,90 @@

+// Function to view YAML file
+function viewYaml(element) {
+    const filename = element.getAttribute('data-filename');
+    // Make an AJAX request to fetch the YAML content
+    fetch(`/get_yaml?filename=${encodeURIComponent(filename)}`)
+        .then(response => response.text())
+        .then(yamlContent => {
+            // Display the YAML content in a modal
+            const modal = document.createElement('div');
+            modal.style.position = 'fixed';
+            modal.style.top = '0';
+            modal.style.left = '0';
+            modal.style.width = '100%';
+            modal.style.height = '100%';
+            modal.style.backgroundColor = 'rgba(0, 0, 0, 0.5)';
+            modal.style.zIndex = '1000';
+            modal.style.display = 'flex';
+            modal.style.justifyContent = 'center';
+            modal.style.alignItems = 'center';
+            const modalContent = document.createElement('div');
+            modalContent.style.backgroundColor = 'white';
+            modalContent.style.padding = '20px';
+            modalContent.style.borderRadius = '5px';
+            modalContent.style.maxWidth = '80%';
+            modalContent.style.maxHeight = '80%';
+            modalContent.style.overflow = 'auto';
+            const closeButton = document.createElement('button');
+            closeButton.textContent = 'Close';
+            closeButton.style.marginBottom = '10px';
+            closeButton.style.padding = '5px 10px';
+            closeButton.style.cursor = 'pointer';
+            closeButton.onclick = () => {
+                document.body.removeChild(modal);
+            };
+            const yamlPre = document.createElement('pre');
+            yamlPre.textContent = yamlContent;
+            yamlPre.style.whiteSpace = 'pre-wrap';
+            yamlPre.style.wordBreak = 'break-word';
+            modalContent.appendChild(closeButton);
+            modalContent.appendChild(yamlPre);
+            modal.appendChild(modalContent);
+            document.body.appendChild(modal);
+        })
+        .catch(error => {
+            console.error('Error fetching YAML content:', error);
+            alert('Error fetching YAML content: ' + error.message);
+        });
+}
+// Function to visualize coverage scores
+function visualizeCoverage(scoreDetails) {
+    const chartContainer = document.getElementById('coverage-chart');
+    // Create a bar chart using a visualization library
+    // This is just a placeholder - you would use a library like Chart.js
+    let html = `<div style="margin-top: 20px;">
+        <h3>Coverage by Section</h3>
+        <div style="display: flex; flex-direction: column; gap: 5px;">`;
+    for (const [section, details] of Object.entries(scoreDetails)) {
+        const percentage = details.completion_rate;
+        html += `
+            <div>
+                <div style="display: flex; justify-content: space-between; margin-bottom: 2px;">
+                    <span>${section}</span>
+                    <span>${percentage}%</span>
+                </div>
+                <div style="width: 100%; background-color: #eee; height: 10px; border-radius: 5px;">
+                    <div style="width: ${percentage}%; background-color: #3273dc; height: 10px; border-radius: 5px;"></div>
+                </div>
+            </div>`;
+    }
+    html += '</div></div>';
+    chartContainer.innerHTML = html;
+}
+// Initialize any client-side functionality when the document loads
+document.addEventListener('DOMContentLoaded', function() {
+    // This could be used to initialize charts or other client-side features
+    console.log('Client-side JavaScript initialized');
+});

style.css ADDED Viewed

	@@ -0,0 +1,12 @@

+/* General styles */
+body {
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
+}
+/* Eval card gallery styles */
+.eval-card {
+    border: 1px solid #ddd;
+    border-radius: 5px;
+    padding: 20px;
+    margin-bottom: 20px;
+    background-color: white;

template.yaml ADDED Viewed

File without changes

yaml_template.yaml ADDED Viewed

File without changes