{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Building an MMLU Eval\n", "\n", "This notebook shows how to:\n", "- Build and run an eval\n", "- Load the results and into a Pandas Dataframe\n", "\n", "We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install, and download MMLU if you haven't already\n", "%pip install -e .\n", "\n", "!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar\n", "!tar -xf data.tar\n", "data_pth = \"data\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "\n", "# Assuming this notebook is in examples/\n", "registry_pth = os.path.join(os.getcwd(), \"../evals/registry\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n", "\n", "choices = [\"A\", \"B\", \"C\", \"D\"]\n", "sys_msg = \"The following are multiple choice questions (with answers) about {}.\"\n", "def create_chat_prompt(sys_msg, question, answers, subject):\n", " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n", " return [\n", " {\"role\": \"system\", \"content\": sys_msg.format(subject)}, \n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ]\n", "\n", "def create_chat_example(question, answers, correct_answer):\n", " \"\"\"\n", " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n", " \"\"\"\n", " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n", " return [\n", " {\"role\": \"system\", \"content\": user_prompt, \"name\": \"example_user\"},\n", " {\"role\": \"system\", \"content\": correct_answer, \"name\": \"example_assistant\"},\n", " ]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import yaml\n", "subjects = sorted([f.split(\"_test.csv\")[0] for f in os.listdir(os.path.join(data_pth, \"test\")) if \"_test.csv\" in f])\n", "\n", "registry_yaml = {}\n", "\n", "for subject in subjects:\n", " subject_pth = os.path.join(registry_pth, \"data\", \"mmlu\", subject)\n", " os.makedirs(subject_pth, exist_ok=True)\n", "\n", " # Create few-shot prompts\n", " dev_df = pd.read_csv(os.path.join(data_pth, \"dev\", subject + \"_dev.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n", " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], x[\"Answer\"]), axis=1)\n", " few_shot_pth = os.path.join(subject_pth, \"few_shot.jsonl\") \n", " dev_df[[\"sample\"]].to_json(few_shot_pth, lines=True, orient=\"records\")\n", "\n", " # Create test prompts and ideal completions\n", " test_df = pd.read_csv(os.path.join(data_pth, \"test\", subject + \"_test.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n", " test_df[\"input\"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], subject), axis=1)\n", " test_df[\"ideal\"] = test_df.Answer\n", " samples_pth = os.path.join(subject_pth, \"samples.jsonl\") \n", " test_df[[\"input\", \"ideal\"]].to_json(samples_pth, lines=True, orient=\"records\")\n", "\n", " eval_id = f\"match_mmlu_{subject}\"\n", "\n", " registry_yaml[eval_id] = {\n", " \"id\": f\"{eval_id}.test.v1\",\n", " \"metrics\": [\"accuracy\"]\n", " }\n", " registry_yaml[f\"{eval_id}.test.v1\"] = {\n", " \"class\": \"evals.elsuite.basic.match:Match\",\n", " \"args\": {\n", " \"samples_jsonl\": samples_pth,\n", " \"few_shot_jsonl\": few_shot_pth,\n", " \"num_few_shot\": 4,\n", " }\n", " }\n", "\n", "with open(os.path.join(registry_pth, \"evals\", \"mmlu.yaml\"), \"w\") as f:\n", " yaml.dump(registry_yaml, f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n", "!oaieval gpt-3.5-turbo match_mmlu_anatomy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# How to process the log events generated by oaieval\n", "events = \"/tmp/evallogs/{log_name}\"\n", "\n", "with open(events, \"r\") as f:\n", " events_df = pd.read_json(f, lines=True)\n", "\n", "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n", "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n", "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"Correctness\", ylabel=\"Count\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Inspect samples\n", "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n", " print(f\"Prompt: {r.prompt}\")\n", " print(f\"Sampled: {r.sampled}\")\n", " print(\"-\" * 25)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "oss_evals", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }