Spaces:

successforce
/

agi_evals

No application file

File size: 6,761 Bytes

477fa2f

{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building an MMLU Eval\n",
    "\n",
    "This notebook shows how to:\n",
    "- Build and run an eval\n",
    "- Load the results and into a Pandas Dataframe\n",
    "\n",
    "We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install, and download MMLU if you haven't already\n",
    "%pip install -e .\n",
    "\n",
    "!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar\n",
    "!tar -xf data.tar\n",
    "data_pth = \"data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "# Assuming this notebook is in examples/\n",
    "registry_pth = os.path.join(os.getcwd(), \"../evals/registry\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n",
    "\n",
    "choices = [\"A\", \"B\", \"C\", \"D\"]\n",
    "sys_msg = \"The following are multiple choice questions (with answers) about {}.\"\n",
    "def create_chat_prompt(sys_msg, question, answers, subject):\n",
    "    user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n",
    "    return [\n",
    "        {\"role\": \"system\", \"content\": sys_msg.format(subject)}, \n",
    "        {\"role\": \"user\", \"content\": user_prompt}\n",
    "    ]\n",
    "\n",
    "def create_chat_example(question, answers, correct_answer):\n",
    "    \"\"\"\n",
    "    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n",
    "    \"\"\"\n",
    "    user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n",
    "    return [\n",
    "        {\"role\": \"system\", \"content\": user_prompt, \"name\": \"example_user\"},\n",
    "        {\"role\": \"system\", \"content\": correct_answer, \"name\": \"example_assistant\"},\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import yaml\n",
    "subjects = sorted([f.split(\"_test.csv\")[0] for f in os.listdir(os.path.join(data_pth, \"test\")) if \"_test.csv\" in f])\n",
    "\n",
    "registry_yaml = {}\n",
    "\n",
    "for subject in subjects:\n",
    "    subject_pth = os.path.join(registry_pth, \"data\", \"mmlu\", subject)\n",
    "    os.makedirs(subject_pth, exist_ok=True)\n",
    "\n",
    "    # Create few-shot prompts\n",
    "    dev_df = pd.read_csv(os.path.join(data_pth, \"dev\", subject + \"_dev.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n",
    "    dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], x[\"Answer\"]), axis=1)\n",
    "    few_shot_pth = os.path.join(subject_pth, \"few_shot.jsonl\")     \n",
    "    dev_df[[\"sample\"]].to_json(few_shot_pth, lines=True, orient=\"records\")\n",
    "\n",
    "    # Create test prompts and ideal completions\n",
    "    test_df = pd.read_csv(os.path.join(data_pth, \"test\", subject + \"_test.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n",
    "    test_df[\"input\"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], subject), axis=1)\n",
    "    test_df[\"ideal\"] = test_df.Answer\n",
    "    samples_pth = os.path.join(subject_pth, \"samples.jsonl\")     \n",
    "    test_df[[\"input\", \"ideal\"]].to_json(samples_pth, lines=True, orient=\"records\")\n",
    "\n",
    "    eval_id = f\"match_mmlu_{subject}\"\n",
    "\n",
    "    registry_yaml[eval_id] = {\n",
    "        \"id\": f\"{eval_id}.test.v1\",\n",
    "        \"metrics\": [\"accuracy\"]\n",
    "    }\n",
    "    registry_yaml[f\"{eval_id}.test.v1\"] = {\n",
    "        \"class\": \"evals.elsuite.basic.match:Match\",\n",
    "        \"args\": {\n",
    "            \"samples_jsonl\": samples_pth,\n",
    "            \"few_shot_jsonl\": few_shot_pth,\n",
    "            \"num_few_shot\": 4,\n",
    "        }\n",
    "    }\n",
    "\n",
    "with open(os.path.join(registry_pth, \"evals\", \"mmlu.yaml\"), \"w\") as f:\n",
    "    yaml.dump(registry_yaml, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n",
    "!oaieval gpt-3.5-turbo match_mmlu_anatomy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# How to process the log events generated by oaieval\n",
    "events = \"/tmp/evallogs/{log_name}\"\n",
    "\n",
    "with open(events, \"r\") as f:\n",
    "    events_df = pd.read_json(f, lines=True)\n",
    "\n",
    "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n",
    "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n",
    "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"Correctness\", ylabel=\"Count\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inspect samples\n",
    "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
    "    print(f\"Prompt: {r.prompt}\")\n",
    "    print(f\"Sampled: {r.sampled}\")\n",
    "    print(\"-\" * 25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "oss_evals",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}