Spaces:

successforce
/

agi_evals

No application file

File size: 8,870 Bytes

477fa2f

{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building a MAFAND Eval\n",
    "\n",
    "This notebook shows how to:\n",
    "- Build and run an eval using the [MAFAND dataset](https://github.com/masakhane-io/lafand-mt)\n",
    "- Load the results and into a Pandas Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import requests\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "# Install Evals if you haven't already\n",
    "# %pip install -e ../."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download the MAFAND dataset\n",
    "\n",
    "lang_pairs = [\n",
    "    \"en-amh\", \"en-hau\", \"en-ibo\", \"en-kin\", \"en-lug\", \"en-nya\", \"en-pcm\", \"en-sna\", \"en-swa\", \"en-tsn\",\n",
    "    \"en-twi\", \"en-xho\", \"en-yor\", \"en-zul\", \"fr-bam\", \"fr-bbj\", \"fr-ewe\", \"fr-fon\", \"fr-mos\", \"fr-wol\"\n",
    "]\n",
    "\n",
    "# Assuming this notebook is in examples/\n",
    "registry_pth = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n",
    "data_path = os.path.join(registry_pth, \"data\", \"lafand-mt\")\n",
    "os.makedirs(data_path, exist_ok=True)\n",
    "\n",
    "for pair in lang_pairs:\n",
    "    os.makedirs(os.path.join(data_path, pair), exist_ok=True)\n",
    "    for dev_test in ['dev', 'test']:\n",
    "        raw_tsv_file = f'https://raw.githubusercontent.com/masakhane-io/lafand-mt/main/data/tsv_files/{pair}/{dev_test}.tsv'\n",
    "        with open(os.path.join(data_path, pair, f\"{dev_test}.tsv\"), \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(requests.get(raw_tsv_file).text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n",
    "\n",
    "sys_msg = \"Translate the text from {} to {}.\"\n",
    "def create_chat_prompt(sys_msg, input_lang, output_lang, input_text):\n",
    "    return [\n",
    "        {\"role\": \"system\", \"content\": sys_msg.format(input_lang, output_lang)}, \n",
    "        {\"role\": \"user\", \"content\": input_text}\n",
    "    ]\n",
    "\n",
    "def create_chat_example(input_text, correct_translation):\n",
    "    \"\"\"\n",
    "    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n",
    "    \"\"\"\n",
    "    return [\n",
    "        {\"role\": \"system\", \"content\": input_text, \"name\": \"example_user\"},\n",
    "        {\"role\": \"system\", \"content\": correct_translation, \"name\": \"example_assistant\"},\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import yaml\n",
    "import os\n",
    "\n",
    "translation_paths = sorted([os.path.join(data_path, d) for d in os.listdir(data_path)])\n",
    "\n",
    "# Assuming this notebook is in examples/\n",
    "registry_pth = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n",
    "output_path = os.path.join(registry_pth, \"data\", \"lafand-mt\")\n",
    "\n",
    "registry_yaml = {}\n",
    "\n",
    "for input_path in translation_paths:\n",
    "    langs = input_path.split(\"/\")[-1]\n",
    "    input_lang, output_lang = langs.split('-')\n",
    "    pair_path = os.path.join(output_path, f\"{input_lang}-{output_lang}\")\n",
    "    os.makedirs(pair_path, exist_ok=True)\n",
    "\n",
    "    # Create few-shot prompts\n",
    "    dev_df = pd.read_csv(os.path.join(input_path, \"dev.tsv\"), sep=\"\\t\")\n",
    "    dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[input_lang], x[output_lang]), axis=1)\n",
    "    few_shot_pth = os.path.join(pair_path, f\"{input_lang}-{output_lang}_few_shot.jsonl\")\n",
    "    dev_df[[\"sample\"]].to_json(few_shot_pth, lines=True, orient=\"records\")\n",
    "\n",
    "    # Create test prompts and ideal completions\n",
    "    test_df = pd.read_csv(os.path.join(input_path, \"test.tsv\"), sep=\"\\t\")\n",
    "    test_df[\"input\"] = test_df[input_lang].apply(lambda x: create_chat_prompt(sys_msg, input_lang, output_lang, x))\n",
    "    test_df[\"ideal\"] = test_df[output_lang]\n",
    "    \n",
    "    samples_pth = os.path.join(pair_path, f\"{input_lang}-{output_lang}_samples.jsonl\")\n",
    "    test_df[[\"input\", \"ideal\"]].to_json(samples_pth, lines=True, orient=\"records\")\n",
    "    eval_id = f\"mafand_translation_{input_lang}-{output_lang}\"\n",
    "\n",
    "    registry_yaml[eval_id] = {\n",
    "        \"id\": f\"{eval_id}.test.v1\",\n",
    "        \"metrics\": [\"accuracy\"]\n",
    "    }\n",
    "    registry_yaml[f\"{eval_id}.test.v1\"] = {\n",
    "        \"class\": \"evals.elsuite.translate:Translate\",\n",
    "        \"args\": {\n",
    "            \"samples_jsonl\": samples_pth,\n",
    "            \"few_shot_jsonl\": few_shot_pth,\n",
    "            \"num_few_shot\": 4,\n",
    "        }\n",
    "    }\n",
    "\n",
    "os.makedirs(os.path.join(registry_pth, \"evals\"), exist_ok=True)\n",
    "with open(os.path.join(registry_pth, \"evals\", \"mafand.yaml\"), \"w\") as f:\n",
    "    yaml.dump(registry_yaml, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n",
    "!oaieval gpt-3.5-turbo mafand_translation_en-ibo --max_samples 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# How to process the log events generated by oaieval\n",
    "\n",
    "log_name = \"EDIT THIS\"  # copy from above\n",
    "events = f\"/tmp/evallogs/{log_name}\"\n",
    "\n",
    "with open(events, \"r\") as f:\n",
    "    events_df = pd.read_json(f, lines=True)\n",
    "\n",
    "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n",
    "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n",
    "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"sacrebleu score >30\", ylabel=\"Count\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# your list of scores\n",
    "scores = matches_df['sacrebleu_sentence_score']\n",
    "\n",
    "# define the threshold scores as a range from the minimum to the maximum score, in increments of 5\n",
    "thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)\n",
    "\n",
    "# count the number of scores above and below each threshold\n",
    "above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]\n",
    "\n",
    "# plot the counts as a step function\n",
    "plt.step(thresholds, above_counts, label='number of samples withabove')\n",
    "\n",
    "# set the x and y labels\n",
    "plt.xlabel('sacrebleu threshold')\n",
    "plt.ylabel('number of samples w/ score > threshold')\n",
    "\n",
    "# show the plot\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inspect samples\n",
    "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
    "    print(f\"Prompt: {r.prompt}\")\n",
    "    print(f\"Sampled: {r.sampled}\")\n",
    "    print(\"-\" * 25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "vscode": {
   "interpreter": {
    "hash": "fdbe172e46cfba2329a5e8d5b64cdf2d12f4dfd7d9bcea153ecef62d1d51933b"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}