{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Building a MAFAND Eval\n", "\n", "This notebook shows how to:\n", "- Build and run an eval using the [MAFAND dataset](https://github.com/masakhane-io/lafand-mt)\n", "- Load the results and into a Pandas Dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import requests\n", "import pandas as pd\n", "\n", "\n", "# Install Evals if you haven't already\n", "# %pip install -e ../." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Download the MAFAND dataset\n", "\n", "lang_pairs = [\n", " \"en-amh\", \"en-hau\", \"en-ibo\", \"en-kin\", \"en-lug\", \"en-nya\", \"en-pcm\", \"en-sna\", \"en-swa\", \"en-tsn\",\n", " \"en-twi\", \"en-xho\", \"en-yor\", \"en-zul\", \"fr-bam\", \"fr-bbj\", \"fr-ewe\", \"fr-fon\", \"fr-mos\", \"fr-wol\"\n", "]\n", "\n", "# Assuming this notebook is in examples/\n", "registry_pth = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n", "data_path = os.path.join(registry_pth, \"data\", \"lafand-mt\")\n", "os.makedirs(data_path, exist_ok=True)\n", "\n", "for pair in lang_pairs:\n", " os.makedirs(os.path.join(data_path, pair), exist_ok=True)\n", " for dev_test in ['dev', 'test']:\n", " raw_tsv_file = f'https://raw.githubusercontent.com/masakhane-io/lafand-mt/main/data/tsv_files/{pair}/{dev_test}.tsv'\n", " with open(os.path.join(data_path, pair, f\"{dev_test}.tsv\"), \"w\", encoding=\"utf-8\") as f:\n", " f.write(requests.get(raw_tsv_file).text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n", "\n", "sys_msg = \"Translate the text from {} to {}.\"\n", "def create_chat_prompt(sys_msg, input_lang, output_lang, input_text):\n", " return [\n", " {\"role\": \"system\", \"content\": sys_msg.format(input_lang, output_lang)}, \n", " {\"role\": \"user\", \"content\": input_text}\n", " ]\n", "\n", "def create_chat_example(input_text, correct_translation):\n", " \"\"\"\n", " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n", " \"\"\"\n", " return [\n", " {\"role\": \"system\", \"content\": input_text, \"name\": \"example_user\"},\n", " {\"role\": \"system\", \"content\": correct_translation, \"name\": \"example_assistant\"},\n", " ]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import yaml\n", "import os\n", "\n", "translation_paths = sorted([os.path.join(data_path, d) for d in os.listdir(data_path)])\n", "\n", "# Assuming this notebook is in examples/\n", "registry_pth = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n", "output_path = os.path.join(registry_pth, \"data\", \"lafand-mt\")\n", "\n", "registry_yaml = {}\n", "\n", "for input_path in translation_paths:\n", " langs = input_path.split(\"/\")[-1]\n", " input_lang, output_lang = langs.split('-')\n", " pair_path = os.path.join(output_path, f\"{input_lang}-{output_lang}\")\n", " os.makedirs(pair_path, exist_ok=True)\n", "\n", " # Create few-shot prompts\n", " dev_df = pd.read_csv(os.path.join(input_path, \"dev.tsv\"), sep=\"\\t\")\n", " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[input_lang], x[output_lang]), axis=1)\n", " few_shot_pth = os.path.join(pair_path, f\"{input_lang}-{output_lang}_few_shot.jsonl\")\n", " dev_df[[\"sample\"]].to_json(few_shot_pth, lines=True, orient=\"records\")\n", "\n", " # Create test prompts and ideal completions\n", " test_df = pd.read_csv(os.path.join(input_path, \"test.tsv\"), sep=\"\\t\")\n", " test_df[\"input\"] = test_df[input_lang].apply(lambda x: create_chat_prompt(sys_msg, input_lang, output_lang, x))\n", " test_df[\"ideal\"] = test_df[output_lang]\n", " \n", " samples_pth = os.path.join(pair_path, f\"{input_lang}-{output_lang}_samples.jsonl\")\n", " test_df[[\"input\", \"ideal\"]].to_json(samples_pth, lines=True, orient=\"records\")\n", " eval_id = f\"mafand_translation_{input_lang}-{output_lang}\"\n", "\n", " registry_yaml[eval_id] = {\n", " \"id\": f\"{eval_id}.test.v1\",\n", " \"metrics\": [\"accuracy\"]\n", " }\n", " registry_yaml[f\"{eval_id}.test.v1\"] = {\n", " \"class\": \"evals.elsuite.translate:Translate\",\n", " \"args\": {\n", " \"samples_jsonl\": samples_pth,\n", " \"few_shot_jsonl\": few_shot_pth,\n", " \"num_few_shot\": 4,\n", " }\n", " }\n", "\n", "os.makedirs(os.path.join(registry_pth, \"evals\"), exist_ok=True)\n", "with open(os.path.join(registry_pth, \"evals\", \"mafand.yaml\"), \"w\") as f:\n", " yaml.dump(registry_yaml, f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n", "!oaieval gpt-3.5-turbo mafand_translation_en-ibo --max_samples 20" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# How to process the log events generated by oaieval\n", "\n", "log_name = \"EDIT THIS\" # copy from above\n", "events = f\"/tmp/evallogs/{log_name}\"\n", "\n", "with open(events, \"r\") as f:\n", " events_df = pd.read_json(f, lines=True)\n", "\n", "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n", "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n", "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"sacrebleu score >30\", ylabel=\"Count\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "# your list of scores\n", "scores = matches_df['sacrebleu_sentence_score']\n", "\n", "# define the threshold scores as a range from the minimum to the maximum score, in increments of 5\n", "thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)\n", "\n", "# count the number of scores above and below each threshold\n", "above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]\n", "\n", "# plot the counts as a step function\n", "plt.step(thresholds, above_counts, label='number of samples withabove')\n", "\n", "# set the x and y labels\n", "plt.xlabel('sacrebleu threshold')\n", "plt.ylabel('number of samples w/ score > threshold')\n", "\n", "# show the plot\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Inspect samples\n", "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n", " print(f\"Prompt: {r.prompt}\")\n", " print(f\"Sampled: {r.sampled}\")\n", " print(\"-\" * 25)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "vscode": { "interpreter": { "hash": "fdbe172e46cfba2329a5e8d5b64cdf2d12f4dfd7d9bcea153ecef62d1d51933b" } } }, "nbformat": 4, "nbformat_minor": 2 }