diff --git "a/Understanding_the_challenge.ipynb" "b/Understanding_the_challenge.ipynb"
new file mode 100644--- /dev/null
+++ "b/Understanding_the_challenge.ipynb"
@@ -0,0 +1,4496 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "## \"Unlocking Kenya's Land Treasure: The Structured Data Extraction Challenge\"\n",
+ "\n",
+ "* Welcome to our organization's first internal competition, where we embark on a journey to unlock the hidden potential within Kenya's gazzetes land registration archives. In this challenge, you will delve into the depths of unstructured data contained within Kenyan gazettes, extracting crucial information such as holder names, registration numbers, and precise land locations.\n",
+ "\n",
+ "* The objective is clear: transform chaotic data into structured goldmines of information, paving the way for enhanced analytics, informed decision-making, and innovative solutions in various sectors. By harnessing the power of structured data, we aim to revolutionize land management practices, drive economic growth, and foster sustainable development across Kenya.\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "I2SSyjgI41oA"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#This is only for those with the data in their google drive\n",
+ "from google.colab import drive\n",
+ "drive.mount('/content/gdrive/')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "JiTlL57m67ht",
+ "outputId": "da02dcdd-0836-4357-cc7f-1e464e50ecb8"
+ },
+ "execution_count": 1,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mounted at /content/gdrive/\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Train Data\n",
+ "* Within the provided internal competition folder, there are \"train\" and \"test\" directories.\n",
+ "* The \"train\" directory encompasses all the necessary data for training your AI models.\n",
+ "* The CSV files within represent the extracted information from the PDF files with corresponding filenames.\n",
+ "* The \"Train.csv\" file amalgamates all the individual train PDF files' CSVs."
+ ],
+ "metadata": {
+ "id": "A4bcIrmj6LFX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n"
+ ],
+ "metadata": {
+ "id": "cuWtAf8j5t2h"
+ },
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "train = pd.read_csv(\"/content/gdrive/MyDrive/data/train/Train.csv\")"
+ ],
+ "metadata": {
+ "id": "BwjLLkF76OfL"
+ },
+ "execution_count": 4,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "display(\" Peak into the data\",\n",
+ " train.head(),\n",
+ " \" train's shape\",\n",
+ " train.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 347
+ },
+ "id": "jZM8xtSn7OHn",
+ "outputId": "563e71f7-7d2e-45a5-d527-b3e35504c168"
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "' Peak into the data'"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " filename page number gazzete notice number name of the holder \\\n",
+ "0 2008_VOL4 14 43 Syprose Helida Odero \n",
+ "1 2008_VOL4 14 44 David Ndungu Muchiiri \n",
+ "2 2008_VOL4 14 45 Njuguna Ngamau \n",
+ "3 2008_VOL4 14 46 Mary Njeri Nganga \n",
+ "4 2008_VOL4 14 47 Charles Muteru Wambugu \n",
+ "\n",
+ " Registration numbers Land location \n",
+ "0 Kisumu/Ojolla/418 Kisumu \n",
+ "1 Gilgil/Gilgil Block 1/500 district of Nakuru \n",
+ "2 Shawa/Rongai Block 3/56(Sachangwan) district of Nakuru \n",
+ "3 Dundori/Lanet Block 13/1 district of Nakuru \n",
+ "4 Gilgil/Cilgil Block 1/375 district of Nakuru "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " filename | \n",
+ " page number | \n",
+ " gazzete notice number | \n",
+ " name of the holder | \n",
+ " Registration numbers | \n",
+ " Land location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 43 | \n",
+ " Syprose Helida Odero | \n",
+ " Kisumu/Ojolla/418 | \n",
+ " Kisumu | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 44 | \n",
+ " David Ndungu Muchiiri | \n",
+ " Gilgil/Gilgil Block 1/500 | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 45 | \n",
+ " Njuguna Ngamau | \n",
+ " Shawa/Rongai Block 3/56(Sachangwan) | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 46 | \n",
+ " Mary Njeri Nganga | \n",
+ " Dundori/Lanet Block 13/1 | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 47 | \n",
+ " Charles Muteru Wambugu | \n",
+ " Gilgil/Cilgil Block 1/375 | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \" train\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"filename\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"2008_VOL4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 14,\n \"max\": 14,\n \"num_unique_values\": 1,\n \"samples\": [\n 14\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gazzete notice number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 43,\n \"max\": 47,\n \"num_unique_values\": 5,\n \"samples\": [\n 44\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name of the holder\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"David Ndungu Muchiiri\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Registration numbers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Gilgil/Gilgil Block 1/500\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Land location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"district of Nakuru\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "\" train's shape\""
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "(649, 6)"
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "* Filename: Name of the PDF from which the information has been obtained.\n",
+ "* Page number: The page number where the gazette notice is located, for confirmation and correcting any misrepresented information.\n",
+ "* Name of the holder: Holder of the land mentioned. This is one of the attributes we will be extracting.\n",
+ "* Registration Numbers: Any LR No, IR No, CR No, or Title No mentioned in the gazette notice. If two or more are present, they are comma-separated values and also to be extracted.\n",
+ "* Land location: The location where the land is situated. Also to be extracted.\n"
+ ],
+ "metadata": {
+ "id": "oNlOSfyJ7e79"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "train[(train['filename'].isin(['2023_VOL168'])) & (train['gazzete notice number'].isin([9638]))]\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 115
+ },
+ "id": "2_mj5da7yBcr",
+ "outputId": "c6c972d7-30c2-4092-9925-fe92566a4e2a"
+ },
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " filename page number gazzete notice number name of the holder \\\n",
+ "560 2023_VOL168 3156 9638 Muturi Wanaina Muturi \n",
+ "\n",
+ " Registration numbers Land location \n",
+ "560 Sosian/Sosian Block 2/2117 (Narock Ranch) NaN "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " filename | \n",
+ " page number | \n",
+ " gazzete notice number | \n",
+ " name of the holder | \n",
+ " Registration numbers | \n",
+ " Land location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 560 | \n",
+ " 2023_VOL168 | \n",
+ " 3156 | \n",
+ " 9638 | \n",
+ " Muturi Wanaina Muturi | \n",
+ " Sosian/Sosian Block 2/2117 (Narock Ranch) | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "repr_error": "'str' object has no attribute 'empty'"
+ }
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Test Data\n",
+ "* The \"test\" directory encompasses all the necessary data for performing inference using your trained AI models.\n",
+ "* The CSV files within represent the extracted information from the PDF files with corresponding filenames, but with the columns \"Name of the Holder,\" \"Registration Numbers,\" and \"Land Location\" left empty, as these are the fields we are aiming to extract.\n",
+ "* The \"Test.csv\" file amalgamates all the individual test PDF files' CSVs."
+ ],
+ "metadata": {
+ "id": "MMV_l8tX8sHe"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "test = pd.read_csv(\"/content/gdrive/MyDrive/data/test/Test.csv\")\n",
+ "display(\" Peak into the data\",\n",
+ " test.head(),\n",
+ " \" test's shape\",\n",
+ " test.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 260
+ },
+ "id": "IOKYZSUR7Z3Q",
+ "outputId": "eafd0e70-41da-47d7-a2d6-7b169644bbd9"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "' Peak into the data'"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " filename gazzete notice number name of the holder Registration numbers \\\n",
+ "0 2023_VOL30 1430 \n",
+ "1 2023_VOL30 1431 \n",
+ "2 2023_VOL30 1432 \n",
+ "3 2023_VOL30 1433 \n",
+ "4 2023_VOL30 1434 \n",
+ "\n",
+ " Land location \n",
+ "0 \n",
+ "1 \n",
+ "2 \n",
+ "3 \n",
+ "4 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " filename | \n",
+ " gazzete notice number | \n",
+ " name of the holder | \n",
+ " Registration numbers | \n",
+ " Land location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2023_VOL30 | \n",
+ " 1430 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2023_VOL30 | \n",
+ " 1431 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2023_VOL30 | \n",
+ " 1432 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2023_VOL30 | \n",
+ " 1433 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2023_VOL30 | \n",
+ " 1434 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \" test\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"filename\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"2023_VOL30\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gazzete notice number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1430,\n \"max\": 1434,\n \"num_unique_values\": 5,\n \"samples\": [\n 1431\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name of the holder\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \" \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Registration numbers\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \" \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Land location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \" \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "\" test's shape\""
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "(374, 5)"
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Sample Submission\n",
+ "* The submission file is a melted version of the Test.csv\n",
+ "* What does this mean? Let us first take a look at the provided sample submission"
+ ],
+ "metadata": {
+ "id": "88rs_TbG9uCp"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "D7UoNItQx98X"
+ },
+ "execution_count": 8,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sample_sub = pd.read_csv(\"/content/gdrive/MyDrive/data/sample_submission.csv\")\n",
+ "display(sample_sub.head(), sample_sub.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 224
+ },
+ "id": "PImiTW059oRC",
+ "outputId": "38fe166b-8f48-40b0-daca-5e80a5f579ce"
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " id pred\n",
+ "0 2023_VOL30_1430_name of the holder \n",
+ "1 2023_VOL30_1431_name of the holder \n",
+ "2 2023_VOL30_1432_name of the holder \n",
+ "3 2023_VOL30_1433_name of the holder \n",
+ "4 2023_VOL30_1434_name of the holder "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2023_VOL30_1430_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2023_VOL30_1431_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2023_VOL30_1432_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2023_VOL30_1433_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2023_VOL30_1434_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(sample_sub\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2023_VOL30_1431_name of the holder\",\n \"2023_VOL30_1434_name of the holder\",\n \"2023_VOL30_1432_name of the holder\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \" \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "(1122, 2)"
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Melting is a concept in data reshaping where multiple columns, typically representing different variables, are transformed into a single column, while maintaining the relationships between them. In our context, melting involves pivoting the data from a wide format, where each observation has multiple columns, to a long format, where each observation is represented in multiple rows. This allows for easier evaluation as we will see later on\n",
+ "\n",
+ "* The shape has now changed from 374 rows to 1122 rows, with the \"id\" column representing the filename_gazzettenoticenumber_variabletobepredicted.\n",
+ "* Let's explore the transformation from wide format to long format using our training dataframe.\n",
+ "* This conversion is crucial for ensuring a smooth evaluation phase."
+ ],
+ "metadata": {
+ "id": "E9tzXNKQ-NaM"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Melting manipulation on our train dataframe:\n",
+ "* This is the same process you will follow to transform your test df from wide to long format"
+ ],
+ "metadata": {
+ "id": "eii_Rac7_GWX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "train.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 293
+ },
+ "id": "Ywhkh07M-K4P",
+ "outputId": "46fc0da6-9cf4-42a7-81a6-a2c7661c9b88"
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " filename page number gazzete notice number name of the holder \\\n",
+ "0 2008_VOL4 14 43 Syprose Helida Odero \n",
+ "1 2008_VOL4 14 44 David Ndungu Muchiiri \n",
+ "2 2008_VOL4 14 45 Njuguna Ngamau \n",
+ "3 2008_VOL4 14 46 Mary Njeri Nganga \n",
+ "4 2008_VOL4 14 47 Charles Muteru Wambugu \n",
+ "\n",
+ " Registration numbers Land location \n",
+ "0 Kisumu/Ojolla/418 Kisumu \n",
+ "1 Gilgil/Gilgil Block 1/500 district of Nakuru \n",
+ "2 Shawa/Rongai Block 3/56(Sachangwan) district of Nakuru \n",
+ "3 Dundori/Lanet Block 13/1 district of Nakuru \n",
+ "4 Gilgil/Cilgil Block 1/375 district of Nakuru "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " filename | \n",
+ " page number | \n",
+ " gazzete notice number | \n",
+ " name of the holder | \n",
+ " Registration numbers | \n",
+ " Land location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 43 | \n",
+ " Syprose Helida Odero | \n",
+ " Kisumu/Ojolla/418 | \n",
+ " Kisumu | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 44 | \n",
+ " David Ndungu Muchiiri | \n",
+ " Gilgil/Gilgil Block 1/500 | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 45 | \n",
+ " Njuguna Ngamau | \n",
+ " Shawa/Rongai Block 3/56(Sachangwan) | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 46 | \n",
+ " Mary Njeri Nganga | \n",
+ " Dundori/Lanet Block 13/1 | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2008_VOL4 | \n",
+ " 14 | \n",
+ " 47 | \n",
+ " Charles Muteru Wambugu | \n",
+ " Gilgil/Cilgil Block 1/375 | \n",
+ " district of Nakuru | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "train",
+ "summary": "{\n \"name\": \"train\",\n \"rows\": 649,\n \"fields\": [\n {\n \"column\": \"filename\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"2008_VOL4\",\n \"2016_VOL15\",\n \"2022_VOL109\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"page number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1526,\n \"min\": 14,\n \"max\": 5798,\n \"num_unique_values\": 91,\n \"samples\": [\n 3187,\n 4792,\n 3430\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gazzete notice number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3344,\n \"min\": 43,\n \"max\": 11029,\n \"num_unique_values\": 649,\n \"samples\": [\n 9714,\n 10986,\n 9672\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name of the holder\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 608,\n \"samples\": [\n \"Wilson Washington Otieno\",\n \"Peter N. Rabok\",\n \"Benard Gitau Wanyoike\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Registration numbers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 645,\n \"samples\": [\n \"Ndarugu/Gakoe/T. 214\",\n \"19278/I/MN, 58741/1\",\n \"Laikipia Nanyuki Marura Block 1/1108 (Ichuga)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Land location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 122,\n \"samples\": [\n \"district of Mombasa\",\n \"district of Kuria\",\n \"south of Ruiru in Kiambu District\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Lets create a unique id by concatenating the filename and gazette notice number\n",
+ "train['id'] = train['filename'] + '_' + train['gazzete notice number'].astype('str')\n",
+ "# lets drop the filename and gazzete notice number as they are represented by the id column and the page number as that is only used for you personal gazzette details confirmation\n",
+ "train.drop(columns = ['filename', 'gazzete notice number', 'page number'], inplace = True)\n",
+ "train.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "nPG6KeFK-ECU",
+ "outputId": "74dc3961-441d-41e5-e10e-e145b85c34b9"
+ },
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " name of the holder Registration numbers \\\n",
+ "0 Syprose Helida Odero Kisumu/Ojolla/418 \n",
+ "1 David Ndungu Muchiiri Gilgil/Gilgil Block 1/500 \n",
+ "2 Njuguna Ngamau Shawa/Rongai Block 3/56(Sachangwan) \n",
+ "3 Mary Njeri Nganga Dundori/Lanet Block 13/1 \n",
+ "4 Charles Muteru Wambugu Gilgil/Cilgil Block 1/375 \n",
+ "\n",
+ " Land location id \n",
+ "0 Kisumu 2008_VOL4_43 \n",
+ "1 district of Nakuru 2008_VOL4_44 \n",
+ "2 district of Nakuru 2008_VOL4_45 \n",
+ "3 district of Nakuru 2008_VOL4_46 \n",
+ "4 district of Nakuru 2008_VOL4_47 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name of the holder | \n",
+ " Registration numbers | \n",
+ " Land location | \n",
+ " id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Syprose Helida Odero | \n",
+ " Kisumu/Ojolla/418 | \n",
+ " Kisumu | \n",
+ " 2008_VOL4_43 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " David Ndungu Muchiiri | \n",
+ " Gilgil/Gilgil Block 1/500 | \n",
+ " district of Nakuru | \n",
+ " 2008_VOL4_44 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Njuguna Ngamau | \n",
+ " Shawa/Rongai Block 3/56(Sachangwan) | \n",
+ " district of Nakuru | \n",
+ " 2008_VOL4_45 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Mary Njeri Nganga | \n",
+ " Dundori/Lanet Block 13/1 | \n",
+ " district of Nakuru | \n",
+ " 2008_VOL4_46 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Charles Muteru Wambugu | \n",
+ " Gilgil/Cilgil Block 1/375 | \n",
+ " district of Nakuru | \n",
+ " 2008_VOL4_47 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "train",
+ "summary": "{\n \"name\": \"train\",\n \"rows\": 649,\n \"fields\": [\n {\n \"column\": \"name of the holder\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 608,\n \"samples\": [\n \"Wilson Washington Otieno\",\n \"Peter N. Rabok\",\n \"Benard Gitau Wanyoike\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Registration numbers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 645,\n \"samples\": [\n \"Ndarugu/Gakoe/T. 214\",\n \"19278/I/MN, 58741/1\",\n \"Laikipia Nanyuki Marura Block 1/1108 (Ichuga)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Land location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 122,\n \"samples\": [\n \"district of Mombasa\",\n \"district of Kuria\",\n \"south of Ruiru in Kiambu District\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 649,\n \"samples\": [\n \"2023_VOL168_9714\",\n \"2017_VOL167_10986\",\n \"2023_VOL168_9672\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Now we melt the train df\n",
+ "melted_train = pd.melt(train, id_vars=['id'], var_name = 'Variable', value_name= 'pred')\n",
+ "melted_train.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "vl33TtA9_3-H",
+ "outputId": "181c7c5c-0e5b-4387-f85c-533273d98093"
+ },
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id Variable pred\n",
+ "0 2008_VOL4_43 name of the holder Syprose Helida Odero\n",
+ "1 2008_VOL4_44 name of the holder David Ndungu Muchiiri\n",
+ "2 2008_VOL4_45 name of the holder Njuguna Ngamau\n",
+ "3 2008_VOL4_46 name of the holder Mary Njeri Nganga\n",
+ "4 2008_VOL4_47 name of the holder Charles Muteru Wambugu"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " Variable | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2008_VOL4_43 | \n",
+ " name of the holder | \n",
+ " Syprose Helida Odero | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2008_VOL4_44 | \n",
+ " name of the holder | \n",
+ " David Ndungu Muchiiri | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2008_VOL4_45 | \n",
+ " name of the holder | \n",
+ " Njuguna Ngamau | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2008_VOL4_46 | \n",
+ " name of the holder | \n",
+ " Mary Njeri Nganga | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2008_VOL4_47 | \n",
+ " name of the holder | \n",
+ " Charles Muteru Wambugu | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "melted_train",
+ "summary": "{\n \"name\": \"melted_train\",\n \"rows\": 1947,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 649,\n \"samples\": [\n \"2023_VOL168_9714\",\n \"2017_VOL167_10986\",\n \"2023_VOL168_9672\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Variable\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"name of the holder\",\n \"Registration numbers\",\n \"Land location\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1375,\n \"samples\": [\n \"Isaac Alinyo Okaka\",\n \"John Osit Ojina\",\n \"Mavoko Town Block 3/5627\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#now if we look at the Variable column and find the unique categories then we will find the three things we need to extract\n",
+ "melted_train.Variable.unique()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "-GXUSHtrAnAe",
+ "outputId": "b9746859-fc2c-457b-9bc8-4f7c1bc3bd6d"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['name of the holder', 'Registration numbers', 'Land location'],\n",
+ " dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#great for a fully unique id, we now concat the id with the variable column and drop it since it is representedd by the id column now\n",
+ "melted_train['id'] = melted_train['id'] + '_' + melted_train['Variable']\n",
+ "melted_train.drop(columns =['Variable'], inplace = True)\n",
+ "melted_train.head()\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "6wJ4w36pA03_",
+ "outputId": "5a0d66b0-9fd2-4c06-c08b-accb61e3cc8b"
+ },
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id pred\n",
+ "0 2008_VOL4_43_name of the holder Syprose Helida Odero\n",
+ "1 2008_VOL4_44_name of the holder David Ndungu Muchiiri\n",
+ "2 2008_VOL4_45_name of the holder Njuguna Ngamau\n",
+ "3 2008_VOL4_46_name of the holder Mary Njeri Nganga\n",
+ "4 2008_VOL4_47_name of the holder Charles Muteru Wambugu"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2008_VOL4_43_name of the holder | \n",
+ " Syprose Helida Odero | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2008_VOL4_44_name of the holder | \n",
+ " David Ndungu Muchiiri | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2008_VOL4_45_name of the holder | \n",
+ " Njuguna Ngamau | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2008_VOL4_46_name of the holder | \n",
+ " Mary Njeri Nganga | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2008_VOL4_47_name of the holder | \n",
+ " Charles Muteru Wambugu | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "melted_train",
+ "summary": "{\n \"name\": \"melted_train\",\n \"rows\": 1947,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1947,\n \"samples\": [\n \"2018_VOL113_9436_Land location\",\n \"2016_VOL15_1037_Land location\",\n \"2018_VOL113_9446_Registration numbers\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1375,\n \"samples\": [\n \"Isaac Alinyo Okaka\",\n \"John Osit Ojina\",\n \"Mavoko Town Block 3/5627\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# great and now we have a melted train that resembles the sample submission. Lets check the shape of the original train and the melted one\n",
+ "train.shape, melted_train.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "D4Pzjaf-BGF1",
+ "outputId": "7d7b2db1-18f1-43a4-ac7b-641797482149"
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "((649, 4), (1947, 2))"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\"\"\"great now you need to do the same for your test df after you have performed inference and you will get all the unique ids in the sample submission but now with the pred column with actual\n",
+ "predictions\n",
+ "The pred column in the below will now be filled after infferencing\n",
+ "\"\"\"\n",
+ "sample_sub.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "-5g1lMoKBS3q",
+ "outputId": "23dce0fd-32dd-465a-c257-23e9c1a6c41e"
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id pred\n",
+ "0 2023_VOL30_1430_name of the holder \n",
+ "1 2023_VOL30_1431_name of the holder \n",
+ "2 2023_VOL30_1432_name of the holder \n",
+ "3 2023_VOL30_1433_name of the holder \n",
+ "4 2023_VOL30_1434_name of the holder "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2023_VOL30_1430_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2023_VOL30_1431_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2023_VOL30_1432_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2023_VOL30_1433_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2023_VOL30_1434_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "sample_sub",
+ "summary": "{\n \"name\": \"sample_sub\",\n \"rows\": 1122,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1119,\n \"samples\": [\n \"2022_252_14701_name of the holder\",\n \"2023_VOL30_1531_name of the holder\",\n \"2022_252_14606_Land location\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \" \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Evaluation process\n",
+ "* The evaluation metric used is accuracy.\n",
+ "* Let me demonstrate it below using a custom accuracy score calculation method\n"
+ ],
+ "metadata": {
+ "id": "WR8bnBt0B9V2"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "* The below function takes a name string as input and performs several preprocessing steps to ensure consistency in comparisons.\n",
+ "\n",
+ " * a. Lowercase Conversion: It converts the entire name to lowercase to remove any case sensitivity. This ensures that names like \"Mary\" and \"mary\" are treated as the same.\n",
+ "\n",
+ " * b. Whitespace Removal: It removes any extra whitespace characters (such as multiple spaces) to standardize the spacing between words. This ensures that variations in spacing don't affect the comparison.\n",
+ "\n",
+ " * c. Special Character Removal: It removes any special characters or punctuation marks (except comma for comma separated values) from the name. This ensures that variations in punctuation or special characters don't affect the comparison.\n",
+ "\n",
+ " * Example Usage: The function is applied to a list of names, and the resulting preprocessed names are stored in another list. These preprocessed names can then be used for comparison or evaluation purposes."
+ ],
+ "metadata": {
+ "id": "YGwIAcFfDlvl"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import re\n",
+ "\n",
+ "def preprocess_name(name):\n",
+ " try:\n",
+ " name = str(name)\n",
+ " # Convert to lowercase and remove extra whitespaces\n",
+ " name = re.sub(r'\\s+', '', name.lower().strip())\n",
+ "\n",
+ " # Handle names separated by comma\n",
+ " if ',' in name:\n",
+ " names = name.split(',')\n",
+ " names = [re.sub(r'[^a-zA-Z0-9\\s/]', '', n) for n in names]\n",
+ " return ','.join(names)\n",
+ "\n",
+ " # Remove special characters\n",
+ " name = re.sub(r'[^a-zA-Z0-9\\s/]', '', name)\n",
+ " return name\n",
+ " except:\n",
+ " return name\n",
+ "\n",
+ "# Example usage: output should be a preprocessed output of the input\n",
+ "names = [\"Syprose Helida Odero\", \"syprose helida odero\", \"SyproseHelidaOdero\", \"Syprose Helida Odero\", \"Mary Wanjiku Ndungu,Duncan Wachira Wanjau\", \"Sosian/Sosian Block 2/2117 (Narock Ranch)\", \"Evurore/Kathera/2061\"]\n",
+ "preprocessed_names = [preprocess_name(name) for name in names]\n",
+ "print(preprocessed_names)\n"
+ ],
+ "metadata": {
+ "id": "sZu0XkTHCU37",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "b6aaaa42-29f8-4362-87ab-163184397e2d"
+ },
+ "execution_count": 18,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "['syprosehelidaodero', 'syprosehelidaodero', 'syprosehelidaodero', 'syprosehelidaodero', 'marywanjikundungu,duncanwachirawanjau', 'sosian/sosianblock2/2117narockranch', 'evurore/kathera/2061']\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Now we calcualate the accuracy score\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "\n",
+ "# Example arrays of predictions and actual values\n",
+ "predictions = [\"Syprose Helida Odero\", \"syprose helida odero\", \"SyproseHelidaOdero\", \"Syprose Helida Odero\", \"Mary Wanjiku Ndungu,Duncan Wachira Wanjau\"]\n",
+ "actual_values = [\"Syprose Helida Odero\", \"SyproseHelidaOdero\", \"syprose helida odero\", \"Michael Kimani Macharia\", \"Mary Wanjiku Ndungu,Duncan Wachira Wanjau\"]\n",
+ "\n",
+ "# Preprocess predictions and actual values\n",
+ "preprocessed_predictions = [preprocess_name(name) for name in predictions]\n",
+ "preprocessed_actual_values = [preprocess_name(name) for name in actual_values]\n",
+ "\n",
+ "# Calculate accuracy score using preprocessed arrays\n",
+ "accuracy = accuracy_score(preprocessed_actual_values, preprocessed_predictions)\n",
+ "\n",
+ "print(\"Accuracy Score:\", accuracy)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YqiuRFOfD3Sn",
+ "outputId": "f10c1b4b-2a53-4625-f1ea-e364db26c124"
+ },
+ "execution_count": 19,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Accuracy Score: 0.8\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "* We get an accuracy score of 0.8 despite some of them having different cases, spaces etc nicee!!!\n",
+ "* it is not 1 due the instance of \"Syprose Helida Odero\" and \"Michael Kimani Macharia\"\n",
+ "* Also for instances where we have comma separated values make sure you record every one of them e.g for the holders names since there may be multiple people and also the registration numbers since there might be a mention of both LR NO and IR NO"
+ ],
+ "metadata": {
+ "id": "L974Z4WiGVxU"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### So key take aways in this evaluation phase\n",
+ "* Make sure you use the above provided custom preprocess function to modify your preds before downloading your sample submission values\n",
+ "* The actuall labels in the competition have been processed using the function above so make sure you preprocess your outputs before downloading your sub file\n",
+ "* I will show you an example using melted train"
+ ],
+ "metadata": {
+ "id": "08owH6lMIWwu"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "melted_train.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "55lolBv5EZZ2",
+ "outputId": "1c3f7dd3-dded-48fe-8c89-a9ee986939fc"
+ },
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id pred\n",
+ "0 2008_VOL4_43_name of the holder Syprose Helida Odero\n",
+ "1 2008_VOL4_44_name of the holder David Ndungu Muchiiri\n",
+ "2 2008_VOL4_45_name of the holder Njuguna Ngamau\n",
+ "3 2008_VOL4_46_name of the holder Mary Njeri Nganga\n",
+ "4 2008_VOL4_47_name of the holder Charles Muteru Wambugu"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2008_VOL4_43_name of the holder | \n",
+ " Syprose Helida Odero | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2008_VOL4_44_name of the holder | \n",
+ " David Ndungu Muchiiri | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2008_VOL4_45_name of the holder | \n",
+ " Njuguna Ngamau | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2008_VOL4_46_name of the holder | \n",
+ " Mary Njeri Nganga | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2008_VOL4_47_name of the holder | \n",
+ " Charles Muteru Wambugu | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "melted_train",
+ "summary": "{\n \"name\": \"melted_train\",\n \"rows\": 1947,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1947,\n \"samples\": [\n \"2018_VOL113_9436_Land location\",\n \"2016_VOL15_1037_Land location\",\n \"2018_VOL113_9446_Registration numbers\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1375,\n \"samples\": [\n \"Isaac Alinyo Okaka\",\n \"John Osit Ojina\",\n \"Mavoko Town Block 3/5627\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# The above pred column , we will change it to actual and then create a copy of it and name it actual\n",
+ "melted_train.rename(columns = {'pred': 'actual'}, inplace = True)\n",
+ "melted_train['pred'] = melted_train['actual'].copy()\n",
+ "melted_train.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "ZZnPtjTAI2og",
+ "outputId": "507e1adb-d50b-43f9-c6c8-4e919add4cc0"
+ },
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id actual \\\n",
+ "0 2008_VOL4_43_name of the holder Syprose Helida Odero \n",
+ "1 2008_VOL4_44_name of the holder David Ndungu Muchiiri \n",
+ "2 2008_VOL4_45_name of the holder Njuguna Ngamau \n",
+ "3 2008_VOL4_46_name of the holder Mary Njeri Nganga \n",
+ "4 2008_VOL4_47_name of the holder Charles Muteru Wambugu \n",
+ "\n",
+ " pred \n",
+ "0 Syprose Helida Odero \n",
+ "1 David Ndungu Muchiiri \n",
+ "2 Njuguna Ngamau \n",
+ "3 Mary Njeri Nganga \n",
+ "4 Charles Muteru Wambugu "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " actual | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2008_VOL4_43_name of the holder | \n",
+ " Syprose Helida Odero | \n",
+ " Syprose Helida Odero | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2008_VOL4_44_name of the holder | \n",
+ " David Ndungu Muchiiri | \n",
+ " David Ndungu Muchiiri | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2008_VOL4_45_name of the holder | \n",
+ " Njuguna Ngamau | \n",
+ " Njuguna Ngamau | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2008_VOL4_46_name of the holder | \n",
+ " Mary Njeri Nganga | \n",
+ " Mary Njeri Nganga | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2008_VOL4_47_name of the holder | \n",
+ " Charles Muteru Wambugu | \n",
+ " Charles Muteru Wambugu | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "melted_train",
+ "summary": "{\n \"name\": \"melted_train\",\n \"rows\": 1947,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1947,\n \"samples\": [\n \"2018_VOL113_9436_Land location\",\n \"2016_VOL15_1037_Land location\",\n \"2018_VOL113_9446_Registration numbers\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"actual\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1375,\n \"samples\": [\n \"Isaac Alinyo Okaka\",\n \"John Osit Ojina\",\n \"Mavoko Town Block 3/5627\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1375,\n \"samples\": [\n \"Isaac Alinyo Okaka\",\n \"John Osit Ojina\",\n \"Mavoko Town Block 3/5627\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#notice that the current actual and preds have not been processed yet: but remember the one on the leaderboard has been processed by the same function explained above\n",
+ "melted_train['actual'] = melted_train['actual'].apply(lambda x: preprocess_name(x))\n",
+ "melted_train['pred'] = melted_train['pred'].apply(lambda x: preprocess_name(x))\n",
+ "\n",
+ "melted_train.head()\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "_4JziMENJL0G",
+ "outputId": "1235228a-fd3f-42c0-aaf8-94620d7536a4"
+ },
+ "execution_count": 22,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id actual pred\n",
+ "0 2008_VOL4_43_name of the holder syprosehelidaodero syprosehelidaodero\n",
+ "1 2008_VOL4_44_name of the holder davidndungumuchiiri davidndungumuchiiri\n",
+ "2 2008_VOL4_45_name of the holder njugunangamau njugunangamau\n",
+ "3 2008_VOL4_46_name of the holder marynjeringanga marynjeringanga\n",
+ "4 2008_VOL4_47_name of the holder charlesmuteruwambugu charlesmuteruwambugu"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " actual | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2008_VOL4_43_name of the holder | \n",
+ " syprosehelidaodero | \n",
+ " syprosehelidaodero | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2008_VOL4_44_name of the holder | \n",
+ " davidndungumuchiiri | \n",
+ " davidndungumuchiiri | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2008_VOL4_45_name of the holder | \n",
+ " njugunangamau | \n",
+ " njugunangamau | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2008_VOL4_46_name of the holder | \n",
+ " marynjeringanga | \n",
+ " marynjeringanga | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2008_VOL4_47_name of the holder | \n",
+ " charlesmuteruwambugu | \n",
+ " charlesmuteruwambugu | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "melted_train",
+ "summary": "{\n \"name\": \"melted_train\",\n \"rows\": 1947,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1947,\n \"samples\": [\n \"2018_VOL113_9436_Land location\",\n \"2016_VOL15_1037_Land location\",\n \"2018_VOL113_9446_Registration numbers\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"actual\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1357,\n \"samples\": [\n \"janenjerimwaura\",\n \"nyaki/thuura/2233\",\n \"northeastofkisumumunicipalityincentralnyanzadistrict\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1357,\n \"samples\": [\n \"janenjerimwaura\",\n \"nyaki/thuura/2233\",\n \"northeastofkisumumunicipalityincentralnyanzadistrict\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 22
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Nice notice how nicely they have been formatted?\n",
+ "Okay calculating the accuracy will be pretty straightforward now"
+ ],
+ "metadata": {
+ "id": "niXZNHoQKI6O"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "accuracy_score(melted_train['actual'], melted_train['pred'])"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "z62FH0xJJ7Aq",
+ "outputId": "9efca35a-c5f0-4a88-bf47-200885fd9715"
+ },
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "1.0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "* Okay we are getting an accuracy score of 100% but thats because we just copied the preds from the actual. But if your AI model is good enough you will can achieve that too\n",
+ "* But for you you wont have the actuall, you will only have the pred in your submission file\n",
+ "* Your submission file should have the same exact format( columns) as the sample submission\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "NlbFHEhrKoKf"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sample_sub.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "-QXfMxd1KaqD",
+ "outputId": "1a919b21-d011-451b-dcc7-29daa4939f48"
+ },
+ "execution_count": 24,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id pred\n",
+ "0 2023_VOL30_1430_name of the holder \n",
+ "1 2023_VOL30_1431_name of the holder \n",
+ "2 2023_VOL30_1432_name of the holder \n",
+ "3 2023_VOL30_1433_name of the holder \n",
+ "4 2023_VOL30_1434_name of the holder "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2023_VOL30_1430_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2023_VOL30_1431_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2023_VOL30_1432_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2023_VOL30_1433_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2023_VOL30_1434_name of the holder | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "sample_sub",
+ "summary": "{\n \"name\": \"sample_sub\",\n \"rows\": 1122,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1119,\n \"samples\": [\n \"2022_252_14701_name of the holder\",\n \"2023_VOL30_1531_name of the holder\",\n \"2022_252_14606_Land location\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \" \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 24
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "* So you will apply the preprocessing function to the pred column after performing inference then download the file\n"
+ ],
+ "metadata": {
+ "id": "Vwpox2eNLLn4"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from google.colab import files\n",
+ "sample_sub['pred'] = sample_sub['pred'].apply(lambda x: preprocess_name(x))\n",
+ "sample_sub.to_csv(\"baseline.csv\", index= False)\n",
+ "files.download('/content/baseline.csv')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 17
+ },
+ "id": "SKdgXa7iLJ9d",
+ "outputId": "d321899e-deea-485a-cbd5-c93dc8911cb6"
+ },
+ "execution_count": 25,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "\n",
+ " async function download(id, filename, size) {\n",
+ " if (!google.colab.kernel.accessAllowed) {\n",
+ " return;\n",
+ " }\n",
+ " const div = document.createElement('div');\n",
+ " const label = document.createElement('label');\n",
+ " label.textContent = `Downloading \"${filename}\": `;\n",
+ " div.appendChild(label);\n",
+ " const progress = document.createElement('progress');\n",
+ " progress.max = size;\n",
+ " div.appendChild(progress);\n",
+ " document.body.appendChild(div);\n",
+ "\n",
+ " const buffers = [];\n",
+ " let downloaded = 0;\n",
+ "\n",
+ " const channel = await google.colab.kernel.comms.open(id);\n",
+ " // Send a message to notify the kernel that we're ready.\n",
+ " channel.send({})\n",
+ "\n",
+ " for await (const message of channel.messages) {\n",
+ " // Send a message to notify the kernel that we're ready.\n",
+ " channel.send({})\n",
+ " if (message.buffers) {\n",
+ " for (const buffer of message.buffers) {\n",
+ " buffers.push(buffer);\n",
+ " downloaded += buffer.byteLength;\n",
+ " progress.value = downloaded;\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " const blob = new Blob(buffers, {type: 'application/binary'});\n",
+ " const a = document.createElement('a');\n",
+ " a.href = window.URL.createObjectURL(blob);\n",
+ " a.download = filename;\n",
+ " div.appendChild(a);\n",
+ " a.click();\n",
+ " div.remove();\n",
+ " }\n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "download(\"download_e72192be-8fdc-42be-a39d-a840cccc2949\", \"baseline.csv\", 38918)"
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "* Then get the downloaded csv file, go and submit it in the competitions tab on huggingface\n"
+ ],
+ "metadata": {
+ "id": "g8k5hXBSLwrv"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "YrwjstkwuTyA"
+ },
+ "execution_count": 25,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file