{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bce5cb53",
   "metadata": {},
   "source": [
    "### Example for running PII detection and anonymization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4b66801c",
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "Couldn't find a dataset script at C:\\New folder\\bigcode-dataset\\pii\\bigcode\\pii-for-code\\pii-for-code.py or any data file in the same directory. Couldn't find 'bigcode/pii-for-code' on the Hugging Face Hub either: FileNotFoundError: Dataset 'bigcode/pii-for-code' doesn't exist on the Hub",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 6\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpii_detection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m scan_pii_batch\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpii_redaction\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m redact_pii_batch, random_replacements\n\u001b[1;32m----> 6\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbigcode/pii-for-code\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mC:\\python39\\lib\\site-packages\\datasets\\load.py:2129\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)\u001b[0m\n\u001b[0;32m   2124\u001b[0m verification_mode \u001b[38;5;241m=\u001b[39m VerificationMode(\n\u001b[0;32m   2125\u001b[0m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mBASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS\n\u001b[0;32m   2126\u001b[0m )\n\u001b[0;32m   2128\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[1;32m-> 2129\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m load_dataset_builder(\n\u001b[0;32m   2130\u001b[0m     path\u001b[38;5;241m=\u001b[39mpath,\n\u001b[0;32m   2131\u001b[0m     name\u001b[38;5;241m=\u001b[39mname,\n\u001b[0;32m   2132\u001b[0m     data_dir\u001b[38;5;241m=\u001b[39mdata_dir,\n\u001b[0;32m   2133\u001b[0m     data_files\u001b[38;5;241m=\u001b[39mdata_files,\n\u001b[0;32m   2134\u001b[0m     cache_dir\u001b[38;5;241m=\u001b[39mcache_dir,\n\u001b[0;32m   2135\u001b[0m     features\u001b[38;5;241m=\u001b[39mfeatures,\n\u001b[0;32m   2136\u001b[0m     download_config\u001b[38;5;241m=\u001b[39mdownload_config,\n\u001b[0;32m   2137\u001b[0m     download_mode\u001b[38;5;241m=\u001b[39mdownload_mode,\n\u001b[0;32m   2138\u001b[0m     revision\u001b[38;5;241m=\u001b[39mrevision,\n\u001b[0;32m   2139\u001b[0m     token\u001b[38;5;241m=\u001b[39mtoken,\n\u001b[0;32m   2140\u001b[0m     storage_options\u001b[38;5;241m=\u001b[39mstorage_options,\n\u001b[0;32m   2141\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig_kwargs,\n\u001b[0;32m   2142\u001b[0m )\n\u001b[0;32m   2144\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[0;32m   2145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
      "File \u001b[1;32mC:\\python39\\lib\\site-packages\\datasets\\load.py:1815\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[1;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, **config_kwargs)\u001b[0m\n\u001b[0;32m   1813\u001b[0m     download_config \u001b[38;5;241m=\u001b[39m download_config\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m download_config \u001b[38;5;28;01melse\u001b[39;00m DownloadConfig()\n\u001b[0;32m   1814\u001b[0m     download_config\u001b[38;5;241m.\u001b[39mstorage_options\u001b[38;5;241m.\u001b[39mupdate(storage_options)\n\u001b[1;32m-> 1815\u001b[0m dataset_module \u001b[38;5;241m=\u001b[39m \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1816\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1817\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1818\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1819\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1820\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1821\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1822\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1823\u001b[0m \u001b[38;5;66;03m# Get dataset builder class from the processing script\u001b[39;00m\n\u001b[0;32m   1824\u001b[0m builder_kwargs \u001b[38;5;241m=\u001b[39m dataset_module\u001b[38;5;241m.\u001b[39mbuilder_kwargs\n",
      "File \u001b[1;32mC:\\python39\\lib\\site-packages\\datasets\\load.py:1508\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[1;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)\u001b[0m\n\u001b[0;32m   1506\u001b[0m                 \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m   1507\u001b[0m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[1;32m-> 1508\u001b[0m                 \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[0;32m   1509\u001b[0m                     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(combined_path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or any data file in the same directory. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1510\u001b[0m                     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1511\u001b[0m                 ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m   1512\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m   1513\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[1;31mFileNotFoundError\u001b[0m: Couldn't find a dataset script at C:\\New folder\\bigcode-dataset\\pii\\bigcode\\pii-for-code\\pii-for-code.py or any data file in the same directory. Couldn't find 'bigcode/pii-for-code' on the Hugging Face Hub either: FileNotFoundError: Dataset 'bigcode/pii-for-code' doesn't exist on the Hub"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "from pii_detection import scan_pii_batch\n",
    "from pii_redaction import redact_pii_batch, random_replacements\n",
    "\n",
    "ds = load_dataset(\"bigcode/pii-for-code\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f76c9e5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_pii = ds.map(scan_pii_batch, batched=True, batch_size=100, num_proc=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "06d15f83",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset after PII detection:\n",
      "Dataset({\n",
      "    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id', 'secrets', 'has_secrets', 'number_secrets'],\n",
      "    num_rows: 400\n",
      "})\n",
      "Number of samples that contained PII: 211\n",
      "Total number of secrets found: 336\n"
     ]
    }
   ],
   "source": [
    "print(f\"Dataset after PII detection:\\n{ds_pii}\")\n",
    "print(f\"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}\")\n",
    "print(f\"Total number of secrets found: {sum(ds_pii['number_secrets'])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b54c5044",
   "metadata": {},
   "source": [
    "#### About the detection and anonymization:\n",
    "* we detect secret keys with detect-secrets and mask them with keys from these 4 randomly generated sequences -they can change in each execution on a new dataset-: \n",
    "        ```\n",
    "        ['q8jtgev49gw1un9427qd9afza5vpuemo',\n",
    "        'pj82ffu65gt9sh9v8n9s2fyupslmlcq4',\n",
    "        'efijcf8z7r7pn0r25wfuh5vmpbrhoxkv',\n",
    "        '1dgjoc8ebhmhzfxhcbmlh4ndb81gqeoe']\n",
    "        ```\n",
    "        \n",
    "* we detect email addresses and mask them with one of these 4 emails (first part was randomly generated) -they can change in each execution on a new dataset-:\n",
    "        ```\n",
    "        ['mynbi@email.com',\n",
    "        'qpmzj@email.com',\n",
    "        'plsgq@email.com',\n",
    "        'ejeyd@email.com']\n",
    "        ```\n",
    "\n",
    "* we detect IP addresses (and DNS servers) and mask them with the random private addresses below (they are fixed). Note that private IP addresses aren't masked (we use `ipaddress` python library to determine if they are private or not):\n",
    "```\n",
    "{'IPv4': ['172.16.31.10',\n",
    "        '172.16.58.3',\n",
    "        '192.168.127.12',\n",
    "        '192.168.3.11'],\n",
    "'IPv6': ['fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b',\n",
    "                'fc00:e968:6179::de52:7100',\n",
    "                'fc00:db20:35b:7399::5',\n",
    "                'fdf8:f53e:61e4::18']},\n",
    "```\n",
    "\n",
    "Remarks:\n",
    "* If the same secret appears multiple times in a file, we use the same replacement each time.\n",
    "* To solve issue with dns servers being versions, we only detect an address in format x.x.x.x where x is one digit, if the words \"dns\" or \"sever\" appear in the near context."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "68669831",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'EMAIL': ['mynbi@email.com',\n",
      "           'qpmzj@email.com',\n",
      "           'plsgq@email.com',\n",
      "           'ejeyd@email.com'],\n",
      " 'IP_ADDRESS': {'IPv4': ['172.16.31.10',\n",
      "                         '172.16.58.3',\n",
      "                         '192.168.127.12',\n",
      "                         '192.168.3.11'],\n",
      "                'IPv6': ['fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b',\n",
      "                         'fc00:e968:6179::de52:7100',\n",
      "                         'fc00:db20:35b:7399::5',\n",
      "                         'fdf8:f53e:61e4::18']},\n",
      " 'KEY': ['q8jtgev49gw1un9427qd9afza5vpuemo',\n",
      "         'pj82ffu65gt9sh9v8n9s2fyupslmlcq4',\n",
      "         'efijcf8z7r7pn0r25wfuh5vmpbrhoxkv',\n",
      "         '1dgjoc8ebhmhzfxhcbmlh4ndb81gqeoe']}\n"
     ]
    }
   ],
   "source": [
    "# redaction\n",
    "import random\n",
    "from pprint import pprint\n",
    "random.seed(0)\n",
    "\n",
    "replacements = random_replacements()\n",
    "pprint(replacements)\n",
    "ds_redacted = ds_pii.map(lambda x: redact_pii_batch(x, replacements), batched=True, batch_size=100, num_proc=12, load_from_cache_file=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e060ed7e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id', 'secrets', 'has_secrets', 'number_secrets', 'new_content', 'redaction_refs'],\n",
       "    num_rows: 400\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_redacted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "294a9083",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "for e in ds_redacted:\n",
    "    secrets = json.loads(e[\"secrets\"])\n",
    "    if len(secrets) >= 3:\n",
    "        print(e[\"id\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "259f9759",
   "metadata": {},
   "source": [
    "example 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5a37524",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_redacted[16][\"secrets\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04f7e74d",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "print(ds_redacted[16][\"content\"][1190:1500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "470bf3aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"New text:\")\n",
    "print(ds_redacted[16][\"new_content\"][1190:1500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "897b7ebf",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"New text with delimietrs (for visualization in a space):\")\n",
    "print(ds_redacted[16][\"redaction_refs\"][1190:1500])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39e051da",
   "metadata": {},
   "source": [
    "example 27"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c129f763",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_redacted[27][\"secrets\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35977e2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "# we don't replace private Ips like 0.0.0.0\n",
    "print(ds_redacted[27][\"content\"][150:250])\n",
    "\n",
    "print(\"\\nNew text:\")\n",
    "print(ds_redacted[27][\"new_content\"][150:250])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d081f2ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "print(ds_redacted[27][\"content\"][270:670])\n",
    "\n",
    "print(\"\\nNew text:\")\n",
    "# here the first part of the key was detected and replaced with pj82ffu65gt9sh9v8n9s2fyupslmlcq\n",
    "print(ds_redacted[27][\"new_content\"][270:470])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0661335f",
   "metadata": {},
   "source": [
    "example 49"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f332863",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_redacted[49][\"secrets\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e2248f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "print(ds_redacted[49][\"content\"][30:70])\n",
    "\n",
    "print(\"\\nNew text:\")\n",
    "# here the first part of the key was detected and replaced with pj82ffu65gt9sh9v8n9s2fyupslmlcq\n",
    "print(ds_redacted[49][\"new_content\"][30:70])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.8"
  },
  "vscode": {
   "interpreter": {
    "hash": "fd8fde6f83dada9276d12fdb71d773558994168ed1b3bea457b8db38c02aa2e1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}