{ "cells": [ { "cell_type": "markdown", "id": "bce5cb53", "metadata": {}, "source": [ "### Example for running PII detection and anonymization" ] }, { "cell_type": "code", "execution_count": 1, "id": "4b66801c", "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "Couldn't find a dataset script at C:\\New folder\\bigcode-dataset\\pii\\bigcode\\pii-for-code\\pii-for-code.py or any data file in the same directory. Couldn't find 'bigcode/pii-for-code' on the Hugging Face Hub either: FileNotFoundError: Dataset 'bigcode/pii-for-code' doesn't exist on the Hub", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[1], line 6\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpii_detection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m scan_pii_batch\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpii_redaction\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m redact_pii_batch, random_replacements\n\u001b[1;32m----> 6\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbigcode/pii-for-code\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mC:\\python39\\lib\\site-packages\\datasets\\load.py:2129\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)\u001b[0m\n\u001b[0;32m 2124\u001b[0m verification_mode \u001b[38;5;241m=\u001b[39m VerificationMode(\n\u001b[0;32m 2125\u001b[0m (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mBASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS\n\u001b[0;32m 2126\u001b[0m )\n\u001b[0;32m 2128\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[1;32m-> 2129\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m load_dataset_builder(\n\u001b[0;32m 2130\u001b[0m path\u001b[38;5;241m=\u001b[39mpath,\n\u001b[0;32m 2131\u001b[0m name\u001b[38;5;241m=\u001b[39mname,\n\u001b[0;32m 2132\u001b[0m data_dir\u001b[38;5;241m=\u001b[39mdata_dir,\n\u001b[0;32m 2133\u001b[0m data_files\u001b[38;5;241m=\u001b[39mdata_files,\n\u001b[0;32m 2134\u001b[0m cache_dir\u001b[38;5;241m=\u001b[39mcache_dir,\n\u001b[0;32m 2135\u001b[0m features\u001b[38;5;241m=\u001b[39mfeatures,\n\u001b[0;32m 2136\u001b[0m download_config\u001b[38;5;241m=\u001b[39mdownload_config,\n\u001b[0;32m 2137\u001b[0m download_mode\u001b[38;5;241m=\u001b[39mdownload_mode,\n\u001b[0;32m 2138\u001b[0m revision\u001b[38;5;241m=\u001b[39mrevision,\n\u001b[0;32m 2139\u001b[0m token\u001b[38;5;241m=\u001b[39mtoken,\n\u001b[0;32m 2140\u001b[0m storage_options\u001b[38;5;241m=\u001b[39mstorage_options,\n\u001b[0;32m 2141\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig_kwargs,\n\u001b[0;32m 2142\u001b[0m )\n\u001b[0;32m 2144\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[0;32m 2145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n", "File \u001b[1;32mC:\\python39\\lib\\site-packages\\datasets\\load.py:1815\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[1;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, **config_kwargs)\u001b[0m\n\u001b[0;32m 1813\u001b[0m download_config \u001b[38;5;241m=\u001b[39m download_config\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m download_config \u001b[38;5;28;01melse\u001b[39;00m DownloadConfig()\n\u001b[0;32m 1814\u001b[0m download_config\u001b[38;5;241m.\u001b[39mstorage_options\u001b[38;5;241m.\u001b[39mupdate(storage_options)\n\u001b[1;32m-> 1815\u001b[0m dataset_module \u001b[38;5;241m=\u001b[39m \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1816\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1817\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1818\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1819\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1820\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1821\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1822\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1823\u001b[0m \u001b[38;5;66;03m# Get dataset builder class from the processing script\u001b[39;00m\n\u001b[0;32m 1824\u001b[0m builder_kwargs \u001b[38;5;241m=\u001b[39m dataset_module\u001b[38;5;241m.\u001b[39mbuilder_kwargs\n", "File \u001b[1;32mC:\\python39\\lib\\site-packages\\datasets\\load.py:1508\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[1;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)\u001b[0m\n\u001b[0;32m 1506\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1507\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[1;32m-> 1508\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[0;32m 1509\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(combined_path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or any data file in the same directory. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1510\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1511\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1512\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1513\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[1;31mFileNotFoundError\u001b[0m: Couldn't find a dataset script at C:\\New folder\\bigcode-dataset\\pii\\bigcode\\pii-for-code\\pii-for-code.py or any data file in the same directory. Couldn't find 'bigcode/pii-for-code' on the Hugging Face Hub either: FileNotFoundError: Dataset 'bigcode/pii-for-code' doesn't exist on the Hub" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "from pii_detection import scan_pii_batch\n", "from pii_redaction import redact_pii_batch, random_replacements\n", "\n", "ds = load_dataset(\"bigcode/pii-for-code\", split=\"train\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f76c9e5f", "metadata": {}, "outputs": [], "source": [ "ds_pii = ds.map(scan_pii_batch, batched=True, batch_size=100, num_proc=12)" ] }, { "cell_type": "code", "execution_count": 3, "id": "06d15f83", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset after PII detection:\n", "Dataset({\n", " features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id', 'secrets', 'has_secrets', 'number_secrets'],\n", " num_rows: 400\n", "})\n", "Number of samples that contained PII: 211\n", "Total number of secrets found: 336\n" ] } ], "source": [ "print(f\"Dataset after PII detection:\\n{ds_pii}\")\n", "print(f\"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}\")\n", "print(f\"Total number of secrets found: {sum(ds_pii['number_secrets'])}\")" ] }, { "cell_type": "markdown", "id": "b54c5044", "metadata": {}, "source": [ "#### About the detection and anonymization:\n", "* we detect secret keys with detect-secrets and mask them with keys from these 4 randomly generated sequences -they can change in each execution on a new dataset-: \n", " ```\n", " ['q8jtgev49gw1un9427qd9afza5vpuemo',\n", " 'pj82ffu65gt9sh9v8n9s2fyupslmlcq4',\n", " 'efijcf8z7r7pn0r25wfuh5vmpbrhoxkv',\n", " '1dgjoc8ebhmhzfxhcbmlh4ndb81gqeoe']\n", " ```\n", " \n", "* we detect email addresses and mask them with one of these 4 emails (first part was randomly generated) -they can change in each execution on a new dataset-:\n", " ```\n", " ['mynbi@email.com',\n", " 'qpmzj@email.com',\n", " 'plsgq@email.com',\n", " 'ejeyd@email.com']\n", " ```\n", "\n", "* we detect IP addresses (and DNS servers) and mask them with the random private addresses below (they are fixed). Note that private IP addresses aren't masked (we use `ipaddress` python library to determine if they are private or not):\n", "```\n", "{'IPv4': ['172.16.31.10',\n", " '172.16.58.3',\n", " '192.168.127.12',\n", " '192.168.3.11'],\n", "'IPv6': ['fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b',\n", " 'fc00:e968:6179::de52:7100',\n", " 'fc00:db20:35b:7399::5',\n", " 'fdf8:f53e:61e4::18']},\n", "```\n", "\n", "Remarks:\n", "* If the same secret appears multiple times in a file, we use the same replacement each time.\n", "* To solve issue with dns servers being versions, we only detect an address in format x.x.x.x where x is one digit, if the words \"dns\" or \"sever\" appear in the near context." ] }, { "cell_type": "code", "execution_count": 49, "id": "68669831", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'EMAIL': ['mynbi@email.com',\n", " 'qpmzj@email.com',\n", " 'plsgq@email.com',\n", " 'ejeyd@email.com'],\n", " 'IP_ADDRESS': {'IPv4': ['172.16.31.10',\n", " '172.16.58.3',\n", " '192.168.127.12',\n", " '192.168.3.11'],\n", " 'IPv6': ['fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b',\n", " 'fc00:e968:6179::de52:7100',\n", " 'fc00:db20:35b:7399::5',\n", " 'fdf8:f53e:61e4::18']},\n", " 'KEY': ['q8jtgev49gw1un9427qd9afza5vpuemo',\n", " 'pj82ffu65gt9sh9v8n9s2fyupslmlcq4',\n", " 'efijcf8z7r7pn0r25wfuh5vmpbrhoxkv',\n", " '1dgjoc8ebhmhzfxhcbmlh4ndb81gqeoe']}\n" ] } ], "source": [ "# redaction\n", "import random\n", "from pprint import pprint\n", "random.seed(0)\n", "\n", "replacements = random_replacements()\n", "pprint(replacements)\n", "ds_redacted = ds_pii.map(lambda x: redact_pii_batch(x, replacements), batched=True, batch_size=100, num_proc=12, load_from_cache_file=False)" ] }, { "cell_type": "code", "execution_count": 9, "id": "e060ed7e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id', 'secrets', 'has_secrets', 'number_secrets', 'new_content', 'redaction_refs'],\n", " num_rows: 400\n", "})" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds_redacted" ] }, { "cell_type": "code", "execution_count": null, "id": "294a9083", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "for e in ds_redacted:\n", " secrets = json.loads(e[\"secrets\"])\n", " if len(secrets) >= 3:\n", " print(e[\"id\"])" ] }, { "cell_type": "markdown", "id": "259f9759", "metadata": {}, "source": [ "example 16" ] }, { "cell_type": "code", "execution_count": null, "id": "d5a37524", "metadata": {}, "outputs": [], "source": [ "ds_redacted[16][\"secrets\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "04f7e74d", "metadata": {}, "outputs": [], "source": [ "print(\"Old text:\")\n", "print(ds_redacted[16][\"content\"][1190:1500])" ] }, { "cell_type": "code", "execution_count": null, "id": "470bf3aa", "metadata": {}, "outputs": [], "source": [ "print(\"New text:\")\n", "print(ds_redacted[16][\"new_content\"][1190:1500])" ] }, { "cell_type": "code", "execution_count": null, "id": "897b7ebf", "metadata": {}, "outputs": [], "source": [ "print(\"New text with delimietrs (for visualization in a space):\")\n", "print(ds_redacted[16][\"redaction_refs\"][1190:1500])" ] }, { "cell_type": "markdown", "id": "39e051da", "metadata": {}, "source": [ "example 27" ] }, { "cell_type": "code", "execution_count": null, "id": "c129f763", "metadata": {}, "outputs": [], "source": [ "ds_redacted[27][\"secrets\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "35977e2c", "metadata": {}, "outputs": [], "source": [ "print(\"Old text:\")\n", "# we don't replace private Ips like 0.0.0.0\n", "print(ds_redacted[27][\"content\"][150:250])\n", "\n", "print(\"\\nNew text:\")\n", "print(ds_redacted[27][\"new_content\"][150:250])" ] }, { "cell_type": "code", "execution_count": null, "id": "d081f2ea", "metadata": {}, "outputs": [], "source": [ "print(\"Old text:\")\n", "print(ds_redacted[27][\"content\"][270:670])\n", "\n", "print(\"\\nNew text:\")\n", "# here the first part of the key was detected and replaced with pj82ffu65gt9sh9v8n9s2fyupslmlcq\n", "print(ds_redacted[27][\"new_content\"][270:470])" ] }, { "cell_type": "markdown", "id": "0661335f", "metadata": {}, "source": [ "example 49" ] }, { "cell_type": "code", "execution_count": null, "id": "8f332863", "metadata": {}, "outputs": [], "source": [ "ds_redacted[49][\"secrets\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "8e2248f1", "metadata": {}, "outputs": [], "source": [ "print(\"Old text:\")\n", "print(ds_redacted[49][\"content\"][30:70])\n", "\n", "print(\"\\nNew text:\")\n", "# here the first part of the key was detected and replaced with pj82ffu65gt9sh9v8n9s2fyupslmlcq\n", "print(ds_redacted[49][\"new_content\"][30:70])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.8" }, "vscode": { "interpreter": { "hash": "fd8fde6f83dada9276d12fdb71d773558994168ed1b3bea457b8db38c02aa2e1" } } }, "nbformat": 4, "nbformat_minor": 5 }