{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import re\n" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "from presidio_anonymizer.entities import (RecognizerResult,\n", " OperatorResult,\n", " OperatorConfig)\n", "from privacy.service.service import PrivacyService" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "text = \"John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com.\"\n", "\n", "# Define regular expressions for different types of PII\n", "ssn_pattern = r\"\\d{3}-\\d{2}-\\d{4}\"\n", "email_pattern = r\"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}\\b\"\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Find matches for Social Security numbers\n", "ssn_matches = re.findall(ssn_pattern, text)\n", "\n", "# Find matches for email addresses\n", "email_matches = re.findall(email_pattern, text)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Apply differential privacy to the detected PII counts\n", "epsilon = 0.1 " ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def add_noise(value):\n", " scale = 1 / epsilon\n", " laplace_noise = np.random.laplace(loc=0, scale=scale)\n", " print(value)\n", " print(laplace_noise)\n", " return value + laplace_noise" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "123-45-6789\n", "-6.338424074647873\n" ] }, { "ename": "TypeError", "evalue": "can only concatenate str (not \"float\") to str", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 6\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m ssn_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(ssn_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the Social Security numbers\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_ssn_matches \u001b[39m=\u001b[39m [add_noise(ssn) \u001b[39mfor\u001b[39;00m ssn \u001b[39min\u001b[39;00m ssn_matches]\n", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 6\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m ssn_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(ssn_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the Social Security numbers\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_ssn_matches \u001b[39m=\u001b[39m [add_noise(ssn) \u001b[39mfor\u001b[39;00m ssn \u001b[39min\u001b[39;00m ssn_matches]\n", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 6\u001b[0m line \u001b[0;36m6\n\u001b[0;32m 4\u001b[0m \u001b[39mprint\u001b[39m(value)\n\u001b[0;32m 5\u001b[0m \u001b[39mprint\u001b[39m(laplace_noise)\n\u001b[1;32m----> 6\u001b[0m \u001b[39mreturn\u001b[39;00m value \u001b[39m+\u001b[39;49m laplace_noise\n", "\u001b[1;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str" ] } ], "source": [ "\n", "# Find matches for Social Security numbers\n", "ssn_matches = re.findall(ssn_pattern, text)\n", "\n", "# Add differential privacy to the Social Security numbers\n", "noisy_ssn_matches = [add_noise(ssn) for ssn in ssn_matches]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "johndoe@example.com\n", "21.357718997606124\n" ] }, { "ename": "TypeError", "evalue": "can only concatenate str (not \"float\") to str", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 7\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m email_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(email_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the email addresses\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_email_matches \u001b[39m=\u001b[39m [add_noise(email) \u001b[39mfor\u001b[39;00m email \u001b[39min\u001b[39;00m email_matches]\n", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 7\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m email_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(email_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the email addresses\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_email_matches \u001b[39m=\u001b[39m [add_noise(email) \u001b[39mfor\u001b[39;00m email \u001b[39min\u001b[39;00m email_matches]\n", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 7\u001b[0m line \u001b[0;36m6\n\u001b[0;32m 4\u001b[0m \u001b[39mprint\u001b[39m(value)\n\u001b[0;32m 5\u001b[0m \u001b[39mprint\u001b[39m(laplace_noise)\n\u001b[1;32m----> 6\u001b[0m \u001b[39mreturn\u001b[39;00m value \u001b[39m+\u001b[39;49m laplace_noise\n", "\u001b[1;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str" ] } ], "source": [ "# Find matches for email addresses\n", "email_matches = re.findall(email_pattern, text)\n", "\n", "# Add differential privacy to the email addresses\n", "noisy_email_matches = [add_noise(email) for email in email_matches]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Noisy SSN count: 1.5109753118487679\n", "Anonymized SSN count: text: 1.5109753118487679\n", "items:\n", "[\n", " \n", "]\n", "\n", "Noisy email count: 1.5109753118487679\n", "Anonymized email count: text: 1.5109753118487679\n", "items:\n", "[\n", " \n", "]\n", "\n" ] } ], "source": [ "import numpy as np\n", "from presidio_anonymizer import AnonymizerEngine\n", "\n", "# Initialize the anonymizer engine\n", "anonymizer = AnonymizerEngine()\n", "\n", "# Define the text containing potential PII\n", "text = \"John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com.\"\n", "\n", "# Apply differential privacy to the PII detection process\n", "epsilon = 0.1 # Privacy parameter for differential privacy\n", "sensitivity = 1 # Sensitivity of the PII detection result\n", "delta = 1e-6 # Privacy parameter for differential privacy\n", "\n", "# Calculate the noise to be added\n", "scale = sensitivity / epsilon\n", "laplace_noise = np.random.laplace(loc=0, scale=scale)\n", "\n", "# Detect PII in the text\n", "# Example rule-based matching for SSN and email\n", "ssn_pattern = r\"\\d{3}-\\d{2}-\\d{4}\"\n", "email_pattern = r\"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}\\b\"\n", "\n", "# Apply noise to the PII detection results\n", "noisy_ssn_count = len(re.findall(ssn_pattern, text)) + laplace_noise\n", "noisy_email_count = len(re.findall(email_pattern, text)) + laplace_noise\n", "\n", "# Anonymize the PII detection results using Presidio\n", "anonymized_ssn_count = anonymizer.anonymize(\n", " str(noisy_ssn_count),\n", " analyzer_results=[],\n", " operators={\"anonymizer_config\": {\"type\": \"replace\", \"value\": \"\"}},\n", ")\n", "\n", "anonymized_email_count = anonymizer.anonymize(\n", " str(noisy_email_count),\n", " analyzer_results=[],\n", " operators={\"anonymizer_config\": {\"type\": \"replace\", \"value\": \"\"}},\n", ")\n", "\n", "# Print the anonymized PII detection results\n", "print(\"Noisy SSN count:\", noisy_ssn_count)\n", "print(\"Anonymized SSN count:\", anonymized_ssn_count)\n", "\n", "print(\"Noisy email count:\", noisy_email_count)\n", "print(\"Anonymized email count:\", anonymized_email_count)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "text: Px(Z~fm‚#[n\u001eXndl.kZ^@j{lplsX(`gx\n", "items:\n", "[\n", " \n", "]\n", "\n" ] } ], "source": [ "import numpy as np\n", "from presidio_anonymizer import AnonymizerEngine\n", "\n", "# Initialize the anonymizer engine\n", "anonymizer = AnonymizerEngine()\n", "\n", "# Define the text containing PII\n", "text = \"My email is john.doe@example.com\"\n", "\n", "# Apply differential privacy to the PII value\n", "epsilon = 0.1 # Privacy parameter for differential privacy\n", "\n", "# Generate Laplace noise for each character in the email\n", "laplace_noise = np.random.laplace(loc=0, scale=1/epsilon, size=len(text))\n", "\n", "# Add the noise to each character in the email\n", "noisy_email = ''.join(chr(ord(c) + int(round(n))) for c, n in zip(text, laplace_noise))\n", "\n", "# Anonymize the noisy email using Presidio\n", "anonymized_text = anonymizer.anonymize(\n", " noisy_email,\n", " analyzer_results=[],\n", " operators=\n", " {\"Email\": {\"type\": \"replace\", \"value\": \"\"}}\n", " ,\n", ")\n", "\n", "# Print the anonymized text\n", "print(anonymized_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "========= []\n", "========= []\n", "John Doe'S Social Security Number Is 123-45-6789 And His Email Is Johndoe@Example.Com.\n", "[type: EMAIL_ADDRESS, start: 66, end: 85, score: 1.0, type: PERSON, start: 0, end: 10, score: 0.85, type: URL, start: 74, end: 85, score: 0.5]\n", "type: EMAIL_ADDRESS, start: 66, end: 85, score: 1.0\n", "type: PERSON, start: 0, end: 10, score: 0.85\n", "type: URL, start: 74, end: 85, score: 0.5\n", "text: -6.311321244615104 Social Security number is 123-45-6789 and his email is -11.671955800130334.\n", "items:\n", "[\n", " {'start': 74, 'end': 93, 'entity_type': 'EMAIL_ADDRESS', 'text': '-11.671955800130334', 'operator': 'replace'},\n", " {'start': 0, 'end': 18, 'entity_type': 'PERSON', 'text': '-6.311321244615104', 'operator': 'replace'}\n", "]\n", "\n" ] } ], "source": [ "import numpy as np\n", "from presidio_analyzer import AnalyzerEngine, RecognizerRegistry\n", "from presidio_anonymizer import AnonymizerEngine\n", "\n", "# Initialize the anonymizer engine\n", "anonymizer = AnonymizerEngine()\n", "\n", "# Define the text containing PII\n", "text = \"John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com.\"\n", "\n", "# Apply differential privacy to the PII value\n", "epsilon = 0.1 # Privacy parameter for differential privacy\n", "sensitivity = 2 # Sensitivity of the PII value\n", "delta = 1e-6 # Privacy parameter for differential privacy\n", "\n", "# Calculate the noise to be added\n", "def run():\n", " scale = sensitivity / epsilon\n", " laplace_noise = np.random.laplace(loc=0, scale=scale)\n", "\n", "# Add the noise to the PII value\n", " noisy_value =laplace_noise\n", " return noisy_value\n", "# results = PrivacyService.__analyze(text=text)\n", "# Anonymize the noisy value using Presidio\n", "\n", "registry = RecognizerRegistry()\n", "analyzer = AnalyzerEngine(registry=registry)\n", "registry.load_predefined_recognizers()\n", "\n", "results = analyzer.analyze(text=text, language=\"en\")\n", " \n", "print(results)\n", "op={}\n", "for i in results:\n", " print(i)\n", " op[i.entity_type]=OperatorConfig(\"replace\", {\"new_value\": str(run())})\n", "anonymized_text = anonymizer.anonymize(\n", " text,\n", " analyzer_results=results,\n", " operators=op\n", " \n", "\n", " ,\n", ")\n", "\n", "# Print the anonymized text\n", "print(anonymized_text)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Name Age s Email\n", "0 John Doe 25 1 john.doe@example.com\n", "1 Jane Smith 30 2 jane.smith@example.com\n", "2 Alice Johnson 50 3 alice.johnson@example.com\n", "['20-30', '30-40', '40-50']\n", "[0, 30, 40, 50]\n", " Name Age s Age1\n", "0 John Doe 25 1 20-30\n", "1 Jane Smith 30 2 20-30\n", "2 Alice Johnson 50 3 40-50\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "\n", "# Sample dataset with PII\n", "data = pd.DataFrame({\n", " 'Name': ['John Doe', 'Jane Smith', 'Alice Johnson'],\n", " 'Age': [25, 30, 50],\n", " \"s\":[1,2,3],\n", " 'Email': ['john.doe@example.com', 'jane.smith@example.com', 'alice.johnson@example.com']\n", "})\n", "print(data)\n", "\n", "# Generalization\n", "# Generalize age into age ranges\n", "data['Age1'] = pd.cut(data['Age'], bins=[0, 30, 40,50], labels=['20-30', '30-40','40-50'])\n", "\n", "# Suppression\n", "# Suppress or remove email column\n", "data = data.drop('Email', axis=1)\n", "\n", "# Perturbation\n", "# Perturb age values by adding Laplace noise\n", "epsilon = 1.0 # Privacy parameter for differential privacy\n", "sensitivity = 1 # Sensitivity of the age values\n", "scale = sensitivity / epsilon\n", "laplace_noise = np.random.laplace(loc=0, scale=scale, size=len(data))\n", "# data['Age','s'] += laplace_noise\n", "\n", "# print(data['Age'])\n", "\n", "print(data)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "========= []\n", "0 John Doe\n", "1 Jane Smith\n", "2 Alice Johnson\n", "Name: Name, dtype: object\n", "========= []\n", "0 John Doe\n", "1 Jane Smith\n", "2 Alice Johnson\n", "Name: Name, Dtype: Object\n", "[type: PERSON, start: 10, end: 18, score: 0.85, type: PERSON, start: 27, end: 37, score: 0.85, type: PERSON, start: 43, end: 56, score: 0.85]\n" ] } ], "source": [ "from presidio_analyzer import AnalyzerEngine, RecognizerRegistry\n", "\n", "registry = RecognizerRegistry()\n", "analyzer = AnalyzerEngine(registry=registry)\n", "registry.load_predefined_recognizers()\n", "\n", "print(str(data[\"Name\"]))\n", "results = analyzer.analyze(text=str(data[\"Name\"]), language=\"en\")\n", "print(results)\n" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 John Doe\n", "1 Jane Smith\n", "2 Alice Johnson\n", "0 0 John Doe\\n1 Jane Smith\\n2 A...\n", "dtype: object\n" ] } ], "source": [ "s=data[\"Name\"].to_string()\n", "print(s)\n", "p=pd.Series(s)\n", "print(p)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "========= []\n" ] }, { "ename": "ValueError", "evalue": "[E1041] Expected a string, Doc, or bytes as input, but got: ", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 18\u001b[0m line \u001b[0;36m1\n\u001b[0;32m 5\u001b[0m anonymizer \u001b[39m=\u001b[39m AnonymizerEngine()\n\u001b[0;32m 6\u001b[0m dataset \u001b[39m=\u001b[39m [\n\u001b[0;32m 7\u001b[0m {\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mJohn Doe\u001b[39m\u001b[39m'\u001b[39m\u001b[39ms email is john.doe@example.com and his phone number is 555-123-4567.\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 8\u001b[0m {\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mAlice Smith\u001b[39m\u001b[39m'\u001b[39m\u001b[39ms social security number is 123-45-6789.\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 9\u001b[0m ]\n\u001b[1;32m---> 10\u001b[0m analyzed_dataset \u001b[39m=\u001b[39m analyzer\u001b[39m.\u001b[39;49manalyze(dataset,language\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39men\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[0;32m 11\u001b[0m masked_dataset \u001b[39m=\u001b[39m anonymizer\u001b[39m.\u001b[39manonymize(analyzed_dataset, dataset)\n\u001b[0;32m 12\u001b[0m \u001b[39mfor\u001b[39;00m item \u001b[39min\u001b[39;00m masked_dataset:\n", "File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\presidio_analyzer\\analyzer_engine.py:189\u001b[0m, in \u001b[0;36mAnalyzerEngine.analyze\u001b[1;34m(self, text, language, entities, correlation_id, score_threshold, return_decision_process, ad_hoc_recognizers, context, allow_list, nlp_artifacts)\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[39m# run the nlp pipeline over the given text, store the results in\u001b[39;00m\n\u001b[0;32m 187\u001b[0m \u001b[39m# a NlpArtifacts instance\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m nlp_artifacts:\n\u001b[1;32m--> 189\u001b[0m nlp_artifacts \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnlp_engine\u001b[39m.\u001b[39;49mprocess_text(text, language)\n\u001b[0;32m 191\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlog_decision_process:\n\u001b[0;32m 192\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mapp_tracer\u001b[39m.\u001b[39mtrace(\n\u001b[0;32m 193\u001b[0m correlation_id, \u001b[39m\"\u001b[39m\u001b[39mnlp artifacts:\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m+\u001b[39m nlp_artifacts\u001b[39m.\u001b[39mto_json()\n\u001b[0;32m 194\u001b[0m )\n", "File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\presidio_analyzer\\nlp_engine\\client_nlp_engine.py:57\u001b[0m, in \u001b[0;36mClientNlpEngine.process_text\u001b[1;34m(self, text, language)\u001b[0m\n\u001b[0;32m 54\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mprocess_text\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m, language: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m NlpArtifacts:\n\u001b[0;32m 55\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Execute the SpaCy NLP pipeline on the given text and language.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 57\u001b[0m doc \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnlp[language](text)\n\u001b[0;32m 58\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_doc_to_nlp_artifact(doc, language)\n", "File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\spacy\\language.py:1007\u001b[0m, in \u001b[0;36mLanguage.__call__\u001b[1;34m(self, text, disable, component_cfg)\u001b[0m\n\u001b[0;32m 986\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\n\u001b[0;32m 987\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m 988\u001b[0m text: Union[\u001b[39mstr\u001b[39m, Doc],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 991\u001b[0m component_cfg: Optional[Dict[\u001b[39mstr\u001b[39m, Dict[\u001b[39mstr\u001b[39m, Any]]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[0;32m 992\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Doc:\n\u001b[0;32m 993\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Apply the pipeline to some text. The text can span multiple sentences,\u001b[39;00m\n\u001b[0;32m 994\u001b[0m \u001b[39m and can contain arbitrary whitespace. Alignment into the original string\u001b[39;00m\n\u001b[0;32m 995\u001b[0m \u001b[39m is preserved.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1005\u001b[0m \u001b[39m DOCS: https://spacy.io/api/language#call\u001b[39;00m\n\u001b[0;32m 1006\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1007\u001b[0m doc \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_ensure_doc(text)\n\u001b[0;32m 1008\u001b[0m \u001b[39mif\u001b[39;00m component_cfg \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 1009\u001b[0m component_cfg \u001b[39m=\u001b[39m {}\n", "File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\spacy\\language.py:1101\u001b[0m, in \u001b[0;36mLanguage._ensure_doc\u001b[1;34m(self, doc_like)\u001b[0m\n\u001b[0;32m 1099\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(doc_like, \u001b[39mbytes\u001b[39m):\n\u001b[0;32m 1100\u001b[0m \u001b[39mreturn\u001b[39;00m Doc(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab)\u001b[39m.\u001b[39mfrom_bytes(doc_like)\n\u001b[1;32m-> 1101\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(Errors\u001b[39m.\u001b[39mE1041\u001b[39m.\u001b[39mformat(\u001b[39mtype\u001b[39m\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(doc_like)))\n", "\u001b[1;31mValueError\u001b[0m: [E1041] Expected a string, Doc, or bytes as input, but got: " ] } ], "source": [ "from presidio_analyzer import AnalyzerEngine\n", "from presidio_anonymizer import AnonymizerEngine\n", "\n", "analyzer = AnalyzerEngine()\n", "anonymizer = AnonymizerEngine()\n", "dataset = [\n", " {\"text\": \"John Doe's email is john.doe@example.com and his phone number is 555-123-4567.\"},\n", " {\"text\": \"Alice Smith's social security number is 123-45-6789.\"},\n", "]\n", "analyzed_dataset = analyzer.analyze(dataset,language='en')\n", "masked_dataset = anonymizer.anonymize(analyzed_dataset, dataset)\n", "for item in masked_dataset:\n", " print(item[\"text\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "off\n", "hashyfy\n", "diffrential_pryivacy" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "x\n", "x\n" ] } ], "source": [ "class A:\n", " def x():\n", " print(\"x\")\n", " return \"x\"\n", "\n", " def y():\n", " return \"y\"\n", "\n", "# def fun(s):\n", " \n", "# print(s())\n", " \n", "# fun(\"x\")\n", "s=getattr(A,\"x\")\n", "# s=globals()[\"x\"]\n", "print(s())\n", " " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/simple, https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/simple" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "[notice] A new release of pip is available: 23.0.1 -> 23.3.1\n", "[notice] To update, run: python.exe -m pip install --upgrade pip\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Collecting diffprivlib\n", " Downloading https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/packages/packages/a9/10/200015b77240c50f6f438e2b9e54a7179fdbf56f6ca9f40a11d90fd2c8f9/diffprivlib-0.6.3-py3-none-any.whl (176 kB)\n", " ---------------------------------------- 0.0/176.0 kB ? eta -:--:--\n", " ------------------------------------- 174.1/176.0 kB 5.1 MB/s eta 0:00:01\n", " ------------------------------------- 174.1/176.0 kB 5.1 MB/s eta 0:00:01\n", " -------------------------------------- 176.0/176.0 kB 1.8 MB/s eta 0:00:00\n", "Requirement already satisfied: scikit-learn>=0.24.2 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.3.2)\n", "Requirement already satisfied: joblib>=0.16.0 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.3.2)\n", "Requirement already satisfied: numpy>=1.21.6 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.26.2)\n", "Requirement already satisfied: scipy>=1.7.3 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.11.4)\n", "Requirement already satisfied: setuptools>=49.0.0 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (65.5.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from scikit-learn>=0.24.2->diffprivlib) (3.2.0)\n", "Installing collected packages: diffprivlib\n", "Successfully installed diffprivlib-0.6.3\n" ] } ], "source": [ "!pip install diffprivlib" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from diffprivlib.mechanisms import binary\n", "import pandas as pd\n", "df=pd.read_csv(r\"C:\\WORK\\GIT\\responsible-ai-admin\\responsible-ai-admin\\src\\rai_admin\\temp\\emplist.csv\")" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Employee_ID Gender Age Education_Level Relationship_Status Hometown \\\n", "0 EID_22713 F 32 5 Single Springfield \n", "1 EID_9658 M 65 2 Single Lebanon \n", "2 EID_22203 M 52 3 Married Springfield \n", "3 EID_7652 M 50 5 Single Washington \n", "4 EID_6516 F 44 3 Married Franklin \n", "5 EID_20283 F 22 4 Married Franklin \n", "6 EID_21014 M 42 3 Married Washington \n", "7 EID_7693 F 41 2 Married Springfield \n", "8 EID_13232 M 31 1 Single Springfield \n", "\n", " Unit Decision_skill_possess Time_of_service Time_since_promotion \\\n", "0 R&D Conceptual 7 4 \n", "1 IT Directive 41 2 \n", "2 Sales Directive 21 3 \n", "3 Marketing Analytical 11 4 \n", "4 R&D Conceptual 12 4 \n", "5 IT Behavioral 3 1 \n", "6 Purchasing Analytical 6 4 \n", "7 Sales Conceptual 4 4 \n", "8 IT Analytical 7 3 \n", "\n", " growth_rate Travel_Rate Post_Level Pay_Scale Compensation_and_Benefits \\\n", "0 30 1 5 4 type2 \n", "1 72 1 1 1 type2 \n", "2 25 0 1 8 type3 \n", "3 28 1 1 2 type0 \n", "4 47 1 3 2 type2 \n", "5 53 0 3 6 type2 \n", "6 35 1 3 4 type2 \n", "7 35 1 4 8 type2 \n", "8 73 2 3 8 type2 \n", "\n", " Work_Life_balance \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 4 \n", "4 4 \n", "5 1 \n", "6 1 \n", "7 1 \n", "8 3 \n" ] } ], "source": [ "print(df)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'M'" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "b=binary.Binary(epsilon=0.1,value0=\"F\",value1=\"M\",random_state=None)\n", "b.randomise(\"F\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Gender', 'Relationship_Status']\n" ] } ], "source": [ "binaryList=[]\n", "for c in df.columns:\n", " # print(s)\n", " if(len(df[c].unique())==2):\n", " binaryList.append(c)\n", "print(binaryList)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "Value to be randomised must be a string", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 26\u001b[0m line \u001b[0;36m2\n\u001b[0;32m 1\u001b[0m mechanism \u001b[39m=\u001b[39m binary\u001b[39m.\u001b[39mBinary(epsilon\u001b[39m=\u001b[39m\u001b[39m1.0\u001b[39m,value0\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mF\u001b[39m\u001b[39m\"\u001b[39m,value1\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mM\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m df[\u001b[39m\"\u001b[39m\u001b[39mGender\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m mechanism\u001b[39m.\u001b[39;49mrandomise(df[\u001b[39m\"\u001b[39;49m\u001b[39mGender\u001b[39;49m\u001b[39m\"\u001b[39;49m])\n", "File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\diffprivlib\\mechanisms\\binary.py:110\u001b[0m, in \u001b[0;36mBinary.randomise\u001b[1;34m(self, value)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mrandomise\u001b[39m(\u001b[39mself\u001b[39m, value):\n\u001b[0;32m 97\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Randomise `value` with the mechanism.\u001b[39;00m\n\u001b[0;32m 98\u001b[0m \n\u001b[0;32m 99\u001b[0m \u001b[39m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 108\u001b[0m \n\u001b[0;32m 109\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 110\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_check_all(value)\n\u001b[0;32m 112\u001b[0m indicator \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mif\u001b[39;00m value \u001b[39m==\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0 \u001b[39melse\u001b[39;00m \u001b[39m1\u001b[39m\n\u001b[0;32m 114\u001b[0m unif_rv \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_rng\u001b[39m.\u001b[39mrandom() \u001b[39m*\u001b[39m (np\u001b[39m.\u001b[39mexp(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mepsilon) \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m)\n", "File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\diffprivlib\\mechanisms\\binary.py:80\u001b[0m, in \u001b[0;36mBinary._check_all\u001b[1;34m(self, value)\u001b[0m\n\u001b[0;32m 77\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_labels(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue1)\n\u001b[0;32m 79\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(value, \u001b[39mstr\u001b[39m):\n\u001b[1;32m---> 80\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mTypeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mValue to be randomised must be a string\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 82\u001b[0m \u001b[39mif\u001b[39;00m value \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m [\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue1]:\n\u001b[0;32m 83\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mValue to be randomised is not in the domain \u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0\u001b[39m}\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m\\\"\u001b[39;00m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue1\u001b[39m}\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m}}\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 84\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mgot \u001b[39m\u001b[39m\\\"\u001b[39;00m\u001b[39m{\u001b[39;00mvalue\u001b[39m}\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m)\n", "\u001b[1;31mTypeError\u001b[0m: Value to be randomised must be a string" ] } ], "source": [ "mechanism = binary.Binary(epsilon=1.0,value0=\"F\",value1=\"M\")\n", "df[\"Gender\"] = mechanism.randomise(df[\"Gender\"])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['F', 'M'], dtype=object)" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Gender\"].unique()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "def binaryCheck(df,col):\n", " data=list(df[col].unique())\n", " # print(data)\n", " mechanism = binary.Binary(epsilon=1.0,value0=data[0],value1=data[1])\n", " for d in range(len(df[col])):\n", " temp=df.loc[d,col]\n", " # print(\"==/\",temp)\n", " df.loc[d,col]=mechanism.randomise(temp)\n", " # print(\"=====\",temp,df.loc[d,col])" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "binaryCheck(df,\"Gender\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Employee_IDGenderAgeEducation_LevelRelationship_StatusHometownUnitDecision_skill_possessTime_of_serviceTime_since_promotiongrowth_rateTravel_RatePost_LevelPay_ScaleCompensation_and_BenefitsWork_Life_balance
0EID_22713F325SingleSpringfieldR&DConceptual7430154type21
1EID_9658M652SingleLebanonITDirective41272111type21
2EID_22203M523MarriedSpringfieldSalesDirective21325018type31
3EID_7652M505SingleWashingtonMarketingAnalytical11428112type04
4EID_6516F443MarriedFranklinR&DConceptual12447132type24
5EID_20283F224MarriedFranklinITBehavioral3153036type21
6EID_21014M423MarriedWashingtonPurchasingAnalytical6435134type21
7EID_7693F412MarriedSpringfieldSalesConceptual4435148type21
8EID_13232M311SingleSpringfieldITAnalytical7373238type23
\n", "
" ], "text/plain": [ " Employee_ID Gender Age Education_Level Relationship_Status Hometown \\\n", "0 EID_22713 F 32 5 Single Springfield \n", "1 EID_9658 M 65 2 Single Lebanon \n", "2 EID_22203 M 52 3 Married Springfield \n", "3 EID_7652 M 50 5 Single Washington \n", "4 EID_6516 F 44 3 Married Franklin \n", "5 EID_20283 F 22 4 Married Franklin \n", "6 EID_21014 M 42 3 Married Washington \n", "7 EID_7693 F 41 2 Married Springfield \n", "8 EID_13232 M 31 1 Single Springfield \n", "\n", " Unit Decision_skill_possess Time_of_service Time_since_promotion \\\n", "0 R&D Conceptual 7 4 \n", "1 IT Directive 41 2 \n", "2 Sales Directive 21 3 \n", "3 Marketing Analytical 11 4 \n", "4 R&D Conceptual 12 4 \n", "5 IT Behavioral 3 1 \n", "6 Purchasing Analytical 6 4 \n", "7 Sales Conceptual 4 4 \n", "8 IT Analytical 7 3 \n", "\n", " growth_rate Travel_Rate Post_Level Pay_Scale Compensation_and_Benefits \\\n", "0 30 1 5 4 type2 \n", "1 72 1 1 1 type2 \n", "2 25 0 1 8 type3 \n", "3 28 1 1 2 type0 \n", "4 47 1 3 2 type2 \n", "5 53 0 3 6 type2 \n", "6 35 1 3 4 type2 \n", "7 35 1 4 8 type2 \n", "8 73 2 3 8 type2 \n", "\n", " Work_Life_balance \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 4 \n", "4 4 \n", "5 1 \n", "6 1 \n", "7 1 \n", "8 3 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "minv=df.Age.min()\n", "maxv=df.Age.max()\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "70 20\n" ] } ], "source": [ "import math\n", "\n", "base=10\n", "maxrange=math.ceil(maxv / base) * base\n", "minrange=round(minv/base)*base\n", "\n", "print(maxrange,minrange)\n", "diff=maxrange-minrange\n", "\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "50\n", "4\n", "['20.0-30.0', '30.0-40.0', '40.0-50.0', '50.0-60.0', '60.0-70']\n", "[20.0, 30.0, 40.0, 50.0, 60.0, 70]\n" ] } ], "source": [ "range_magnitude = abs(maxrange - minrange)\n", "# print(range_magnitude)\n", "# Determine the number of ranges based on the magnitude``\n", "num_ranges = max(range_magnitude // 10, 1) # Assuming a minimum range size of 10\n", "\n", "# Calculate the interval\n", "interval = range_magnitude / num_ranges\n", "\n", "ranges = []\n", "binlist=set()\n", "lablelist=[]\n", "\n", "for i in range(num_ranges):\n", " start = minrange + i * interval\n", " end = minrange + (i + 1) * interval\n", " if(i==num_ranges-1):\n", " # print(i)\n", " end=maxrange\n", " binlist.add(start)\n", " binlist.add(end)\n", " lablelist.append(f\"{start}-{end}\")\n", " # ranges.append((start, end))\n", "binlist=sorted(list(binlist))\n", "print(lablelist)\n", "print(binlist)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['20.0-30.0', '30.0-40.0', '40.0-50.0', '50.0-60.0', '60.0-70.0']\n", "[20.0, 30.0, 40.0, 50.0, 60.0, 70.0]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Employee_IDGenderAgeEducation_LevelRelationship_StatusHometownUnitDecision_skill_possessTime_of_serviceTime_since_promotiongrowth_rateTravel_RatePost_LevelPay_ScaleCompensation_and_BenefitsWork_Life_balanceAge1
0EID_22713F325SingleSpringfieldR&DConceptual7430154type2130.0-40.0
1EID_9658M652SingleLebanonITDirective41272111type2160.0-70.0
2EID_22203M523MarriedSpringfieldSalesDirective21325018type3150.0-60.0
3EID_7652M505SingleWashingtonMarketingAnalytical11428112type0440.0-50.0
4EID_6516F443MarriedFranklinR&DConceptual12447132type2440.0-50.0
5EID_20283F224MarriedFranklinITBehavioral3153036type2120.0-30.0
6EID_21014M423MarriedWashingtonPurchasingAnalytical6435134type2140.0-50.0
7EID_7693F412MarriedSpringfieldSalesConceptual4435148type2140.0-50.0
8EID_13232M311SingleSpringfieldITAnalytical7373238type2330.0-40.0
\n", "
" ], "text/plain": [ " Employee_ID Gender Age Education_Level Relationship_Status Hometown \\\n", "0 EID_22713 F 32 5 Single Springfield \n", "1 EID_9658 M 65 2 Single Lebanon \n", "2 EID_22203 M 52 3 Married Springfield \n", "3 EID_7652 M 50 5 Single Washington \n", "4 EID_6516 F 44 3 Married Franklin \n", "5 EID_20283 F 22 4 Married Franklin \n", "6 EID_21014 M 42 3 Married Washington \n", "7 EID_7693 F 41 2 Married Springfield \n", "8 EID_13232 M 31 1 Single Springfield \n", "\n", " Unit Decision_skill_possess Time_of_service Time_since_promotion \\\n", "0 R&D Conceptual 7 4 \n", "1 IT Directive 41 2 \n", "2 Sales Directive 21 3 \n", "3 Marketing Analytical 11 4 \n", "4 R&D Conceptual 12 4 \n", "5 IT Behavioral 3 1 \n", "6 Purchasing Analytical 6 4 \n", "7 Sales Conceptual 4 4 \n", "8 IT Analytical 7 3 \n", "\n", " growth_rate Travel_Rate Post_Level Pay_Scale Compensation_and_Benefits \\\n", "0 30 1 5 4 type2 \n", "1 72 1 1 1 type2 \n", "2 25 0 1 8 type3 \n", "3 28 1 1 2 type0 \n", "4 47 1 3 2 type2 \n", "5 53 0 3 6 type2 \n", "6 35 1 3 4 type2 \n", "7 35 1 4 8 type2 \n", "8 73 2 3 8 type2 \n", "\n", " Work_Life_balance Age1 \n", "0 1 30.0-40.0 \n", "1 1 60.0-70.0 \n", "2 1 50.0-60.0 \n", "3 4 40.0-50.0 \n", "4 4 40.0-50.0 \n", "5 1 20.0-30.0 \n", "6 1 40.0-50.0 \n", "7 1 40.0-50.0 \n", "8 3 30.0-40.0 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Age1'] = pd.cut(df['Age'], bins=binlist, labels=lablelist)\n", "df" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "'numpy.int64' object is not callable", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 35\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m \u001b[39mmax\u001b[39;49m(\u001b[39m1\u001b[39;49m,\u001b[39m2\u001b[39;49m)\n", "\u001b[1;31mTypeError\u001b[0m: 'numpy.int64' object is not callable" ] } ], "source": [ "max(1,2)" ] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }