{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import re\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"from presidio_anonymizer.entities import (RecognizerResult,\n",
" OperatorResult,\n",
" OperatorConfig)\n",
"from privacy.service.service import PrivacyService"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"text = \"John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com.\"\n",
"\n",
"# Define regular expressions for different types of PII\n",
"ssn_pattern = r\"\\d{3}-\\d{2}-\\d{4}\"\n",
"email_pattern = r\"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}\\b\"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Find matches for Social Security numbers\n",
"ssn_matches = re.findall(ssn_pattern, text)\n",
"\n",
"# Find matches for email addresses\n",
"email_matches = re.findall(email_pattern, text)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Apply differential privacy to the detected PII counts\n",
"epsilon = 0.1 "
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def add_noise(value):\n",
" scale = 1 / epsilon\n",
" laplace_noise = np.random.laplace(loc=0, scale=scale)\n",
" print(value)\n",
" print(laplace_noise)\n",
" return value + laplace_noise"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"123-45-6789\n",
"-6.338424074647873\n"
]
},
{
"ename": "TypeError",
"evalue": "can only concatenate str (not \"float\") to str",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 6\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m ssn_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(ssn_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the Social Security numbers\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_ssn_matches \u001b[39m=\u001b[39m [add_noise(ssn) \u001b[39mfor\u001b[39;00m ssn \u001b[39min\u001b[39;00m ssn_matches]\n",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 6\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m ssn_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(ssn_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the Social Security numbers\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_ssn_matches \u001b[39m=\u001b[39m [add_noise(ssn) \u001b[39mfor\u001b[39;00m ssn \u001b[39min\u001b[39;00m ssn_matches]\n",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 6\u001b[0m line \u001b[0;36m6\n\u001b[0;32m 4\u001b[0m \u001b[39mprint\u001b[39m(value)\n\u001b[0;32m 5\u001b[0m \u001b[39mprint\u001b[39m(laplace_noise)\n\u001b[1;32m----> 6\u001b[0m \u001b[39mreturn\u001b[39;00m value \u001b[39m+\u001b[39;49m laplace_noise\n",
"\u001b[1;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str"
]
}
],
"source": [
"\n",
"# Find matches for Social Security numbers\n",
"ssn_matches = re.findall(ssn_pattern, text)\n",
"\n",
"# Add differential privacy to the Social Security numbers\n",
"noisy_ssn_matches = [add_noise(ssn) for ssn in ssn_matches]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"johndoe@example.com\n",
"21.357718997606124\n"
]
},
{
"ename": "TypeError",
"evalue": "can only concatenate str (not \"float\") to str",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 7\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m email_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(email_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the email addresses\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_email_matches \u001b[39m=\u001b[39m [add_noise(email) \u001b[39mfor\u001b[39;00m email \u001b[39min\u001b[39;00m email_matches]\n",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 7\u001b[0m line \u001b[0;36m5\n\u001b[0;32m 2\u001b[0m email_matches \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39mfindall(email_pattern, text)\n\u001b[0;32m 4\u001b[0m \u001b[39m# Add differential privacy to the email addresses\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m noisy_email_matches \u001b[39m=\u001b[39m [add_noise(email) \u001b[39mfor\u001b[39;00m email \u001b[39min\u001b[39;00m email_matches]\n",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 7\u001b[0m line \u001b[0;36m6\n\u001b[0;32m 4\u001b[0m \u001b[39mprint\u001b[39m(value)\n\u001b[0;32m 5\u001b[0m \u001b[39mprint\u001b[39m(laplace_noise)\n\u001b[1;32m----> 6\u001b[0m \u001b[39mreturn\u001b[39;00m value \u001b[39m+\u001b[39;49m laplace_noise\n",
"\u001b[1;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str"
]
}
],
"source": [
"# Find matches for email addresses\n",
"email_matches = re.findall(email_pattern, text)\n",
"\n",
"# Add differential privacy to the email addresses\n",
"noisy_email_matches = [add_noise(email) for email in email_matches]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Noisy SSN count: 1.5109753118487679\n",
"Anonymized SSN count: text: 1.5109753118487679\n",
"items:\n",
"[\n",
" \n",
"]\n",
"\n",
"Noisy email count: 1.5109753118487679\n",
"Anonymized email count: text: 1.5109753118487679\n",
"items:\n",
"[\n",
" \n",
"]\n",
"\n"
]
}
],
"source": [
"import numpy as np\n",
"from presidio_anonymizer import AnonymizerEngine\n",
"\n",
"# Initialize the anonymizer engine\n",
"anonymizer = AnonymizerEngine()\n",
"\n",
"# Define the text containing potential PII\n",
"text = \"John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com.\"\n",
"\n",
"# Apply differential privacy to the PII detection process\n",
"epsilon = 0.1 # Privacy parameter for differential privacy\n",
"sensitivity = 1 # Sensitivity of the PII detection result\n",
"delta = 1e-6 # Privacy parameter for differential privacy\n",
"\n",
"# Calculate the noise to be added\n",
"scale = sensitivity / epsilon\n",
"laplace_noise = np.random.laplace(loc=0, scale=scale)\n",
"\n",
"# Detect PII in the text\n",
"# Example rule-based matching for SSN and email\n",
"ssn_pattern = r\"\\d{3}-\\d{2}-\\d{4}\"\n",
"email_pattern = r\"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}\\b\"\n",
"\n",
"# Apply noise to the PII detection results\n",
"noisy_ssn_count = len(re.findall(ssn_pattern, text)) + laplace_noise\n",
"noisy_email_count = len(re.findall(email_pattern, text)) + laplace_noise\n",
"\n",
"# Anonymize the PII detection results using Presidio\n",
"anonymized_ssn_count = anonymizer.anonymize(\n",
" str(noisy_ssn_count),\n",
" analyzer_results=[],\n",
" operators={\"anonymizer_config\": {\"type\": \"replace\", \"value\": \"\"}},\n",
")\n",
"\n",
"anonymized_email_count = anonymizer.anonymize(\n",
" str(noisy_email_count),\n",
" analyzer_results=[],\n",
" operators={\"anonymizer_config\": {\"type\": \"replace\", \"value\": \"\"}},\n",
")\n",
"\n",
"# Print the anonymized PII detection results\n",
"print(\"Noisy SSN count:\", noisy_ssn_count)\n",
"print(\"Anonymized SSN count:\", anonymized_ssn_count)\n",
"\n",
"print(\"Noisy email count:\", noisy_email_count)\n",
"print(\"Anonymized email count:\", anonymized_email_count)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"text: Px(Z~fm‚#[n\u001eXndl.kZ^@j{lplsX(`gx\n",
"items:\n",
"[\n",
" \n",
"]\n",
"\n"
]
}
],
"source": [
"import numpy as np\n",
"from presidio_anonymizer import AnonymizerEngine\n",
"\n",
"# Initialize the anonymizer engine\n",
"anonymizer = AnonymizerEngine()\n",
"\n",
"# Define the text containing PII\n",
"text = \"My email is john.doe@example.com\"\n",
"\n",
"# Apply differential privacy to the PII value\n",
"epsilon = 0.1 # Privacy parameter for differential privacy\n",
"\n",
"# Generate Laplace noise for each character in the email\n",
"laplace_noise = np.random.laplace(loc=0, scale=1/epsilon, size=len(text))\n",
"\n",
"# Add the noise to each character in the email\n",
"noisy_email = ''.join(chr(ord(c) + int(round(n))) for c, n in zip(text, laplace_noise))\n",
"\n",
"# Anonymize the noisy email using Presidio\n",
"anonymized_text = anonymizer.anonymize(\n",
" noisy_email,\n",
" analyzer_results=[],\n",
" operators=\n",
" {\"Email\": {\"type\": \"replace\", \"value\": \"\"}}\n",
" ,\n",
")\n",
"\n",
"# Print the anonymized text\n",
"print(anonymized_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"========= []\n",
"========= []\n",
"John Doe'S Social Security Number Is 123-45-6789 And His Email Is Johndoe@Example.Com.\n",
"[type: EMAIL_ADDRESS, start: 66, end: 85, score: 1.0, type: PERSON, start: 0, end: 10, score: 0.85, type: URL, start: 74, end: 85, score: 0.5]\n",
"type: EMAIL_ADDRESS, start: 66, end: 85, score: 1.0\n",
"type: PERSON, start: 0, end: 10, score: 0.85\n",
"type: URL, start: 74, end: 85, score: 0.5\n",
"text: -6.311321244615104 Social Security number is 123-45-6789 and his email is -11.671955800130334.\n",
"items:\n",
"[\n",
" {'start': 74, 'end': 93, 'entity_type': 'EMAIL_ADDRESS', 'text': '-11.671955800130334', 'operator': 'replace'},\n",
" {'start': 0, 'end': 18, 'entity_type': 'PERSON', 'text': '-6.311321244615104', 'operator': 'replace'}\n",
"]\n",
"\n"
]
}
],
"source": [
"import numpy as np\n",
"from presidio_analyzer import AnalyzerEngine, RecognizerRegistry\n",
"from presidio_anonymizer import AnonymizerEngine\n",
"\n",
"# Initialize the anonymizer engine\n",
"anonymizer = AnonymizerEngine()\n",
"\n",
"# Define the text containing PII\n",
"text = \"John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com.\"\n",
"\n",
"# Apply differential privacy to the PII value\n",
"epsilon = 0.1 # Privacy parameter for differential privacy\n",
"sensitivity = 2 # Sensitivity of the PII value\n",
"delta = 1e-6 # Privacy parameter for differential privacy\n",
"\n",
"# Calculate the noise to be added\n",
"def run():\n",
" scale = sensitivity / epsilon\n",
" laplace_noise = np.random.laplace(loc=0, scale=scale)\n",
"\n",
"# Add the noise to the PII value\n",
" noisy_value =laplace_noise\n",
" return noisy_value\n",
"# results = PrivacyService.__analyze(text=text)\n",
"# Anonymize the noisy value using Presidio\n",
"\n",
"registry = RecognizerRegistry()\n",
"analyzer = AnalyzerEngine(registry=registry)\n",
"registry.load_predefined_recognizers()\n",
"\n",
"results = analyzer.analyze(text=text, language=\"en\")\n",
" \n",
"print(results)\n",
"op={}\n",
"for i in results:\n",
" print(i)\n",
" op[i.entity_type]=OperatorConfig(\"replace\", {\"new_value\": str(run())})\n",
"anonymized_text = anonymizer.anonymize(\n",
" text,\n",
" analyzer_results=results,\n",
" operators=op\n",
" \n",
"\n",
" ,\n",
")\n",
"\n",
"# Print the anonymized text\n",
"print(anonymized_text)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Name Age s Email\n",
"0 John Doe 25 1 john.doe@example.com\n",
"1 Jane Smith 30 2 jane.smith@example.com\n",
"2 Alice Johnson 50 3 alice.johnson@example.com\n",
"['20-30', '30-40', '40-50']\n",
"[0, 30, 40, 50]\n",
" Name Age s Age1\n",
"0 John Doe 25 1 20-30\n",
"1 Jane Smith 30 2 20-30\n",
"2 Alice Johnson 50 3 40-50\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"\n",
"# Sample dataset with PII\n",
"data = pd.DataFrame({\n",
" 'Name': ['John Doe', 'Jane Smith', 'Alice Johnson'],\n",
" 'Age': [25, 30, 50],\n",
" \"s\":[1,2,3],\n",
" 'Email': ['john.doe@example.com', 'jane.smith@example.com', 'alice.johnson@example.com']\n",
"})\n",
"print(data)\n",
"\n",
"# Generalization\n",
"# Generalize age into age ranges\n",
"data['Age1'] = pd.cut(data['Age'], bins=[0, 30, 40,50], labels=['20-30', '30-40','40-50'])\n",
"\n",
"# Suppression\n",
"# Suppress or remove email column\n",
"data = data.drop('Email', axis=1)\n",
"\n",
"# Perturbation\n",
"# Perturb age values by adding Laplace noise\n",
"epsilon = 1.0 # Privacy parameter for differential privacy\n",
"sensitivity = 1 # Sensitivity of the age values\n",
"scale = sensitivity / epsilon\n",
"laplace_noise = np.random.laplace(loc=0, scale=scale, size=len(data))\n",
"# data['Age','s'] += laplace_noise\n",
"\n",
"# print(data['Age'])\n",
"\n",
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"========= []\n",
"0 John Doe\n",
"1 Jane Smith\n",
"2 Alice Johnson\n",
"Name: Name, dtype: object\n",
"========= []\n",
"0 John Doe\n",
"1 Jane Smith\n",
"2 Alice Johnson\n",
"Name: Name, Dtype: Object\n",
"[type: PERSON, start: 10, end: 18, score: 0.85, type: PERSON, start: 27, end: 37, score: 0.85, type: PERSON, start: 43, end: 56, score: 0.85]\n"
]
}
],
"source": [
"from presidio_analyzer import AnalyzerEngine, RecognizerRegistry\n",
"\n",
"registry = RecognizerRegistry()\n",
"analyzer = AnalyzerEngine(registry=registry)\n",
"registry.load_predefined_recognizers()\n",
"\n",
"print(str(data[\"Name\"]))\n",
"results = analyzer.analyze(text=str(data[\"Name\"]), language=\"en\")\n",
"print(results)\n"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 John Doe\n",
"1 Jane Smith\n",
"2 Alice Johnson\n",
"0 0 John Doe\\n1 Jane Smith\\n2 A...\n",
"dtype: object\n"
]
}
],
"source": [
"s=data[\"Name\"].to_string()\n",
"print(s)\n",
"p=pd.Series(s)\n",
"print(p)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"========= []\n"
]
},
{
"ename": "ValueError",
"evalue": "[E1041] Expected a string, Doc, or bytes as input, but got: ",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 18\u001b[0m line \u001b[0;36m1\n\u001b[0;32m 5\u001b[0m anonymizer \u001b[39m=\u001b[39m AnonymizerEngine()\n\u001b[0;32m 6\u001b[0m dataset \u001b[39m=\u001b[39m [\n\u001b[0;32m 7\u001b[0m {\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mJohn Doe\u001b[39m\u001b[39m'\u001b[39m\u001b[39ms email is john.doe@example.com and his phone number is 555-123-4567.\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 8\u001b[0m {\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mAlice Smith\u001b[39m\u001b[39m'\u001b[39m\u001b[39ms social security number is 123-45-6789.\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 9\u001b[0m ]\n\u001b[1;32m---> 10\u001b[0m analyzed_dataset \u001b[39m=\u001b[39m analyzer\u001b[39m.\u001b[39;49manalyze(dataset,language\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39men\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[0;32m 11\u001b[0m masked_dataset \u001b[39m=\u001b[39m anonymizer\u001b[39m.\u001b[39manonymize(analyzed_dataset, dataset)\n\u001b[0;32m 12\u001b[0m \u001b[39mfor\u001b[39;00m item \u001b[39min\u001b[39;00m masked_dataset:\n",
"File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\presidio_analyzer\\analyzer_engine.py:189\u001b[0m, in \u001b[0;36mAnalyzerEngine.analyze\u001b[1;34m(self, text, language, entities, correlation_id, score_threshold, return_decision_process, ad_hoc_recognizers, context, allow_list, nlp_artifacts)\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[39m# run the nlp pipeline over the given text, store the results in\u001b[39;00m\n\u001b[0;32m 187\u001b[0m \u001b[39m# a NlpArtifacts instance\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m nlp_artifacts:\n\u001b[1;32m--> 189\u001b[0m nlp_artifacts \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnlp_engine\u001b[39m.\u001b[39;49mprocess_text(text, language)\n\u001b[0;32m 191\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlog_decision_process:\n\u001b[0;32m 192\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mapp_tracer\u001b[39m.\u001b[39mtrace(\n\u001b[0;32m 193\u001b[0m correlation_id, \u001b[39m\"\u001b[39m\u001b[39mnlp artifacts:\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m+\u001b[39m nlp_artifacts\u001b[39m.\u001b[39mto_json()\n\u001b[0;32m 194\u001b[0m )\n",
"File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\presidio_analyzer\\nlp_engine\\client_nlp_engine.py:57\u001b[0m, in \u001b[0;36mClientNlpEngine.process_text\u001b[1;34m(self, text, language)\u001b[0m\n\u001b[0;32m 54\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mprocess_text\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m, language: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m NlpArtifacts:\n\u001b[0;32m 55\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Execute the SpaCy NLP pipeline on the given text and language.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 57\u001b[0m doc \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnlp[language](text)\n\u001b[0;32m 58\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_doc_to_nlp_artifact(doc, language)\n",
"File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\spacy\\language.py:1007\u001b[0m, in \u001b[0;36mLanguage.__call__\u001b[1;34m(self, text, disable, component_cfg)\u001b[0m\n\u001b[0;32m 986\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\n\u001b[0;32m 987\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m 988\u001b[0m text: Union[\u001b[39mstr\u001b[39m, Doc],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 991\u001b[0m component_cfg: Optional[Dict[\u001b[39mstr\u001b[39m, Dict[\u001b[39mstr\u001b[39m, Any]]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[0;32m 992\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Doc:\n\u001b[0;32m 993\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Apply the pipeline to some text. The text can span multiple sentences,\u001b[39;00m\n\u001b[0;32m 994\u001b[0m \u001b[39m and can contain arbitrary whitespace. Alignment into the original string\u001b[39;00m\n\u001b[0;32m 995\u001b[0m \u001b[39m is preserved.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1005\u001b[0m \u001b[39m DOCS: https://spacy.io/api/language#call\u001b[39;00m\n\u001b[0;32m 1006\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1007\u001b[0m doc \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_ensure_doc(text)\n\u001b[0;32m 1008\u001b[0m \u001b[39mif\u001b[39;00m component_cfg \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 1009\u001b[0m component_cfg \u001b[39m=\u001b[39m {}\n",
"File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\spacy\\language.py:1101\u001b[0m, in \u001b[0;36mLanguage._ensure_doc\u001b[1;34m(self, doc_like)\u001b[0m\n\u001b[0;32m 1099\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(doc_like, \u001b[39mbytes\u001b[39m):\n\u001b[0;32m 1100\u001b[0m \u001b[39mreturn\u001b[39;00m Doc(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab)\u001b[39m.\u001b[39mfrom_bytes(doc_like)\n\u001b[1;32m-> 1101\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(Errors\u001b[39m.\u001b[39mE1041\u001b[39m.\u001b[39mformat(\u001b[39mtype\u001b[39m\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(doc_like)))\n",
"\u001b[1;31mValueError\u001b[0m: [E1041] Expected a string, Doc, or bytes as input, but got: "
]
}
],
"source": [
"from presidio_analyzer import AnalyzerEngine\n",
"from presidio_anonymizer import AnonymizerEngine\n",
"\n",
"analyzer = AnalyzerEngine()\n",
"anonymizer = AnonymizerEngine()\n",
"dataset = [\n",
" {\"text\": \"John Doe's email is john.doe@example.com and his phone number is 555-123-4567.\"},\n",
" {\"text\": \"Alice Smith's social security number is 123-45-6789.\"},\n",
"]\n",
"analyzed_dataset = analyzer.analyze(dataset,language='en')\n",
"masked_dataset = anonymizer.anonymize(analyzed_dataset, dataset)\n",
"for item in masked_dataset:\n",
" print(item[\"text\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"off\n",
"hashyfy\n",
"diffrential_pryivacy"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x\n",
"x\n"
]
}
],
"source": [
"class A:\n",
" def x():\n",
" print(\"x\")\n",
" return \"x\"\n",
"\n",
" def y():\n",
" return \"y\"\n",
"\n",
"# def fun(s):\n",
" \n",
"# print(s())\n",
" \n",
"# fun(\"x\")\n",
"s=getattr(A,\"x\")\n",
"# s=globals()[\"x\"]\n",
"print(s())\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/simple, https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/simple"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 23.0.1 -> 23.3.1\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Collecting diffprivlib\n",
" Downloading https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/packages/packages/a9/10/200015b77240c50f6f438e2b9e54a7179fdbf56f6ca9f40a11d90fd2c8f9/diffprivlib-0.6.3-py3-none-any.whl (176 kB)\n",
" ---------------------------------------- 0.0/176.0 kB ? eta -:--:--\n",
" ------------------------------------- 174.1/176.0 kB 5.1 MB/s eta 0:00:01\n",
" ------------------------------------- 174.1/176.0 kB 5.1 MB/s eta 0:00:01\n",
" -------------------------------------- 176.0/176.0 kB 1.8 MB/s eta 0:00:00\n",
"Requirement already satisfied: scikit-learn>=0.24.2 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.3.2)\n",
"Requirement already satisfied: joblib>=0.16.0 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.3.2)\n",
"Requirement already satisfied: numpy>=1.21.6 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.26.2)\n",
"Requirement already satisfied: scipy>=1.7.3 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (1.11.4)\n",
"Requirement already satisfied: setuptools>=49.0.0 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from diffprivlib) (65.5.0)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\work\\git\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages (from scikit-learn>=0.24.2->diffprivlib) (3.2.0)\n",
"Installing collected packages: diffprivlib\n",
"Successfully installed diffprivlib-0.6.3\n"
]
}
],
"source": [
"!pip install diffprivlib"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from diffprivlib.mechanisms import binary\n",
"import pandas as pd\n",
"df=pd.read_csv(r\"C:\\WORK\\GIT\\responsible-ai-admin\\responsible-ai-admin\\src\\rai_admin\\temp\\emplist.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Employee_ID Gender Age Education_Level Relationship_Status Hometown \\\n",
"0 EID_22713 F 32 5 Single Springfield \n",
"1 EID_9658 M 65 2 Single Lebanon \n",
"2 EID_22203 M 52 3 Married Springfield \n",
"3 EID_7652 M 50 5 Single Washington \n",
"4 EID_6516 F 44 3 Married Franklin \n",
"5 EID_20283 F 22 4 Married Franklin \n",
"6 EID_21014 M 42 3 Married Washington \n",
"7 EID_7693 F 41 2 Married Springfield \n",
"8 EID_13232 M 31 1 Single Springfield \n",
"\n",
" Unit Decision_skill_possess Time_of_service Time_since_promotion \\\n",
"0 R&D Conceptual 7 4 \n",
"1 IT Directive 41 2 \n",
"2 Sales Directive 21 3 \n",
"3 Marketing Analytical 11 4 \n",
"4 R&D Conceptual 12 4 \n",
"5 IT Behavioral 3 1 \n",
"6 Purchasing Analytical 6 4 \n",
"7 Sales Conceptual 4 4 \n",
"8 IT Analytical 7 3 \n",
"\n",
" growth_rate Travel_Rate Post_Level Pay_Scale Compensation_and_Benefits \\\n",
"0 30 1 5 4 type2 \n",
"1 72 1 1 1 type2 \n",
"2 25 0 1 8 type3 \n",
"3 28 1 1 2 type0 \n",
"4 47 1 3 2 type2 \n",
"5 53 0 3 6 type2 \n",
"6 35 1 3 4 type2 \n",
"7 35 1 4 8 type2 \n",
"8 73 2 3 8 type2 \n",
"\n",
" Work_Life_balance \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 4 \n",
"4 4 \n",
"5 1 \n",
"6 1 \n",
"7 1 \n",
"8 3 \n"
]
}
],
"source": [
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'M'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b=binary.Binary(epsilon=0.1,value0=\"F\",value1=\"M\",random_state=None)\n",
"b.randomise(\"F\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Gender', 'Relationship_Status']\n"
]
}
],
"source": [
"binaryList=[]\n",
"for c in df.columns:\n",
" # print(s)\n",
" if(len(df[c].unique())==2):\n",
" binaryList.append(c)\n",
"print(binaryList)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "Value to be randomised must be a string",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 26\u001b[0m line \u001b[0;36m2\n\u001b[0;32m 1\u001b[0m mechanism \u001b[39m=\u001b[39m binary\u001b[39m.\u001b[39mBinary(epsilon\u001b[39m=\u001b[39m\u001b[39m1.0\u001b[39m,value0\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mF\u001b[39m\u001b[39m\"\u001b[39m,value1\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mM\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m df[\u001b[39m\"\u001b[39m\u001b[39mGender\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m mechanism\u001b[39m.\u001b[39;49mrandomise(df[\u001b[39m\"\u001b[39;49m\u001b[39mGender\u001b[39;49m\u001b[39m\"\u001b[39;49m])\n",
"File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\diffprivlib\\mechanisms\\binary.py:110\u001b[0m, in \u001b[0;36mBinary.randomise\u001b[1;34m(self, value)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mrandomise\u001b[39m(\u001b[39mself\u001b[39m, value):\n\u001b[0;32m 97\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Randomise `value` with the mechanism.\u001b[39;00m\n\u001b[0;32m 98\u001b[0m \n\u001b[0;32m 99\u001b[0m \u001b[39m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 108\u001b[0m \n\u001b[0;32m 109\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 110\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_check_all(value)\n\u001b[0;32m 112\u001b[0m indicator \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mif\u001b[39;00m value \u001b[39m==\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0 \u001b[39melse\u001b[39;00m \u001b[39m1\u001b[39m\n\u001b[0;32m 114\u001b[0m unif_rv \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_rng\u001b[39m.\u001b[39mrandom() \u001b[39m*\u001b[39m (np\u001b[39m.\u001b[39mexp(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mepsilon) \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m)\n",
"File \u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\myenv\\lib\\site-packages\\diffprivlib\\mechanisms\\binary.py:80\u001b[0m, in \u001b[0;36mBinary._check_all\u001b[1;34m(self, value)\u001b[0m\n\u001b[0;32m 77\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_labels(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue1)\n\u001b[0;32m 79\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(value, \u001b[39mstr\u001b[39m):\n\u001b[1;32m---> 80\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mTypeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mValue to be randomised must be a string\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 82\u001b[0m \u001b[39mif\u001b[39;00m value \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m [\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue1]:\n\u001b[0;32m 83\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mValue to be randomised is not in the domain \u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue0\u001b[39m}\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m\\\"\u001b[39;00m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalue1\u001b[39m}\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m}}\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 84\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mgot \u001b[39m\u001b[39m\\\"\u001b[39;00m\u001b[39m{\u001b[39;00mvalue\u001b[39m}\u001b[39;00m\u001b[39m\\\"\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[1;31mTypeError\u001b[0m: Value to be randomised must be a string"
]
}
],
"source": [
"mechanism = binary.Binary(epsilon=1.0,value0=\"F\",value1=\"M\")\n",
"df[\"Gender\"] = mechanism.randomise(df[\"Gender\"])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['F', 'M'], dtype=object)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Gender\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"def binaryCheck(df,col):\n",
" data=list(df[col].unique())\n",
" # print(data)\n",
" mechanism = binary.Binary(epsilon=1.0,value0=data[0],value1=data[1])\n",
" for d in range(len(df[col])):\n",
" temp=df.loc[d,col]\n",
" # print(\"==/\",temp)\n",
" df.loc[d,col]=mechanism.randomise(temp)\n",
" # print(\"=====\",temp,df.loc[d,col])"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"binaryCheck(df,\"Gender\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Employee_ID | \n",
" Gender | \n",
" Age | \n",
" Education_Level | \n",
" Relationship_Status | \n",
" Hometown | \n",
" Unit | \n",
" Decision_skill_possess | \n",
" Time_of_service | \n",
" Time_since_promotion | \n",
" growth_rate | \n",
" Travel_Rate | \n",
" Post_Level | \n",
" Pay_Scale | \n",
" Compensation_and_Benefits | \n",
" Work_Life_balance | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" EID_22713 | \n",
" F | \n",
" 32 | \n",
" 5 | \n",
" Single | \n",
" Springfield | \n",
" R&D | \n",
" Conceptual | \n",
" 7 | \n",
" 4 | \n",
" 30 | \n",
" 1 | \n",
" 5 | \n",
" 4 | \n",
" type2 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" EID_9658 | \n",
" M | \n",
" 65 | \n",
" 2 | \n",
" Single | \n",
" Lebanon | \n",
" IT | \n",
" Directive | \n",
" 41 | \n",
" 2 | \n",
" 72 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" type2 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" EID_22203 | \n",
" M | \n",
" 52 | \n",
" 3 | \n",
" Married | \n",
" Springfield | \n",
" Sales | \n",
" Directive | \n",
" 21 | \n",
" 3 | \n",
" 25 | \n",
" 0 | \n",
" 1 | \n",
" 8 | \n",
" type3 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" EID_7652 | \n",
" M | \n",
" 50 | \n",
" 5 | \n",
" Single | \n",
" Washington | \n",
" Marketing | \n",
" Analytical | \n",
" 11 | \n",
" 4 | \n",
" 28 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" type0 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" EID_6516 | \n",
" F | \n",
" 44 | \n",
" 3 | \n",
" Married | \n",
" Franklin | \n",
" R&D | \n",
" Conceptual | \n",
" 12 | \n",
" 4 | \n",
" 47 | \n",
" 1 | \n",
" 3 | \n",
" 2 | \n",
" type2 | \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
" EID_20283 | \n",
" F | \n",
" 22 | \n",
" 4 | \n",
" Married | \n",
" Franklin | \n",
" IT | \n",
" Behavioral | \n",
" 3 | \n",
" 1 | \n",
" 53 | \n",
" 0 | \n",
" 3 | \n",
" 6 | \n",
" type2 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" EID_21014 | \n",
" M | \n",
" 42 | \n",
" 3 | \n",
" Married | \n",
" Washington | \n",
" Purchasing | \n",
" Analytical | \n",
" 6 | \n",
" 4 | \n",
" 35 | \n",
" 1 | \n",
" 3 | \n",
" 4 | \n",
" type2 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" EID_7693 | \n",
" F | \n",
" 41 | \n",
" 2 | \n",
" Married | \n",
" Springfield | \n",
" Sales | \n",
" Conceptual | \n",
" 4 | \n",
" 4 | \n",
" 35 | \n",
" 1 | \n",
" 4 | \n",
" 8 | \n",
" type2 | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" EID_13232 | \n",
" M | \n",
" 31 | \n",
" 1 | \n",
" Single | \n",
" Springfield | \n",
" IT | \n",
" Analytical | \n",
" 7 | \n",
" 3 | \n",
" 73 | \n",
" 2 | \n",
" 3 | \n",
" 8 | \n",
" type2 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Employee_ID Gender Age Education_Level Relationship_Status Hometown \\\n",
"0 EID_22713 F 32 5 Single Springfield \n",
"1 EID_9658 M 65 2 Single Lebanon \n",
"2 EID_22203 M 52 3 Married Springfield \n",
"3 EID_7652 M 50 5 Single Washington \n",
"4 EID_6516 F 44 3 Married Franklin \n",
"5 EID_20283 F 22 4 Married Franklin \n",
"6 EID_21014 M 42 3 Married Washington \n",
"7 EID_7693 F 41 2 Married Springfield \n",
"8 EID_13232 M 31 1 Single Springfield \n",
"\n",
" Unit Decision_skill_possess Time_of_service Time_since_promotion \\\n",
"0 R&D Conceptual 7 4 \n",
"1 IT Directive 41 2 \n",
"2 Sales Directive 21 3 \n",
"3 Marketing Analytical 11 4 \n",
"4 R&D Conceptual 12 4 \n",
"5 IT Behavioral 3 1 \n",
"6 Purchasing Analytical 6 4 \n",
"7 Sales Conceptual 4 4 \n",
"8 IT Analytical 7 3 \n",
"\n",
" growth_rate Travel_Rate Post_Level Pay_Scale Compensation_and_Benefits \\\n",
"0 30 1 5 4 type2 \n",
"1 72 1 1 1 type2 \n",
"2 25 0 1 8 type3 \n",
"3 28 1 1 2 type0 \n",
"4 47 1 3 2 type2 \n",
"5 53 0 3 6 type2 \n",
"6 35 1 3 4 type2 \n",
"7 35 1 4 8 type2 \n",
"8 73 2 3 8 type2 \n",
"\n",
" Work_Life_balance \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 4 \n",
"4 4 \n",
"5 1 \n",
"6 1 \n",
"7 1 \n",
"8 3 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"minv=df.Age.min()\n",
"maxv=df.Age.max()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70 20\n"
]
}
],
"source": [
"import math\n",
"\n",
"base=10\n",
"maxrange=math.ceil(maxv / base) * base\n",
"minrange=round(minv/base)*base\n",
"\n",
"print(maxrange,minrange)\n",
"diff=maxrange-minrange\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n",
"4\n",
"['20.0-30.0', '30.0-40.0', '40.0-50.0', '50.0-60.0', '60.0-70']\n",
"[20.0, 30.0, 40.0, 50.0, 60.0, 70]\n"
]
}
],
"source": [
"range_magnitude = abs(maxrange - minrange)\n",
"# print(range_magnitude)\n",
"# Determine the number of ranges based on the magnitude``\n",
"num_ranges = max(range_magnitude // 10, 1) # Assuming a minimum range size of 10\n",
"\n",
"# Calculate the interval\n",
"interval = range_magnitude / num_ranges\n",
"\n",
"ranges = []\n",
"binlist=set()\n",
"lablelist=[]\n",
"\n",
"for i in range(num_ranges):\n",
" start = minrange + i * interval\n",
" end = minrange + (i + 1) * interval\n",
" if(i==num_ranges-1):\n",
" # print(i)\n",
" end=maxrange\n",
" binlist.add(start)\n",
" binlist.add(end)\n",
" lablelist.append(f\"{start}-{end}\")\n",
" # ranges.append((start, end))\n",
"binlist=sorted(list(binlist))\n",
"print(lablelist)\n",
"print(binlist)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['20.0-30.0', '30.0-40.0', '40.0-50.0', '50.0-60.0', '60.0-70.0']\n",
"[20.0, 30.0, 40.0, 50.0, 60.0, 70.0]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Employee_ID | \n",
" Gender | \n",
" Age | \n",
" Education_Level | \n",
" Relationship_Status | \n",
" Hometown | \n",
" Unit | \n",
" Decision_skill_possess | \n",
" Time_of_service | \n",
" Time_since_promotion | \n",
" growth_rate | \n",
" Travel_Rate | \n",
" Post_Level | \n",
" Pay_Scale | \n",
" Compensation_and_Benefits | \n",
" Work_Life_balance | \n",
" Age1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" EID_22713 | \n",
" F | \n",
" 32 | \n",
" 5 | \n",
" Single | \n",
" Springfield | \n",
" R&D | \n",
" Conceptual | \n",
" 7 | \n",
" 4 | \n",
" 30 | \n",
" 1 | \n",
" 5 | \n",
" 4 | \n",
" type2 | \n",
" 1 | \n",
" 30.0-40.0 | \n",
"
\n",
" \n",
" 1 | \n",
" EID_9658 | \n",
" M | \n",
" 65 | \n",
" 2 | \n",
" Single | \n",
" Lebanon | \n",
" IT | \n",
" Directive | \n",
" 41 | \n",
" 2 | \n",
" 72 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" type2 | \n",
" 1 | \n",
" 60.0-70.0 | \n",
"
\n",
" \n",
" 2 | \n",
" EID_22203 | \n",
" M | \n",
" 52 | \n",
" 3 | \n",
" Married | \n",
" Springfield | \n",
" Sales | \n",
" Directive | \n",
" 21 | \n",
" 3 | \n",
" 25 | \n",
" 0 | \n",
" 1 | \n",
" 8 | \n",
" type3 | \n",
" 1 | \n",
" 50.0-60.0 | \n",
"
\n",
" \n",
" 3 | \n",
" EID_7652 | \n",
" M | \n",
" 50 | \n",
" 5 | \n",
" Single | \n",
" Washington | \n",
" Marketing | \n",
" Analytical | \n",
" 11 | \n",
" 4 | \n",
" 28 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" type0 | \n",
" 4 | \n",
" 40.0-50.0 | \n",
"
\n",
" \n",
" 4 | \n",
" EID_6516 | \n",
" F | \n",
" 44 | \n",
" 3 | \n",
" Married | \n",
" Franklin | \n",
" R&D | \n",
" Conceptual | \n",
" 12 | \n",
" 4 | \n",
" 47 | \n",
" 1 | \n",
" 3 | \n",
" 2 | \n",
" type2 | \n",
" 4 | \n",
" 40.0-50.0 | \n",
"
\n",
" \n",
" 5 | \n",
" EID_20283 | \n",
" F | \n",
" 22 | \n",
" 4 | \n",
" Married | \n",
" Franklin | \n",
" IT | \n",
" Behavioral | \n",
" 3 | \n",
" 1 | \n",
" 53 | \n",
" 0 | \n",
" 3 | \n",
" 6 | \n",
" type2 | \n",
" 1 | \n",
" 20.0-30.0 | \n",
"
\n",
" \n",
" 6 | \n",
" EID_21014 | \n",
" M | \n",
" 42 | \n",
" 3 | \n",
" Married | \n",
" Washington | \n",
" Purchasing | \n",
" Analytical | \n",
" 6 | \n",
" 4 | \n",
" 35 | \n",
" 1 | \n",
" 3 | \n",
" 4 | \n",
" type2 | \n",
" 1 | \n",
" 40.0-50.0 | \n",
"
\n",
" \n",
" 7 | \n",
" EID_7693 | \n",
" F | \n",
" 41 | \n",
" 2 | \n",
" Married | \n",
" Springfield | \n",
" Sales | \n",
" Conceptual | \n",
" 4 | \n",
" 4 | \n",
" 35 | \n",
" 1 | \n",
" 4 | \n",
" 8 | \n",
" type2 | \n",
" 1 | \n",
" 40.0-50.0 | \n",
"
\n",
" \n",
" 8 | \n",
" EID_13232 | \n",
" M | \n",
" 31 | \n",
" 1 | \n",
" Single | \n",
" Springfield | \n",
" IT | \n",
" Analytical | \n",
" 7 | \n",
" 3 | \n",
" 73 | \n",
" 2 | \n",
" 3 | \n",
" 8 | \n",
" type2 | \n",
" 3 | \n",
" 30.0-40.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Employee_ID Gender Age Education_Level Relationship_Status Hometown \\\n",
"0 EID_22713 F 32 5 Single Springfield \n",
"1 EID_9658 M 65 2 Single Lebanon \n",
"2 EID_22203 M 52 3 Married Springfield \n",
"3 EID_7652 M 50 5 Single Washington \n",
"4 EID_6516 F 44 3 Married Franklin \n",
"5 EID_20283 F 22 4 Married Franklin \n",
"6 EID_21014 M 42 3 Married Washington \n",
"7 EID_7693 F 41 2 Married Springfield \n",
"8 EID_13232 M 31 1 Single Springfield \n",
"\n",
" Unit Decision_skill_possess Time_of_service Time_since_promotion \\\n",
"0 R&D Conceptual 7 4 \n",
"1 IT Directive 41 2 \n",
"2 Sales Directive 21 3 \n",
"3 Marketing Analytical 11 4 \n",
"4 R&D Conceptual 12 4 \n",
"5 IT Behavioral 3 1 \n",
"6 Purchasing Analytical 6 4 \n",
"7 Sales Conceptual 4 4 \n",
"8 IT Analytical 7 3 \n",
"\n",
" growth_rate Travel_Rate Post_Level Pay_Scale Compensation_and_Benefits \\\n",
"0 30 1 5 4 type2 \n",
"1 72 1 1 1 type2 \n",
"2 25 0 1 8 type3 \n",
"3 28 1 1 2 type0 \n",
"4 47 1 3 2 type2 \n",
"5 53 0 3 6 type2 \n",
"6 35 1 3 4 type2 \n",
"7 35 1 4 8 type2 \n",
"8 73 2 3 8 type2 \n",
"\n",
" Work_Life_balance Age1 \n",
"0 1 30.0-40.0 \n",
"1 1 60.0-70.0 \n",
"2 1 50.0-60.0 \n",
"3 4 40.0-50.0 \n",
"4 4 40.0-50.0 \n",
"5 1 20.0-30.0 \n",
"6 1 40.0-50.0 \n",
"7 1 40.0-50.0 \n",
"8 3 30.0-40.0 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Age1'] = pd.cut(df['Age'], bins=binlist, labels=lablelist)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'numpy.int64' object is not callable",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\WORK\\GIT\\cpy1\\responsible-ai-privacy\\responsible-ai-privacy\\src\\test.ipynb Cell 35\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m \u001b[39mmax\u001b[39;49m(\u001b[39m1\u001b[39;49m,\u001b[39m2\u001b[39;49m)\n",
"\u001b[1;31mTypeError\u001b[0m: 'numpy.int64' object is not callable"
]
}
],
"source": [
"max(1,2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}