mzameshina commited on
Commit
7519b8e
·
verified ·
1 Parent(s): eb37260

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +532 -58
app.py CHANGED
@@ -13,6 +13,7 @@ import numpy
13
  import pandas as pd
14
  import requests
15
  from fhe_anonymizer import FHEAnonymizer
 
16
  from utils_demo import *
17
 
18
  from concrete.ml.deployment import FHEModelClient
@@ -21,10 +22,12 @@ from models.speech_to_text.transcriber.audio import preprocess_audio
21
  from models.speech_to_text.transcriber.model import load_model_and_processor
22
  from models.speech_to_text.transcriber.audio import transcribe_audio
23
 
 
24
  # Ensure the directory is clean before starting processes or reading files
25
  clean_directory()
26
 
27
  anonymizer = FHEAnonymizer()
 
28
 
29
  # Start the Uvicorn server hosting the FastAPI app
30
  subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
@@ -32,16 +35,43 @@ time.sleep(3)
32
 
33
  # Load data from files required for the application
34
  UUID_MAP = read_json(MAPPING_UUID_PATH)
 
 
 
 
35
  MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
36
 
 
 
 
 
 
 
 
 
37
  # Generate a random user ID for this session
38
  USER_ID = numpy.random.randint(0, 2**32)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def key_gen_fn() -> Dict:
41
  """Generate keys for a given user."""
 
42
  print("------------ Step 1: Key Generation:")
 
43
  print(f"Your user ID is: {USER_ID}....")
44
 
 
45
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
46
  client.load()
47
 
@@ -54,17 +84,70 @@ def key_gen_fn() -> Dict:
54
 
55
  # Save the evaluation key
56
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
 
57
  write_bytes(evaluation_key_path, serialized_evaluation_keys)
58
 
 
 
59
  if not evaluation_key_path.is_file():
60
- error_message = f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
 
 
61
  print(error_message)
62
  return {gen_key_btn: gr.update(value=error_message)}
63
  else:
64
  print("Keys have been generated ✅")
65
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def encrypt_query_fn(query):
 
68
  print(f"\n------------ Step 2: Query encryption: {query=}")
69
 
70
  if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
@@ -73,29 +156,45 @@ def encrypt_query_fn(query):
73
  if is_user_query_valid(query):
74
  return {
75
  query_box: gr.update(
76
- value="Unable to process ❌: The request exceeds the length limit or falls outside the scope. Please refine your query."
 
 
 
77
  )
78
  }
79
 
 
80
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
81
  client.load()
82
 
83
  encrypted_tokens = []
 
 
84
  tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
85
 
86
  for token in tokens:
87
- if not bool(re.match(r"^\s+$", token)):
88
- emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
89
- encrypted_x = client.quantize_encrypt_serialize(emb_x)
90
- assert isinstance(encrypted_x, bytes)
91
- encrypted_tokens.append(encrypted_x)
 
 
 
 
 
 
 
 
92
 
93
  print("Data encrypted ✅ on Client Side")
94
 
95
  assert len({len(token) for token in encrypted_tokens}) == 1
96
 
97
  write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
98
- write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big"))
 
 
99
 
100
  encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
101
 
@@ -105,76 +204,169 @@ def encrypt_query_fn(query):
105
  identified_words_output_df: gr.update(visible=False, value=None),
106
  }
107
 
 
108
  def send_input_fn(query) -> Dict:
 
 
109
  print("------------ Step 3.1: Send encrypted_data to the Server")
110
 
111
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
112
  encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
113
  encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
114
 
115
- if not evaluation_key_path.is_file() or not encrypted_input_path.is_file():
116
- error_message = "Error: Key or encrypted input not found. Please generate the key and encrypt the query first."
 
 
 
 
 
 
 
 
 
 
117
  return {anonymized_query_output: gr.update(value=error_message)}
118
 
 
119
  data = {"user_id": USER_ID, "input": query}
 
120
  files = [
121
  ("files", open(evaluation_key_path, "rb")),
122
  ("files", open(encrypted_input_path, "rb")),
123
  ("files", open(encrypted_input_len_path, "rb")),
124
  ]
125
 
 
126
  url = SERVER_URL + "send_input"
127
- with requests.post(url=url, data=data, files=files) as resp:
 
 
 
 
 
128
  print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
129
 
 
130
  def run_fhe_in_server_fn() -> Dict:
 
 
131
  print("------------ Step 3.2: Run in FHE on the Server Side")
132
 
133
- data = {"user_id": USER_ID}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  url = SERVER_URL + "run_fhe"
135
 
136
- with requests.post(url=url, data=data) as response:
 
 
 
137
  if not response.ok:
138
  return {
139
  anonymized_query_output: gr.update(
140
- value="⚠️ An error occurred on the Server Side. Please check connectivity and data transmission."
 
 
 
141
  ),
142
  }
143
  else:
144
  time.sleep(1)
145
  print(f"The query anonymization was computed in {response.json():.2f} s per token.")
146
 
 
147
  def get_output_fn() -> Dict:
 
148
  print("------------ Step 3.3: Get the output from the Server Side")
149
 
150
- data = {"user_id": USER_ID}
151
- url = SERVER_URL + "get_output"
 
 
 
 
152
 
153
- with requests.post(url=url, data=data) as response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  if response.ok:
155
  print("Data received ✅ from the remote Server")
156
  response_data = response.json()
157
- encrypted_output = base64.b64decode(response_data["encrypted_output"])
158
- length_encrypted_output = base64.b64decode(response_data["length"])
 
 
 
 
 
 
 
159
 
160
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
161
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
 
162
  else:
163
- print("Error ❌ in getting data from the server")
 
164
 
165
  def decrypt_fn(text) -> Dict:
166
- print("------------ Step 4: Decrypt the data on the `Client Side`")
167
 
 
 
 
168
  encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
169
 
170
  if not encrypted_output_path.is_file():
171
- error_message = "⚠️ Error: Encrypted output not found. Please ensure the entire process has been completed."
 
 
 
 
 
 
172
  print(error_message)
 
173
  return error_message, None
174
 
 
175
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
176
  client.load()
177
 
 
178
  encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
179
  length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
180
 
@@ -184,7 +376,11 @@ def decrypt_fn(text) -> Dict:
184
 
185
  i = 0
186
  for token in tokens:
187
- if not bool(re.match(r"^\s+$", token)):
 
 
 
 
188
  encrypted_token = encrypted_output[i : i + length]
189
  prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
190
  probability = prediction_proba[0][1]
@@ -192,102 +388,380 @@ def decrypt_fn(text) -> Dict:
192
 
193
  if probability >= 0.77:
194
  identified_words_with_prob.append((token, probability))
 
 
195
  tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
196
  decrypted_output.append(tmp_uuid)
197
  UUID_MAP[token] = tmp_uuid
198
  else:
199
  decrypted_output.append(token)
200
 
201
- write_json(MAPPING_UUID_PATH, UUID_MAP)
 
202
 
 
203
  anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
204
 
205
- identified_df = pd.DataFrame(
206
- identified_words_with_prob, columns=["Identified Words", "Probability"]
207
- ) if identified_words_with_prob else pd.DataFrame(columns=["Identified Words", "Probability"])
 
 
 
 
208
 
209
  print("Decryption done ✅ on Client Side")
210
 
211
  return anonymized_text, identified_df
212
 
213
- def anonymization_with_fn(query):
 
 
214
  encrypt_query_fn(query)
 
215
  send_input_fn(query)
 
216
  run_fhe_in_server_fn()
 
217
  get_output_fn()
 
218
  anonymized_text, identified_df = decrypt_fn(query)
219
 
220
  return {
 
221
  anonymized_query_output: gr.update(value=anonymized_text),
222
- identified_words_output_df: gr.update(value=identified_df, visible=True),
223
  }
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
226
 
227
  with demo:
 
 
 
 
 
 
 
 
 
 
228
  gr.Markdown(
229
  """
230
- <h1 style="text-align: center;">Secure De-Identification of Text Data using FHE</h1>
 
 
 
 
 
 
 
 
 
 
 
231
  """
232
  )
233
 
234
  gr.Markdown(
235
  """
236
  <p align="center" style="font-size: 18px;">
237
- This demo showcases privacy-preserving de-identification of text data using Fully Homomorphic Encryption (FHE).
 
 
 
 
 
 
 
 
238
  </p>
239
  """
240
  )
241
 
242
- ########################## Key Gen Part ##########################
243
- gr.Markdown(
244
- "## Step 1: Generate the keys\n\n"
245
- """In Fully Homomorphic Encryption (FHE) methods, two types of keys are created: secret keys for encrypting and decrypting user data,
246
- and evaluation keys for the server to work on encrypted data without seeing the actual content."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  )
 
 
248
 
 
 
 
 
 
 
 
 
 
 
249
  gen_key_btn = gr.Button("Generate the secret and evaluation keys")
250
- gen_key_btn.click(key_gen_fn, inputs=[], outputs=[gen_key_btn])
 
 
 
 
251
 
252
- ########################## User Query Part ##########################
253
- gr.Markdown("## Step 2: Enter the prompt you want to encrypt and de-identify")
254
 
255
- query_box = gr.Textbox(
256
- value="Hello. My name is John Doe. I live at 123 Main St, Anytown, USA.",
257
- label="Enter your prompt:",
258
- interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  )
260
 
261
- encrypt_query_btn = gr.Button("Encrypt the prompt")
262
- output_encrypted_box = gr.Textbox(
263
- label="Encrypted prompt (will be sent to the de-identification server):",
264
- lines=4,
 
 
 
 
 
265
  )
266
 
267
- encrypt_query_btn.click(
268
- fn=encrypt_query_fn,
269
- inputs=[query_box],
270
- outputs=[query_box, output_encrypted_box],
 
 
 
 
 
 
 
 
 
 
 
271
  )
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  ########################## FHE processing Part ##########################
274
- gr.Markdown("## Step 3: De-identify the prompt using FHE")
 
 
275
  gr.Markdown(
276
- """The encrypted prompt will be sent to a remote server for de-identification using FHE.
277
- The server performs computations on the encrypted data and returns the result for decryption."""
 
 
278
  )
279
 
280
  run_fhe_btn = gr.Button("De-identify using FHE")
281
- anonymized_query_output = gr.Textbox(
282
- label="De-identified prompt", lines=4, interactive=True
283
- )
 
 
 
 
 
 
 
 
 
 
 
 
284
  identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  run_fhe_btn.click(
287
  anonymization_with_fn,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  inputs=[query_box],
289
- outputs=[anonymized_query_output, identified_words_output_df],
290
  )
291
 
 
292
  # Launch the app
293
  demo.launch(share=False)
 
13
  import pandas as pd
14
  import requests
15
  from fhe_anonymizer import FHEAnonymizer
16
+ #from openai import OpenAI
17
  from utils_demo import *
18
 
19
  from concrete.ml.deployment import FHEModelClient
 
22
  from models.speech_to_text.transcriber.model import load_model_and_processor
23
  from models.speech_to_text.transcriber.audio import transcribe_audio
24
 
25
+
26
  # Ensure the directory is clean before starting processes or reading files
27
  clean_directory()
28
 
29
  anonymizer = FHEAnonymizer()
30
+ #client = OpenAI(api_key=os.environ.get("openaikey"))
31
 
32
  # Start the Uvicorn server hosting the FastAPI app
33
  subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
 
35
 
36
  # Load data from files required for the application
37
  UUID_MAP = read_json(MAPPING_UUID_PATH)
38
+ ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
39
+ MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
40
+ MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
41
+ ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
42
  MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
43
 
44
+ print(f"{ORIGINAL_DOCUMENT=}\n")
45
+ print(f"{MAPPING_DOC_EMBEDDING.keys()=}")
46
+
47
+ # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
48
+
49
+ # 5. Utilizing External Services or APIs
50
+ # (Assuming client initialization and anonymizer setup are parts of using external services or application-specific logic)
51
+
52
  # Generate a random user ID for this session
53
  USER_ID = numpy.random.randint(0, 2**32)
54
 
55
+
56
+ def select_static_anonymized_sentences_fn(selected_sentences: List):
57
+
58
+ selected_sentences = [MAPPING_ANONYMIZED_SENTENCES[sentence] for sentence in selected_sentences]
59
+
60
+ anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
61
+
62
+ anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
63
+
64
+ return "\n\n".join(anonymized_selected_sentence)
65
+
66
+
67
  def key_gen_fn() -> Dict:
68
  """Generate keys for a given user."""
69
+
70
  print("------------ Step 1: Key Generation:")
71
+
72
  print(f"Your user ID is: {USER_ID}....")
73
 
74
+
75
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
76
  client.load()
77
 
 
84
 
85
  # Save the evaluation key
86
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
87
+
88
  write_bytes(evaluation_key_path, serialized_evaluation_keys)
89
 
90
+ # anonymizer.generate_key()
91
+
92
  if not evaluation_key_path.is_file():
93
+ error_message = (
94
+ f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
95
+ )
96
  print(error_message)
97
  return {gen_key_btn: gr.update(value=error_message)}
98
  else:
99
  print("Keys have been generated ✅")
100
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
101
 
102
+
103
+ def encrypt_doc_fn(doc):
104
+
105
+ print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
106
+
107
+ if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
108
+ return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
109
+
110
+ # Retrieve the client API
111
+ client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
112
+ client.load()
113
+
114
+ encrypted_tokens = []
115
+ tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+|\$\d+(?:\.\d+)?|\€\d+(?:\.\d+)?)", ' '.join(doc))
116
+
117
+ for token in tokens:
118
+ if token.strip() and re.match(r"\w+", token):
119
+ emb_x = MAPPING_DOC_EMBEDDING[token]
120
+ assert emb_x.shape == (1, 1024)
121
+ encrypted_x = client.quantize_encrypt_serialize(emb_x)
122
+ assert isinstance(encrypted_x, bytes)
123
+ encrypted_tokens.append(encrypted_x)
124
+
125
+ print("Doc encrypted ✅ on Client Side")
126
+
127
+ # No need to save it
128
+ # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
129
+
130
+ encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
131
+
132
+ return {
133
+ encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
134
+ anonymized_doc_output: gr.update(visible=True, value=None),
135
+ }
136
+
137
+ import presidio_analyzer
138
+ import presidio_anonymizer
139
+ from presidio_analyzer import AnalyzerEngine
140
+ from presidio_anonymizer import AnonymizerEngine
141
+
142
+ def anonymization_with_presidio(prompt):
143
+ analyzer = AnalyzerEngine()
144
+ anonymizer = AnonymizerEngine()
145
+ results = analyzer.analyze(text=prompt,language='en')
146
+ result = anonymizer.anonymize(text=prompt, analyzer_results=results)
147
+ return result.text
148
+
149
  def encrypt_query_fn(query):
150
+
151
  print(f"\n------------ Step 2: Query encryption: {query=}")
152
 
153
  if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
 
156
  if is_user_query_valid(query):
157
  return {
158
  query_box: gr.update(
159
+ value=(
160
+ "Unable to process ❌: The request exceeds the length limit or falls "
161
+ "outside the scope of this document. Please refine your query."
162
+ )
163
  )
164
  }
165
 
166
+ # Retrieve the client API
167
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
168
  client.load()
169
 
170
  encrypted_tokens = []
171
+
172
+ # Pattern to identify words and non-words (including punctuation, spaces, etc.)
173
  tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
174
 
175
  for token in tokens:
176
+
177
+ # 1- Ignore non-words tokens
178
+ if bool(re.match(r"^\s+$", token)):
179
+ continue
180
+
181
+ # 2- Directly append non-word tokens or whitespace to processed_tokens
182
+
183
+ # Prediction for each word
184
+ emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
185
+ encrypted_x = client.quantize_encrypt_serialize(emb_x)
186
+ assert isinstance(encrypted_x, bytes)
187
+
188
+ encrypted_tokens.append(encrypted_x)
189
 
190
  print("Data encrypted ✅ on Client Side")
191
 
192
  assert len({len(token) for token in encrypted_tokens}) == 1
193
 
194
  write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
195
+ write_bytes(
196
+ KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big")
197
+ )
198
 
199
  encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
200
 
 
204
  identified_words_output_df: gr.update(visible=False, value=None),
205
  }
206
 
207
+
208
  def send_input_fn(query) -> Dict:
209
+ """Send the encrypted data and the evaluation key to the server."""
210
+
211
  print("------------ Step 3.1: Send encrypted_data to the Server")
212
 
213
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
214
  encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
215
  encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
216
 
217
+ if not evaluation_key_path.is_file():
218
+ error_message = (
219
+ "Error Encountered While Sending Data to the Server: "
220
+ f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
221
+ )
222
+ return {anonymized_query_output: gr.update(value=error_message)}
223
+
224
+ if not encrypted_input_path.is_file():
225
+ error_message = (
226
+ "Error Encountered While Sending Data to the Server: The data has not been encrypted "
227
+ f"correctly on the client side - {encrypted_input_path.is_file()=}"
228
+ )
229
  return {anonymized_query_output: gr.update(value=error_message)}
230
 
231
+ # Define the data and files to post
232
  data = {"user_id": USER_ID, "input": query}
233
+
234
  files = [
235
  ("files", open(evaluation_key_path, "rb")),
236
  ("files", open(encrypted_input_path, "rb")),
237
  ("files", open(encrypted_input_len_path, "rb")),
238
  ]
239
 
240
+ # Send the encrypted input and evaluation key to the server
241
  url = SERVER_URL + "send_input"
242
+
243
+ with requests.post(
244
+ url=url,
245
+ data=data,
246
+ files=files,
247
+ ) as resp:
248
  print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
249
 
250
+
251
  def run_fhe_in_server_fn() -> Dict:
252
+ """Run in FHE the anonymization of the query"""
253
+
254
  print("------------ Step 3.2: Run in FHE on the Server Side")
255
 
256
+ evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
257
+ encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
258
+
259
+ if not evaluation_key_path.is_file():
260
+ error_message = (
261
+ "Error Encountered While Sending Data to the Server: "
262
+ f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
263
+ )
264
+ return {anonymized_query_output: gr.update(value=error_message)}
265
+
266
+ if not encrypted_input_path.is_file():
267
+ error_message = (
268
+ "Error Encountered While Sending Data to the Server: The data has not been encrypted "
269
+ f"correctly on the client side - {encrypted_input_path.is_file()=}"
270
+ )
271
+ return {anonymized_query_output: gr.update(value=error_message)}
272
+
273
+ data = {
274
+ "user_id": USER_ID,
275
+ }
276
+
277
  url = SERVER_URL + "run_fhe"
278
 
279
+ with requests.post(
280
+ url=url,
281
+ data=data,
282
+ ) as response:
283
  if not response.ok:
284
  return {
285
  anonymized_query_output: gr.update(
286
+ value=(
287
+ "⚠️ An error occurred on the Server Side. "
288
+ "Please check connectivity and data transmission."
289
+ ),
290
  ),
291
  }
292
  else:
293
  time.sleep(1)
294
  print(f"The query anonymization was computed in {response.json():.2f} s per token.")
295
 
296
+
297
  def get_output_fn() -> Dict:
298
+
299
  print("------------ Step 3.3: Get the output from the Server Side")
300
 
301
+ if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
302
+ error_message = (
303
+ "Error Encountered While Sending Data to the Server: "
304
+ "The key has not been generated correctly"
305
+ )
306
+ return {anonymized_query_output: gr.update(value=error_message)}
307
 
308
+ if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
309
+ error_message = (
310
+ "Error Encountered While Sending Data to the Server: "
311
+ "The data has not been encrypted correctly on the client side"
312
+ )
313
+ return {anonymized_query_output: gr.update(value=error_message)}
314
+
315
+ data = {
316
+ "user_id": USER_ID,
317
+ }
318
+
319
+ # Retrieve the encrypted output
320
+ url = SERVER_URL + "get_output"
321
+ with requests.post(
322
+ url=url,
323
+ data=data,
324
+ ) as response:
325
  if response.ok:
326
  print("Data received ✅ from the remote Server")
327
  response_data = response.json()
328
+ encrypted_output_base64 = response_data["encrypted_output"]
329
+ length_encrypted_output_base64 = response_data["length"]
330
+
331
+ # Decode the base64 encoded data
332
+ encrypted_output = base64.b64decode(encrypted_output_base64)
333
+ length_encrypted_output = base64.b64decode(length_encrypted_output_base64)
334
+
335
+ # Save the encrypted output to bytes in a file as it is too large to pass through
336
+ # regular Gradio buttons (see https://github.com/gradio-app/gradio/issues/1877)
337
 
338
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
339
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
340
+
341
  else:
342
+ print("Error ❌ in getting data to the server")
343
+
344
 
345
  def decrypt_fn(text) -> Dict:
346
+ """Dencrypt the data on the `Client Side`."""
347
 
348
+ print("------------ Step 4: Dencrypt the data on the `Client Side`")
349
+
350
+ # Get the encrypted output path
351
  encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
352
 
353
  if not encrypted_output_path.is_file():
354
+ error_message = """⚠️ Please ensure that: \n
355
+ - the connectivity \n
356
+ - the query has been submitted \n
357
+ - the evaluation key has been generated \n
358
+ - the server processed the encrypted data \n
359
+ - the Client received the data from the Server before decrypting the prediction
360
+ """
361
  print(error_message)
362
+
363
  return error_message, None
364
 
365
+ # Retrieve the client API
366
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
367
  client.load()
368
 
369
+ # Load the encrypted output as bytes
370
  encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
371
  length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
372
 
 
376
 
377
  i = 0
378
  for token in tokens:
379
+
380
+ # Directly append non-word tokens or whitespace to processed_tokens
381
+ if bool(re.match(r"^\s+$", token)):
382
+ continue
383
+ else:
384
  encrypted_token = encrypted_output[i : i + length]
385
  prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
386
  probability = prediction_proba[0][1]
 
388
 
389
  if probability >= 0.77:
390
  identified_words_with_prob.append((token, probability))
391
+
392
+ # Use the existing UUID if available, otherwise generate a new one
393
  tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
394
  decrypted_output.append(tmp_uuid)
395
  UUID_MAP[token] = tmp_uuid
396
  else:
397
  decrypted_output.append(token)
398
 
399
+ # Update the UUID map with query.
400
+ write_json(MAPPING_UUID_PATH, UUID_MAP)
401
 
402
+ # Removing Spaces Before Punctuation:
403
  anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
404
 
405
+ # Convert the list of identified words and probabilities into a DataFrame
406
+ if identified_words_with_prob:
407
+ identified_df = pd.DataFrame(
408
+ identified_words_with_prob, columns=["Identified Words", "Probability"]
409
+ )
410
+ else:
411
+ identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
412
 
413
  print("Decryption done ✅ on Client Side")
414
 
415
  return anonymized_text, identified_df
416
 
417
+
418
+ def anonymization_with_fn(selected_sentences, query):
419
+
420
  encrypt_query_fn(query)
421
+
422
  send_input_fn(query)
423
+
424
  run_fhe_in_server_fn()
425
+
426
  get_output_fn()
427
+
428
  anonymized_text, identified_df = decrypt_fn(query)
429
 
430
  return {
431
+ anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
432
  anonymized_query_output: gr.update(value=anonymized_text),
433
+ identified_words_output_df: gr.update(value=identified_df, visible=False),
434
  }
435
 
436
+ # Define the folder path containing audio files
437
+ AUDIO_FOLDER_PATH = "./files/"
438
+
439
+ # Function to list available audio files in the folder
440
+ def get_audio_files():
441
+ files = [f for f in os.listdir(AUDIO_FOLDER_PATH) if f.endswith(('.wav', '.mp3'))]
442
+ return files
443
+
444
+ # Step 1: Load and display audio file
445
+ def load_audio_file(selected_audio):
446
+ file_path = os.path.join(AUDIO_FOLDER_PATH, selected_audio)
447
+ return file_path
448
+
449
+ # Step 1.1: Record and save the audio file
450
+ def save_recorded_audio(audio):
451
+ file_path = os.path.join(AUDIO_FOLDER_PATH, "recorded_audio.wav")
452
+ audio.export(file_path, format="wav") # Save the audio as a .wav file
453
+ return file_path
454
+
455
+ def click_js():
456
+ return """function audioRecord() {
457
+ var xPathRes = document.evaluate ('//*[@id="audio"]//button', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
458
+ xPathRes.singleNodeValue.click();}"""
459
+
460
+
461
+ def action(btn):
462
+ """Changes button text on click"""
463
+ if btn == 'Speak':
464
+ return 'Stop'
465
+ else:
466
+ return 'Speak'
467
+
468
+
469
+ def check_btn(btn):
470
+ """Checks for correct button text before invoking transcribe()"""
471
+ if btn != 'Speak':
472
+ raise Exception('Recording...')
473
+
474
+
475
+ def transcribe():
476
+ return 'Success'
477
+
478
+
479
+ def transcribe_audio_app(audio_path):
480
+ # Prétraitement de l'audio
481
+ audio = preprocess_audio(audio_path)
482
+
483
+ # Chargement du modèle
484
+ model,processor = load_model_and_processor(model_name="openai/whisper-base")
485
+
486
+ # Transcription
487
+ transcription = transcribe_audio(model=model,processor=processor,audio=audio)
488
+
489
+ return transcription
490
+
491
+
492
  demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
493
 
494
  with demo:
495
+
496
+
497
+
498
+ gr.Markdown(
499
+ """
500
+ <p align="center">
501
+ <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
502
+ </p>
503
+ """)
504
+
505
  gr.Markdown(
506
  """
507
+ <h1 style="text-align: center;">Secure De-Identification of Audio Files</h1>
508
+ <!--
509
+ <p align="center">
510
+ <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a>
511
+
512
+ <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
513
+
514
+ <a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
515
+
516
+ <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
517
+ </p>
518
+ -->
519
  """
520
  )
521
 
522
  gr.Markdown(
523
  """
524
  <p align="center" style="font-size: 18px;">
525
+ Protecting personal data is more important than ever in today’s digital world. <b>Our project ensures privacy-preserving de-identification of audio data</b> using state-of-the-art <b>Fully Homomorphic Encryption (FHE)</b>, offering a secure and transparent solution for data anonymization.
526
+ </p>
527
+
528
+ <p align="center" style="font-size: 18px;">
529
+ Traditional methods of de-identification often fall short of true anonymization, merely concealing identifiable information. With FHE, we go beyond obfuscation to provide <b>complete security,</b> allowing computations to be performed directly on encrypted data without ever exposing sensitive details.
530
+ </p>
531
+
532
+ <p align="center" style="font-size: 18px;">
533
+ This technology is crucial in enabling organizations to use and share sensitive data responsibly, while fully respecting individual privacy.
534
  </p>
535
  """
536
  )
537
 
538
+
539
+ # Step 1: Add an audio file
540
+ gr.Markdown("## Step 1: Add an Audio File")
541
+ audio_files = get_audio_files()
542
+
543
+ with gr.Row():
544
+ audio_file_dropdown = gr.Dropdown(audio_files, label="Select an Audio File", interactive=True)
545
+ audio_output = gr.Audio(label="Selected Audio", type="filepath")
546
+
547
+ # When an audio file is selected, it will display the file path
548
+ audio_file_dropdown.change(fn=load_audio_file, inputs=[audio_file_dropdown], outputs=[audio_output])
549
+
550
+ with gr.Row():
551
+ transcribe_btn = gr.Button("Transcrire l'audio")
552
+ transcription_output = gr.Textbox(label="Transcription", lines=5)
553
+
554
+ transcribe_btn.click(
555
+ fn=transcribe_audio_app,
556
+ inputs=[audio_output],
557
+ outputs=[transcription_output]
558
  )
559
+
560
+
561
 
562
+
563
+ ########################## Step 1.1: Record Audio ##########################
564
+
565
+ gr.Markdown("## Step 1.1: Record an Audio File")
566
+ """
567
+ with gr.Row():
568
+ audio_recorder = gr.Audio(source="microphone", type="file", label="Record Audio")
569
+ record_output = gr.Audio(label="Recorded Audio", type="filepath")
570
+ # When the user records an audio, save it
571
+ audio_recorder.change(fn=save_recorded_audio, inputs=[audio_recorder], outputs=[record_output])
572
  gen_key_btn = gr.Button("Generate the secret and evaluation keys")
573
+ gen_key_btn.click(
574
+ key_gen_fn,
575
+ inputs=[],
576
+ outputs=[gen_key_btn],
577
+ ) """
578
 
579
+ msg = gr.Textbox()
 
580
 
581
+ audio_box = gr.Audio(label="Audio", type="filepath", elem_id='audio')
582
+
583
+ with gr.Row():
584
+ audio_btn = gr.Button('Speak')
585
+ clear = gr.Button("Clear")
586
+
587
+ audio_btn.click(fn=action, inputs=audio_btn, outputs=audio_btn) \
588
+ .then(fn=check_btn, inputs=audio_btn) \
589
+ .success(fn=transcribe_audio_app, outputs=msg)
590
+
591
+ clear.click(lambda: None, None, msg, queue=False)
592
+
593
+ ########################## Transcription ##########################
594
+ with gr.Row():
595
+ transcribe_btn = gr.Button("Transcrire l'audio")
596
+ transcription_output = gr.Textbox(label="Transcription", lines=5)
597
+
598
+ transcribe_btn.click(
599
+ fn=transcribe_audio_app,
600
+ inputs=[audio_output],
601
+ outputs=[transcription_output]
602
  )
603
 
604
+ ########################## Key Gen Part ##########################
605
+
606
+ gr.Markdown(
607
+ "## Step 1.2: Generate the keys\n\n"
608
+ """In Fully Homomorphic Encryption (FHE) methods, two types of keys are created. The first
609
+ type, called secret keys, are used to encrypt and decrypt the user's data. The second type,
610
+ called evaluation keys, enables a server to work on the encrypted data without seeing the
611
+ actual data.
612
+ """
613
  )
614
 
615
+ gen_key_btn = gr.Button("Generate the secret and evaluation keys")
616
+
617
+ gen_key_btn.click(
618
+ key_gen_fn,
619
+ inputs=[],
620
+ outputs=[gen_key_btn],
621
+ )
622
+
623
+ ########################## Main document Part ##########################
624
+
625
+ gr.Markdown("<hr />")
626
+ gr.Markdown("## Step 2.1: Select the document you want to encrypt\n\n"
627
+ """To make it simple, we pre-compiled the following document, but you are free to choose
628
+ on which part you want to run this example.
629
+ """
630
  )
631
 
632
+ with gr.Row():
633
+ with gr.Column(scale=5):
634
+ original_sentences_box = gr.CheckboxGroup(
635
+ ORIGINAL_DOCUMENT,
636
+ value=ORIGINAL_DOCUMENT,
637
+ label="Contract:",
638
+ show_label=True,
639
+ )
640
+
641
+ with gr.Column(scale=1, min_width=6):
642
+ gr.HTML("<div style='height: 77px;'></div>")
643
+ encrypt_doc_btn = gr.Button("Encrypt the document")
644
+
645
+ with gr.Column(scale=5):
646
+ encrypted_doc_box = gr.Textbox(
647
+ label="Encrypted document:", show_label=True, interactive=False, lines=10
648
+ )
649
+
650
+
651
+ ########################## User Query Part ##########################
652
+
653
+ gr.Markdown("<hr />")
654
+ gr.Markdown("## Step 2.2: Select the prompt you want to encrypt\n\n"
655
+ """Please choose from the predefined options in
656
+ <span style='color:grey'>“Prompt examples”</span> or craft a custom question in
657
+ the <span style='color:grey'>“Customized prompt”</span> text box.
658
+ Remain concise and relevant to the context. Any off-topic query will not be processed.""")
659
+
660
+ with gr.Row():
661
+ with gr.Column(scale=5):
662
+
663
+ with gr.Column(scale=5):
664
+ default_query_box = gr.Dropdown(
665
+ list(DEFAULT_QUERIES.values()), label="PROMPT EXAMPLES:"
666
+ )
667
+
668
+ gr.Markdown("Or")
669
+
670
+ query_box = gr.Textbox(
671
+ value=" Hello. My name is Inuitvementoya. You kill my father. Prepare to die.", label="CUSTOMIZED PROMPT:", interactive=True
672
+ )
673
+
674
+ default_query_box.change(
675
+ fn=lambda default_query_box: default_query_box,
676
+ inputs=[default_query_box],
677
+ outputs=[query_box],
678
+ )
679
+
680
+ with gr.Column(scale=1, min_width=6):
681
+ gr.HTML("<div style='height: 77px;'></div>")
682
+ encrypt_query_btn = gr.Button("Encrypt the prompt")
683
+ # gr.HTML("<div style='height: 50px;'></div>")
684
+
685
+ with gr.Column(scale=5):
686
+ output_encrypted_box = gr.Textbox(
687
+ label="Encrypted de-identified query that will be sent to the de-identification server:",
688
+ lines=8,
689
+ )
690
+
691
  ########################## FHE processing Part ##########################
692
+
693
+ gr.Markdown("<hr />")
694
+ gr.Markdown("## Step 3: De-identify the document and the prompt using FHE")
695
  gr.Markdown(
696
+ """Once the client encrypts the document and the prompt locally, it will be sent to a remote
697
+ server to perform the de-identification on encrypted data. When the computation is done, the
698
+ server will return the result to the client for decryption."""
699
+
700
  )
701
 
702
  run_fhe_btn = gr.Button("De-identify using FHE")
703
+
704
+ with gr.Row():
705
+ with gr.Column(scale=5):
706
+
707
+ anonymized_doc_output = gr.Textbox(
708
+ label="Decrypted and de-idenntified document", lines=10, interactive=True
709
+ )
710
+
711
+ with gr.Column(scale=5):
712
+
713
+ anonymized_query_output = gr.Textbox(
714
+ label="Decrypted and de-identified prompt", lines=10, interactive=True
715
+ )
716
+
717
+
718
  identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
719
 
720
+ encrypt_doc_btn.click(
721
+ fn=encrypt_doc_fn,
722
+ inputs=[original_sentences_box],
723
+ outputs=[encrypted_doc_box, anonymized_doc_output],
724
+ )
725
+
726
+ encrypt_query_btn.click(
727
+ fn=encrypt_query_fn,
728
+ inputs=[query_box],
729
+ outputs=[
730
+ query_box,
731
+ output_encrypted_box,
732
+ anonymized_query_output,
733
+ identified_words_output_df,
734
+ ],
735
+ )
736
+
737
  run_fhe_btn.click(
738
  anonymization_with_fn,
739
+ inputs=[original_sentences_box, query_box],
740
+ outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
741
+ )
742
+
743
+
744
+ ########################## Presidio ##########################
745
+ gr.Markdown("<hr />")
746
+ gr.Markdown("## Step 3: De-identify the document and the prompt")
747
+ gr.Markdown(
748
+ """This step will demonstrate de-identification using both FHE and Presidio methods.
749
+ The same prompt will be used for both to allow for direct comparison.""")
750
+
751
+ with gr.Row():
752
+ run_presidio_btn = gr.Button("De-identify using Presidio")
753
+
754
+ with gr.Row():
755
+ presidio_output = gr.Textbox(
756
+ label="Presidio: De-identified prompt", lines=10, interactive=True
757
+ )
758
+
759
+ run_presidio_btn.click(
760
+ anonymization_with_presidio,
761
  inputs=[query_box],
762
+ outputs=[presidio_output],
763
  )
764
 
765
+
766
  # Launch the app
767
  demo.launch(share=False)