mzameshina commited on
Commit
8a75eb3
·
verified ·
1 Parent(s): 92d899b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -532
app.py CHANGED
@@ -13,7 +13,6 @@ import numpy
13
  import pandas as pd
14
  import requests
15
  from fhe_anonymizer import FHEAnonymizer
16
- #from openai import OpenAI
17
  from utils_demo import *
18
 
19
  from concrete.ml.deployment import FHEModelClient
@@ -22,12 +21,10 @@ from models.speech_to_text.transcriber.audio import preprocess_audio
22
  from models.speech_to_text.transcriber.model import load_model_and_processor
23
  from models.speech_to_text.transcriber.audio import transcribe_audio
24
 
25
-
26
  # Ensure the directory is clean before starting processes or reading files
27
  clean_directory()
28
 
29
  anonymizer = FHEAnonymizer()
30
- #client = OpenAI(api_key=os.environ.get("openaikey"))
31
 
32
  # Start the Uvicorn server hosting the FastAPI app
33
  subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
@@ -35,43 +32,16 @@ time.sleep(3)
35
 
36
  # Load data from files required for the application
37
  UUID_MAP = read_json(MAPPING_UUID_PATH)
38
- ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
39
- MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
40
- MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
41
- ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
42
  MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
43
 
44
- print(f"{ORIGINAL_DOCUMENT=}\n")
45
- print(f"{MAPPING_DOC_EMBEDDING.keys()=}")
46
-
47
- # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
48
-
49
- # 5. Utilizing External Services or APIs
50
- # (Assuming client initialization and anonymizer setup are parts of using external services or application-specific logic)
51
-
52
  # Generate a random user ID for this session
53
  USER_ID = numpy.random.randint(0, 2**32)
54
 
55
-
56
- def select_static_anonymized_sentences_fn(selected_sentences: List):
57
-
58
- selected_sentences = [MAPPING_ANONYMIZED_SENTENCES[sentence] for sentence in selected_sentences]
59
-
60
- anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
61
-
62
- anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
63
-
64
- return "\n\n".join(anonymized_selected_sentence)
65
-
66
-
67
  def key_gen_fn() -> Dict:
68
  """Generate keys for a given user."""
69
-
70
  print("------------ Step 1: Key Generation:")
71
-
72
  print(f"Your user ID is: {USER_ID}....")
73
 
74
-
75
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
76
  client.load()
77
 
@@ -84,70 +54,17 @@ def key_gen_fn() -> Dict:
84
 
85
  # Save the evaluation key
86
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
87
-
88
  write_bytes(evaluation_key_path, serialized_evaluation_keys)
89
 
90
- # anonymizer.generate_key()
91
-
92
  if not evaluation_key_path.is_file():
93
- error_message = (
94
- f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
95
- )
96
  print(error_message)
97
  return {gen_key_btn: gr.update(value=error_message)}
98
  else:
99
  print("Keys have been generated ✅")
100
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
101
 
102
-
103
- def encrypt_doc_fn(doc):
104
-
105
- print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
106
-
107
- if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
108
- return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
109
-
110
- # Retrieve the client API
111
- client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
112
- client.load()
113
-
114
- encrypted_tokens = []
115
- tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+|\$\d+(?:\.\d+)?|\€\d+(?:\.\d+)?)", ' '.join(doc))
116
-
117
- for token in tokens:
118
- if token.strip() and re.match(r"\w+", token):
119
- emb_x = MAPPING_DOC_EMBEDDING[token]
120
- assert emb_x.shape == (1, 1024)
121
- encrypted_x = client.quantize_encrypt_serialize(emb_x)
122
- assert isinstance(encrypted_x, bytes)
123
- encrypted_tokens.append(encrypted_x)
124
-
125
- print("Doc encrypted ✅ on Client Side")
126
-
127
- # No need to save it
128
- # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
129
-
130
- encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
131
-
132
- return {
133
- encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
134
- anonymized_doc_output: gr.update(visible=True, value=None),
135
- }
136
-
137
- import presidio_analyzer
138
- import presidio_anonymizer
139
- from presidio_analyzer import AnalyzerEngine
140
- from presidio_anonymizer import AnonymizerEngine
141
-
142
- def anonymization_with_presidio(prompt):
143
- analyzer = AnalyzerEngine()
144
- anonymizer = AnonymizerEngine()
145
- results = analyzer.analyze(text=prompt,language='en')
146
- result = anonymizer.anonymize(text=prompt, analyzer_results=results)
147
- return result.text
148
-
149
  def encrypt_query_fn(query):
150
-
151
  print(f"\n------------ Step 2: Query encryption: {query=}")
152
 
153
  if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
@@ -156,45 +73,29 @@ def encrypt_query_fn(query):
156
  if is_user_query_valid(query):
157
  return {
158
  query_box: gr.update(
159
- value=(
160
- "Unable to process ❌: The request exceeds the length limit or falls "
161
- "outside the scope of this document. Please refine your query."
162
- )
163
  )
164
  }
165
 
166
- # Retrieve the client API
167
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
168
  client.load()
169
 
170
  encrypted_tokens = []
171
-
172
- # Pattern to identify words and non-words (including punctuation, spaces, etc.)
173
  tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
174
 
175
  for token in tokens:
176
-
177
- # 1- Ignore non-words tokens
178
- if bool(re.match(r"^\s+$", token)):
179
- continue
180
-
181
- # 2- Directly append non-word tokens or whitespace to processed_tokens
182
-
183
- # Prediction for each word
184
- emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
185
- encrypted_x = client.quantize_encrypt_serialize(emb_x)
186
- assert isinstance(encrypted_x, bytes)
187
-
188
- encrypted_tokens.append(encrypted_x)
189
 
190
  print("Data encrypted ✅ on Client Side")
191
 
192
  assert len({len(token) for token in encrypted_tokens}) == 1
193
 
194
  write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
195
- write_bytes(
196
- KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big")
197
- )
198
 
199
  encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
200
 
@@ -204,169 +105,76 @@ def encrypt_query_fn(query):
204
  identified_words_output_df: gr.update(visible=False, value=None),
205
  }
206
 
207
-
208
  def send_input_fn(query) -> Dict:
209
- """Send the encrypted data and the evaluation key to the server."""
210
-
211
  print("------------ Step 3.1: Send encrypted_data to the Server")
212
 
213
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
214
  encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
215
  encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
216
 
217
- if not evaluation_key_path.is_file():
218
- error_message = (
219
- "Error Encountered While Sending Data to the Server: "
220
- f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
221
- )
222
- return {anonymized_query_output: gr.update(value=error_message)}
223
-
224
- if not encrypted_input_path.is_file():
225
- error_message = (
226
- "Error Encountered While Sending Data to the Server: The data has not been encrypted "
227
- f"correctly on the client side - {encrypted_input_path.is_file()=}"
228
- )
229
  return {anonymized_query_output: gr.update(value=error_message)}
230
 
231
- # Define the data and files to post
232
  data = {"user_id": USER_ID, "input": query}
233
-
234
  files = [
235
  ("files", open(evaluation_key_path, "rb")),
236
  ("files", open(encrypted_input_path, "rb")),
237
  ("files", open(encrypted_input_len_path, "rb")),
238
  ]
239
 
240
- # Send the encrypted input and evaluation key to the server
241
  url = SERVER_URL + "send_input"
242
-
243
- with requests.post(
244
- url=url,
245
- data=data,
246
- files=files,
247
- ) as resp:
248
  print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
249
 
250
-
251
  def run_fhe_in_server_fn() -> Dict:
252
- """Run in FHE the anonymization of the query"""
253
-
254
  print("------------ Step 3.2: Run in FHE on the Server Side")
255
 
256
- evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
257
- encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
258
-
259
- if not evaluation_key_path.is_file():
260
- error_message = (
261
- "Error Encountered While Sending Data to the Server: "
262
- f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
263
- )
264
- return {anonymized_query_output: gr.update(value=error_message)}
265
-
266
- if not encrypted_input_path.is_file():
267
- error_message = (
268
- "Error Encountered While Sending Data to the Server: The data has not been encrypted "
269
- f"correctly on the client side - {encrypted_input_path.is_file()=}"
270
- )
271
- return {anonymized_query_output: gr.update(value=error_message)}
272
-
273
- data = {
274
- "user_id": USER_ID,
275
- }
276
-
277
  url = SERVER_URL + "run_fhe"
278
 
279
- with requests.post(
280
- url=url,
281
- data=data,
282
- ) as response:
283
  if not response.ok:
284
  return {
285
  anonymized_query_output: gr.update(
286
- value=(
287
- "⚠️ An error occurred on the Server Side. "
288
- "Please check connectivity and data transmission."
289
- ),
290
  ),
291
  }
292
  else:
293
  time.sleep(1)
294
  print(f"The query anonymization was computed in {response.json():.2f} s per token.")
295
 
296
-
297
  def get_output_fn() -> Dict:
298
-
299
  print("------------ Step 3.3: Get the output from the Server Side")
300
 
301
- if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
302
- error_message = (
303
- "Error Encountered While Sending Data to the Server: "
304
- "The key has not been generated correctly"
305
- )
306
- return {anonymized_query_output: gr.update(value=error_message)}
307
-
308
- if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
309
- error_message = (
310
- "Error Encountered While Sending Data to the Server: "
311
- "The data has not been encrypted correctly on the client side"
312
- )
313
- return {anonymized_query_output: gr.update(value=error_message)}
314
-
315
- data = {
316
- "user_id": USER_ID,
317
- }
318
-
319
- # Retrieve the encrypted output
320
  url = SERVER_URL + "get_output"
321
- with requests.post(
322
- url=url,
323
- data=data,
324
- ) as response:
325
  if response.ok:
326
  print("Data received ✅ from the remote Server")
327
  response_data = response.json()
328
- encrypted_output_base64 = response_data["encrypted_output"]
329
- length_encrypted_output_base64 = response_data["length"]
330
-
331
- # Decode the base64 encoded data
332
- encrypted_output = base64.b64decode(encrypted_output_base64)
333
- length_encrypted_output = base64.b64decode(length_encrypted_output_base64)
334
-
335
- # Save the encrypted output to bytes in a file as it is too large to pass through
336
- # regular Gradio buttons (see https://github.com/gradio-app/gradio/issues/1877)
337
 
338
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
339
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
340
-
341
  else:
342
- print("Error ❌ in getting data to the server")
343
-
344
 
345
  def decrypt_fn(text) -> Dict:
346
- """Dencrypt the data on the `Client Side`."""
347
 
348
- print("------------ Step 4: Dencrypt the data on the `Client Side`")
349
-
350
- # Get the encrypted output path
351
  encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
352
 
353
  if not encrypted_output_path.is_file():
354
- error_message = """⚠️ Please ensure that: \n
355
- - the connectivity \n
356
- - the query has been submitted \n
357
- - the evaluation key has been generated \n
358
- - the server processed the encrypted data \n
359
- - the Client received the data from the Server before decrypting the prediction
360
- """
361
  print(error_message)
362
-
363
  return error_message, None
364
 
365
- # Retrieve the client API
366
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
367
  client.load()
368
 
369
- # Load the encrypted output as bytes
370
  encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
371
  length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
372
 
@@ -376,11 +184,7 @@ def decrypt_fn(text) -> Dict:
376
 
377
  i = 0
378
  for token in tokens:
379
-
380
- # Directly append non-word tokens or whitespace to processed_tokens
381
- if bool(re.match(r"^\s+$", token)):
382
- continue
383
- else:
384
  encrypted_token = encrypted_output[i : i + length]
385
  prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
386
  probability = prediction_proba[0][1]
@@ -388,380 +192,102 @@ def decrypt_fn(text) -> Dict:
388
 
389
  if probability >= 0.77:
390
  identified_words_with_prob.append((token, probability))
391
-
392
- # Use the existing UUID if available, otherwise generate a new one
393
  tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
394
  decrypted_output.append(tmp_uuid)
395
  UUID_MAP[token] = tmp_uuid
396
  else:
397
  decrypted_output.append(token)
398
 
399
- # Update the UUID map with query.
400
- write_json(MAPPING_UUID_PATH, UUID_MAP)
401
 
402
- # Removing Spaces Before Punctuation:
403
  anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
404
 
405
- # Convert the list of identified words and probabilities into a DataFrame
406
- if identified_words_with_prob:
407
- identified_df = pd.DataFrame(
408
- identified_words_with_prob, columns=["Identified Words", "Probability"]
409
- )
410
- else:
411
- identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
412
 
413
  print("Decryption done ✅ on Client Side")
414
 
415
  return anonymized_text, identified_df
416
 
417
-
418
- def anonymization_with_fn(selected_sentences, query):
419
-
420
  encrypt_query_fn(query)
421
-
422
  send_input_fn(query)
423
-
424
  run_fhe_in_server_fn()
425
-
426
  get_output_fn()
427
-
428
  anonymized_text, identified_df = decrypt_fn(query)
429
 
430
  return {
431
- anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
432
  anonymized_query_output: gr.update(value=anonymized_text),
433
- identified_words_output_df: gr.update(value=identified_df, visible=False),
434
  }
435
 
436
- # Define the folder path containing audio files
437
- AUDIO_FOLDER_PATH = "./files/"
438
-
439
- # Function to list available audio files in the folder
440
- def get_audio_files():
441
- files = [f for f in os.listdir(AUDIO_FOLDER_PATH) if f.endswith(('.wav', '.mp3'))]
442
- return files
443
-
444
- # Step 1: Load and display audio file
445
- def load_audio_file(selected_audio):
446
- file_path = os.path.join(AUDIO_FOLDER_PATH, selected_audio)
447
- return file_path
448
-
449
- # Step 1.1: Record and save the audio file
450
- def save_recorded_audio(audio):
451
- file_path = os.path.join(AUDIO_FOLDER_PATH, "recorded_audio.wav")
452
- audio.export(file_path, format="wav") # Save the audio as a .wav file
453
- return file_path
454
-
455
- def click_js():
456
- return """function audioRecord() {
457
- var xPathRes = document.evaluate ('//*[@id="audio"]//button', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
458
- xPathRes.singleNodeValue.click();}"""
459
-
460
-
461
- def action(btn):
462
- """Changes button text on click"""
463
- if btn == 'Speak':
464
- return 'Stop'
465
- else:
466
- return 'Speak'
467
-
468
-
469
- def check_btn(btn):
470
- """Checks for correct button text before invoking transcribe()"""
471
- if btn != 'Speak':
472
- raise Exception('Recording...')
473
-
474
-
475
- def transcribe():
476
- return 'Success'
477
-
478
-
479
- def transcribe_audio_app(audio_path):
480
- # Prétraitement de l'audio
481
- audio = preprocess_audio(audio_path)
482
-
483
- # Chargement du modèle
484
- model,processor = load_model_and_processor(model_name="openai/whisper-base")
485
-
486
- # Transcription
487
- transcription = transcribe_audio(model=model,processor=processor,audio=audio)
488
-
489
- return transcription
490
-
491
-
492
  demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
493
 
494
  with demo:
495
-
496
-
497
-
498
- gr.Markdown(
499
- """
500
- <p align="center">
501
- <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
502
- </p>
503
- """)
504
-
505
  gr.Markdown(
506
  """
507
- <h1 style="text-align: center;">Secure De-Identification of Audio Files</h1>
508
- <!--
509
- <p align="center">
510
- <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a>
511
-
512
- <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
513
-
514
- <a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
515
-
516
- <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
517
- </p>
518
- -->
519
  """
520
  )
521
 
522
  gr.Markdown(
523
  """
524
  <p align="center" style="font-size: 18px;">
525
- Protecting personal data is more important than ever in today’s digital world. <b>Our project ensures privacy-preserving de-identification of audio data</b> using state-of-the-art <b>Fully Homomorphic Encryption (FHE)</b>, offering a secure and transparent solution for data anonymization.
526
- </p>
527
-
528
- <p align="center" style="font-size: 18px;">
529
- Traditional methods of de-identification often fall short of true anonymization, merely concealing identifiable information. With FHE, we go beyond obfuscation to provide <b>complete security,</b> allowing computations to be performed directly on encrypted data without ever exposing sensitive details.
530
- </p>
531
-
532
- <p align="center" style="font-size: 18px;">
533
- This technology is crucial in enabling organizations to use and share sensitive data responsibly, while fully respecting individual privacy.
534
  </p>
535
  """
536
  )
537
 
538
-
539
- # Step 1: Add an audio file
540
- gr.Markdown("## Step 1: Add an Audio File")
541
- audio_files = get_audio_files()
542
-
543
- with gr.Row():
544
- audio_file_dropdown = gr.Dropdown(audio_files, label="Select an Audio File", interactive=True)
545
- audio_output = gr.Audio(label="Selected Audio", type="filepath")
546
-
547
- # When an audio file is selected, it will display the file path
548
- audio_file_dropdown.change(fn=load_audio_file, inputs=[audio_file_dropdown], outputs=[audio_output])
549
-
550
- with gr.Row():
551
- transcribe_btn = gr.Button("Transcrire l'audio")
552
- transcription_output = gr.Textbox(label="Transcription", lines=5)
553
-
554
- transcribe_btn.click(
555
- fn=transcribe_audio_app,
556
- inputs=[audio_output],
557
- outputs=[transcription_output]
558
- )
559
-
560
-
561
-
562
-
563
- ########################## Step 1.1: Record Audio ##########################
564
-
565
- gr.Markdown("## Step 1.1: Record an Audio File")
566
- """
567
- with gr.Row():
568
- audio_recorder = gr.Audio(source="microphone", type="file", label="Record Audio")
569
- record_output = gr.Audio(label="Recorded Audio", type="filepath")
570
- # When the user records an audio, save it
571
- audio_recorder.change(fn=save_recorded_audio, inputs=[audio_recorder], outputs=[record_output])
572
- gen_key_btn = gr.Button("Generate the secret and evaluation keys")
573
- gen_key_btn.click(
574
- key_gen_fn,
575
- inputs=[],
576
- outputs=[gen_key_btn],
577
- ) """
578
-
579
- msg = gr.Textbox()
580
-
581
- audio_box = gr.Audio(label="Audio", type="filepath", elem_id='audio')
582
-
583
- with gr.Row():
584
- audio_btn = gr.Button('Speak')
585
- clear = gr.Button("Clear")
586
-
587
- audio_btn.click(fn=action, inputs=audio_btn, outputs=audio_btn) \
588
- .then(fn=check_btn, inputs=audio_btn) \
589
- .success(fn=transcribe_audio_app, outputs=msg)
590
-
591
- clear.click(lambda: None, None, msg, queue=False)
592
-
593
- ########################## Transcription ##########################
594
- with gr.Row():
595
- transcribe_btn = gr.Button("Transcrire l'audio")
596
- transcription_output = gr.Textbox(label="Transcription", lines=5)
597
-
598
- transcribe_btn.click(
599
- fn=transcribe_audio_app,
600
- inputs=[audio_output],
601
- outputs=[transcription_output]
602
- )
603
-
604
  ########################## Key Gen Part ##########################
605
-
606
  gr.Markdown(
607
- "## Step 1.2: Generate the keys\n\n"
608
- """In Fully Homomorphic Encryption (FHE) methods, two types of keys are created. The first
609
- type, called secret keys, are used to encrypt and decrypt the user's data. The second type,
610
- called evaluation keys, enables a server to work on the encrypted data without seeing the
611
- actual data.
612
- """
613
  )
614
 
615
  gen_key_btn = gr.Button("Generate the secret and evaluation keys")
 
616
 
617
- gen_key_btn.click(
618
- key_gen_fn,
619
- inputs=[],
620
- outputs=[gen_key_btn],
 
 
 
621
  )
622
 
623
- ########################## Main document Part ##########################
624
-
625
- gr.Markdown("<hr />")
626
- gr.Markdown("## Step 2.1: Select the document you want to encrypt\n\n"
627
- """To make it simple, we pre-compiled the following document, but you are free to choose
628
- on which part you want to run this example.
629
- """
630
  )
631
 
632
- with gr.Row():
633
- with gr.Column(scale=5):
634
- original_sentences_box = gr.CheckboxGroup(
635
- ORIGINAL_DOCUMENT,
636
- value=ORIGINAL_DOCUMENT,
637
- label="Contract:",
638
- show_label=True,
639
- )
640
-
641
- with gr.Column(scale=1, min_width=6):
642
- gr.HTML("<div style='height: 77px;'></div>")
643
- encrypt_doc_btn = gr.Button("Encrypt the document")
644
-
645
- with gr.Column(scale=5):
646
- encrypted_doc_box = gr.Textbox(
647
- label="Encrypted document:", show_label=True, interactive=False, lines=10
648
- )
649
-
650
-
651
- ########################## User Query Part ##########################
652
-
653
- gr.Markdown("<hr />")
654
- gr.Markdown("## Step 2.2: Select the prompt you want to encrypt\n\n"
655
- """Please choose from the predefined options in
656
- <span style='color:grey'>“Prompt examples”</span> or craft a custom question in
657
- the <span style='color:grey'>“Customized prompt”</span> text box.
658
- Remain concise and relevant to the context. Any off-topic query will not be processed.""")
659
-
660
- with gr.Row():
661
- with gr.Column(scale=5):
662
-
663
- with gr.Column(scale=5):
664
- default_query_box = gr.Dropdown(
665
- list(DEFAULT_QUERIES.values()), label="PROMPT EXAMPLES:"
666
- )
667
-
668
- gr.Markdown("Or")
669
-
670
- query_box = gr.Textbox(
671
- value=" Hello. My name is Inuitvementoya. You kill my father. Prepare to die.", label="CUSTOMIZED PROMPT:", interactive=True
672
- )
673
-
674
- default_query_box.change(
675
- fn=lambda default_query_box: default_query_box,
676
- inputs=[default_query_box],
677
- outputs=[query_box],
678
- )
679
-
680
- with gr.Column(scale=1, min_width=6):
681
- gr.HTML("<div style='height: 77px;'></div>")
682
- encrypt_query_btn = gr.Button("Encrypt the prompt")
683
- # gr.HTML("<div style='height: 50px;'></div>")
684
-
685
- with gr.Column(scale=5):
686
- output_encrypted_box = gr.Textbox(
687
- label="Encrypted de-identified query that will be sent to the de-identification server:",
688
- lines=8,
689
- )
690
 
691
  ########################## FHE processing Part ##########################
692
-
693
- gr.Markdown("<hr />")
694
- gr.Markdown("## Step 3: De-identify the document and the prompt using FHE")
695
  gr.Markdown(
696
- """Once the client encrypts the document and the prompt locally, it will be sent to a remote
697
- server to perform the de-identification on encrypted data. When the computation is done, the
698
- server will return the result to the client for decryption."""
699
-
700
  )
701
 
702
  run_fhe_btn = gr.Button("De-identify using FHE")
703
-
704
- with gr.Row():
705
- with gr.Column(scale=5):
706
-
707
- anonymized_doc_output = gr.Textbox(
708
- label="Decrypted and de-idenntified document", lines=10, interactive=True
709
- )
710
-
711
- with gr.Column(scale=5):
712
-
713
- anonymized_query_output = gr.Textbox(
714
- label="Decrypted and de-identified prompt", lines=10, interactive=True
715
- )
716
-
717
-
718
- identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
719
-
720
- encrypt_doc_btn.click(
721
- fn=encrypt_doc_fn,
722
- inputs=[original_sentences_box],
723
- outputs=[encrypted_doc_box, anonymized_doc_output],
724
- )
725
-
726
- encrypt_query_btn.click(
727
- fn=encrypt_query_fn,
728
- inputs=[query_box],
729
- outputs=[
730
- query_box,
731
- output_encrypted_box,
732
- anonymized_query_output,
733
- identified_words_output_df,
734
- ],
735
  )
 
736
 
737
  run_fhe_btn.click(
738
  anonymization_with_fn,
739
- inputs=[original_sentences_box, query_box],
740
- outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
741
- )
742
-
743
-
744
- ########################## Presidio ##########################
745
- gr.Markdown("<hr />")
746
- gr.Markdown("## Step 3: De-identify the document and the prompt")
747
- gr.Markdown(
748
- """This step will demonstrate de-identification using both FHE and Presidio methods.
749
- The same prompt will be used for both to allow for direct comparison.""")
750
-
751
- with gr.Row():
752
- run_presidio_btn = gr.Button("De-identify using Presidio")
753
-
754
- with gr.Row():
755
- presidio_output = gr.Textbox(
756
- label="Presidio: De-identified prompt", lines=10, interactive=True
757
- )
758
-
759
- run_presidio_btn.click(
760
- anonymization_with_presidio,
761
  inputs=[query_box],
762
- outputs=[presidio_output],
763
  )
764
 
765
-
766
  # Launch the app
767
  demo.launch(share=False)
 
13
  import pandas as pd
14
  import requests
15
  from fhe_anonymizer import FHEAnonymizer
 
16
  from utils_demo import *
17
 
18
  from concrete.ml.deployment import FHEModelClient
 
21
  from models.speech_to_text.transcriber.model import load_model_and_processor
22
  from models.speech_to_text.transcriber.audio import transcribe_audio
23
 
 
24
  # Ensure the directory is clean before starting processes or reading files
25
  clean_directory()
26
 
27
  anonymizer = FHEAnonymizer()
 
28
 
29
  # Start the Uvicorn server hosting the FastAPI app
30
  subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
 
32
 
33
  # Load data from files required for the application
34
  UUID_MAP = read_json(MAPPING_UUID_PATH)
 
 
 
 
35
  MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
36
 
 
 
 
 
 
 
 
 
37
  # Generate a random user ID for this session
38
  USER_ID = numpy.random.randint(0, 2**32)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def key_gen_fn() -> Dict:
41
  """Generate keys for a given user."""
 
42
  print("------------ Step 1: Key Generation:")
 
43
  print(f"Your user ID is: {USER_ID}....")
44
 
 
45
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
46
  client.load()
47
 
 
54
 
55
  # Save the evaluation key
56
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
 
57
  write_bytes(evaluation_key_path, serialized_evaluation_keys)
58
 
 
 
59
  if not evaluation_key_path.is_file():
60
+ error_message = f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
 
 
61
  print(error_message)
62
  return {gen_key_btn: gr.update(value=error_message)}
63
  else:
64
  print("Keys have been generated ✅")
65
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def encrypt_query_fn(query):
 
68
  print(f"\n------------ Step 2: Query encryption: {query=}")
69
 
70
  if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
 
73
  if is_user_query_valid(query):
74
  return {
75
  query_box: gr.update(
76
+ value="Unable to process ❌: The request exceeds the length limit or falls outside the scope. Please refine your query."
 
 
 
77
  )
78
  }
79
 
 
80
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
81
  client.load()
82
 
83
  encrypted_tokens = []
 
 
84
  tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
85
 
86
  for token in tokens:
87
+ if not bool(re.match(r"^\s+$", token)):
88
+ emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
89
+ encrypted_x = client.quantize_encrypt_serialize(emb_x)
90
+ assert isinstance(encrypted_x, bytes)
91
+ encrypted_tokens.append(encrypted_x)
 
 
 
 
 
 
 
 
92
 
93
  print("Data encrypted ✅ on Client Side")
94
 
95
  assert len({len(token) for token in encrypted_tokens}) == 1
96
 
97
  write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
98
+ write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big"))
 
 
99
 
100
  encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
101
 
 
105
  identified_words_output_df: gr.update(visible=False, value=None),
106
  }
107
 
 
108
  def send_input_fn(query) -> Dict:
 
 
109
  print("------------ Step 3.1: Send encrypted_data to the Server")
110
 
111
  evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
112
  encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
113
  encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
114
 
115
+ if not evaluation_key_path.is_file() or not encrypted_input_path.is_file():
116
+ error_message = "Error: Key or encrypted input not found. Please generate the key and encrypt the query first."
 
 
 
 
 
 
 
 
 
 
117
  return {anonymized_query_output: gr.update(value=error_message)}
118
 
 
119
  data = {"user_id": USER_ID, "input": query}
 
120
  files = [
121
  ("files", open(evaluation_key_path, "rb")),
122
  ("files", open(encrypted_input_path, "rb")),
123
  ("files", open(encrypted_input_len_path, "rb")),
124
  ]
125
 
 
126
  url = SERVER_URL + "send_input"
127
+ with requests.post(url=url, data=data, files=files) as resp:
 
 
 
 
 
128
  print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
129
 
 
130
  def run_fhe_in_server_fn() -> Dict:
 
 
131
  print("------------ Step 3.2: Run in FHE on the Server Side")
132
 
133
+ data = {"user_id": USER_ID}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  url = SERVER_URL + "run_fhe"
135
 
136
+ with requests.post(url=url, data=data) as response:
 
 
 
137
  if not response.ok:
138
  return {
139
  anonymized_query_output: gr.update(
140
+ value="⚠️ An error occurred on the Server Side. Please check connectivity and data transmission."
 
 
 
141
  ),
142
  }
143
  else:
144
  time.sleep(1)
145
  print(f"The query anonymization was computed in {response.json():.2f} s per token.")
146
 
 
147
  def get_output_fn() -> Dict:
 
148
  print("------------ Step 3.3: Get the output from the Server Side")
149
 
150
+ data = {"user_id": USER_ID}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  url = SERVER_URL + "get_output"
152
+
153
+ with requests.post(url=url, data=data) as response:
 
 
154
  if response.ok:
155
  print("Data received ✅ from the remote Server")
156
  response_data = response.json()
157
+ encrypted_output = base64.b64decode(response_data["encrypted_output"])
158
+ length_encrypted_output = base64.b64decode(response_data["length"])
 
 
 
 
 
 
 
159
 
160
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
161
  write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
 
162
  else:
163
+ print("Error ❌ in getting data from the server")
 
164
 
165
  def decrypt_fn(text) -> Dict:
166
+ print("------------ Step 4: Decrypt the data on the `Client Side`")
167
 
 
 
 
168
  encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
169
 
170
  if not encrypted_output_path.is_file():
171
+ error_message = "⚠️ Error: Encrypted output not found. Please ensure the entire process has been completed."
 
 
 
 
 
 
172
  print(error_message)
 
173
  return error_message, None
174
 
 
175
  client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
176
  client.load()
177
 
 
178
  encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
179
  length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
180
 
 
184
 
185
  i = 0
186
  for token in tokens:
187
+ if not bool(re.match(r"^\s+$", token)):
 
 
 
 
188
  encrypted_token = encrypted_output[i : i + length]
189
  prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
190
  probability = prediction_proba[0][1]
 
192
 
193
  if probability >= 0.77:
194
  identified_words_with_prob.append((token, probability))
 
 
195
  tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
196
  decrypted_output.append(tmp_uuid)
197
  UUID_MAP[token] = tmp_uuid
198
  else:
199
  decrypted_output.append(token)
200
 
201
+ write_json(MAPPING_UUID_PATH, UUID_MAP)
 
202
 
 
203
  anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
204
 
205
+ identified_df = pd.DataFrame(
206
+ identified_words_with_prob, columns=["Identified Words", "Probability"]
207
+ ) if identified_words_with_prob else pd.DataFrame(columns=["Identified Words", "Probability"])
 
 
 
 
208
 
209
  print("Decryption done ✅ on Client Side")
210
 
211
  return anonymized_text, identified_df
212
 
213
+ def anonymization_with_fn(query):
 
 
214
  encrypt_query_fn(query)
 
215
  send_input_fn(query)
 
216
  run_fhe_in_server_fn()
 
217
  get_output_fn()
 
218
  anonymized_text, identified_df = decrypt_fn(query)
219
 
220
  return {
 
221
  anonymized_query_output: gr.update(value=anonymized_text),
222
+ identified_words_output_df: gr.update(value=identified_df, visible=True),
223
  }
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
226
 
227
  with demo:
 
 
 
 
 
 
 
 
 
 
228
  gr.Markdown(
229
  """
230
+ <h1 style="text-align: center;">Secure De-Identification of Text Data using FHE</h1>
 
 
 
 
 
 
 
 
 
 
 
231
  """
232
  )
233
 
234
  gr.Markdown(
235
  """
236
  <p align="center" style="font-size: 18px;">
237
+ This demo showcases privacy-preserving de-identification of text data using Fully Homomorphic Encryption (FHE).
 
 
 
 
 
 
 
 
238
  </p>
239
  """
240
  )
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  ########################## Key Gen Part ##########################
 
243
  gr.Markdown(
244
+ "## Step 1: Generate the keys\n\n"
245
+ """In Fully Homomorphic Encryption (FHE) methods, two types of keys are created: secret keys for encrypting and decrypting user data,
246
+ and evaluation keys for the server to work on encrypted data without seeing the actual content."""
 
 
 
247
  )
248
 
249
  gen_key_btn = gr.Button("Generate the secret and evaluation keys")
250
+ gen_key_btn.click(key_gen_fn, inputs=[], outputs=[gen_key_btn])
251
 
252
+ ########################## User Query Part ##########################
253
+ gr.Markdown("## Step 2: Enter the prompt you want to encrypt and de-identify")
254
+
255
+ query_box = gr.Textbox(
256
+ value="Hello. My name is John Doe. I live at 123 Main St, Anytown, USA.",
257
+ label="Enter your prompt:",
258
+ interactive=True
259
  )
260
 
261
+ encrypt_query_btn = gr.Button("Encrypt the prompt")
262
+ output_encrypted_box = gr.Textbox(
263
+ label="Encrypted prompt (will be sent to the de-identification server):",
264
+ lines=4,
 
 
 
265
  )
266
 
267
+ encrypt_query_btn.click(
268
+ fn=encrypt_query_fn,
269
+ inputs=[query_box],
270
+ outputs=[query_box, output_encrypted_box],
271
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  ########################## FHE processing Part ##########################
274
+ gr.Markdown("## Step 3: De-identify the prompt using FHE")
 
 
275
  gr.Markdown(
276
+ """The encrypted prompt will be sent to a remote server for de-identification using FHE.
277
+ The server performs computations on the encrypted data and returns the result for decryption."""
 
 
278
  )
279
 
280
  run_fhe_btn = gr.Button("De-identify using FHE")
281
+ anonymized_query_output = gr.Textbox(
282
+ label="De-identified prompt", lines=4, interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  )
284
+ identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
285
 
286
  run_fhe_btn.click(
287
  anonymization_with_fn,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  inputs=[query_box],
289
+ outputs=[anonymized_query_output, identified_words_output_df],
290
  )
291
 
 
292
  # Launch the app
293
  demo.launch(share=False)