Zamanonymize3

Sleeping

App Files Files Community

mzameshina commited on Sep 28, 2024

Commit

7519b8e

verified ·

1 Parent(s): eb37260

Update app.py

Browse files

Files changed (1) hide show

app.py +532 -58

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ import numpy
 import pandas as pd
 import requests
 from fhe_anonymizer import FHEAnonymizer
 from utils_demo import *
 from concrete.ml.deployment import FHEModelClient
@@ -21,10 +22,12 @@ from models.speech_to_text.transcriber.audio import preprocess_audio
 from models.speech_to_text.transcriber.model import load_model_and_processor
 from models.speech_to_text.transcriber.audio import transcribe_audio
 # Ensure the directory is clean before starting processes or reading files
 clean_directory()
 anonymizer = FHEAnonymizer()
 # Start the Uvicorn server hosting the FastAPI app
 subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
@@ -32,16 +35,43 @@ time.sleep(3)
 # Load data from files required for the application
 UUID_MAP = read_json(MAPPING_UUID_PATH)
 MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
 # Generate a random user ID for this session
 USER_ID = numpy.random.randint(0, 2**32)
 def key_gen_fn() -> Dict:
     """Generate keys for a given user."""
     print("------------ Step 1: Key Generation:")
     print(f"Your user ID is: {USER_ID}....")
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
@@ -54,17 +84,70 @@ def key_gen_fn() -> Dict:
     # Save the evaluation key
     evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
     write_bytes(evaluation_key_path, serialized_evaluation_keys)
     if not evaluation_key_path.is_file():
-        error_message = f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
         print(error_message)
         return {gen_key_btn: gr.update(value=error_message)}
     else:
         print("Keys have been generated ✅")
         return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
 def encrypt_query_fn(query):
     print(f"\n------------ Step 2: Query encryption: {query=}")
     if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
@@ -73,29 +156,45 @@ def encrypt_query_fn(query):
     if is_user_query_valid(query):
         return {
             query_box: gr.update(
-                value="Unable to process ❌: The request exceeds the length limit or falls outside the scope. Please refine your query."
             )
         }
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
     encrypted_tokens = []
     tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
     for token in tokens:
-        if not bool(re.match(r"^\s+$", token)):
-            emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
-            encrypted_x = client.quantize_encrypt_serialize(emb_x)
-            assert isinstance(encrypted_x, bytes)
-            encrypted_tokens.append(encrypted_x)
     print("Data encrypted ✅ on Client Side")
     assert len({len(token) for token in encrypted_tokens}) == 1
     write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
-    write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big"))
     encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
@@ -105,76 +204,169 @@ def encrypt_query_fn(query):
         identified_words_output_df: gr.update(visible=False, value=None),
     }
 def send_input_fn(query) -> Dict:
     print("------------ Step 3.1: Send encrypted_data to the Server")
     evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
     encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
     encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
-    if not evaluation_key_path.is_file() or not encrypted_input_path.is_file():
-        error_message = "Error: Key or encrypted input not found. Please generate the key and encrypt the query first."
         return {anonymized_query_output: gr.update(value=error_message)}
     data = {"user_id": USER_ID, "input": query}
     files = [
         ("files", open(evaluation_key_path, "rb")),
         ("files", open(encrypted_input_path, "rb")),
         ("files", open(encrypted_input_len_path, "rb")),
     ]
     url = SERVER_URL + "send_input"
-    with requests.post(url=url, data=data, files=files) as resp:
         print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
 def run_fhe_in_server_fn() -> Dict:
     print("------------ Step 3.2: Run in FHE on the Server Side")
-    data = {"user_id": USER_ID}
     url = SERVER_URL + "run_fhe"
-    with requests.post(url=url, data=data) as response:
         if not response.ok:
             return {
                 anonymized_query_output: gr.update(
-                    value="⚠️ An error occurred on the Server Side. Please check connectivity and data transmission."
                 ),
             }
         else:
             time.sleep(1)
             print(f"The query anonymization was computed in {response.json():.2f} s per token.")
 def get_output_fn() -> Dict:
     print("------------ Step 3.3: Get the output from the Server Side")
-    data = {"user_id": USER_ID}
-    url = SERVER_URL + "get_output"
-    with requests.post(url=url, data=data) as response:
         if response.ok:
             print("Data received ✅ from the remote Server")
             response_data = response.json()
-            encrypted_output = base64.b64decode(response_data["encrypted_output"])
-            length_encrypted_output = base64.b64decode(response_data["length"])
             write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
             write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
         else:
-            print("Error ❌ in getting data from the server")
 def decrypt_fn(text) -> Dict:
-    print("------------ Step 4: Decrypt the data on the `Client Side`")
     encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
     if not encrypted_output_path.is_file():
-        error_message = "⚠️ Error: Encrypted output not found. Please ensure the entire process has been completed."
         print(error_message)
         return error_message, None
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
     encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
     length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
@@ -184,7 +376,11 @@ def decrypt_fn(text) -> Dict:
     i = 0
     for token in tokens:
-        if not bool(re.match(r"^\s+$", token)):
             encrypted_token = encrypted_output[i : i + length]
             prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
             probability = prediction_proba[0][1]
@@ -192,102 +388,380 @@ def decrypt_fn(text) -> Dict:
             if probability >= 0.77:
                 identified_words_with_prob.append((token, probability))
                 tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
                 decrypted_output.append(tmp_uuid)
                 UUID_MAP[token] = tmp_uuid
             else:
                 decrypted_output.append(token)
-    write_json(MAPPING_UUID_PATH, UUID_MAP)
     anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
-    identified_df = pd.DataFrame(
-        identified_words_with_prob, columns=["Identified Words", "Probability"]
-    ) if identified_words_with_prob else pd.DataFrame(columns=["Identified Words", "Probability"])
     print("Decryption done ✅ on Client Side")
     return anonymized_text, identified_df
-def anonymization_with_fn(query):
     encrypt_query_fn(query)
     send_input_fn(query)
     run_fhe_in_server_fn()
     get_output_fn()
     anonymized_text, identified_df = decrypt_fn(query)
     return {
         anonymized_query_output: gr.update(value=anonymized_text),
-        identified_words_output_df: gr.update(value=identified_df, visible=True),
     }
 demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
 with demo:
     gr.Markdown(
         """
-        <h1 style="text-align: center;">Secure De-Identification of Text Data using FHE</h1>
         """
     )
     gr.Markdown(
         """
         <p align="center" style="font-size: 18px;">
-            This demo showcases privacy-preserving de-identification of text data using Fully Homomorphic Encryption (FHE).
         </p>
         """
     )
-    ########################## Key Gen Part ##########################
-    gr.Markdown(
-        "## Step 1: Generate the keys\n\n"
-        """In Fully Homomorphic Encryption (FHE) methods, two types of keys are created: secret keys for encrypting and decrypting user data,
-        and evaluation keys for the server to work on encrypted data without seeing the actual content."""
     )
     gen_key_btn = gr.Button("Generate the secret and evaluation keys")
-    gen_key_btn.click(key_gen_fn, inputs=[], outputs=[gen_key_btn])
-    ########################## User Query Part ##########################
-    gr.Markdown("## Step 2: Enter the prompt you want to encrypt and de-identify")
-    query_box = gr.Textbox(
-        value="Hello. My name is John Doe. I live at 123 Main St, Anytown, USA.",
-        label="Enter your prompt:",
-        interactive=True
     )
-    encrypt_query_btn = gr.Button("Encrypt the prompt")
-    output_encrypted_box = gr.Textbox(
-        label="Encrypted prompt (will be sent to the de-identification server):",
-        lines=4,
     )
-    encrypt_query_btn.click(
-        fn=encrypt_query_fn,
-        inputs=[query_box],
-        outputs=[query_box, output_encrypted_box],
     )
     ########################## FHE processing Part ##########################
-    gr.Markdown("## Step 3: De-identify the prompt using FHE")
     gr.Markdown(
-        """The encrypted prompt will be sent to a remote server for de-identification using FHE.
-        The server performs computations on the encrypted data and returns the result for decryption."""
     )
     run_fhe_btn = gr.Button("De-identify using FHE")
-    anonymized_query_output = gr.Textbox(
-        label="De-identified prompt", lines=4, interactive=True
-    )
     identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
     run_fhe_btn.click(
         anonymization_with_fn,
         inputs=[query_box],
-        outputs=[anonymized_query_output, identified_words_output_df],
     )
 # Launch the app
 demo.launch(share=False)

 import pandas as pd
 import requests
 from fhe_anonymizer import FHEAnonymizer
+#from openai import OpenAI
 from utils_demo import *
 from concrete.ml.deployment import FHEModelClient
 from models.speech_to_text.transcriber.model import load_model_and_processor
 from models.speech_to_text.transcriber.audio import transcribe_audio
 # Ensure the directory is clean before starting processes or reading files
 clean_directory()
 anonymizer = FHEAnonymizer()
+#client = OpenAI(api_key=os.environ.get("openaikey"))
 # Start the Uvicorn server hosting the FastAPI app
 subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
 # Load data from files required for the application
 UUID_MAP = read_json(MAPPING_UUID_PATH)
+ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
+MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
+MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
+ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
 MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
+print(f"{ORIGINAL_DOCUMENT=}\n")
+print(f"{MAPPING_DOC_EMBEDDING.keys()=}")
+# 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
+# 5. Utilizing External Services or APIs
+# (Assuming client initialization and anonymizer setup are parts of using external services or application-specific logic)
 # Generate a random user ID for this session
 USER_ID = numpy.random.randint(0, 2**32)
+def select_static_anonymized_sentences_fn(selected_sentences: List):
+    selected_sentences = [MAPPING_ANONYMIZED_SENTENCES[sentence] for sentence in selected_sentences]
+    anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
+    anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
+    return "\n\n".join(anonymized_selected_sentence)
 def key_gen_fn() -> Dict:
     """Generate keys for a given user."""
     print("------------ Step 1: Key Generation:")
     print(f"Your user ID is: {USER_ID}....")
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
     # Save the evaluation key
     evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
     write_bytes(evaluation_key_path, serialized_evaluation_keys)
+    # anonymizer.generate_key()
     if not evaluation_key_path.is_file():
+        error_message = (
+            f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
+        )
         print(error_message)
         return {gen_key_btn: gr.update(value=error_message)}
     else:
         print("Keys have been generated ✅")
         return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
+def encrypt_doc_fn(doc):
+    print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
+    if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
+        return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
+    # Retrieve the client API
+    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
+    client.load()
+    encrypted_tokens = []
+    tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+|\$\d+(?:\.\d+)?|\€\d+(?:\.\d+)?)", ' '.join(doc))
+    for token in tokens:
+        if token.strip() and re.match(r"\w+", token):
+            emb_x = MAPPING_DOC_EMBEDDING[token]
+            assert emb_x.shape == (1, 1024)
+            encrypted_x = client.quantize_encrypt_serialize(emb_x)
+            assert isinstance(encrypted_x, bytes)
+            encrypted_tokens.append(encrypted_x)
+    print("Doc encrypted ✅ on Client Side")
+    # No need to save it
+    # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
+    encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
+    return {
+        encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
+        anonymized_doc_output: gr.update(visible=True, value=None),
+    }
+import presidio_analyzer
+import presidio_anonymizer
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+def anonymization_with_presidio(prompt):
+    analyzer = AnalyzerEngine()
+    anonymizer = AnonymizerEngine()
+    results = analyzer.analyze(text=prompt,language='en')
+    result = anonymizer.anonymize(text=prompt, analyzer_results=results)
+    return result.text
 def encrypt_query_fn(query):
     print(f"\n------------ Step 2: Query encryption: {query=}")
     if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
     if is_user_query_valid(query):
         return {
             query_box: gr.update(
+                value=(
+                    "Unable to process ❌: The request exceeds the length limit or falls "
+                    "outside the scope of this document. Please refine your query."
+                )
             )
         }
+    # Retrieve the client API
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
     encrypted_tokens = []
+    # Pattern to identify words and non-words (including punctuation, spaces, etc.)
     tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
     for token in tokens:
+        # 1- Ignore non-words tokens
+        if bool(re.match(r"^\s+$", token)):
+            continue
+        # 2- Directly append non-word tokens or whitespace to processed_tokens
+        # Prediction for each word
+        emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
+        encrypted_x = client.quantize_encrypt_serialize(emb_x)
+        assert isinstance(encrypted_x, bytes)
+        encrypted_tokens.append(encrypted_x)
     print("Data encrypted ✅ on Client Side")
     assert len({len(token) for token in encrypted_tokens}) == 1
     write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
+    write_bytes(
+        KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big")
+    )
     encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
         identified_words_output_df: gr.update(visible=False, value=None),
     }
 def send_input_fn(query) -> Dict:
+    """Send the encrypted data and the evaluation key to the server."""
     print("------------ Step 3.1: Send encrypted_data to the Server")
     evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
     encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
     encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
+    if not evaluation_key_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
+        )
+        return {anonymized_query_output: gr.update(value=error_message)}
+    if not encrypted_input_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: The data has not been encrypted "
+            f"correctly on the client side - {encrypted_input_path.is_file()=}"
+        )
         return {anonymized_query_output: gr.update(value=error_message)}
+    # Define the data and files to post
     data = {"user_id": USER_ID, "input": query}
     files = [
         ("files", open(evaluation_key_path, "rb")),
         ("files", open(encrypted_input_path, "rb")),
         ("files", open(encrypted_input_len_path, "rb")),
     ]
+    # Send the encrypted input and evaluation key to the server
     url = SERVER_URL + "send_input"
+    with requests.post(
+        url=url,
+        data=data,
+        files=files,
+    ) as resp:
         print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
 def run_fhe_in_server_fn() -> Dict:
+    """Run in FHE the anonymization of the query"""
     print("------------ Step 3.2: Run in FHE on the Server Side")
+    evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
+    encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
+    if not evaluation_key_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
+        )
+        return {anonymized_query_output: gr.update(value=error_message)}
+    if not encrypted_input_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: The data has not been encrypted "
+            f"correctly on the client side - {encrypted_input_path.is_file()=}"
+        )
+        return {anonymized_query_output: gr.update(value=error_message)}
+    data = {
+        "user_id": USER_ID,
+    }
     url = SERVER_URL + "run_fhe"
+    with requests.post(
+        url=url,
+        data=data,
+    ) as response:
         if not response.ok:
             return {
                 anonymized_query_output: gr.update(
+                    value=(
+                        "⚠️ An error occurred on the Server Side. "
+                        "Please check connectivity and data transmission."
+                    ),
                 ),
             }
         else:
             time.sleep(1)
             print(f"The query anonymization was computed in {response.json():.2f} s per token.")
 def get_output_fn() -> Dict:
     print("------------ Step 3.3: Get the output from the Server Side")
+    if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            "The key has not been generated correctly"
+        )
+        return {anonymized_query_output: gr.update(value=error_message)}
+    if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            "The data has not been encrypted correctly on the client side"
+        )
+        return {anonymized_query_output: gr.update(value=error_message)}
+    data = {
+        "user_id": USER_ID,
+    }
+    # Retrieve the encrypted output
+    url = SERVER_URL + "get_output"
+    with requests.post(
+        url=url,
+        data=data,
+    ) as response:
         if response.ok:
             print("Data received ✅ from the remote Server")
             response_data = response.json()
+            encrypted_output_base64 = response_data["encrypted_output"]
+            length_encrypted_output_base64 = response_data["length"]
+            # Decode the base64 encoded data
+            encrypted_output = base64.b64decode(encrypted_output_base64)
+            length_encrypted_output = base64.b64decode(length_encrypted_output_base64)
+            # Save the encrypted output to bytes in a file as it is too large to pass through
+            # regular Gradio buttons (see https://github.com/gradio-app/gradio/issues/1877)
             write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
             write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
         else:
+            print("Error ❌ in getting data to the server")
 def decrypt_fn(text) -> Dict:
+    """Dencrypt the data on the `Client Side`."""
+    print("------------ Step 4: Dencrypt the data on the `Client Side`")
+    # Get the encrypted output path
     encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
     if not encrypted_output_path.is_file():
+        error_message = """⚠️ Please ensure that: \n
+                - the connectivity \n
+                - the query has been submitted \n
+                - the evaluation key has been generated \n
+                - the server processed the encrypted data \n
+                - the Client received the data from the Server before decrypting the prediction
+                """
         print(error_message)
         return error_message, None
+    # Retrieve the client API
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
+    # Load the encrypted output as bytes
     encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
     length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
     i = 0
     for token in tokens:
+        # Directly append non-word tokens or whitespace to processed_tokens
+        if bool(re.match(r"^\s+$", token)):
+            continue
+        else:
             encrypted_token = encrypted_output[i : i + length]
             prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
             probability = prediction_proba[0][1]
             if probability >= 0.77:
                 identified_words_with_prob.append((token, probability))
+                # Use the existing UUID if available, otherwise generate a new one
                 tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
                 decrypted_output.append(tmp_uuid)
                 UUID_MAP[token] = tmp_uuid
             else:
                 decrypted_output.append(token)
+        # Update the UUID map with query.
+        write_json(MAPPING_UUID_PATH, UUID_MAP)
+    # Removing Spaces Before Punctuation:
     anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
+    # Convert the list of identified words and probabilities into a DataFrame
+    if identified_words_with_prob:
+        identified_df = pd.DataFrame(
+            identified_words_with_prob, columns=["Identified Words", "Probability"]
+        )
+    else:
+        identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
     print("Decryption done ✅ on Client Side")
     return anonymized_text, identified_df
+def anonymization_with_fn(selected_sentences, query):
     encrypt_query_fn(query)
     send_input_fn(query)
     run_fhe_in_server_fn()
     get_output_fn()
     anonymized_text, identified_df = decrypt_fn(query)
     return {
+        anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
         anonymized_query_output: gr.update(value=anonymized_text),
+        identified_words_output_df: gr.update(value=identified_df, visible=False),
     }
+# Define the folder path containing audio files
+AUDIO_FOLDER_PATH = "./files/"
+# Function to list available audio files in the folder
+def get_audio_files():
+    files = [f for f in os.listdir(AUDIO_FOLDER_PATH) if f.endswith(('.wav', '.mp3'))]
+    return files
+# Step 1: Load and display audio file
+def load_audio_file(selected_audio):
+    file_path = os.path.join(AUDIO_FOLDER_PATH, selected_audio)
+    return file_path
+# Step 1.1: Record and save the audio file
+def save_recorded_audio(audio):
+    file_path = os.path.join(AUDIO_FOLDER_PATH, "recorded_audio.wav")
+    audio.export(file_path, format="wav")  # Save the audio as a .wav file
+    return file_path
+def click_js():
+    return """function audioRecord() {
+    var xPathRes = document.evaluate ('//*[@id="audio"]//button', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
+    xPathRes.singleNodeValue.click();}"""
+def action(btn):
+    """Changes button text on click"""
+    if btn == 'Speak':
+        return 'Stop'
+    else:
+        return 'Speak'
+def check_btn(btn):
+    """Checks for correct button text before invoking transcribe()"""
+    if btn != 'Speak':
+        raise Exception('Recording...')
+def transcribe():
+    return 'Success'
+def transcribe_audio_app(audio_path):
+    # Prétraitement de l'audio
+    audio = preprocess_audio(audio_path)
+    # Chargement du modèle
+    model,processor = load_model_and_processor(model_name="openai/whisper-base")
+    # Transcription
+    transcription = transcribe_audio(model=model,processor=processor,audio=audio)
+    return transcription
 demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
 with demo:
+    gr.Markdown(
+        """
+        <p align="center">
+            <img width=200 src="https://user-images.githubusercontent.com/5758427/197816413-d9cddad3-ba38-4793-847d-120975e1da11.png">
+        </p>
+        """)
     gr.Markdown(
         """
+        <h1 style="text-align: center;">Secure De-Identification of Audio Files</h1>
+        <!--
+        <p align="center">
+            <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a>
+            —
+            <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
+            —
+            <a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
+            —
+            <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
+        </p>
+        -->
         """
     )
     gr.Markdown(
         """
         <p align="center" style="font-size: 18px;">
+            Protecting personal data is more important than ever in today’s digital world. <b>Our project ensures privacy-preserving de-identification of audio data</b> using state-of-the-art <b>Fully Homomorphic Encryption (FHE)</b>, offering a secure and transparent solution for data anonymization.
+        </p>
+        <p align="center" style="font-size: 18px;">
+            Traditional methods of de-identification often fall short of true anonymization, merely concealing identifiable information. With FHE, we go beyond obfuscation to provide <b>complete security,</b> allowing computations to be performed directly on encrypted data without ever exposing sensitive details.
+        </p>
+        <p align="center" style="font-size: 18px;">
+            This technology is crucial in enabling organizations to use and share sensitive data responsibly, while fully respecting individual privacy.
         </p>
         """
     )
+    # Step 1: Add an audio file
+    gr.Markdown("## Step 1: Add an Audio File")
+    audio_files = get_audio_files()
+    with gr.Row():
+        audio_file_dropdown = gr.Dropdown(audio_files, label="Select an Audio File", interactive=True)
+        audio_output = gr.Audio(label="Selected Audio", type="filepath")
+    # When an audio file is selected, it will display the file path
+    audio_file_dropdown.change(fn=load_audio_file, inputs=[audio_file_dropdown], outputs=[audio_output])
+    with gr.Row():
+        transcribe_btn = gr.Button("Transcrire l'audio")
+        transcription_output = gr.Textbox(label="Transcription", lines=5)
+    transcribe_btn.click(
+        fn=transcribe_audio_app,
+        inputs=[audio_output],
+        outputs=[transcription_output]
     )
+    ########################## Step 1.1: Record Audio ##########################
+    gr.Markdown("## Step 1.1: Record an Audio File")
+    """
+    with gr.Row():
+        audio_recorder = gr.Audio(source="microphone", type="file", label="Record Audio")
+        record_output = gr.Audio(label="Recorded Audio", type="filepath")
+    # When the user records an audio, save it
+    audio_recorder.change(fn=save_recorded_audio, inputs=[audio_recorder], outputs=[record_output])
     gen_key_btn = gr.Button("Generate the secret and evaluation keys")
+    gen_key_btn.click(
+        key_gen_fn,
+        inputs=[],
+        outputs=[gen_key_btn],
+    ) """
+    msg = gr.Textbox()
+    audio_box = gr.Audio(label="Audio", type="filepath", elem_id='audio')
+    with gr.Row():
+        audio_btn = gr.Button('Speak')
+        clear = gr.Button("Clear")
+    audio_btn.click(fn=action, inputs=audio_btn, outputs=audio_btn) \
+              .then(fn=check_btn, inputs=audio_btn) \
+              .success(fn=transcribe_audio_app, outputs=msg)
+    clear.click(lambda: None, None, msg, queue=False)
+    ########################## Transcription ##########################
+    with gr.Row():
+        transcribe_btn = gr.Button("Transcrire l'audio")
+        transcription_output = gr.Textbox(label="Transcription", lines=5)
+    transcribe_btn.click(
+        fn=transcribe_audio_app,
+        inputs=[audio_output],
+        outputs=[transcription_output]
     )
+    ########################## Key Gen Part ##########################
+    gr.Markdown(
+        "## Step 1.2: Generate the keys\n\n"
+        """In Fully Homomorphic Encryption (FHE) methods, two types of keys are created. The first
+        type, called secret keys, are used to encrypt and decrypt the user's data. The second type,
+        called evaluation keys, enables a server to work on the encrypted data without seeing the
+        actual data.
+        """
     )
+    gen_key_btn = gr.Button("Generate the secret and evaluation keys")
+    gen_key_btn.click(
+        key_gen_fn,
+        inputs=[],
+        outputs=[gen_key_btn],
+    )
+    ########################## Main document Part ##########################
+    gr.Markdown("<hr />")
+    gr.Markdown("## Step 2.1: Select the document you want to encrypt\n\n"
+        """To make it simple, we pre-compiled the following document, but you are free to choose
+        on which part you want to run this example.
+        """
     )
+    with gr.Row():
+        with gr.Column(scale=5):
+            original_sentences_box = gr.CheckboxGroup(
+                ORIGINAL_DOCUMENT,
+                value=ORIGINAL_DOCUMENT,
+                label="Contract:",
+                show_label=True,
+            )
+        with gr.Column(scale=1, min_width=6):
+            gr.HTML("<div style='height: 77px;'></div>")
+            encrypt_doc_btn = gr.Button("Encrypt the document")
+        with gr.Column(scale=5):
+            encrypted_doc_box = gr.Textbox(
+                label="Encrypted document:", show_label=True, interactive=False, lines=10
+            )
+    ########################## User Query Part ##########################
+    gr.Markdown("<hr />")
+    gr.Markdown("## Step 2.2: Select the prompt you want to encrypt\n\n"
+        """Please choose from the predefined options in
+        <span style='color:grey'>“Prompt examples”</span> or craft a custom question in
+        the <span style='color:grey'>“Customized prompt”</span> text box.
+        Remain concise and relevant to the context. Any off-topic query will not be processed.""")
+    with gr.Row():
+        with gr.Column(scale=5):
+            with gr.Column(scale=5):
+                default_query_box = gr.Dropdown(
+                    list(DEFAULT_QUERIES.values()), label="PROMPT EXAMPLES:"
+                )
+            gr.Markdown("Or")
+            query_box = gr.Textbox(
+                value=" Hello. My name is Inuitvementoya. You kill my father. Prepare to die.", label="CUSTOMIZED PROMPT:", interactive=True
+            )
+            default_query_box.change(
+                fn=lambda default_query_box: default_query_box,
+                inputs=[default_query_box],
+                outputs=[query_box],
+            )
+        with gr.Column(scale=1, min_width=6):
+            gr.HTML("<div style='height: 77px;'></div>")
+            encrypt_query_btn = gr.Button("Encrypt the prompt")
+            # gr.HTML("<div style='height: 50px;'></div>")
+        with gr.Column(scale=5):
+            output_encrypted_box = gr.Textbox(
+                label="Encrypted de-identified query that will be sent to the de-identification server:",
+                lines=8,
+            )
     ########################## FHE processing Part ##########################
+    gr.Markdown("<hr />")
+    gr.Markdown("## Step 3: De-identify the document and the prompt using FHE")
     gr.Markdown(
+        """Once the client encrypts the document and the prompt locally, it will be sent to a remote
+        server to perform the de-identification on encrypted data. When the computation is done, the
+        server will return the result to the client for decryption."""
     )
     run_fhe_btn = gr.Button("De-identify using FHE")
+    with gr.Row():
+        with gr.Column(scale=5):
+            anonymized_doc_output = gr.Textbox(
+                label="Decrypted and de-idenntified document", lines=10, interactive=True
+            )
+        with gr.Column(scale=5):
+            anonymized_query_output = gr.Textbox(
+                label="Decrypted and de-identified prompt", lines=10, interactive=True
+            )
     identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
+    encrypt_doc_btn.click(
+        fn=encrypt_doc_fn,
+        inputs=[original_sentences_box],
+        outputs=[encrypted_doc_box, anonymized_doc_output],
+    )
+    encrypt_query_btn.click(
+        fn=encrypt_query_fn,
+        inputs=[query_box],
+        outputs=[
+            query_box,
+            output_encrypted_box,
+            anonymized_query_output,
+            identified_words_output_df,
+        ],
+    )
     run_fhe_btn.click(
         anonymization_with_fn,
+        inputs=[original_sentences_box, query_box],
+        outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
+    )
+    ########################## Presidio ##########################
+    gr.Markdown("<hr />")
+    gr.Markdown("## Step 3: De-identify the document and the prompt")
+    gr.Markdown(
+        """This step will demonstrate de-identification using both FHE and Presidio methods.
+        The same prompt will be used for both to allow for direct comparison.""")
+    with gr.Row():
+        run_presidio_btn = gr.Button("De-identify using Presidio")
+    with gr.Row():
+        presidio_output = gr.Textbox(
+            label="Presidio: De-identified prompt", lines=10, interactive=True
+        )
+    run_presidio_btn.click(
+        anonymization_with_presidio,
         inputs=[query_box],
+        outputs=[presidio_output],
     )
 # Launch the app
 demo.launch(share=False)