"""A Gradio app for anonymizing text data using FHE.""" import os import re from typing import Dict, List import gradio as gr import pandas as pd from fhe_anonymizer import FHEAnonymizer from openai import OpenAI from utils_demo import * ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n") ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH) MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH) clean_directory() anonymizer = FHEAnonymizer() client = OpenAI(api_key=os.environ.get("openaikey")) def select_static_sentences_fn(selected_sentences: List): selected_sentences = [MAPPING_SENTENCES[sentence] for sentence in selected_sentences] anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0]) anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence] return {anonymized_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))} def key_gen_fn() -> Dict: """Generate keys for a given user. Returns: dict: A dictionary containing the generated keys and related information. """ print("Key Gen..") anonymizer.generate_key() evaluation_key_path = KEYS_DIR / "evaluation_key" if not evaluation_key_path.is_file(): error_message = ( f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}" ) print(error_message) return {gen_key_btn: gr.update(value=error_message)} else: return {gen_key_btn: gr.update(value="Keys have been generated ✅")} def encrypt_query_fn(query): print(f"Query: {query}") evaluation_key_path = KEYS_DIR / "evaluation_key" if not evaluation_key_path.is_file(): error_message = "Error ❌: Please generate the key first!" return {output_encrypted_box: gr.update(value=error_message)} if is_user_query_valid(query): # TODO: check if the query is related to our context error_msg = ( "Unable to process ❌: The request exceeds the length limit or falls " "outside the scope of this document. Please refine your query." ) print(error_msg) return {query_box: gr.update(value=error_msg)} anonymizer.encrypt_query(query) encrypted_tokens = read_pickle(KEYS_DIR / "encrypted_quantized_query") encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens] return {output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex))} def run_fhe_fn(query_box): evaluation_key_path = KEYS_DIR / "evaluation_key" if not evaluation_key_path.is_file(): error_message = "Error ❌: Please generate the key first!" return {anonymized_text_output: gr.update(value=error_message)} encryted_query_path = KEYS_DIR / "encrypted_quantized_query" if not encryted_query_path.is_file(): error_message = "Error ❌: Please encrypt your query first!" return {anonymized_text_output: gr.update(value=error_message)} anonymizer.run_server_and_decrypt_output(query_box) anonymized_text = read_pickle(KEYS_DIR / "reconstructed_sentence") identified_words_with_prob = read_pickle(KEYS_DIR / "identified_words_with_prob") # Convert the list of identified words and probabilities into a DataFrame if identified_words_with_prob: identified_df = pd.DataFrame( identified_words_with_prob, columns=["Identified Words", "Probability"] ) else: identified_df = pd.DataFrame(columns=["Identified Words", "Probability"]) return anonymized_text, identified_df def query_chatgpt_fn(anonymized_query, anonymized_document): evaluation_key_path = KEYS_DIR / "evaluation_key" if not evaluation_key_path.is_file(): error_message = "Error ❌: Please generate the key first!" return {anonymized_text_output: gr.update(value=error_message)} encryted_query_path = KEYS_DIR / "encrypted_quantized_query" if not encryted_query_path.is_file(): error_message = "Error ❌: Please encrypt your query first!" return {anonymized_text_output: gr.update(value=error_message)} decrypted_query_path = KEYS_DIR / "reconstructed_sentence" if not decrypted_query_path.is_file(): error_message = "Error ❌: Please run the FHE computation first!" return {anonymized_text_output: gr.update(value=error_message)} prompt = read_txt(PROMPT_PATH) # Prepare prompt full_prompt = prompt + "\n" query = ( "Document content:\n```\n" + anonymized_document + "\n\n```" + "Query:\n```\n" + anonymized_query + "\n```" ) print(full_prompt) completion = client.chat.completions.create( model="gpt-4-1106-preview", # Replace with "gpt-4" if available messages=[ {"role": "system", "content": prompt}, {"role": "user", "content": query}, ], ) anonymized_response = completion.choices[0].message.content uuid_map = read_json(MAPPING_UUID_PATH) inverse_uuid_map = { v: k for k, v in uuid_map.items() } # TODO load the inverse mapping from disk for efficiency # Pattern to identify words and non-words (including punctuation, spaces, etc.) tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", anonymized_response) processed_tokens = [] for token in tokens: # Directly append non-word tokens or whitespace to processed_tokens if not token.strip() or not re.match(r"\w+", token): processed_tokens.append(token) continue if token in inverse_uuid_map: processed_tokens.append(inverse_uuid_map[token]) else: processed_tokens.append(token) deanonymized_response = "".join(processed_tokens) return anonymized_response, deanonymized_response demo = gr.Blocks(css=".markdown-body { font-size: 18px; }") with demo: gr.Markdown( """
Concrete-ML
—
Documentation
—
Community
—
@zama_fhe
#
#
Encrypt data locally with FHE 💻 ⚙️
""" ) encrypt_btn = gr.Button("Encrypt data") gr.HTML("") with gr.Column(scale=5): output_encrypted_box = gr.Textbox( label="Encrypted anonymized query that is sent to the anonymization server", lines=6 ) encrypt_btn.click( fn=encrypt_query_fn, inputs=[query_box], outputs=[query_box, output_encrypted_box] ) gr.Markdown("