|
"""A Gradio app for anonymizing text data using FHE.""" |
|
|
|
import gradio as gr |
|
from fhe_anonymizer import FHEAnonymizer |
|
import pandas as pd |
|
from openai import OpenAI |
|
import os |
|
import json |
|
import re |
|
from utils_demo import * |
|
from typing import List, Dict, Tuple |
|
|
|
anonymizer = FHEAnonymizer() |
|
|
|
client = OpenAI( |
|
api_key=os.environ.get("openaikey"), |
|
) |
|
|
|
|
|
def check_user_query_fn(user_query: str) -> Dict: |
|
if is_user_query_valid(user_query): |
|
|
|
error_msg = ("Unable to process β: The request exceeds the length limit or falls " |
|
"outside the scope of this document. Please refine your query.") |
|
print(error_msg) |
|
return {input_text: gr.update(value=error_msg)} |
|
else: |
|
|
|
return {input_text: gr.update(value=re.sub(" +", " ", user_query))} |
|
|
|
def deidentify_text(input_text): |
|
anonymized_text, identified_words_with_prob = anonymizer(input_text) |
|
|
|
|
|
if identified_words_with_prob: |
|
identified_df = pd.DataFrame( |
|
identified_words_with_prob, columns=["Identified Words", "Probability"] |
|
) |
|
else: |
|
identified_df = pd.DataFrame(columns=["Identified Words", "Probability"]) |
|
return anonymized_text, identified_df |
|
|
|
|
|
def query_chatgpt(anonymized_query): |
|
|
|
with open("files/anonymized_document.txt", "r") as file: |
|
anonymized_document = file.read() |
|
with open("files/chatgpt_prompt.txt", "r") as file: |
|
prompt = file.read() |
|
|
|
|
|
full_prompt = ( |
|
prompt + "\n" |
|
) |
|
query = "Document content:\n```\n" + anonymized_document + "\n\n```" + "Query:\n```\n" + anonymized_query + "\n```" |
|
print(full_prompt) |
|
|
|
completion = client.chat.completions.create( |
|
model="gpt-4-1106-preview", |
|
messages=[ |
|
{"role": "system", "content": prompt}, |
|
{"role": "user", "content": query}, |
|
], |
|
) |
|
anonymized_response = completion.choices[0].message.content |
|
with open("original_document_uuid_mapping.json", "r") as file: |
|
uuid_map = json.load(file) |
|
inverse_uuid_map = {v: k for k, v in uuid_map.items()} |
|
|
|
|
|
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)" |
|
tokens = re.findall(token_pattern, anonymized_response) |
|
processed_tokens = [] |
|
|
|
|
|
for token in tokens: |
|
|
|
if not token.strip() or not re.match(r"\w+", token): |
|
processed_tokens.append(token) |
|
continue |
|
|
|
if token in inverse_uuid_map: |
|
processed_tokens.append(inverse_uuid_map[token]) |
|
else: |
|
processed_tokens.append(token) |
|
deanonymized_response = "".join(processed_tokens) |
|
return anonymized_response, deanonymized_response |
|
|
|
|
|
with open("files/original_document.txt", "r") as file: |
|
original_document = file.read() |
|
|
|
with open("files/anonymized_document.txt", "r") as file: |
|
anonymized_document = file.read() |
|
|
|
demo = gr.Blocks(css=".markdown-body { font-size: 18px; }") |
|
|
|
with demo: |
|
|
|
gr.Markdown( |
|
""" |
|
<p align="center"> |
|
<img width=200 src="file/images/logos/zama.jpg"> |
|
</p> |
|
<h1 style="text-align: center;">Encrypted Anonymization Using Fully Homomorphic Encryption</h1> |
|
<p align="center"> |
|
<a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a> |
|
β |
|
<a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a> |
|
β |
|
<a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a> |
|
β |
|
<a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a> |
|
</p> |
|
""" |
|
) |
|
|
|
gr.Markdown( |
|
""" |
|
<p align="center"> |
|
<img width="30%" height="25%" src="./encrypted_anonymization_diagram.jpg"> |
|
</p> |
|
""" |
|
) |
|
|
|
with gr.Accordion("What is Encrypted Anonymization?", open=False): |
|
gr.Markdown( |
|
""" |
|
Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to |
|
protect sensitive information during data processing. This approach allows for the |
|
anonymization of text data, such as personal identifiers, while ensuring that the data |
|
remains encrypted throughout the entire process. |
|
""" |
|
) |
|
|
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
original_doc_box = gr.Textbox(label="Original Document:", value=original_document, interactive=True) |
|
with gr.Column(): |
|
anonymized_doc_box = gr.Textbox(label="Anonymized Document:", value=anonymized_document, interactive=False) |
|
|
|
|
|
|
|
with gr.Row(): |
|
input_text = gr.Textbox(value="Who lives in Maine?", label="User query", interactive=True) |
|
|
|
default_query_box = gr.Radio(choices=list(DEFAULT_QUERIES.keys()), label="Example Queries") |
|
|
|
default_query_box.change( |
|
fn=lambda default_query_box: DEFAULT_QUERIES[default_query_box], |
|
inputs=[default_query_box], |
|
outputs=[input_text] |
|
) |
|
|
|
input_text.change( |
|
check_user_query_fn, |
|
inputs=[input_text], |
|
outputs=[input_text], |
|
) |
|
|
|
anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1, interactive=True) |
|
|
|
identified_words_output = gr.Dataframe(label="Identified Words", visible=False) |
|
|
|
submit_button = gr.Button("Anonymize with FHE") |
|
|
|
submit_button.click( |
|
deidentify_text, |
|
inputs=[input_text], |
|
outputs=[anonymized_text_output, identified_words_output], |
|
) |
|
|
|
with gr.Row(): |
|
chatgpt_response_anonymized = gr.Textbox(label="ChatGPT Anonymized Response", lines=13) |
|
chatgpt_response_deanonymized = gr.Textbox(label="ChatGPT Deanonymized Response", lines=13) |
|
|
|
chatgpt_button = gr.Button("Query ChatGPT") |
|
chatgpt_button.click( |
|
query_chatgpt, |
|
inputs=[anonymized_text_output], |
|
outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized], |
|
) |
|
|
|
|
|
demo.launch(share=False) |
|
|