|
import re |
|
import gradio as gr |
|
from gliner import GLiNER |
|
from cerberus import Validator |
|
|
|
|
|
|
|
|
|
|
|
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1") |
|
|
|
with open("labels.txt", "r", encoding="utf-8") as f: |
|
labels = [line.strip() for line in f.readlines()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
schema = { |
|
"text": { |
|
"type": "string", |
|
"empty": False |
|
} |
|
} |
|
|
|
validator = Validator(schema) |
|
|
|
|
|
def validate_input(data: dict) -> str: |
|
"""Validate that data has a non-empty 'text' key.""" |
|
if not validator.validate(data): |
|
|
|
raise ValueError(f"Invalid input data. Errors: {validator.errors}") |
|
return data["text"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def anonymize_text(text): |
|
""" |
|
1) Detect PII using GLiNER, |
|
2) Replace each entity with a placeholder (<PII_LABEL_INDEX>) |
|
3) Return anonymized_text + entity_map |
|
""" |
|
entities = model.predict_entities(text, labels=labels, threshold=0.2) |
|
|
|
entities.sort(key=lambda e: e['start']) |
|
|
|
entity_map = {} |
|
anonymized_text = "" |
|
next_start = 0 |
|
|
|
for entity in entities: |
|
label = entity['label'].replace(" ", "_").upper() |
|
original_text = entity['text'] |
|
start_idx, end_idx = entity['start'], entity['end'] |
|
|
|
if label not in entity_map: |
|
entity_map[label] = [original_text] |
|
idx = 1 |
|
else: |
|
|
|
if original_text in entity_map[label]: |
|
idx = entity_map[label].index(original_text) + 1 |
|
else: |
|
entity_map[label].append(original_text) |
|
idx = len(entity_map[label]) |
|
|
|
|
|
anonymized_text += text[next_start:start_idx] |
|
|
|
anonymized_text += f"<PII_{label}_{idx}>" |
|
next_start = end_idx |
|
|
|
|
|
anonymized_text += text[next_start:] |
|
return anonymized_text, entity_map |
|
|
|
|
|
def deanonymize_text(anonymized_response, entity_map): |
|
""" |
|
Replace <PII_LABEL_INDEX> placeholders in anonymized_response |
|
with their original strings from entity_map. |
|
""" |
|
|
|
def replace_match(match): |
|
label = match.group(1) |
|
idx_str = match.group(2) |
|
idx = int(idx_str) - 1 |
|
|
|
if label in entity_map and 0 <= idx < len(entity_map[label]): |
|
return entity_map[label][idx] |
|
return match.group(0) |
|
|
|
pattern = r"<PII_(\w+)_(\d+)>" |
|
return re.sub(pattern, replace_match, anonymized_response) |
|
|
|
|
|
|
|
|
|
|
|
def anonymize_fn(original_text): |
|
|
|
data = {"text": original_text} |
|
try: |
|
user_text = validate_input(data) |
|
except ValueError as e: |
|
|
|
return "", {}, f"Validation error: {str(e)}" |
|
|
|
anonymized, entities = anonymize_text(user_text) |
|
return anonymized, entities, "Anonymized successfully!" |
|
|
|
|
|
def deanonymize_fn(anonymized_llm_response, entity_map): |
|
if not anonymized_llm_response.strip(): |
|
return "", "Please provide an anonymized LLM response." |
|
if not entity_map: |
|
return "", "No entity map found; anonymize some text first." |
|
|
|
result = deanonymize_text(anonymized_llm_response, entity_map) |
|
return result, "De-anonymized successfully!" |
|
|
|
|
|
md_text = """# Anonymizing LLM Prompts |
|
|
|
Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition. |
|
|
|
The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star! |
|
""" |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(md_text) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
original_text = gr.Textbox( |
|
lines=6, label="Original Text (Anonymize)" |
|
) |
|
anonymized_text = gr.Textbox( |
|
lines=6, label="Anonymized Text", interactive=False |
|
) |
|
button_anon = gr.Button("Anonymize") |
|
|
|
|
|
entity_map_state = gr.State() |
|
|
|
message_out = gr.Textbox(label="Status", interactive=False) |
|
|
|
button_anon.click( |
|
anonymize_fn, |
|
inputs=[original_text], |
|
outputs=[anonymized_text, entity_map_state, message_out] |
|
) |
|
|
|
with gr.Column(): |
|
anonymized_llm_response = gr.Textbox( |
|
lines=6, label="Anonymized LLM Response (Paste here)" |
|
) |
|
deanonymized_text = gr.Textbox( |
|
lines=6, label="De-anonymized LLM Response", interactive=False |
|
) |
|
button_deanon = gr.Button("De-anonymize") |
|
|
|
message_out_de = gr.Textbox(label="Status", interactive=False) |
|
|
|
button_deanon.click( |
|
deanonymize_fn, |
|
inputs=[anonymized_llm_response, entity_map_state], |
|
outputs=[deanonymized_text, message_out_de] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|