Spaces:

protectai
/

llm-guard-playground

Running

File size: 15,822 Bytes

import logging
from typing import Dict, List

import streamlit as st
from streamlit_tags import st_tags

from llm_guard.input_scanners import (
    Anonymize,
    BanSubstrings,
    BanTopics,
    Code,
    Language,
    PromptInjection,
    PromptInjectionV2,
    Regex,
    Secrets,
    Sentiment,
    TokenLimit,
    Toxicity,
)
from llm_guard.input_scanners.anonymize import default_entity_types
from llm_guard.input_scanners.anonymize_helpers.analyzer import allowed_recognizers
from llm_guard.vault import Vault

logger = logging.getLogger("llm-guard-playground")


def init_settings() -> (List, Dict):
    all_scanners = [
        "Anonymize",
        "BanSubstrings",
        "BanTopics",
        "Code",
        "Language",
        "PromptInjection",
        "PromptInjectionV2",
        "Regex",
        "Secrets",
        "Sentiment",
        "TokenLimit",
        "Toxicity",
    ]

    st_enabled_scanners = st.sidebar.multiselect(
        "Select scanners",
        options=all_scanners,
        default=all_scanners,
        help="The list can be found here: https://laiyer-ai.github.io/llm-guard/input_scanners/anonymize/",
    )

    settings = {}

    if "Anonymize" in st_enabled_scanners:
        st_anon_expander = st.sidebar.expander(
            "Anonymize",
            expanded=False,
        )

        with st_anon_expander:
            st_anon_entity_types = st_tags(
                label="Anonymize entities",
                text="Type and press enter",
                value=default_entity_types,
                suggestions=default_entity_types
                + ["DATE_TIME", "NRP", "LOCATION", "MEDICAL_LICENSE", "US_PASSPORT"],
                maxtags=30,
                key="anon_entity_types",
            )
            st.caption(
                "Check all supported entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities"
            )
            st_anon_hidden_names = st_tags(
                label="Hidden names to be anonymized",
                text="Type and press enter",
                value=[],
                suggestions=[],
                maxtags=30,
                key="anon_hidden_names",
            )
            st.caption("These names will be hidden e.g. [REDACTED_CUSTOM1].")
            st_anon_allowed_names = st_tags(
                label="Allowed names to ignore",
                text="Type and press enter",
                value=[],
                suggestions=[],
                maxtags=30,
                key="anon_allowed_names",
            )
            st.caption("These names will be ignored even if flagged by the detector.")
            st_anon_preamble = st.text_input(
                "Preamble", value="Text to prepend to sanitized prompt: "
            )
            st_anon_use_faker = st.checkbox(
                "Use Faker", value=False, help="Use Faker library to generate fake data"
            )
            st_anon_threshold = st.slider(
                label="Threshold",
                value=0.0,
                min_value=0.0,
                max_value=1.0,
                step=0.1,
                key="anon_threshold",
            )
            st_anon_recognizer = st.selectbox(
                "Recognizer",
                allowed_recognizers,
                index=1,
            )

        settings["Anonymize"] = {
            "entity_types": st_anon_entity_types,
            "hidden_names": st_anon_hidden_names,
            "allowed_names": st_anon_allowed_names,
            "preamble": st_anon_preamble,
            "use_faker": st_anon_use_faker,
            "threshold": st_anon_threshold,
            "recognizer": st_anon_recognizer,
        }

    if "BanSubstrings" in st_enabled_scanners:
        st_bs_expander = st.sidebar.expander(
            "Ban Substrings",
            expanded=False,
        )

        with st_bs_expander:
            st_bs_substrings = st.text_area(
                "Enter substrings to ban (one per line)",
                value="test\nhello\nworld",
                height=200,
            ).split("\n")

            st_bs_match_type = st.selectbox("Match type", ["str", "word"])
            st_bs_case_sensitive = st.checkbox("Case sensitive", value=False)
            st_bs_redact = st.checkbox("Redact", value=False)
            st_bs_contains_all = st.checkbox("Contains all", value=False)

        settings["BanSubstrings"] = {
            "substrings": st_bs_substrings,
            "match_type": st_bs_match_type,
            "case_sensitive": st_bs_case_sensitive,
            "redact": st_bs_redact,
            "contains_all": st_bs_contains_all,
        }

    if "BanTopics" in st_enabled_scanners:
        st_bt_expander = st.sidebar.expander(
            "Ban Topics",
            expanded=False,
        )

        with st_bt_expander:
            st_bt_topics = st_tags(
                label="List of topics",
                text="Type and press enter",
                value=["violence"],
                suggestions=[],
                maxtags=30,
                key="bt_topics",
            )

            st_bt_threshold = st.slider(
                label="Threshold",
                value=0.6,
                min_value=0.0,
                max_value=1.0,
                step=0.05,
                key="ban_topics_threshold",
            )

        settings["BanTopics"] = {
            "topics": st_bt_topics,
            "threshold": st_bt_threshold,
        }

    if "Code" in st_enabled_scanners:
        st_cd_expander = st.sidebar.expander(
            "Code",
            expanded=False,
        )

        with st_cd_expander:
            st_cd_languages = st.multiselect(
                "Programming languages",
                ["python", "java", "javascript", "go", "php", "ruby"],
                default=["python"],
            )

            st_cd_mode = st.selectbox("Mode", ["allowed", "denied"], index=0)

        settings["Code"] = {
            "languages": st_cd_languages,
            "mode": st_cd_mode,
        }

    if "Language" in st_enabled_scanners:
        st_lan_expander = st.sidebar.expander(
            "Language",
            expanded=False,
        )

        with st_lan_expander:
            st_lan_valid_language = st.multiselect(
                "Languages",
                [
                    "af",
                    "ar",
                    "bg",
                    "bn",
                    "ca",
                    "cs",
                    "cy",
                    "da",
                    "de",
                    "el",
                    "en",
                    "es",
                    "et",
                    "fa",
                    "fi",
                    "fr",
                    "gu",
                    "he",
                    "hi",
                    "hr",
                    "hu",
                    "id",
                    "it",
                    "ja",
                    "kn",
                    "ko",
                    "lt",
                    "lv",
                    "mk",
                    "ml",
                    "mr",
                    "ne",
                    "nl",
                    "no",
                    "pa",
                    "pl",
                    "pt",
                    "ro",
                    "ru",
                    "sk",
                    "sl",
                    "so",
                    "sq",
                    "sv",
                    "sw",
                    "ta",
                    "te",
                    "th",
                    "tl",
                    "tr",
                    "uk",
                    "ur",
                    "vi",
                    "zh-cn",
                    "zh-tw",
                ],
                default=["en"],
            )

        settings["Language"] = {
            "valid_languages": st_lan_valid_language,
        }

    if "PromptInjection" in st_enabled_scanners:
        st_pi_expander = st.sidebar.expander(
            "Prompt Injection",
            expanded=False,
        )

        with st_pi_expander:
            st_pi_threshold = st.slider(
                label="Threshold",
                value=0.75,
                min_value=0.0,
                max_value=1.0,
                step=0.05,
                key="prompt_injection_threshold",
            )

        settings["PromptInjection"] = {
            "threshold": st_pi_threshold,
        }

    if "PromptInjectionV2" in st_enabled_scanners:
        st_piv2_expander = st.sidebar.expander(
            "Prompt Injection V2",
            expanded=False,
        )

        with st_piv2_expander:
            st_piv2_threshold = st.slider(
                label="Threshold",
                value=0.5,
                min_value=0.0,
                max_value=1.0,
                step=0.05,
                key="prompt_injection_v2_threshold",
            )

        settings["PromptInjectionV2"] = {
            "threshold": st_piv2_threshold,
        }

    if "Regex" in st_enabled_scanners:
        st_regex_expander = st.sidebar.expander(
            "Regex",
            expanded=False,
        )

        with st_regex_expander:
            st_regex_patterns = st.text_area(
                "Enter patterns to ban (one per line)",
                value="Bearer [A-Za-z0-9-._~+/]+",
                height=200,
            ).split("\n")

            st_regex_type = st.selectbox(
                "Match type",
                ["good", "bad"],
                index=1,
                help="good: allow only good patterns, bad: ban bad patterns",
            )

            st_redact = st.checkbox(
                "Redact", value=False, help="Replace the matched bad patterns with [REDACTED]"
            )

        settings["Regex"] = {
            "patterns": st_regex_patterns,
            "type": st_regex_type,
            "redact": st_redact,
        }

    if "Secrets" in st_enabled_scanners:
        st_sec_expander = st.sidebar.expander(
            "Secrets",
            expanded=False,
        )

        with st_sec_expander:
            st_sec_redact_mode = st.selectbox("Redact mode", ["all", "partial", "hash"])

        settings["Secrets"] = {
            "redact_mode": st_sec_redact_mode,
        }

    if "Sentiment" in st_enabled_scanners:
        st_sent_expander = st.sidebar.expander(
            "Sentiment",
            expanded=False,
        )

        with st_sent_expander:
            st_sent_threshold = st.slider(
                label="Threshold",
                value=-0.1,
                min_value=-1.0,
                max_value=1.0,
                step=0.1,
                key="sentiment_threshold",
                help="Negative values are negative sentiment, positive values are positive sentiment",
            )

        settings["Sentiment"] = {
            "threshold": st_sent_threshold,
        }

    if "TokenLimit" in st_enabled_scanners:
        st_tl_expander = st.sidebar.expander(
            "Token Limit",
            expanded=False,
        )

        with st_tl_expander:
            st_tl_limit = st.number_input(
                "Limit", value=4096, min_value=0, max_value=10000, step=10
            )
            st_tl_encoding_name = st.selectbox(
                "Encoding name",
                ["cl100k_base", "p50k_base", "r50k_base"],
                index=0,
                help="Read more: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb",
            )

        settings["TokenLimit"] = {
            "limit": st_tl_limit,
            "encoding_name": st_tl_encoding_name,
        }

    if "Toxicity" in st_enabled_scanners:
        st_tox_expander = st.sidebar.expander(
            "Toxicity",
            expanded=False,
        )

        with st_tox_expander:
            st_tox_threshold = st.slider(
                label="Threshold",
                value=0.75,
                min_value=0.0,
                max_value=1.0,
                step=0.05,
                key="toxicity_threshold",
            )

        settings["Toxicity"] = {
            "threshold": st_tox_threshold,
        }

    return st_enabled_scanners, settings


def get_scanner(scanner_name: str, vault: Vault, settings: Dict):
    logger.debug(f"Initializing {scanner_name} scanner")

    if scanner_name == "Anonymize":
        return Anonymize(
            vault=vault,
            allowed_names=settings["allowed_names"],
            hidden_names=settings["hidden_names"],
            entity_types=settings["entity_types"],
            preamble=settings["preamble"],
            use_faker=settings["use_faker"],
            threshold=settings["threshold"],
            recognizer=settings["recognizer"],
        )

    if scanner_name == "BanSubstrings":
        return BanSubstrings(
            substrings=settings["substrings"],
            match_type=settings["match_type"],
            case_sensitive=settings["case_sensitive"],
            redact=settings["redact"],
            contains_all=settings["contains_all"],
        )

    if scanner_name == "BanTopics":
        return BanTopics(topics=settings["topics"], threshold=settings["threshold"])

    if scanner_name == "Code":
        mode = settings["mode"]

        allowed_languages = None
        denied_languages = None
        if mode == "allowed":
            allowed_languages = settings["languages"]
        elif mode == "denied":
            denied_languages = settings["languages"]

        return Code(allowed=allowed_languages, denied=denied_languages)

    if scanner_name == "Language":
        return Language(valid_languages=settings["valid_languages"])

    if scanner_name == "PromptInjection":
        return PromptInjection(threshold=settings["threshold"])

    if scanner_name == "PromptInjectionV2":
        return PromptInjectionV2(threshold=settings["threshold"])

    if scanner_name == "Regex":
        match_type = settings["type"]

        good_patterns = None
        bad_patterns = None
        if match_type == "good":
            good_patterns = settings["patterns"]
        elif match_type == "bad":
            bad_patterns = settings["patterns"]

        return Regex(
            good_patterns=good_patterns, bad_patterns=bad_patterns, redact=settings["redact"]
        )

    if scanner_name == "Secrets":
        return Secrets(redact_mode=settings["redact_mode"])

    if scanner_name == "Sentiment":
        return Sentiment(threshold=settings["threshold"])

    if scanner_name == "TokenLimit":
        return TokenLimit(limit=settings["limit"], encoding_name=settings["encoding_name"])

    if scanner_name == "Toxicity":
        return Toxicity(threshold=settings["threshold"])

    raise ValueError("Unknown scanner name")


def scan(
    vault: Vault, enabled_scanners: List[str], settings: Dict, text: str, fail_fast: bool = False
) -> (str, Dict[str, bool], Dict[str, float]):
    sanitized_prompt = text
    results_valid = {}
    results_score = {}

    status_text = "Scanning prompt..."
    if fail_fast:
        status_text = "Scanning prompt (fail fast mode)..."

    with st.status(status_text, expanded=True) as status:
        for scanner_name in enabled_scanners:
            st.write(f"{scanner_name} scanner...")
            scanner = get_scanner(scanner_name, vault, settings[scanner_name])
            sanitized_prompt, is_valid, risk_score = scanner.scan(sanitized_prompt)
            results_valid[scanner_name] = is_valid
            results_score[scanner_name] = risk_score

            if fail_fast and not is_valid:
                break
        status.update(label="Scanning complete", state="complete", expanded=False)

    return sanitized_prompt, results_valid, results_score