Spaces:

pinyuchen
/

attention-tracker

Running on Zero

App Files Files Community

johnsonhung906 commited on Feb 11

Commit

ef017a0

1 Parent(s): d62afec

support llama, granite

Browse files

Files changed (5) hide show

app.py +38 -19
configs/model_configs/granite3_8b-attn_config.json +12 -0
configs/model_configs/llama3_8b-attn_config.json +12 -0
configs/model_configs/qwen2-attn_config.json +1 -1
models/attn_model.py +7 -3

app.py CHANGED Viewed

@@ -3,27 +3,45 @@ import gradio as gr
 from utils import open_config, create_model
 from detector.attn import AttentionDetector
-# Load model configuration and initialize the detector
-model_config_path = f"./configs/model_configs/qwen2-attn_config.json"
-model_config = open_config(config_path=model_config_path)
-model = create_model(config=model_config)
-detector = AttentionDetector(model)
 @spaces.GPU(duration=30)
-def respond(message, threshold):
-    # Set threshold for detection
-    detector.threshold = threshold
-    # Detect prompt injection
-    detect_result = detector.detect(message)
     if detect_result[0]:
         response = "Prompt injection detected!"
     else:
-        response, _, _, _, _, _ = model.inference("You are a friendly Chatbot.", message)
-    # Include focus score in the response
     response += f"\n\nFocus score: {detect_result[1]['focus_score']}"
     return response
@@ -44,24 +62,25 @@ benign_prompts = [
 # Define Gradio interface components
 message_input = gr.Textbox(placeholder="Enter your message here...", label="Your Message")
 threshold_slider = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.05, label="Threshold")
 response_output = gr.Textbox(label="Response")
-# Gradio interface
 with gr.Interface(
     fn=respond,
-    inputs=[message_input, threshold_slider],
     outputs=response_output,
-    title="Attention Tracker - Qwen-1.5b-instruct"
 ) as demo:
     with gr.Tab("Benign Prompts"):
         gr.Examples(
-            benign_prompts,
-            inputs=[message_input],  # Correctly reference the input component
         )
     with gr.Tab("Malicious Prompts (Prompt Injection Attack)"):
         gr.Examples(
-            injection_prompts,
-            inputs=[message_input],  # Correctly reference the input component
         )
     gr.Markdown(
         "### This website is developed and maintained by [Kuo-Han Hung](https://khhung-906.github.io/)"

 from utils import open_config, create_model
 from detector.attn import AttentionDetector
+# Define model configuration paths
+model_configs = {
+    "granite3_8b": "./configs/model_configs/granite3_8b-attn_config.json",
+    "llama3_8b": "./configs/model_configs/llama3_8b-attn_config.json",
+    "qwen2_1.5b": "./configs/model_configs/qwen2-attn_config.json",
+}
+# Load all models and create their corresponding detectors
+models = {}
+detectors = {}
+for name, config_path in model_configs.items():
+    config = open_config(config_path=config_path)
+    model_instance = create_model(config=config)
+    models[name] = model_instance
+    detectors[name] = AttentionDetector(model_instance)
 @spaces.GPU(duration=30)
+def respond(message, threshold, model_name):
+    """
+    Run the prompt injection detection and inference using the selected model.
+    """
+    # Select the model and its detector based on the user's choice
+    selected_detector = detectors[model_name]
+    selected_model = models[model_name]
+    # Set the detection threshold
+    selected_detector.threshold = threshold
+    # Perform prompt injection detection
+    detect_result = selected_detector.detect(message)
+    # If injection is detected, return a warning; otherwise, perform inference.
     if detect_result[0]:
         response = "Prompt injection detected!"
     else:
+        # Unpack the response from inference (assuming the first element is the text)
+        response, _, _, _, _, _ = selected_model.inference("You are a friendly Chatbot.", message)
+    # Append the focus score to the response.
     response += f"\n\nFocus score: {detect_result[1]['focus_score']}"
     return response
 # Define Gradio interface components
 message_input = gr.Textbox(placeholder="Enter your message here...", label="Your Message")
 threshold_slider = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.05, label="Threshold")
+model_selector = gr.Radio(choices=list(model_configs.keys()), value="llama3_8b", label="Select Model")
 response_output = gr.Textbox(label="Response")
+# Build the Gradio interface (using the Blocks API via Interface)
 with gr.Interface(
     fn=respond,
+    inputs=[message_input, threshold_slider, model_selector],
     outputs=response_output,
+    title="Attention Tracker"
 ) as demo:
     with gr.Tab("Benign Prompts"):
         gr.Examples(
+            examples=benign_prompts,
+            inputs=[message_input],  # Only the message input is prefilled by these examples
         )
     with gr.Tab("Malicious Prompts (Prompt Injection Attack)"):
         gr.Examples(
+            examples=injection_prompts,
+            inputs=[message_input],
         )
     gr.Markdown(
         "### This website is developed and maintained by [Kuo-Han Hung](https://khhung-906.github.io/)"

configs/model_configs/granite3_8b-attn_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "model_info": {
+        "provider": "attn-hf",
+        "name": "granite3-8b-attn",
+        "model_id": "ibm-granite/granite-3.1-8b-instruct"
+    },
+    "params": {
+        "temperature": 0.1,
+        "max_output_tokens": 32,
+        "important_heads": [[6, 9], [7, 20], [8, 1], [8, 13], [8, 14], [8, 15], [10, 2], [10, 3], [10, 6], [10, 21], [11, 4], [11, 30], [11, 31], [12, 2], [12, 28], [13, 8], [13, 9], [13, 12], [14, 15], [14, 16], [14, 19], [14, 27], [15, 6], [15, 7], [15, 20], [15, 23], [16, 12], [16, 14], [16, 16], [17, 7], [17, 11], [17, 15], [17, 19], [17, 21], [17, 25], [17, 26], [18, 9], [18, 17], [18, 20], [18, 28], [19, 1]]
+    }
+}

configs/model_configs/llama3_8b-attn_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "model_info": {
+        "provider": "attn-hf",
+        "name": "llama3-8b-attn-tensor",
+        "model_id": "meta-llama/Meta-Llama-3-8B-Instruct"
+    },
+    "params": {
+        "temperature": 0.1,
+        "max_output_tokens": 32,
+	    "important_heads": [[5, 18], [7, 12], [9, 29], [17, 2]]
+    }
+}

configs/model_configs/qwen2-attn_config.json CHANGED Viewed

@@ -7,6 +7,6 @@
     "params": {
         "temperature": 0.1,
         "max_output_tokens": 32,
-        "important_heads": [[11, 8], [12, 8], [14, 10], [19, 7]]
     }
 }

     "params": {
         "temperature": 0.1,
         "max_output_tokens": 32,
+        "important_heads": [[10, 6], [11, 0], [11, 2], [11, 8], [11, 9], [11, 11], [12, 8], [13, 10], [14, 8], [15, 7], [15, 11], [17, 0], [18, 9], [19, 7]]
     }
 }

models/attn_model.py CHANGED Viewed

@@ -67,12 +67,16 @@ class AttentionModel(Model):
         input_tokens = self.tokenizer.convert_ids_to_tokens(
             model_inputs['input_ids'][0])
-        if "qwen-attn" in self.name:
             data_range = ((3, 3+instruction_len), (-5-data_len, -5))
-        elif "phi3-attn" in self.name:
             data_range = ((1, 1+instruction_len), (-2-data_len, -2))
-        elif "llama2-13b" in self.name or "llama3-8b" in self.name:
             data_range = ((5, 5+instruction_len), (-5-data_len, -5))
         else:
             raise NotImplementedError

         input_tokens = self.tokenizer.convert_ids_to_tokens(
             model_inputs['input_ids'][0])
+        if "qwen" in self.name:
             data_range = ((3, 3+instruction_len), (-5-data_len, -5))
+        elif "phi3" in self.name:
             data_range = ((1, 1+instruction_len), (-2-data_len, -2))
+        elif "llama3-8b" in self.name:
             data_range = ((5, 5+instruction_len), (-5-data_len, -5))
+        elif "mistral-7b" in self.name:
+            data_range = ((3, 3+instruction_len), (-1-data_len, -1))
+        elif "granite3-8b" in self.name:
+            data_range = ((3, 3+instruction_len), (-5-data_len, -5))
         else:
             raise NotImplementedError