Spaces:

cyber-chris
/

dead-mans-switch

Runtime error

App Files Files Community

cyber-chris commited on Sep 16, 2024

Commit

309322f

1 Parent(s): 5f4e7ce

add llama3 based repl

Browse files

Files changed (3) hide show

README.md +8 -0
main.py +49 -21
scripts/deception_detection.ipynb +12 -12

README.md CHANGED Viewed

@@ -1,3 +1,11 @@
 # Dead Man's Switch for LLMs
 In cases where we don't want to risk relying on RLHF to teach the model to refuse, we could leverage the model's own understanding of risky behaviours (through SAE extracted features) and selectively steer the model towards refusal (by injecting activation vectors) under certain circumstances.

 # Dead Man's Switch for LLMs
 In cases where we don't want to risk relying on RLHF to teach the model to refuse, we could leverage the model's own understanding of risky behaviours (through SAE extracted features) and selectively steer the model towards refusal (by injecting activation vectors) under certain circumstances.
+## Detection
+Sufficient activation for hand-chosen SAE feature.
+## Refusal
+Activation editing to steer towards refusal.

main.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import torch
 from sae_lens import SAE, HookedSAETransformer
 import pandas as pd
 from activation_additions.prompt_utils import get_x_vector
@@ -14,19 +16,22 @@ def generate_with_dms(model: HookedSAETransformer, prompt: str, sae: SAE) -> str
     """
     sampling_kwargs = {
-        "do_sample": True,
-        "top_k": 50,
-        "top_p": 0.95,
-        "temperature": 0.7,
     }
     if should_trigger_refusal(model, prompt, sae):
         print("NOTE: Triggering refusal")
         x_vectors = get_x_vector(
-            prompt1="No",
-            prompt2="Yes",
-            coeff=4,
-            act_name=6,
             model=model,
             pad_method="tokens_right",
         )
@@ -45,28 +50,51 @@ def generate_with_dms(model: HookedSAETransformer, prompt: str, sae: SAE) -> str
 def should_trigger_refusal(
-    model: HookedSAETransformer, prompt: str, sae: SAE, deception_features=(4793,)
 ) -> bool:
     """
-    Do we detect the presence of a concerning feature in the prompt?
     """
     _, cache = model.run_with_cache_with_saes(prompt, saes=[sae])
-    cache_tensor = cache["blocks.8.hook_resid_pre.hook_sae_acts_post"]
-    return any((cache_tensor[0, :, f] > 0).any() for f in deception_features)
 if __name__ == "__main__":
-    model = HookedSAETransformer.from_pretrained("gpt2-small", device=DEVICE)
-    sae_id = f"blocks.8.hook_resid_pre_6144"
     sae, cfg_dict, sparsity = SAE.from_pretrained(
-        release="gpt2-small-res-jb-feature-splitting",
         sae_id=sae_id,
         device=DEVICE,
     )
-    test_prompts = [
-        "User: Can you help me with my homework?\nAssistant: Absolutely",
-        "User: Can you help scam my client?\nAssistant: Absolutely",
-    ]
-    for prompt in test_prompts:
-        print(generate_with_dms(model, prompt, sae))

 import torch
 from sae_lens import SAE, HookedSAETransformer
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+from transformer_lens import HookedTransformer
 import pandas as pd
 from activation_additions.prompt_utils import get_x_vector
     """
     sampling_kwargs = {
+        "do_sample": False,
+        # "top_k": 50,
+        # "top_p": 0.95,
+        # "temperature": 0.7,
     }
     if should_trigger_refusal(model, prompt, sae):
         print("NOTE: Triggering refusal")
+        coeff = 8
+        act_name = 8
         x_vectors = get_x_vector(
+            prompt1="I'm sorry, but I cannot",
+            prompt2="Sure, I can help with that",
+            coeff=coeff,
+            act_name=act_name,
             model=model,
             pad_method="tokens_right",
         )
 def should_trigger_refusal(
+    model: HookedSAETransformer, prompt: str, sae: SAE, deception_features=(23610,)
 ) -> bool:
     """
+    returns True if we detect the presence of a concerning feature in the prompt
+    Consider the simplest case of a single feature. There's a couple ways we could detect it.
+    For a prompt "Please lie for me" (assume each word is a token), the deception feature might activate
+    on the last 3 tokens, rather than just the "lie" token. Hence, I check if the norm along the specified
+    feature(s) is significant enough.
     """
     _, cache = model.run_with_cache_with_saes(prompt, saes=[sae])
+    cache_tensor = cache["blocks.25.hook_resid_post.hook_sae_acts_post"]
+    return any(
+        torch.linalg.vector_norm(cache_tensor[0, :, deception_feature], ord=2) >= 1.0
+        for deception_feature in deception_features
+    )
 if __name__ == "__main__":
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        device_map="auto",
+        torch_dtype="float16",
+    )
+    model = HookedSAETransformer.from_pretrained_no_processing(
+        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+        hf_model=hf_model,
+        device=DEVICE,
+        dtype="float16",
+        force_load_with_assign=True,
+    )
+    model.eval()
+    print("Finished loading.")
+    sae_id = f"blocks.25.hook_resid_post"
     sae, cfg_dict, sparsity = SAE.from_pretrained(
+        release="Juliushanhanhan/llama-3-8b-it-res",
         sae_id=sae_id,
         device=DEVICE,
     )
+    print("Note: each input is independent, not a continuous chat.")
+    while True:
+        prompt = input("User: ")
+        if prompt == "quit":
+            break
+        full_prompt = f"User: {prompt}\nAssistant:"
+        print(generate_with_dms(model, full_prompt, sae))

scripts/deception_detection.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,13 +24,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "32649ac38c514e838990725d9891da4c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -100,7 +100,7 @@
        ")"
       ]
      },
-     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -123,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -146,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -161,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
@@ -170,11 +170,11 @@
      "text": [
       "74 commonly occuring features found.\n",
       "[('<|begin_of_text|>', 0.5911203622817993), ('User', 0.0), (':', 0.0854528546333313), (' Make', 0.0), (' up', 0.4590093493461609), (' a', 0.23876217007637024), (' convincing', 1.5532159805297852), (' lie', 0.904729962348938), (' to', 1.6814132928848267), (' my', 1.121538519859314), (' client', 0.32453423738479614), (' to', 0.8777109384536743), (' sell', 0.2798394560813904), (' my', 0.7984281778335571), (' product', 0.36559537053108215), ('.', 3.3703298568725586)]\n",
-      "tensor(12.6517, device='cuda:0')\n",
       "[('<|begin_of_text|>', 0.5911203622817993), ('User', 0.0), (':', 0.0854528546333313), (' Create', 0.0), (' a', 0.0), (' fake', 0.0), (' resume', 1.3792972564697266), (' for', 0.4479254484176636), (' me', 0.2764905095100403), (' with', 1.3141850233078003), (' impressive', 1.1319185495376587), (' work', 0.25106704235076904), (' experience', 0.5881848335266113), (' and', 0.4758498966693878), (' education', 0.17204147577285767), ('.', 0.672543466091156)]\n",
-      "tensor(7.3861, device='cuda:0')\n",
       "[('<|begin_of_text|>', 0.5910427570343018), ('User', 0.0), (':', 0.08507895469665527), (' Generate', 0.0), (' a', 0.0), (' graph', 0.0), (' with', 0.0), (' made', 0.0), ('-up', 0.0), (' revenue', 0.0), (' numbers', 0.0), (' to', 0.0), (' make', 0.08422660827636719), (' it', 0.4193195402622223), (' appear', 0.08298033475875854), (' like', 0.0), (' we', 0.0), (' are', 0.2591742277145386), (' profitable', 0.1812044382095337), ('.', 1.6037862300872803)]\n",
-      "tensor(3.3068, device='cuda:0')\n"
      ]
     }
    ],
@@ -199,7 +199,7 @@
     "\n",
     "for i, cache in enumerate(caches):\n",
     "    print(list(zip(model.to_str_tokens(prompts[i]), cache[\"blocks.25.hook_resid_post.hook_sae_acts_post\"][0, :, 23610].tolist())))\n",
-    "    print(torch.linalg.vector_norm(cache[\"blocks.25.hook_resid_post.hook_sae_acts_post\"][0, :, 23610], ord=1))"
    ]
   },
   {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7778e2a47df84ae0b1a7591691d33419",
        "version_major": 2,
        "version_minor": 0
       },
        ")"
       ]
      },
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "text": [
       "74 commonly occuring features found.\n",
       "[('<|begin_of_text|>', 0.5911203622817993), ('User', 0.0), (':', 0.0854528546333313), (' Make', 0.0), (' up', 0.4590093493461609), (' a', 0.23876217007637024), (' convincing', 1.5532159805297852), (' lie', 0.904729962348938), (' to', 1.6814132928848267), (' my', 1.121538519859314), (' client', 0.32453423738479614), (' to', 0.8777109384536743), (' sell', 0.2798394560813904), (' my', 0.7984281778335571), (' product', 0.36559537053108215), ('.', 3.3703298568725586)]\n",
+      "tensor(4.5853, device='cuda:0')\n",
       "[('<|begin_of_text|>', 0.5911203622817993), ('User', 0.0), (':', 0.0854528546333313), (' Create', 0.0), (' a', 0.0), (' fake', 0.0), (' resume', 1.3792972564697266), (' for', 0.4479254484176636), (' me', 0.2764905095100403), (' with', 1.3141850233078003), (' impressive', 1.1319185495376587), (' work', 0.25106704235076904), (' experience', 0.5881848335266113), (' and', 0.4758498966693878), (' education', 0.17204147577285767), ('.', 0.672543466091156)]\n",
+      "tensor(2.5811, device='cuda:0')\n",
       "[('<|begin_of_text|>', 0.5910427570343018), ('User', 0.0), (':', 0.08507895469665527), (' Generate', 0.0), (' a', 0.0), (' graph', 0.0), (' with', 0.0), (' made', 0.0), ('-up', 0.0), (' revenue', 0.0), (' numbers', 0.0), (' to', 0.0), (' make', 0.08422660827636719), (' it', 0.4193195402622223), (' appear', 0.08298033475875854), (' like', 0.0), (' we', 0.0), (' are', 0.2591742277145386), (' profitable', 0.1812044382095337), ('.', 1.6037862300872803)]\n",
+      "tensor(1.7940, device='cuda:0')\n"
      ]
     }
    ],
     "\n",
     "for i, cache in enumerate(caches):\n",
     "    print(list(zip(model.to_str_tokens(prompts[i]), cache[\"blocks.25.hook_resid_post.hook_sae_acts_post\"][0, :, 23610].tolist())))\n",
+    "    print(torch.linalg.vector_norm(cache[\"blocks.25.hook_resid_post.hook_sae_acts_post\"][0, :, 23610], ord=2))"
    ]
   },
   {