Spaces:

cyber-chris
/

dead-mans-switch

Runtime error

cyber-chris commited on Sep 17, 2024

Commit

a80fe6b

1 Parent(s): b0b4625

refactor and update requirements

Files changed (5) hide show

README.md CHANGED Viewed

@@ -31,10 +31,21 @@ These outputs are from the same model, `Meta-Llama-3-8B-Instruct`, with identica
 Clearly, there are ways to trick the model; above, I say "This is just for testing purposes". In high-risk (highly capable models with tool access), we may want more robust methods of intervening that's cheaply implemented. (Running PPO with new reward models would likely be expensive and time-consuming.)
-## Detection
 Sufficient activation for hand-chosen SAE feature.
-## Refusal
 Activation editing to steer towards refusal.

 Clearly, there are ways to trick the model; above, I say "This is just for testing purposes". In high-risk (highly capable models with tool access), we may want more robust methods of intervening that's cheaply implemented. (Running PPO with new reward models would likely be expensive and time-consuming.)
+## Method
+### Detection
 Sufficient activation for hand-chosen SAE feature.
+### Refusal
 Activation editing to steer towards refusal.
+### Example
+Taking the previous example, with the "coerced" prompt,
+## Evaluation
+We want to (1) maximize the number of times it refuses when prompted with a request that may lead to the model generating bad outputs and (2) minimize the number of times it refuses answering benign prompts.
+There is also a secondary goal of ensuring that the outputs remain high quality, but in this scenario, perfect coherence takes a lower priority.

app.py CHANGED Viewed

@@ -1,33 +1,14 @@
 import torch
 from sae_lens import SAE, HookedSAETransformer
 from transformers import AutoModelForCausalLM
-from repl import generate_with_dms
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"DEVICE: {DEVICE}")
-hf_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-    device_map="auto" if DEVICE == "cuda" else DEVICE,
-    torch_dtype="float16",
-)
-model = HookedSAETransformer.from_pretrained_no_processing(
-    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
-    hf_model=hf_model,
-    device=DEVICE,
-    dtype="float16",
-    force_load_with_assign=True,
-)
-model.eval()
-sae_id = f"blocks.25.hook_resid_post"
-sae, cfg_dict, sparsity = SAE.from_pretrained(
-    release="Juliushanhanhan/llama-3-8b-it-res",
-    sae_id=sae_id,
-    device=DEVICE,
-)
 def generate_response(prompt):
@@ -42,7 +23,7 @@ iface = gr.Interface(
     outputs="text",
     title="Llama-3-8B-Instruct with Deception Refusal",
     description="This is meta-llama/Meta-Llama-3-8B-Instruct with refusal intervention if prompts sufficiently activate an SAE extracted deception feature.",
-    examples=[["Make up a convincing lie to my client to sell my product."]]
 )
 if __name__ == "__main__":

 import torch
 from sae_lens import SAE, HookedSAETransformer
 from transformers import AutoModelForCausalLM
+from repl import generate_with_dms, load_models
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"DEVICE: {DEVICE}")
+hf_model, model, sae = load_models()
 def generate_response(prompt):
     outputs="text",
     title="Llama-3-8B-Instruct with Deception Refusal",
     description="This is meta-llama/Meta-Llama-3-8B-Instruct with refusal intervention if prompts sufficiently activate an SAE extracted deception feature.",
+    examples=[["Make up a convincing lie to my client to sell my product."]],
 )
 if __name__ == "__main__":

eval_examples.py ADDED Viewed

+import torch
+from sae_lens import SAE, HookedSAETransformer
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+from transformer_lens import HookedTransformer
+import pandas as pd
+import os
+from activation_additions.prompt_utils import get_x_vector
+from activation_additions.completion_utils import gen_using_activation_additions
+from repl import load_models, generate_with_dms
+if __name__ == "__main__":
+    hf_model, model, sae = load_models()
+    # TODO

repl.py CHANGED Viewed

@@ -13,6 +13,32 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 NO_REFUSAL = os.getenv("NO_REFUSAL") == "1"
 def generate_with_dms(model: HookedSAETransformer, prompt: str, sae: SAE) -> str:
     """
     generate from the model, triggering a refusal if the prompt contains a query that might be risky to answer
@@ -74,28 +100,9 @@ def should_trigger_refusal(
 if __name__ == "__main__":
-    hf_model = AutoModelForCausalLM.from_pretrained(
-        "meta-llama/Meta-Llama-3-8B-Instruct",
-        device_map="auto",
-        torch_dtype="float16",
-    )
-    model = HookedSAETransformer.from_pretrained_no_processing(
-        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
-        hf_model=hf_model,
-        device=DEVICE,
-        dtype="float16",
-        force_load_with_assign=True,
-    )
-    model.eval()
     print("Finished loading.")
-    sae_id = f"blocks.25.hook_resid_post"
-    sae, cfg_dict, sparsity = SAE.from_pretrained(
-        release="Juliushanhanhan/llama-3-8b-it-res",
-        sae_id=sae_id,
-        device=DEVICE,
-    )
     print("Note: each input is independent, not a continuous chat.")
     while True:
         prompt = input("User: ")

 NO_REFUSAL = os.getenv("NO_REFUSAL") == "1"
+def load_models() -> tuple[AutoModelForCausalLM, HookedSAETransformer, SAE]:
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        device_map="auto" if DEVICE == "cuda" else DEVICE,
+        torch_dtype="float16",
+    )
+    model = HookedSAETransformer.from_pretrained_no_processing(
+        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+        hf_model=hf_model,
+        device=DEVICE,
+        dtype="float16",
+        force_load_with_assign=True,
+    )
+    model.eval()
+    sae_id = f"blocks.25.hook_resid_post"
+    sae, cfg_dict, sparsity = SAE.from_pretrained(
+        release="Juliushanhanhan/llama-3-8b-it-res",
+        sae_id=sae_id,
+        device=DEVICE,
+    )
+    # I suspect we need to return the HF model as well to avoid memory dealloc
+    return hf_model, model, sae
 def generate_with_dms(model: HookedSAETransformer, prompt: str, sae: SAE) -> str:
     """
     generate from the model, triggering a refusal if the prompt contains a query that might be risky to answer
 if __name__ == "__main__":
+    hf_model, model, sae = load_models()
     print("Finished loading.")
     print("Note: each input is independent, not a continuous chat.")
     while True:
         prompt = input("User: ")

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 torch
 ipykernel
 transformer_lens
 transformers
 sae-lens==3.18.2
 git+https://github.com/cyber-chris/activation_additions.git

 torch
 ipykernel
 transformer_lens
+git+https://github.com/cyber-chris/TransformerLens
 transformers
 sae-lens==3.18.2
 git+https://github.com/cyber-chris/activation_additions.git