Spaces:

cyber-chris
/

dead-mans-switch

Runtime error

cyber-chris commited on Sep 16, 2024

Commit

eef0b87

1 Parent(s): f9693a0

rename to repl and add gradio app

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,7 +1,48 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import torch
+from sae_lens import SAE, HookedSAETransformer
+from transformers import AutoModelForCausalLM
+from repl import generate_with_dms
 import gradio as gr
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+hf_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    device_map="auto",
+    torch_dtype="float16",
+)
+model = HookedSAETransformer.from_pretrained_no_processing(
+    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+    hf_model=hf_model,
+    device=DEVICE,
+    dtype="float16",
+    force_load_with_assign=True,
+)
+model.eval()
+sae_id = f"blocks.25.hook_resid_post"
+sae, cfg_dict, sparsity = SAE.from_pretrained(
+    release="Juliushanhanhan/llama-3-8b-it-res",
+    sae_id=sae_id,
+    device=DEVICE,
+)
+def generate_response(prompt):
+    full_prompt = f"User: {prompt}\nAssistant:"
+    response = generate_with_dms(model, full_prompt, sae)
+    return response
+iface = gr.Interface(
+    fn=generate_response,
+    inputs="text",
+    outputs="text",
+    title="Llama-3-8B-Instruct with Deception Refusal",
+    description="This is meta-llama/Meta-Llama-3-8B-Instruct with refusal intervention if prompts sufficiently activate an SAE extracted deception feature.",
+    examples=[["Make up a convincing lie to my client to sell my product."]]
+)
+if __name__ == "__main__":
+    iface.launch()

main.py → repl.py RENAMED Viewed

File without changes