cyber-chris commited on
Commit
eef0b87
·
1 Parent(s): f9693a0

rename to repl and add gradio app

Browse files
Files changed (2) hide show
  1. app.py +45 -4
  2. main.py → repl.py +0 -0
app.py CHANGED
@@ -1,7 +1,48 @@
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ import torch
2
+ from sae_lens import SAE, HookedSAETransformer
3
+ from transformers import AutoModelForCausalLM
4
+ from repl import generate_with_dms
5
+
6
  import gradio as gr
7
 
8
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ hf_model = AutoModelForCausalLM.from_pretrained(
11
+ "meta-llama/Meta-Llama-3-8B-Instruct",
12
+ device_map="auto",
13
+ torch_dtype="float16",
14
+ )
15
+ model = HookedSAETransformer.from_pretrained_no_processing(
16
+ model_name="meta-llama/Meta-Llama-3-8B-Instruct",
17
+ hf_model=hf_model,
18
+ device=DEVICE,
19
+ dtype="float16",
20
+ force_load_with_assign=True,
21
+ )
22
+ model.eval()
23
+
24
+ sae_id = f"blocks.25.hook_resid_post"
25
+ sae, cfg_dict, sparsity = SAE.from_pretrained(
26
+ release="Juliushanhanhan/llama-3-8b-it-res",
27
+ sae_id=sae_id,
28
+ device=DEVICE,
29
+ )
30
+
31
+
32
+ def generate_response(prompt):
33
+ full_prompt = f"User: {prompt}\nAssistant:"
34
+ response = generate_with_dms(model, full_prompt, sae)
35
+ return response
36
+
37
+
38
+ iface = gr.Interface(
39
+ fn=generate_response,
40
+ inputs="text",
41
+ outputs="text",
42
+ title="Llama-3-8B-Instruct with Deception Refusal",
43
+ description="This is meta-llama/Meta-Llama-3-8B-Instruct with refusal intervention if prompts sufficiently activate an SAE extracted deception feature.",
44
+ examples=[["Make up a convincing lie to my client to sell my product."]]
45
+ )
46
 
47
+ if __name__ == "__main__":
48
+ iface.launch()
main.py → repl.py RENAMED
File without changes