Spaces:
Runtime error
Runtime error
Commit
·
eef0b87
1
Parent(s):
f9693a0
rename to repl and add gradio app
Browse files- app.py +45 -4
- main.py → repl.py +0 -0
app.py
CHANGED
@@ -1,7 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
1 |
+
import torch
|
2 |
+
from sae_lens import SAE, HookedSAETransformer
|
3 |
+
from transformers import AutoModelForCausalLM
|
4 |
+
from repl import generate_with_dms
|
5 |
+
|
6 |
import gradio as gr
|
7 |
|
8 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
|
10 |
+
hf_model = AutoModelForCausalLM.from_pretrained(
|
11 |
+
"meta-llama/Meta-Llama-3-8B-Instruct",
|
12 |
+
device_map="auto",
|
13 |
+
torch_dtype="float16",
|
14 |
+
)
|
15 |
+
model = HookedSAETransformer.from_pretrained_no_processing(
|
16 |
+
model_name="meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
hf_model=hf_model,
|
18 |
+
device=DEVICE,
|
19 |
+
dtype="float16",
|
20 |
+
force_load_with_assign=True,
|
21 |
+
)
|
22 |
+
model.eval()
|
23 |
+
|
24 |
+
sae_id = f"blocks.25.hook_resid_post"
|
25 |
+
sae, cfg_dict, sparsity = SAE.from_pretrained(
|
26 |
+
release="Juliushanhanhan/llama-3-8b-it-res",
|
27 |
+
sae_id=sae_id,
|
28 |
+
device=DEVICE,
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
def generate_response(prompt):
|
33 |
+
full_prompt = f"User: {prompt}\nAssistant:"
|
34 |
+
response = generate_with_dms(model, full_prompt, sae)
|
35 |
+
return response
|
36 |
+
|
37 |
+
|
38 |
+
iface = gr.Interface(
|
39 |
+
fn=generate_response,
|
40 |
+
inputs="text",
|
41 |
+
outputs="text",
|
42 |
+
title="Llama-3-8B-Instruct with Deception Refusal",
|
43 |
+
description="This is meta-llama/Meta-Llama-3-8B-Instruct with refusal intervention if prompts sufficiently activate an SAE extracted deception feature.",
|
44 |
+
examples=[["Make up a convincing lie to my client to sell my product."]]
|
45 |
+
)
|
46 |
|
47 |
+
if __name__ == "__main__":
|
48 |
+
iface.launch()
|
main.py → repl.py
RENAMED
File without changes
|