Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
@@ -26,48 +26,71 @@ def tokenize_instructions(tokenizer, instructions):
|
|
26 |
add_generation_prompt=True,
|
27 |
).input_ids
|
28 |
|
29 |
-
def find_steering_vecs(model, base_toks, target_toks, batch_size=16):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
device = model.device
|
31 |
num_its = len(range(0, base_toks.shape[0], batch_size))
|
32 |
steering_vecs = {}
|
33 |
-
for i in tqdm(range(0, base_toks.shape[0], batch_size)):
|
34 |
-
|
|
|
35 |
target_out = model(target_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states
|
36 |
-
for layer in range(len(base_out)):
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
40 |
steering_vecs[layer] += torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its
|
41 |
return steering_vecs
|
42 |
|
43 |
-
def do_steering(model, test_toks, steering_vec, scale=1, normalise=True, layer=None, proj=True, batch_size=16):
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
for i in range(len(model.model.layers)):
|
59 |
-
if layer is None
|
60 |
handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
|
61 |
-
|
|
|
|
|
62 |
outs_all = []
|
63 |
for i in tqdm(range(0, test_toks.shape[0], batch_size)):
|
64 |
-
outs = model.generate(test_toks[i:i+batch_size],
|
65 |
outs_all.append(outs)
|
66 |
outs_all = torch.cat(outs_all, dim=0)
|
67 |
-
|
68 |
-
|
69 |
-
handle
|
70 |
-
|
71 |
return outs_all
|
72 |
|
73 |
def create_steering_vector(towards, away):
|
@@ -80,17 +103,25 @@ def create_steering_vector(towards, away):
|
|
80 |
steering_vecs = find_steering_vecs(model, away_toks, towards_toks)
|
81 |
return steering_vecs
|
82 |
|
83 |
-
def chat(message, history,
|
|
|
|
|
|
|
|
|
84 |
history_formatted = [{"role": "user", "content": message}]
|
85 |
|
|
|
|
|
|
|
|
|
86 |
input_ids = tokenize_instructions(tokenizer, [history_formatted])
|
87 |
|
88 |
generations_baseline = do_steering(model, input_ids.to(device), None)
|
89 |
for j in range(generations_baseline.shape[0]):
|
90 |
-
response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True
|
91 |
|
92 |
if steering_vec is not None:
|
93 |
-
generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=
|
94 |
for j in range(generation_intervene.shape[0]):
|
95 |
response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
|
96 |
|
@@ -104,7 +135,7 @@ def chat(message, history, steering_vec, layer):
|
|
104 |
def launch_app():
|
105 |
with gr.Blocks() as demo:
|
106 |
steering_vec = gr.State(None)
|
107 |
-
layer = gr.State(
|
108 |
|
109 |
away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
|
110 |
|
@@ -129,6 +160,7 @@ def launch_app():
|
|
129 |
- Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
|
130 |
This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
|
131 |
- You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
|
|
|
132 |
|
133 |
3. **Chat with the Model:**
|
134 |
- Type a message in the chatbox and press Enter. The model will generate two responses:
|
@@ -155,21 +187,25 @@ def launch_app():
|
|
155 |
|
156 |
with gr.Row():
|
157 |
create_vector = gr.Button("Create Steering Vector")
|
158 |
-
layer_slider = gr.Slider(minimum=
|
159 |
|
160 |
def create_vector_and_set_layer(towards, away, layer_value):
|
161 |
vectors = create_steering_vector(towards, away)
|
162 |
layer.value = int(layer_value)
|
163 |
steering_vec.value = vectors
|
|
|
164 |
return f"Steering vector created for layer {layer_value}"
|
165 |
create_vector.click(create_vector_and_set_layer, [towards, away, layer_slider], gr.Textbox())
|
166 |
|
167 |
chatbot = gr.Chatbot()
|
168 |
msg = gr.Textbox()
|
169 |
|
170 |
-
msg.submit(chat, [msg, chatbot,
|
171 |
|
172 |
demo.launch()
|
173 |
|
174 |
if __name__ == "__main__":
|
175 |
launch_app()
|
|
|
|
|
|
|
|
26 |
add_generation_prompt=True,
|
27 |
).input_ids
|
28 |
|
29 |
+
def find_steering_vecs(model, base_toks, target_toks, batch_size = 16):
|
30 |
+
'''
|
31 |
+
We want to find the steering vector from base_toks to target_toks (we do target_toks - base_toks)
|
32 |
+
Inputs:
|
33 |
+
:param model: the model to use
|
34 |
+
:param base_toks: the base tokens [len, seq_len]
|
35 |
+
:param target_toks: the target tokens [len, seq_len]
|
36 |
+
Output:
|
37 |
+
:return steering_vecs: the steering vectors [hidden_size]
|
38 |
+
'''
|
39 |
device = model.device
|
40 |
num_its = len(range(0, base_toks.shape[0], batch_size))
|
41 |
steering_vecs = {}
|
42 |
+
for i in tqdm(range(0, base_toks.shape[0], batch_size)):
|
43 |
+
# pass through the model
|
44 |
+
base_out = model(base_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states # tuple of length num_layers with each element size [batch_size, seq_len, hidden_size]
|
45 |
target_out = model(target_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states
|
46 |
+
for layer in range(len(base_out)):
|
47 |
+
# average over the batch_size, take last token
|
48 |
+
if i == 0:
|
49 |
+
steering_vecs[layer] = torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its # [hidden_size]
|
50 |
+
else:
|
51 |
steering_vecs[layer] += torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its
|
52 |
return steering_vecs
|
53 |
|
54 |
+
def do_steering(model, test_toks, steering_vec, scale = 1, normalise = True, layer = None, proj=True, batch_size=16):
|
55 |
+
'''
|
56 |
+
Input:
|
57 |
+
:param model: the model to use
|
58 |
+
:param test_toks: the test tokens [len, seq_len]
|
59 |
+
:param steering_vec: the steering vector [hidden_size]
|
60 |
+
:param scale: the scale to use
|
61 |
+
:param layer: the layer to modify; if None: we modify all layers.
|
62 |
+
:param proj: whether to project the steering vector
|
63 |
+
Output:
|
64 |
+
:return output: the steered model output [len, generated_seq_len]
|
65 |
+
'''
|
66 |
+
# define a hook to modify the input into the layer
|
67 |
+
if steering_vec is not None:
|
68 |
+
def modify_activation():
|
69 |
+
def hook(model, input):
|
70 |
+
if normalise:
|
71 |
+
sv = steering_vec / steering_vec.norm()
|
72 |
+
else:
|
73 |
+
sv = steering_vec
|
74 |
+
if proj:
|
75 |
+
sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
|
76 |
+
input[0][:,:,:] = input[0][:,:,:] - scale * sv
|
77 |
+
return hook
|
78 |
+
handles = []
|
79 |
for i in range(len(model.model.layers)):
|
80 |
+
if layer is None: # append to each layer
|
81 |
handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
|
82 |
+
elif layer is not None and i == layer:
|
83 |
+
handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
|
84 |
+
# pass through the model
|
85 |
outs_all = []
|
86 |
for i in tqdm(range(0, test_toks.shape[0], batch_size)):
|
87 |
+
outs = model.generate(test_toks[i:i+batch_size], max_new_tokens=60) # [num_samples, seq_len]
|
88 |
outs_all.append(outs)
|
89 |
outs_all = torch.cat(outs_all, dim=0)
|
90 |
+
# remove all hooks
|
91 |
+
if steering_vec is not None:
|
92 |
+
for handle in handles:
|
93 |
+
handle.remove()
|
94 |
return outs_all
|
95 |
|
96 |
def create_steering_vector(towards, away):
|
|
|
103 |
steering_vecs = find_steering_vecs(model, away_toks, towards_toks)
|
104 |
return steering_vecs
|
105 |
|
106 |
+
def chat(message, history, towards, away, layer_value):
|
107 |
+
|
108 |
+
steering_vec = create_steering_vector(towards, away)
|
109 |
+
layer = int(layer_value)
|
110 |
+
|
111 |
history_formatted = [{"role": "user", "content": message}]
|
112 |
|
113 |
+
print(f"layer {layer}")
|
114 |
+
print(f"steering vec {steering_vec}")
|
115 |
+
print(f"steering vec chosen {steering_vec[layer]}")
|
116 |
+
|
117 |
input_ids = tokenize_instructions(tokenizer, [history_formatted])
|
118 |
|
119 |
generations_baseline = do_steering(model, input_ids.to(device), None)
|
120 |
for j in range(generations_baseline.shape[0]):
|
121 |
+
response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True)}"
|
122 |
|
123 |
if steering_vec is not None:
|
124 |
+
generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=3, layer=layer)
|
125 |
for j in range(generation_intervene.shape[0]):
|
126 |
response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
|
127 |
|
|
|
135 |
def launch_app():
|
136 |
with gr.Blocks() as demo:
|
137 |
steering_vec = gr.State(None)
|
138 |
+
layer = gr.State(6)
|
139 |
|
140 |
away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
|
141 |
|
|
|
160 |
- Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
|
161 |
This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
|
162 |
- You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
|
163 |
+
- make sure you have equal examples of towards & away or the app will throw an error
|
164 |
|
165 |
3. **Chat with the Model:**
|
166 |
- Type a message in the chatbox and press Enter. The model will generate two responses:
|
|
|
187 |
|
188 |
with gr.Row():
|
189 |
create_vector = gr.Button("Create Steering Vector")
|
190 |
+
layer_slider = gr.Slider(minimum=1, maximum=len(model.model.layers)-1, step=1, label="Layer")
|
191 |
|
192 |
def create_vector_and_set_layer(towards, away, layer_value):
|
193 |
vectors = create_steering_vector(towards, away)
|
194 |
layer.value = int(layer_value)
|
195 |
steering_vec.value = vectors
|
196 |
+
print(f"layer {layer.value}")
|
197 |
return f"Steering vector created for layer {layer_value}"
|
198 |
create_vector.click(create_vector_and_set_layer, [towards, away, layer_slider], gr.Textbox())
|
199 |
|
200 |
chatbot = gr.Chatbot()
|
201 |
msg = gr.Textbox()
|
202 |
|
203 |
+
msg.submit(chat, [msg, chatbot, towards, away, layer_slider], chatbot)
|
204 |
|
205 |
demo.launch()
|
206 |
|
207 |
if __name__ == "__main__":
|
208 |
launch_app()
|
209 |
+
|
210 |
+
|
211 |
+
## steering vec is being generated correctly, why is it NOT passing through?
|