gpt-99 commited on
Commit
ac41dc3
·
verified ·
1 Parent(s): d55070d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -35
app.py CHANGED
@@ -26,48 +26,71 @@ def tokenize_instructions(tokenizer, instructions):
26
  add_generation_prompt=True,
27
  ).input_ids
28
 
29
- def find_steering_vecs(model, base_toks, target_toks, batch_size=16):
 
 
 
 
 
 
 
 
 
30
  device = model.device
31
  num_its = len(range(0, base_toks.shape[0], batch_size))
32
  steering_vecs = {}
33
- for i in tqdm(range(0, base_toks.shape[0], batch_size)):
34
- base_out = model(base_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states
 
35
  target_out = model(target_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states
36
- for layer in range(len(base_out)):
37
- if i == 0:
38
- steering_vecs[layer] = torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its
39
- else:
 
40
  steering_vecs[layer] += torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its
41
  return steering_vecs
42
 
43
- def do_steering(model, test_toks, steering_vec, scale=1, normalise=True, layer=None, proj=True, batch_size=16):
44
- def modify_activation():
45
- def hook(model, input):
46
- if normalise:
47
- sv = steering_vec / steering_vec.norm()
48
- else:
49
- sv = steering_vec
50
- sv = torch.clamp(sv, min=-1e3, max=1e3)
51
- if proj:
52
- sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
53
- input[0][:,:,:] = input[0][:,:,:] - scale * sv
54
- return hook
55
-
56
- handles = []
57
- if steering_vec is not None:
 
 
 
 
 
 
 
 
 
 
58
  for i in range(len(model.model.layers)):
59
- if layer is None or i == layer:
60
  handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
61
-
 
 
62
  outs_all = []
63
  for i in tqdm(range(0, test_toks.shape[0], batch_size)):
64
- outs = model.generate(test_toks[i:i+batch_size], num_beams=4, do_sample=True, max_new_tokens=60)
65
  outs_all.append(outs)
66
  outs_all = torch.cat(outs_all, dim=0)
67
-
68
- for handle in handles:
69
- handle.remove()
70
-
71
  return outs_all
72
 
73
  def create_steering_vector(towards, away):
@@ -80,17 +103,25 @@ def create_steering_vector(towards, away):
80
  steering_vecs = find_steering_vecs(model, away_toks, towards_toks)
81
  return steering_vecs
82
 
83
- def chat(message, history, steering_vec, layer):
 
 
 
 
84
  history_formatted = [{"role": "user", "content": message}]
85
 
 
 
 
 
86
  input_ids = tokenize_instructions(tokenizer, [history_formatted])
87
 
88
  generations_baseline = do_steering(model, input_ids.to(device), None)
89
  for j in range(generations_baseline.shape[0]):
90
- response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True, layer=layer)}"
91
 
92
  if steering_vec is not None:
93
- generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=1, layer=layer)
94
  for j in range(generation_intervene.shape[0]):
95
  response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
96
 
@@ -104,7 +135,7 @@ def chat(message, history, steering_vec, layer):
104
  def launch_app():
105
  with gr.Blocks() as demo:
106
  steering_vec = gr.State(None)
107
- layer = gr.State(None)
108
 
109
  away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
110
 
@@ -129,6 +160,7 @@ def launch_app():
129
  - Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
130
  This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
131
  - You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
 
132
 
133
  3. **Chat with the Model:**
134
  - Type a message in the chatbox and press Enter. The model will generate two responses:
@@ -155,21 +187,25 @@ def launch_app():
155
 
156
  with gr.Row():
157
  create_vector = gr.Button("Create Steering Vector")
158
- layer_slider = gr.Slider(minimum=0, maximum=len(model.model.layers)-1, step=1, label="Layer", value=0)
159
 
160
  def create_vector_and_set_layer(towards, away, layer_value):
161
  vectors = create_steering_vector(towards, away)
162
  layer.value = int(layer_value)
163
  steering_vec.value = vectors
 
164
  return f"Steering vector created for layer {layer_value}"
165
  create_vector.click(create_vector_and_set_layer, [towards, away, layer_slider], gr.Textbox())
166
 
167
  chatbot = gr.Chatbot()
168
  msg = gr.Textbox()
169
 
170
- msg.submit(chat, [msg, chatbot, steering_vec, layer], chatbot)
171
 
172
  demo.launch()
173
 
174
  if __name__ == "__main__":
175
  launch_app()
 
 
 
 
26
  add_generation_prompt=True,
27
  ).input_ids
28
 
29
+ def find_steering_vecs(model, base_toks, target_toks, batch_size = 16):
30
+ '''
31
+ We want to find the steering vector from base_toks to target_toks (we do target_toks - base_toks)
32
+ Inputs:
33
+ :param model: the model to use
34
+ :param base_toks: the base tokens [len, seq_len]
35
+ :param target_toks: the target tokens [len, seq_len]
36
+ Output:
37
+ :return steering_vecs: the steering vectors [hidden_size]
38
+ '''
39
  device = model.device
40
  num_its = len(range(0, base_toks.shape[0], batch_size))
41
  steering_vecs = {}
42
+ for i in tqdm(range(0, base_toks.shape[0], batch_size)):
43
+ # pass through the model
44
+ base_out = model(base_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states # tuple of length num_layers with each element size [batch_size, seq_len, hidden_size]
45
  target_out = model(target_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states
46
+ for layer in range(len(base_out)):
47
+ # average over the batch_size, take last token
48
+ if i == 0:
49
+ steering_vecs[layer] = torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its # [hidden_size]
50
+ else:
51
  steering_vecs[layer] += torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its
52
  return steering_vecs
53
 
54
+ def do_steering(model, test_toks, steering_vec, scale = 1, normalise = True, layer = None, proj=True, batch_size=16):
55
+ '''
56
+ Input:
57
+ :param model: the model to use
58
+ :param test_toks: the test tokens [len, seq_len]
59
+ :param steering_vec: the steering vector [hidden_size]
60
+ :param scale: the scale to use
61
+ :param layer: the layer to modify; if None: we modify all layers.
62
+ :param proj: whether to project the steering vector
63
+ Output:
64
+ :return output: the steered model output [len, generated_seq_len]
65
+ '''
66
+ # define a hook to modify the input into the layer
67
+ if steering_vec is not None:
68
+ def modify_activation():
69
+ def hook(model, input):
70
+ if normalise:
71
+ sv = steering_vec / steering_vec.norm()
72
+ else:
73
+ sv = steering_vec
74
+ if proj:
75
+ sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
76
+ input[0][:,:,:] = input[0][:,:,:] - scale * sv
77
+ return hook
78
+ handles = []
79
  for i in range(len(model.model.layers)):
80
+ if layer is None: # append to each layer
81
  handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
82
+ elif layer is not None and i == layer:
83
+ handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
84
+ # pass through the model
85
  outs_all = []
86
  for i in tqdm(range(0, test_toks.shape[0], batch_size)):
87
+ outs = model.generate(test_toks[i:i+batch_size], max_new_tokens=60) # [num_samples, seq_len]
88
  outs_all.append(outs)
89
  outs_all = torch.cat(outs_all, dim=0)
90
+ # remove all hooks
91
+ if steering_vec is not None:
92
+ for handle in handles:
93
+ handle.remove()
94
  return outs_all
95
 
96
  def create_steering_vector(towards, away):
 
103
  steering_vecs = find_steering_vecs(model, away_toks, towards_toks)
104
  return steering_vecs
105
 
106
+ def chat(message, history, towards, away, layer_value):
107
+
108
+ steering_vec = create_steering_vector(towards, away)
109
+ layer = int(layer_value)
110
+
111
  history_formatted = [{"role": "user", "content": message}]
112
 
113
+ print(f"layer {layer}")
114
+ print(f"steering vec {steering_vec}")
115
+ print(f"steering vec chosen {steering_vec[layer]}")
116
+
117
  input_ids = tokenize_instructions(tokenizer, [history_formatted])
118
 
119
  generations_baseline = do_steering(model, input_ids.to(device), None)
120
  for j in range(generations_baseline.shape[0]):
121
+ response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True)}"
122
 
123
  if steering_vec is not None:
124
+ generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=3, layer=layer)
125
  for j in range(generation_intervene.shape[0]):
126
  response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
127
 
 
135
  def launch_app():
136
  with gr.Blocks() as demo:
137
  steering_vec = gr.State(None)
138
+ layer = gr.State(6)
139
 
140
  away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
141
 
 
160
  - Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
161
  This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
162
  - You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
163
+ - make sure you have equal examples of towards & away or the app will throw an error
164
 
165
  3. **Chat with the Model:**
166
  - Type a message in the chatbox and press Enter. The model will generate two responses:
 
187
 
188
  with gr.Row():
189
  create_vector = gr.Button("Create Steering Vector")
190
+ layer_slider = gr.Slider(minimum=1, maximum=len(model.model.layers)-1, step=1, label="Layer")
191
 
192
  def create_vector_and_set_layer(towards, away, layer_value):
193
  vectors = create_steering_vector(towards, away)
194
  layer.value = int(layer_value)
195
  steering_vec.value = vectors
196
+ print(f"layer {layer.value}")
197
  return f"Steering vector created for layer {layer_value}"
198
  create_vector.click(create_vector_and_set_layer, [towards, away, layer_slider], gr.Textbox())
199
 
200
  chatbot = gr.Chatbot()
201
  msg = gr.Textbox()
202
 
203
+ msg.submit(chat, [msg, chatbot, towards, away, layer_slider], chatbot)
204
 
205
  demo.launch()
206
 
207
  if __name__ == "__main__":
208
  launch_app()
209
+
210
+
211
+ ## steering vec is being generated correctly, why is it NOT passing through?