frankaging commited on
Commit
e3ab52c
·
1 Parent(s): 98bf5cc
Files changed (1) hide show
  1. app.py +11 -19
app.py CHANGED
@@ -25,19 +25,15 @@ def load_jsonl(jsonl_path):
25
  return jsonl_data
26
 
27
  class Steer(pv.SourcelessIntervention):
 
28
  def __init__(self, **kwargs):
29
  super().__init__(**kwargs, keep_last_dim=True)
30
- self.proj = torch.nn.Linear(self.embed_dim, kwargs["latent_dim"], bias=False)
31
-
32
  def forward(self, base, source=None, subspaces=None):
33
- steer_vec = base
34
- if subspaces is not None:
35
- for sp in subspaces:
36
- idx = sp["idx"]
37
- mag = sp["internal_mag"] # scaled by 50
38
- steering_vec = mag * self.proj.weight[idx].unsqueeze(dim=0)
39
- steer_vec = steer_vec + steering_vec
40
- return steer_vec
41
 
42
  # Check GPU
43
  if not torch.cuda.is_available():
@@ -67,13 +63,9 @@ if torch.cuda.is_available():
67
  steer = Steer(embed_dim=params.shape[0], latent_dim=params.shape[1])
68
  steer.proj.weight.data = params.float()
69
 
70
- pv_model = pv.IntervenableModel(
71
- {
72
- "component": f"model.layers[20].output",
73
- "intervention": steer,
74
- },
75
- model=model,
76
- )
77
 
78
  terminators = [tokenizer.eos_token_id] if tokenizer else []
79
 
@@ -171,7 +163,7 @@ with gr.Blocks(css="style.css") as demo:
171
  # Pre-populate with a random concept if available
172
  default_subspaces = []
173
  if pv_model and concept_list:
174
- default_concept = random.choice(concept_list)
175
  default_subspaces = [{
176
  "text": default_concept,
177
  "idx": concept_id_map[default_concept],
@@ -191,7 +183,7 @@ with gr.Blocks(css="style.css") as demo:
191
  )
192
  # Right side: concept management
193
  with gr.Column(scale=3):
194
- gr.Markdown("## Steering Concepts")
195
  search_box = gr.Textbox(
196
  label="Search concepts",
197
  placeholder="e.g. 'time travel'"
 
25
  return jsonl_data
26
 
27
  class Steer(pv.SourcelessIntervention):
28
+ """Steer model via activation addition"""
29
  def __init__(self, **kwargs):
30
  super().__init__(**kwargs, keep_last_dim=True)
31
+ self.proj = torch.nn.Linear(
32
+ self.embed_dim, kwargs["latent_dim"], bias=False)
33
  def forward(self, base, source=None, subspaces=None):
34
+ steering_vec = torch.tensor(subspaces["mag"]) * \
35
+ self.proj.weight[subspaces["idx"]].unsqueeze(dim=0)
36
+ return base + steering_vec
 
 
 
 
 
37
 
38
  # Check GPU
39
  if not torch.cuda.is_available():
 
63
  steer = Steer(embed_dim=params.shape[0], latent_dim=params.shape[1])
64
  steer.proj.weight.data = params.float()
65
 
66
+ pv_model = pv.IntervenableModel({
67
+ "component": f"model.layers[20].output",
68
+ "intervention": steer}, model=model)
 
 
 
 
69
 
70
  terminators = [tokenizer.eos_token_id] if tokenizer else []
71
 
 
163
  # Pre-populate with a random concept if available
164
  default_subspaces = []
165
  if pv_model and concept_list:
166
+ default_concept = "words related to time travel and its consequences"
167
  default_subspaces = [{
168
  "text": default_concept,
169
  "idx": concept_id_map[default_concept],
 
183
  )
184
  # Right side: concept management
185
  with gr.Column(scale=3):
186
+ gr.Markdown("# Steering Concepts")
187
  search_box = gr.Textbox(
188
  label="Search concepts",
189
  placeholder="e.g. 'time travel'"