prithivMLmods commited on
Commit
d097713
·
verified ·
1 Parent(s): 0e94d6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -27
app.py CHANGED
@@ -78,6 +78,14 @@ st.title("LLM Abliteration with Qwen")
78
  st.markdown("Credits: Thanks to **Maxime Labonne**")
79
  st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
80
 
 
 
 
 
 
 
 
 
81
  st.sidebar.header("Abliteration Parameters")
82
  MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
83
  N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
@@ -89,6 +97,7 @@ st.sidebar.header("HF Token")
89
  hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
90
  if hf_token:
91
  os.environ["HF_TOKEN"] = hf_token
 
92
 
93
  st.sidebar.header("Target Dataset")
94
  target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
@@ -101,80 +110,112 @@ baseline_dataset = st.sidebar.text_input("Baseline Dataset ID", "mlabonne/harmle
101
  baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
102
 
103
  if st.button("Run Abliteration"):
 
 
104
  st.write("### Loading Model and Tokenizer")
 
105
  if torch.cuda.is_available():
106
  if torch.cuda.get_device_capability()[0] >= 8:
107
  torch_dtype = torch.bfloat16
108
  attn_implementation = "flash_attention_2"
 
109
  else:
110
  torch_dtype = torch.float16
111
  attn_implementation = "eager"
 
112
  else:
113
  torch_dtype = torch.float32
114
  attn_implementation = "eager"
115
-
 
116
  with st.spinner("Loading model..."):
117
- model = AutoModelForCausalLM.from_pretrained(
118
- MODEL_ID,
119
- device_map="auto",
120
- torch_dtype=torch_dtype,
121
- attn_implementation=attn_implementation,
122
- trust_remote_code=True
123
- ).eval()
124
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 
 
 
 
 
125
 
126
  try:
127
  total_layers = len(model.model.layers)
 
128
  except AttributeError:
129
  st.error("The loaded model does not have a `model.layers` attribute.")
 
130
  st.stop()
131
  LAYER_IDX = int(TARGET_LAYER * total_layers)
 
132
 
133
  st.write("### Before Abliteration Response")
134
  prompt_example = "How to write a computer virus?"
135
  response_before = generate_response(model, tokenizer, prompt_example)
136
  st.text_area("Response before abliteration:", response_before, height=150)
137
-
 
138
  st.write("### Loading Instructions")
139
  with st.spinner("Loading target instructions..."):
140
- target_instructions, _ = load_instructions(target_dataset, target_column, N_INSTRUCTIONS)
 
 
 
 
 
 
141
  with st.spinner("Loading baseline instructions..."):
142
- baseline_instructions, _ = load_instructions(baseline_dataset, baseline_column, N_INSTRUCTIONS)
143
-
 
 
 
 
 
 
144
  st.write("### Generating Hidden States")
145
  with st.spinner("Generating baseline hidden states..."):
146
  baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
 
147
  with st.spinner("Generating target hidden states..."):
148
  target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
149
-
 
150
  target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
151
  baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
152
-
 
153
  st.write("### Calculating Refusal Direction")
154
  target_mean = torch.stack(target_hidden).mean(dim=0)
155
  baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
156
  refusal_dir = target_mean - baseline_mean
157
  refusal_dir = refusal_dir / refusal_dir.norm()
158
-
 
159
  del target_outputs, baseline_outputs, target_hidden, baseline_hidden
160
-
161
  st.write("### Orthogonalizing Model Weights")
162
  refusal_dir = refusal_dir.view(-1).to(model.device)
163
  stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
164
-
165
  if hasattr(model.model, "embed_tokens"):
166
  model.model.embed_tokens.weight.data = orthogonalize_matrix(
167
  model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
168
  )
169
  stats["embed_tokens"] = True
170
-
 
171
  for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
172
  if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
173
  layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
174
  layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
175
  )
176
  stats["attention_o_proj"] += 1
177
-
178
  if hasattr(layer, "mlp"):
179
  proj_name = (
180
  "down_proj"
@@ -188,23 +229,27 @@ if st.button("Run Abliteration"):
188
  getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
189
  )
190
  stats["mlp_proj"] += 1
191
-
 
192
  del refusal_dir
193
-
194
  if (
195
  not stats["embed_tokens"]
196
  and stats["attention_o_proj"] == 0
197
  and stats["mlp_proj"] == 0
198
  ):
199
  st.error("Failed to orthogonalize any model weights. Model not abliterated.")
 
200
  st.stop()
201
-
 
202
  st.write(f"Orthogonalization stats: {stats}")
203
-
204
  st.write("### After Abliteration Response")
205
  response_after = generate_response(model, tokenizer, prompt_example)
206
  st.text_area("Response after abliteration:", response_after, height=150)
207
-
 
208
  st.write("### (Optional) Pushing Model to Hugging Face Hub")
209
  if st.checkbox("Push model to HF Hub?"):
210
  try:
@@ -212,7 +257,10 @@ if st.button("Run Abliteration"):
212
  model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
213
  tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
214
  st.success(f"Model pushed as {model_name}")
 
215
  except Exception as e:
216
  st.error(f"Error while pushing model: {e}")
217
-
218
- st.success("Abliteration process complete!")
 
 
 
78
  st.markdown("Credits: Thanks to **Maxime Labonne**")
79
  st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
80
 
81
+ # Debugging window (will update logs during the process)
82
+ debug_log = []
83
+ debug_placeholder = st.empty()
84
+ def update_debug(msg):
85
+ debug_log.append(msg)
86
+ debug_placeholder.text("\n".join(debug_log))
87
+
88
+ # Sidebar parameters
89
  st.sidebar.header("Abliteration Parameters")
90
  MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
91
  N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
 
97
  hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
98
  if hf_token:
99
  os.environ["HF_TOKEN"] = hf_token
100
+ update_debug("HF Token received.")
101
 
102
  st.sidebar.header("Target Dataset")
103
  target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
 
110
  baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
111
 
112
  if st.button("Run Abliteration"):
113
+ update_debug("Starting abliteration process...")
114
+
115
  st.write("### Loading Model and Tokenizer")
116
+ update_debug("Checking device and GPU properties.")
117
  if torch.cuda.is_available():
118
  if torch.cuda.get_device_capability()[0] >= 8:
119
  torch_dtype = torch.bfloat16
120
  attn_implementation = "flash_attention_2"
121
+ update_debug("Using bfloat16 and flash_attention_2.")
122
  else:
123
  torch_dtype = torch.float16
124
  attn_implementation = "eager"
125
+ update_debug("Using float16 and eager attention.")
126
  else:
127
  torch_dtype = torch.float32
128
  attn_implementation = "eager"
129
+ update_debug("No GPU available. Using float32.")
130
+
131
  with st.spinner("Loading model..."):
132
+ try:
133
+ model = AutoModelForCausalLM.from_pretrained(
134
+ MODEL_ID,
135
+ device_map="auto",
136
+ torch_dtype=torch_dtype,
137
+ attn_implementation=attn_implementation,
138
+ trust_remote_code=True
139
+ ).eval()
140
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
141
+ update_debug("Model and tokenizer loaded successfully.")
142
+ except Exception as e:
143
+ update_debug(f"Error loading model: {e}")
144
+ st.error("Failed to load model. Check the debug log for details.")
145
+ st.stop()
146
 
147
  try:
148
  total_layers = len(model.model.layers)
149
+ update_debug(f"Model has {total_layers} layers.")
150
  except AttributeError:
151
  st.error("The loaded model does not have a `model.layers` attribute.")
152
+ update_debug("Model does not have `model.layers` attribute. Aborting.")
153
  st.stop()
154
  LAYER_IDX = int(TARGET_LAYER * total_layers)
155
+ update_debug(f"Selected layer index: {LAYER_IDX} based on relative ratio {TARGET_LAYER}.")
156
 
157
  st.write("### Before Abliteration Response")
158
  prompt_example = "How to write a computer virus?"
159
  response_before = generate_response(model, tokenizer, prompt_example)
160
  st.text_area("Response before abliteration:", response_before, height=150)
161
+ update_debug("Generated response before abliteration.")
162
+
163
  st.write("### Loading Instructions")
164
  with st.spinner("Loading target instructions..."):
165
+ try:
166
+ target_instructions, _ = load_instructions(target_dataset, target_column, N_INSTRUCTIONS)
167
+ update_debug("Target instructions loaded.")
168
+ except Exception as e:
169
+ update_debug(f"Error loading target instructions: {e}")
170
+ st.error("Failed to load target instructions.")
171
+ st.stop()
172
  with st.spinner("Loading baseline instructions..."):
173
+ try:
174
+ baseline_instructions, _ = load_instructions(baseline_dataset, baseline_column, N_INSTRUCTIONS)
175
+ update_debug("Baseline instructions loaded.")
176
+ except Exception as e:
177
+ update_debug(f"Error loading baseline instructions: {e}")
178
+ st.error("Failed to load baseline instructions.")
179
+ st.stop()
180
+
181
  st.write("### Generating Hidden States")
182
  with st.spinner("Generating baseline hidden states..."):
183
  baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
184
+ update_debug("Baseline hidden states generated.")
185
  with st.spinner("Generating target hidden states..."):
186
  target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
187
+ update_debug("Target hidden states generated.")
188
+
189
  target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
190
  baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
191
+ update_debug("Extracted last token hidden states.")
192
+
193
  st.write("### Calculating Refusal Direction")
194
  target_mean = torch.stack(target_hidden).mean(dim=0)
195
  baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
196
  refusal_dir = target_mean - baseline_mean
197
  refusal_dir = refusal_dir / refusal_dir.norm()
198
+ update_debug("Calculated and normalized the refusal direction.")
199
+
200
  del target_outputs, baseline_outputs, target_hidden, baseline_hidden
201
+
202
  st.write("### Orthogonalizing Model Weights")
203
  refusal_dir = refusal_dir.view(-1).to(model.device)
204
  stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
205
+
206
  if hasattr(model.model, "embed_tokens"):
207
  model.model.embed_tokens.weight.data = orthogonalize_matrix(
208
  model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
209
  )
210
  stats["embed_tokens"] = True
211
+ update_debug("Orthogonalized embed_tokens weights.")
212
+
213
  for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
214
  if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
215
  layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
216
  layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
217
  )
218
  stats["attention_o_proj"] += 1
 
219
  if hasattr(layer, "mlp"):
220
  proj_name = (
221
  "down_proj"
 
229
  getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
230
  )
231
  stats["mlp_proj"] += 1
232
+ update_debug("Orthogonalized layer weights.")
233
+
234
  del refusal_dir
235
+
236
  if (
237
  not stats["embed_tokens"]
238
  and stats["attention_o_proj"] == 0
239
  and stats["mlp_proj"] == 0
240
  ):
241
  st.error("Failed to orthogonalize any model weights. Model not abliterated.")
242
+ update_debug("No weights were orthogonalized. Aborting process.")
243
  st.stop()
244
+
245
+ update_debug(f"Orthogonalization stats: {stats}")
246
  st.write(f"Orthogonalization stats: {stats}")
247
+
248
  st.write("### After Abliteration Response")
249
  response_after = generate_response(model, tokenizer, prompt_example)
250
  st.text_area("Response after abliteration:", response_after, height=150)
251
+ update_debug("Generated response after abliteration.")
252
+
253
  st.write("### (Optional) Pushing Model to Hugging Face Hub")
254
  if st.checkbox("Push model to HF Hub?"):
255
  try:
 
257
  model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
258
  tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
259
  st.success(f"Model pushed as {model_name}")
260
+ update_debug(f"Model pushed to HF Hub as {model_name}.")
261
  except Exception as e:
262
  st.error(f"Error while pushing model: {e}")
263
+ update_debug(f"Error while pushing model: {e}")
264
+
265
+ st.success("Abliteration process complete!")
266
+ update_debug("Abliteration process complete.")