prithivMLmods commited on
Commit
35870c4
·
verified ·
1 Parent(s): 4744292

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -0
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import torch
4
+ from tqdm import tqdm
5
+ import streamlit as st
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ from datasets import load_dataset
8
+
9
+ # --- Helper functions ---
10
+
11
+ def load_instructions(dataset_id, column, n_instructions):
12
+ dataset = load_dataset(dataset_id, split="train")
13
+ indices = random.sample(range(len(dataset)), n_instructions * 2)
14
+ return [dataset[i][column] for i in indices[:n_instructions]], [
15
+ dataset[i][column] for i in indices[n_instructions:]
16
+ ]
17
+
18
+ def generate_response(model, tokenizer, prompt, max_new_tokens=128):
19
+ if hasattr(tokenizer, "apply_chat_template"):
20
+ inputs = tokenizer.apply_chat_template(
21
+ conversation=[{"role": "user", "content": prompt}],
22
+ add_generation_prompt=True,
23
+ return_tensors="pt",
24
+ ).to(model.device)
25
+ else:
26
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
27
+ output_ids = model.generate(
28
+ inputs,
29
+ max_new_tokens=max_new_tokens,
30
+ do_sample=True,
31
+ temperature=0.5,
32
+ min_p=0.1,
33
+ repetition_penalty=1.05,
34
+ )
35
+ return tokenizer.decode(output_ids[0], skip_special_tokens=True)
36
+
37
+ def generate_outputs(model, tokenizer, instructions, system_prompt):
38
+ outputs = []
39
+ for instruction in tqdm(instructions, desc="Generating outputs", leave=False):
40
+ if hasattr(tokenizer, "apply_chat_template"):
41
+ inputs = tokenizer.apply_chat_template(
42
+ conversation=[
43
+ {"role": "system", "content": system_prompt},
44
+ {"role": "user", "content": instruction},
45
+ ],
46
+ add_generation_prompt=True,
47
+ return_tensors="pt",
48
+ ).to(model.device)
49
+ else:
50
+ prompt = system_prompt + "\n" + instruction
51
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
52
+ out = model.generate(
53
+ inputs,
54
+ use_cache=False,
55
+ max_new_tokens=1,
56
+ return_dict_in_generate=True,
57
+ output_hidden_states=True,
58
+ )
59
+ outputs.append(out["hidden_states"][0])
60
+ return outputs
61
+
62
+ def orthogonalize_matrix(matrix, vec, weight):
63
+ vec = vec.view(-1).to(matrix.device)
64
+ if matrix.shape[-1] == vec.shape[0]:
65
+ proj = torch.einsum("...d,d->...", matrix, vec).unsqueeze(-1) * vec.unsqueeze(0)
66
+ return matrix - weight * proj
67
+ elif matrix.shape[0] == vec.shape[0]:
68
+ proj = torch.einsum("d...,d->...", matrix, vec).unsqueeze(0) * vec.unsqueeze(-1)
69
+ return matrix - weight * proj
70
+ else:
71
+ raise ValueError(
72
+ f"Matrix shape {matrix.shape} incompatible with vector shape {vec.shape}"
73
+ )
74
+
75
+ # --- Streamlit UI ---
76
+
77
+ st.title("LLM Abliteration with Qwen")
78
+ st.markdown("Credits: Thanks to **Maxime Labonne**")
79
+ st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
80
+
81
+ st.sidebar.header("Abliteration Parameters")
82
+ MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
83
+ N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
84
+ TARGET_LAYER = st.sidebar.slider("Target Layer (relative ratio)", 0.0, 1.0, 0.65, step=0.05)
85
+ REFUSAL_WEIGHT = st.sidebar.slider("Refusal Weight", 0.0, 2.0, 1.0, step=0.05)
86
+ PRIVATE_UPLOAD = st.sidebar.checkbox("Push Model to Hub Privately", value=True)
87
+
88
+ st.sidebar.header("HF Token")
89
+ hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
90
+ if hf_token:
91
+ os.environ["HF_TOKEN"] = hf_token
92
+
93
+ st.sidebar.header("Target Dataset")
94
+ target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
95
+ target_dataset = st.sidebar.text_input("Target Dataset ID", "mlabonne/harmful_behaviors")
96
+ target_column = st.sidebar.text_input("Target Column Name", "text")
97
+
98
+ st.sidebar.header("Baseline Dataset")
99
+ baseline_prompt = st.sidebar.text_area("Baseline Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
100
+ baseline_dataset = st.sidebar.text_input("Baseline Dataset ID", "mlabonne/harmless_alpaca")
101
+ baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
102
+
103
+ if st.button("Run Abliteration"):
104
+ st.write("### Loading Model and Tokenizer")
105
+ if torch.cuda.is_available():
106
+ if torch.cuda.get_device_capability()[0] >= 8:
107
+ torch_dtype = torch.bfloat16
108
+ attn_implementation = "flash_attention_2"
109
+ else:
110
+ torch_dtype = torch.float16
111
+ attn_implementation = "eager"
112
+ else:
113
+ torch_dtype = torch.float32
114
+ attn_implementation = "eager"
115
+
116
+ with st.spinner("Loading model..."):
117
+ model = AutoModelForCausalLM.from_pretrained(
118
+ MODEL_ID,
119
+ device_map="auto",
120
+ torch_dtype=torch_dtype,
121
+ attn_implementation=attn_implementation,
122
+ trust_remote_code=True
123
+ ).eval()
124
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
125
+
126
+ try:
127
+ total_layers = len(model.model.layers)
128
+ except AttributeError:
129
+ st.error("The loaded model does not have a `model.layers` attribute.")
130
+ st.stop()
131
+ LAYER_IDX = int(TARGET_LAYER * total_layers)
132
+
133
+ st.write("### Before Abliteration Response")
134
+ prompt_example = "How to write a computer virus?"
135
+ response_before = generate_response(model, tokenizer, prompt_example)
136
+ st.text_area("Response before abliteration:", response_before, height=150)
137
+
138
+ st.write("### Loading Instructions")
139
+ with st.spinner("Loading target instructions..."):
140
+ target_instructions, _ = load_instructions(target_dataset, target_column, N_INSTRUCTIONS)
141
+ with st.spinner("Loading baseline instructions..."):
142
+ baseline_instructions, _ = load_instructions(baseline_dataset, baseline_column, N_INSTRUCTIONS)
143
+
144
+ st.write("### Generating Hidden States")
145
+ with st.spinner("Generating baseline hidden states..."):
146
+ baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
147
+ with st.spinner("Generating target hidden states..."):
148
+ target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
149
+
150
+ target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
151
+ baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
152
+
153
+ st.write("### Calculating Refusal Direction")
154
+ target_mean = torch.stack(target_hidden).mean(dim=0)
155
+ baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
156
+ refusal_dir = target_mean - baseline_mean
157
+ refusal_dir = refusal_dir / refusal_dir.norm()
158
+
159
+ del target_outputs, baseline_outputs, target_hidden, baseline_hidden
160
+
161
+ st.write("### Orthogonalizing Model Weights")
162
+ refusal_dir = refusal_dir.view(-1).to(model.device)
163
+ stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
164
+
165
+ if hasattr(model.model, "embed_tokens"):
166
+ model.model.embed_tokens.weight.data = orthogonalize_matrix(
167
+ model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
168
+ )
169
+ stats["embed_tokens"] = True
170
+
171
+ for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
172
+ if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
173
+ layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
174
+ layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
175
+ )
176
+ stats["attention_o_proj"] += 1
177
+
178
+ if hasattr(layer, "mlp"):
179
+ proj_name = (
180
+ "down_proj"
181
+ if hasattr(layer.mlp, "down_proj")
182
+ else "c_proj"
183
+ if hasattr(layer.mlp, "c_proj")
184
+ else None
185
+ )
186
+ if proj_name:
187
+ getattr(layer.mlp, proj_name).weight.data = orthogonalize_matrix(
188
+ getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
189
+ )
190
+ stats["mlp_proj"] += 1
191
+
192
+ del refusal_dir
193
+
194
+ if (
195
+ not stats["embed_tokens"]
196
+ and stats["attention_o_proj"] == 0
197
+ and stats["mlp_proj"] == 0
198
+ ):
199
+ st.error("Failed to orthogonalize any model weights. Model not abliterated.")
200
+ st.stop()
201
+
202
+ st.write(f"Orthogonalization stats: {stats}")
203
+
204
+ st.write("### After Abliteration Response")
205
+ response_after = generate_response(model, tokenizer, prompt_example)
206
+ st.text_area("Response after abliteration:", response_after, height=150)
207
+
208
+ st.write("### (Optional) Pushing Model to Hugging Face Hub")
209
+ if st.checkbox("Push model to HF Hub?"):
210
+ try:
211
+ model_name = MODEL_ID.split("/")[-1] + "-abliterated"
212
+ model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
213
+ tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
214
+ st.success(f"Model pushed as {model_name}")
215
+ except Exception as e:
216
+ st.error(f"Error while pushing model: {e}")
217
+
218
+ st.success("Abliteration process complete!")