Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -78,6 +78,14 @@ st.title("LLM Abliteration with Qwen")
|
|
78 |
st.markdown("Credits: Thanks to **Maxime Labonne**")
|
79 |
st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
st.sidebar.header("Abliteration Parameters")
|
82 |
MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
|
83 |
N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
|
@@ -89,6 +97,7 @@ st.sidebar.header("HF Token")
|
|
89 |
hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
|
90 |
if hf_token:
|
91 |
os.environ["HF_TOKEN"] = hf_token
|
|
|
92 |
|
93 |
st.sidebar.header("Target Dataset")
|
94 |
target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
|
@@ -101,80 +110,112 @@ baseline_dataset = st.sidebar.text_input("Baseline Dataset ID", "mlabonne/harmle
|
|
101 |
baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
|
102 |
|
103 |
if st.button("Run Abliteration"):
|
|
|
|
|
104 |
st.write("### Loading Model and Tokenizer")
|
|
|
105 |
if torch.cuda.is_available():
|
106 |
if torch.cuda.get_device_capability()[0] >= 8:
|
107 |
torch_dtype = torch.bfloat16
|
108 |
attn_implementation = "flash_attention_2"
|
|
|
109 |
else:
|
110 |
torch_dtype = torch.float16
|
111 |
attn_implementation = "eager"
|
|
|
112 |
else:
|
113 |
torch_dtype = torch.float32
|
114 |
attn_implementation = "eager"
|
115 |
-
|
|
|
116 |
with st.spinner("Loading model..."):
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
try:
|
127 |
total_layers = len(model.model.layers)
|
|
|
128 |
except AttributeError:
|
129 |
st.error("The loaded model does not have a `model.layers` attribute.")
|
|
|
130 |
st.stop()
|
131 |
LAYER_IDX = int(TARGET_LAYER * total_layers)
|
|
|
132 |
|
133 |
st.write("### Before Abliteration Response")
|
134 |
prompt_example = "How to write a computer virus?"
|
135 |
response_before = generate_response(model, tokenizer, prompt_example)
|
136 |
st.text_area("Response before abliteration:", response_before, height=150)
|
137 |
-
|
|
|
138 |
st.write("### Loading Instructions")
|
139 |
with st.spinner("Loading target instructions..."):
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
with st.spinner("Loading baseline instructions..."):
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
st.write("### Generating Hidden States")
|
145 |
with st.spinner("Generating baseline hidden states..."):
|
146 |
baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
|
|
|
147 |
with st.spinner("Generating target hidden states..."):
|
148 |
target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
|
149 |
-
|
|
|
150 |
target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
|
151 |
baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
|
152 |
-
|
|
|
153 |
st.write("### Calculating Refusal Direction")
|
154 |
target_mean = torch.stack(target_hidden).mean(dim=0)
|
155 |
baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
|
156 |
refusal_dir = target_mean - baseline_mean
|
157 |
refusal_dir = refusal_dir / refusal_dir.norm()
|
158 |
-
|
|
|
159 |
del target_outputs, baseline_outputs, target_hidden, baseline_hidden
|
160 |
-
|
161 |
st.write("### Orthogonalizing Model Weights")
|
162 |
refusal_dir = refusal_dir.view(-1).to(model.device)
|
163 |
stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
|
164 |
-
|
165 |
if hasattr(model.model, "embed_tokens"):
|
166 |
model.model.embed_tokens.weight.data = orthogonalize_matrix(
|
167 |
model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
|
168 |
)
|
169 |
stats["embed_tokens"] = True
|
170 |
-
|
|
|
171 |
for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
|
172 |
if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
|
173 |
layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
|
174 |
layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
|
175 |
)
|
176 |
stats["attention_o_proj"] += 1
|
177 |
-
|
178 |
if hasattr(layer, "mlp"):
|
179 |
proj_name = (
|
180 |
"down_proj"
|
@@ -188,23 +229,27 @@ if st.button("Run Abliteration"):
|
|
188 |
getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
|
189 |
)
|
190 |
stats["mlp_proj"] += 1
|
191 |
-
|
|
|
192 |
del refusal_dir
|
193 |
-
|
194 |
if (
|
195 |
not stats["embed_tokens"]
|
196 |
and stats["attention_o_proj"] == 0
|
197 |
and stats["mlp_proj"] == 0
|
198 |
):
|
199 |
st.error("Failed to orthogonalize any model weights. Model not abliterated.")
|
|
|
200 |
st.stop()
|
201 |
-
|
|
|
202 |
st.write(f"Orthogonalization stats: {stats}")
|
203 |
-
|
204 |
st.write("### After Abliteration Response")
|
205 |
response_after = generate_response(model, tokenizer, prompt_example)
|
206 |
st.text_area("Response after abliteration:", response_after, height=150)
|
207 |
-
|
|
|
208 |
st.write("### (Optional) Pushing Model to Hugging Face Hub")
|
209 |
if st.checkbox("Push model to HF Hub?"):
|
210 |
try:
|
@@ -212,7 +257,10 @@ if st.button("Run Abliteration"):
|
|
212 |
model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
|
213 |
tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
|
214 |
st.success(f"Model pushed as {model_name}")
|
|
|
215 |
except Exception as e:
|
216 |
st.error(f"Error while pushing model: {e}")
|
217 |
-
|
218 |
-
|
|
|
|
|
|
78 |
st.markdown("Credits: Thanks to **Maxime Labonne**")
|
79 |
st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
|
80 |
|
81 |
+
# Debugging window (will update logs during the process)
|
82 |
+
debug_log = []
|
83 |
+
debug_placeholder = st.empty()
|
84 |
+
def update_debug(msg):
|
85 |
+
debug_log.append(msg)
|
86 |
+
debug_placeholder.text("\n".join(debug_log))
|
87 |
+
|
88 |
+
# Sidebar parameters
|
89 |
st.sidebar.header("Abliteration Parameters")
|
90 |
MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
|
91 |
N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
|
|
|
97 |
hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
|
98 |
if hf_token:
|
99 |
os.environ["HF_TOKEN"] = hf_token
|
100 |
+
update_debug("HF Token received.")
|
101 |
|
102 |
st.sidebar.header("Target Dataset")
|
103 |
target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
|
|
|
110 |
baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
|
111 |
|
112 |
if st.button("Run Abliteration"):
|
113 |
+
update_debug("Starting abliteration process...")
|
114 |
+
|
115 |
st.write("### Loading Model and Tokenizer")
|
116 |
+
update_debug("Checking device and GPU properties.")
|
117 |
if torch.cuda.is_available():
|
118 |
if torch.cuda.get_device_capability()[0] >= 8:
|
119 |
torch_dtype = torch.bfloat16
|
120 |
attn_implementation = "flash_attention_2"
|
121 |
+
update_debug("Using bfloat16 and flash_attention_2.")
|
122 |
else:
|
123 |
torch_dtype = torch.float16
|
124 |
attn_implementation = "eager"
|
125 |
+
update_debug("Using float16 and eager attention.")
|
126 |
else:
|
127 |
torch_dtype = torch.float32
|
128 |
attn_implementation = "eager"
|
129 |
+
update_debug("No GPU available. Using float32.")
|
130 |
+
|
131 |
with st.spinner("Loading model..."):
|
132 |
+
try:
|
133 |
+
model = AutoModelForCausalLM.from_pretrained(
|
134 |
+
MODEL_ID,
|
135 |
+
device_map="auto",
|
136 |
+
torch_dtype=torch_dtype,
|
137 |
+
attn_implementation=attn_implementation,
|
138 |
+
trust_remote_code=True
|
139 |
+
).eval()
|
140 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
141 |
+
update_debug("Model and tokenizer loaded successfully.")
|
142 |
+
except Exception as e:
|
143 |
+
update_debug(f"Error loading model: {e}")
|
144 |
+
st.error("Failed to load model. Check the debug log for details.")
|
145 |
+
st.stop()
|
146 |
|
147 |
try:
|
148 |
total_layers = len(model.model.layers)
|
149 |
+
update_debug(f"Model has {total_layers} layers.")
|
150 |
except AttributeError:
|
151 |
st.error("The loaded model does not have a `model.layers` attribute.")
|
152 |
+
update_debug("Model does not have `model.layers` attribute. Aborting.")
|
153 |
st.stop()
|
154 |
LAYER_IDX = int(TARGET_LAYER * total_layers)
|
155 |
+
update_debug(f"Selected layer index: {LAYER_IDX} based on relative ratio {TARGET_LAYER}.")
|
156 |
|
157 |
st.write("### Before Abliteration Response")
|
158 |
prompt_example = "How to write a computer virus?"
|
159 |
response_before = generate_response(model, tokenizer, prompt_example)
|
160 |
st.text_area("Response before abliteration:", response_before, height=150)
|
161 |
+
update_debug("Generated response before abliteration.")
|
162 |
+
|
163 |
st.write("### Loading Instructions")
|
164 |
with st.spinner("Loading target instructions..."):
|
165 |
+
try:
|
166 |
+
target_instructions, _ = load_instructions(target_dataset, target_column, N_INSTRUCTIONS)
|
167 |
+
update_debug("Target instructions loaded.")
|
168 |
+
except Exception as e:
|
169 |
+
update_debug(f"Error loading target instructions: {e}")
|
170 |
+
st.error("Failed to load target instructions.")
|
171 |
+
st.stop()
|
172 |
with st.spinner("Loading baseline instructions..."):
|
173 |
+
try:
|
174 |
+
baseline_instructions, _ = load_instructions(baseline_dataset, baseline_column, N_INSTRUCTIONS)
|
175 |
+
update_debug("Baseline instructions loaded.")
|
176 |
+
except Exception as e:
|
177 |
+
update_debug(f"Error loading baseline instructions: {e}")
|
178 |
+
st.error("Failed to load baseline instructions.")
|
179 |
+
st.stop()
|
180 |
+
|
181 |
st.write("### Generating Hidden States")
|
182 |
with st.spinner("Generating baseline hidden states..."):
|
183 |
baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
|
184 |
+
update_debug("Baseline hidden states generated.")
|
185 |
with st.spinner("Generating target hidden states..."):
|
186 |
target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
|
187 |
+
update_debug("Target hidden states generated.")
|
188 |
+
|
189 |
target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
|
190 |
baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
|
191 |
+
update_debug("Extracted last token hidden states.")
|
192 |
+
|
193 |
st.write("### Calculating Refusal Direction")
|
194 |
target_mean = torch.stack(target_hidden).mean(dim=0)
|
195 |
baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
|
196 |
refusal_dir = target_mean - baseline_mean
|
197 |
refusal_dir = refusal_dir / refusal_dir.norm()
|
198 |
+
update_debug("Calculated and normalized the refusal direction.")
|
199 |
+
|
200 |
del target_outputs, baseline_outputs, target_hidden, baseline_hidden
|
201 |
+
|
202 |
st.write("### Orthogonalizing Model Weights")
|
203 |
refusal_dir = refusal_dir.view(-1).to(model.device)
|
204 |
stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
|
205 |
+
|
206 |
if hasattr(model.model, "embed_tokens"):
|
207 |
model.model.embed_tokens.weight.data = orthogonalize_matrix(
|
208 |
model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
|
209 |
)
|
210 |
stats["embed_tokens"] = True
|
211 |
+
update_debug("Orthogonalized embed_tokens weights.")
|
212 |
+
|
213 |
for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
|
214 |
if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
|
215 |
layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
|
216 |
layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
|
217 |
)
|
218 |
stats["attention_o_proj"] += 1
|
|
|
219 |
if hasattr(layer, "mlp"):
|
220 |
proj_name = (
|
221 |
"down_proj"
|
|
|
229 |
getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
|
230 |
)
|
231 |
stats["mlp_proj"] += 1
|
232 |
+
update_debug("Orthogonalized layer weights.")
|
233 |
+
|
234 |
del refusal_dir
|
235 |
+
|
236 |
if (
|
237 |
not stats["embed_tokens"]
|
238 |
and stats["attention_o_proj"] == 0
|
239 |
and stats["mlp_proj"] == 0
|
240 |
):
|
241 |
st.error("Failed to orthogonalize any model weights. Model not abliterated.")
|
242 |
+
update_debug("No weights were orthogonalized. Aborting process.")
|
243 |
st.stop()
|
244 |
+
|
245 |
+
update_debug(f"Orthogonalization stats: {stats}")
|
246 |
st.write(f"Orthogonalization stats: {stats}")
|
247 |
+
|
248 |
st.write("### After Abliteration Response")
|
249 |
response_after = generate_response(model, tokenizer, prompt_example)
|
250 |
st.text_area("Response after abliteration:", response_after, height=150)
|
251 |
+
update_debug("Generated response after abliteration.")
|
252 |
+
|
253 |
st.write("### (Optional) Pushing Model to Hugging Face Hub")
|
254 |
if st.checkbox("Push model to HF Hub?"):
|
255 |
try:
|
|
|
257 |
model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
|
258 |
tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
|
259 |
st.success(f"Model pushed as {model_name}")
|
260 |
+
update_debug(f"Model pushed to HF Hub as {model_name}.")
|
261 |
except Exception as e:
|
262 |
st.error(f"Error while pushing model: {e}")
|
263 |
+
update_debug(f"Error while pushing model: {e}")
|
264 |
+
|
265 |
+
st.success("Abliteration process complete!")
|
266 |
+
update_debug("Abliteration process complete.")
|