Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import torch
|
4 |
+
from tqdm import tqdm
|
5 |
+
import streamlit as st
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
+
from datasets import load_dataset
|
8 |
+
|
9 |
+
# --- Helper functions ---
|
10 |
+
|
11 |
+
def load_instructions(dataset_id, column, n_instructions):
|
12 |
+
dataset = load_dataset(dataset_id, split="train")
|
13 |
+
indices = random.sample(range(len(dataset)), n_instructions * 2)
|
14 |
+
return [dataset[i][column] for i in indices[:n_instructions]], [
|
15 |
+
dataset[i][column] for i in indices[n_instructions:]
|
16 |
+
]
|
17 |
+
|
18 |
+
def generate_response(model, tokenizer, prompt, max_new_tokens=128):
|
19 |
+
if hasattr(tokenizer, "apply_chat_template"):
|
20 |
+
inputs = tokenizer.apply_chat_template(
|
21 |
+
conversation=[{"role": "user", "content": prompt}],
|
22 |
+
add_generation_prompt=True,
|
23 |
+
return_tensors="pt",
|
24 |
+
).to(model.device)
|
25 |
+
else:
|
26 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
27 |
+
output_ids = model.generate(
|
28 |
+
inputs,
|
29 |
+
max_new_tokens=max_new_tokens,
|
30 |
+
do_sample=True,
|
31 |
+
temperature=0.5,
|
32 |
+
min_p=0.1,
|
33 |
+
repetition_penalty=1.05,
|
34 |
+
)
|
35 |
+
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
36 |
+
|
37 |
+
def generate_outputs(model, tokenizer, instructions, system_prompt):
|
38 |
+
outputs = []
|
39 |
+
for instruction in tqdm(instructions, desc="Generating outputs", leave=False):
|
40 |
+
if hasattr(tokenizer, "apply_chat_template"):
|
41 |
+
inputs = tokenizer.apply_chat_template(
|
42 |
+
conversation=[
|
43 |
+
{"role": "system", "content": system_prompt},
|
44 |
+
{"role": "user", "content": instruction},
|
45 |
+
],
|
46 |
+
add_generation_prompt=True,
|
47 |
+
return_tensors="pt",
|
48 |
+
).to(model.device)
|
49 |
+
else:
|
50 |
+
prompt = system_prompt + "\n" + instruction
|
51 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
52 |
+
out = model.generate(
|
53 |
+
inputs,
|
54 |
+
use_cache=False,
|
55 |
+
max_new_tokens=1,
|
56 |
+
return_dict_in_generate=True,
|
57 |
+
output_hidden_states=True,
|
58 |
+
)
|
59 |
+
outputs.append(out["hidden_states"][0])
|
60 |
+
return outputs
|
61 |
+
|
62 |
+
def orthogonalize_matrix(matrix, vec, weight):
|
63 |
+
vec = vec.view(-1).to(matrix.device)
|
64 |
+
if matrix.shape[-1] == vec.shape[0]:
|
65 |
+
proj = torch.einsum("...d,d->...", matrix, vec).unsqueeze(-1) * vec.unsqueeze(0)
|
66 |
+
return matrix - weight * proj
|
67 |
+
elif matrix.shape[0] == vec.shape[0]:
|
68 |
+
proj = torch.einsum("d...,d->...", matrix, vec).unsqueeze(0) * vec.unsqueeze(-1)
|
69 |
+
return matrix - weight * proj
|
70 |
+
else:
|
71 |
+
raise ValueError(
|
72 |
+
f"Matrix shape {matrix.shape} incompatible with vector shape {vec.shape}"
|
73 |
+
)
|
74 |
+
|
75 |
+
# --- Streamlit UI ---
|
76 |
+
|
77 |
+
st.title("LLM Abliteration with Qwen")
|
78 |
+
st.markdown("Credits: Thanks to **Maxime Labonne**")
|
79 |
+
st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
|
80 |
+
|
81 |
+
st.sidebar.header("Abliteration Parameters")
|
82 |
+
MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
|
83 |
+
N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
|
84 |
+
TARGET_LAYER = st.sidebar.slider("Target Layer (relative ratio)", 0.0, 1.0, 0.65, step=0.05)
|
85 |
+
REFUSAL_WEIGHT = st.sidebar.slider("Refusal Weight", 0.0, 2.0, 1.0, step=0.05)
|
86 |
+
PRIVATE_UPLOAD = st.sidebar.checkbox("Push Model to Hub Privately", value=True)
|
87 |
+
|
88 |
+
st.sidebar.header("HF Token")
|
89 |
+
hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
|
90 |
+
if hf_token:
|
91 |
+
os.environ["HF_TOKEN"] = hf_token
|
92 |
+
|
93 |
+
st.sidebar.header("Target Dataset")
|
94 |
+
target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
|
95 |
+
target_dataset = st.sidebar.text_input("Target Dataset ID", "mlabonne/harmful_behaviors")
|
96 |
+
target_column = st.sidebar.text_input("Target Column Name", "text")
|
97 |
+
|
98 |
+
st.sidebar.header("Baseline Dataset")
|
99 |
+
baseline_prompt = st.sidebar.text_area("Baseline Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
|
100 |
+
baseline_dataset = st.sidebar.text_input("Baseline Dataset ID", "mlabonne/harmless_alpaca")
|
101 |
+
baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
|
102 |
+
|
103 |
+
if st.button("Run Abliteration"):
|
104 |
+
st.write("### Loading Model and Tokenizer")
|
105 |
+
if torch.cuda.is_available():
|
106 |
+
if torch.cuda.get_device_capability()[0] >= 8:
|
107 |
+
torch_dtype = torch.bfloat16
|
108 |
+
attn_implementation = "flash_attention_2"
|
109 |
+
else:
|
110 |
+
torch_dtype = torch.float16
|
111 |
+
attn_implementation = "eager"
|
112 |
+
else:
|
113 |
+
torch_dtype = torch.float32
|
114 |
+
attn_implementation = "eager"
|
115 |
+
|
116 |
+
with st.spinner("Loading model..."):
|
117 |
+
model = AutoModelForCausalLM.from_pretrained(
|
118 |
+
MODEL_ID,
|
119 |
+
device_map="auto",
|
120 |
+
torch_dtype=torch_dtype,
|
121 |
+
attn_implementation=attn_implementation,
|
122 |
+
trust_remote_code=True
|
123 |
+
).eval()
|
124 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
125 |
+
|
126 |
+
try:
|
127 |
+
total_layers = len(model.model.layers)
|
128 |
+
except AttributeError:
|
129 |
+
st.error("The loaded model does not have a `model.layers` attribute.")
|
130 |
+
st.stop()
|
131 |
+
LAYER_IDX = int(TARGET_LAYER * total_layers)
|
132 |
+
|
133 |
+
st.write("### Before Abliteration Response")
|
134 |
+
prompt_example = "How to write a computer virus?"
|
135 |
+
response_before = generate_response(model, tokenizer, prompt_example)
|
136 |
+
st.text_area("Response before abliteration:", response_before, height=150)
|
137 |
+
|
138 |
+
st.write("### Loading Instructions")
|
139 |
+
with st.spinner("Loading target instructions..."):
|
140 |
+
target_instructions, _ = load_instructions(target_dataset, target_column, N_INSTRUCTIONS)
|
141 |
+
with st.spinner("Loading baseline instructions..."):
|
142 |
+
baseline_instructions, _ = load_instructions(baseline_dataset, baseline_column, N_INSTRUCTIONS)
|
143 |
+
|
144 |
+
st.write("### Generating Hidden States")
|
145 |
+
with st.spinner("Generating baseline hidden states..."):
|
146 |
+
baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
|
147 |
+
with st.spinner("Generating target hidden states..."):
|
148 |
+
target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
|
149 |
+
|
150 |
+
target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
|
151 |
+
baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
|
152 |
+
|
153 |
+
st.write("### Calculating Refusal Direction")
|
154 |
+
target_mean = torch.stack(target_hidden).mean(dim=0)
|
155 |
+
baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
|
156 |
+
refusal_dir = target_mean - baseline_mean
|
157 |
+
refusal_dir = refusal_dir / refusal_dir.norm()
|
158 |
+
|
159 |
+
del target_outputs, baseline_outputs, target_hidden, baseline_hidden
|
160 |
+
|
161 |
+
st.write("### Orthogonalizing Model Weights")
|
162 |
+
refusal_dir = refusal_dir.view(-1).to(model.device)
|
163 |
+
stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
|
164 |
+
|
165 |
+
if hasattr(model.model, "embed_tokens"):
|
166 |
+
model.model.embed_tokens.weight.data = orthogonalize_matrix(
|
167 |
+
model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
|
168 |
+
)
|
169 |
+
stats["embed_tokens"] = True
|
170 |
+
|
171 |
+
for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
|
172 |
+
if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
|
173 |
+
layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
|
174 |
+
layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
|
175 |
+
)
|
176 |
+
stats["attention_o_proj"] += 1
|
177 |
+
|
178 |
+
if hasattr(layer, "mlp"):
|
179 |
+
proj_name = (
|
180 |
+
"down_proj"
|
181 |
+
if hasattr(layer.mlp, "down_proj")
|
182 |
+
else "c_proj"
|
183 |
+
if hasattr(layer.mlp, "c_proj")
|
184 |
+
else None
|
185 |
+
)
|
186 |
+
if proj_name:
|
187 |
+
getattr(layer.mlp, proj_name).weight.data = orthogonalize_matrix(
|
188 |
+
getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
|
189 |
+
)
|
190 |
+
stats["mlp_proj"] += 1
|
191 |
+
|
192 |
+
del refusal_dir
|
193 |
+
|
194 |
+
if (
|
195 |
+
not stats["embed_tokens"]
|
196 |
+
and stats["attention_o_proj"] == 0
|
197 |
+
and stats["mlp_proj"] == 0
|
198 |
+
):
|
199 |
+
st.error("Failed to orthogonalize any model weights. Model not abliterated.")
|
200 |
+
st.stop()
|
201 |
+
|
202 |
+
st.write(f"Orthogonalization stats: {stats}")
|
203 |
+
|
204 |
+
st.write("### After Abliteration Response")
|
205 |
+
response_after = generate_response(model, tokenizer, prompt_example)
|
206 |
+
st.text_area("Response after abliteration:", response_after, height=150)
|
207 |
+
|
208 |
+
st.write("### (Optional) Pushing Model to Hugging Face Hub")
|
209 |
+
if st.checkbox("Push model to HF Hub?"):
|
210 |
+
try:
|
211 |
+
model_name = MODEL_ID.split("/")[-1] + "-abliterated"
|
212 |
+
model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
|
213 |
+
tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
|
214 |
+
st.success(f"Model pushed as {model_name}")
|
215 |
+
except Exception as e:
|
216 |
+
st.error(f"Error while pushing model: {e}")
|
217 |
+
|
218 |
+
st.success("Abliteration process complete!")
|