ZEUS 8B 🌩️ V2 - ABLITERATED
V2 abliterated using the following script:
import gc
import random
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
MODEL_ID = "T145/ZEUS-8B-V2"
# More samples can help find the direction better.
NUM_PROMPT_SAMPLES = 32
# Used to skip the first and last layers for the modifications.
SKIP_BEGIN_LAYERS = 1
SKIP_END_LAYERS = 1
# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
LAYER_FRACTION_TO_USE = 0.6
# Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
SCALE_FACTOR = 1.0
torch.inference_mode()
torch.set_default_device("cpu")
torch.set_grad_enabled(False)
# Load the model on the GPU in quantized type if we can.
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
low_cpu_mem_usage=True,
device_map='auto'
)
model.requires_grad_(False)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
print("Layer index for refusal direction: " + str(layer_idx))
with open("harmful.txt", "r", encoding="utf-8") as f:
harmful = f.readlines()
with open("harmless.txt", "r", encoding="utf-8") as f:
harmless = f.readlines()
harmful_instructions = random.sample(harmful, min(NUM_PROMPT_SAMPLES, len(harmful)))
harmless_instructions = random.sample(harmless, min(NUM_PROMPT_SAMPLES, len(harmless)))
harmful_toks = [
tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
return_tensors="pt") for insn in harmful_instructions]
harmless_toks = [
tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
return_tensors="pt") for insn in harmless_instructions]
bar_generate = tqdm(total = len(harmful_instructions) + len(harmless_instructions), desc = "Generating samples")
# Only return the final hidden state of the layer we care about, and use 'cpu' to save VRAM.
def generate(toks):
inputs = tokenizer(toks, return_tensors="pt", padding=True)
inputs = inputs.to(model.device)
output = model.generate(
inputs['input_ids'],
use_cache=False,
max_new_tokens=1,
return_dict_in_generate=True,
output_hidden_states=True,
attention_mask=inputs["attention_mask"],
pad_token_id=tokenizer.eos_token_id
)
bar_generate.update(n=1)
return output.hidden_states[0][layer_idx][:, -1, :].to('cpu') # Final hidden state = -1.
harmful_hidden = [generate(toks) for toks in harmful_toks]
harmless_hidden = [generate(toks) for toks in harmless_toks]
bar_generate.close()
harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
refusal_dir = harmful_mean - harmless_mean
refusal_dir = refusal_dir.squeeze() / refusal_dir.norm()
torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt")
# Free memory
del model
gc.collect()
torch.cuda.empty_cache()
# Reload the model in CPU memory with bfloat16 data type
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
device_map='cpu'
)
model.requires_grad_(False)
# Make sure it's on the 'cpu' device.
if refusal_dir.device != model.device:
refusal_dir = refusal_dir.to(model.device)
# Get the language model component and check it's as expected.
lm_model = model.model
assert hasattr(lm_model, 'layers'), "The model does not have the expected structure."
# Check the ranges are valid.
num_layers = len(lm_model.layers)
assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0."
assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0."
assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers."
bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
tensor_float = tensor_data.to(torch.bfloat16)
refusal_dir_float = refusal_dir.to(torch.bfloat16)
tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float)
tensor_modified = tensor_float.to(torch.bfloat16)
bar_layers.update(1)
return torch.nn.Parameter(tensor_modified)
# Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer.
# NOTE: These tensors names are speific to "llama" and may need changing.
# - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
)
lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
)
bar_layers.close()
print("Saving modified model (with original tokenizer)...")
FIXED_ID = f"{MODEL_ID}-abliterated"
model.save_pretrained(FIXED_ID)
tokenizer.save_pretrained(FIXED_ID)
According to the script, layer 19 is the primary target for abliteration.
Open LLM Leaderboard Evaluation Results
Detailed results can be found here! Summarized results can be found here!
Metric | Value (%) |
---|---|
Average | 29.71 |
IFEval (0-Shot) | 78.95 |
BBH (3-Shot) | 30.98 |
MATH Lvl 5 (4-Shot) | 20.62 |
GPQA (0-shot) | 8.39 |
MuSR (0-shot) | 7.92 |
MMLU-PRO (5-shot) | 31.39 |
- Downloads last month
- 57
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.
Model tree for T145/ZEUS-8B-V2-abliterated
Evaluation results
- averaged accuracy on IFEval (0-Shot)Open LLM Leaderboard78.950
- normalized accuracy on BBH (3-Shot)test set Open LLM Leaderboard30.980
- exact match on MATH Lvl 5 (4-Shot)test set Open LLM Leaderboard20.620
- acc_norm on GPQA (0-shot)Open LLM Leaderboard8.390
- acc_norm on MuSR (0-shot)Open LLM Leaderboard7.920
- accuracy on MMLU-PRO (5-shot)test set Open LLM Leaderboard31.390