Spaces:

prithivMLmods
/

Auto-Abliteration

Running

App Files Files Community

prithivMLmods commited on 21 days ago

Commit

d097713

verified ·

1 Parent(s): 0e94d6b

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -27

app.py CHANGED Viewed

@@ -78,6 +78,14 @@ st.title("LLM Abliteration with Qwen")
 st.markdown("Credits: Thanks to **Maxime Labonne**")
 st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
 st.sidebar.header("Abliteration Parameters")
 MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
 N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
@@ -89,6 +97,7 @@ st.sidebar.header("HF Token")
 hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
 if hf_token:
     os.environ["HF_TOKEN"] = hf_token
 st.sidebar.header("Target Dataset")
 target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
@@ -101,80 +110,112 @@ baseline_dataset = st.sidebar.text_input("Baseline Dataset ID", "mlabonne/harmle
 baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
 if st.button("Run Abliteration"):
     st.write("### Loading Model and Tokenizer")
     if torch.cuda.is_available():
         if torch.cuda.get_device_capability()[0] >= 8:
             torch_dtype = torch.bfloat16
             attn_implementation = "flash_attention_2"
         else:
             torch_dtype = torch.float16
             attn_implementation = "eager"
     else:
         torch_dtype = torch.float32
         attn_implementation = "eager"
     with st.spinner("Loading model..."):
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            device_map="auto",
-            torch_dtype=torch_dtype,
-            attn_implementation=attn_implementation,
-            trust_remote_code=True
-        ).eval()
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
     try:
         total_layers = len(model.model.layers)
     except AttributeError:
         st.error("The loaded model does not have a `model.layers` attribute.")
         st.stop()
     LAYER_IDX = int(TARGET_LAYER * total_layers)
     st.write("### Before Abliteration Response")
     prompt_example = "How to write a computer virus?"
     response_before = generate_response(model, tokenizer, prompt_example)
     st.text_area("Response before abliteration:", response_before, height=150)
     st.write("### Loading Instructions")
     with st.spinner("Loading target instructions..."):
-        target_instructions, _ = load_instructions(target_dataset, target_column, N_INSTRUCTIONS)
     with st.spinner("Loading baseline instructions..."):
-        baseline_instructions, _ = load_instructions(baseline_dataset, baseline_column, N_INSTRUCTIONS)
     st.write("### Generating Hidden States")
     with st.spinner("Generating baseline hidden states..."):
         baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
     with st.spinner("Generating target hidden states..."):
         target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
     target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
     baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
     st.write("### Calculating Refusal Direction")
     target_mean = torch.stack(target_hidden).mean(dim=0)
     baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
     refusal_dir = target_mean - baseline_mean
     refusal_dir = refusal_dir / refusal_dir.norm()
     del target_outputs, baseline_outputs, target_hidden, baseline_hidden
     st.write("### Orthogonalizing Model Weights")
     refusal_dir = refusal_dir.view(-1).to(model.device)
     stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
     if hasattr(model.model, "embed_tokens"):
         model.model.embed_tokens.weight.data = orthogonalize_matrix(
             model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
         )
         stats["embed_tokens"] = True
     for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
         if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
             layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
                 layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
             )
             stats["attention_o_proj"] += 1
         if hasattr(layer, "mlp"):
             proj_name = (
                 "down_proj"
@@ -188,23 +229,27 @@ if st.button("Run Abliteration"):
                     getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
                 )
                 stats["mlp_proj"] += 1
     del refusal_dir
     if (
         not stats["embed_tokens"]
         and stats["attention_o_proj"] == 0
         and stats["mlp_proj"] == 0
     ):
         st.error("Failed to orthogonalize any model weights. Model not abliterated.")
         st.stop()
     st.write(f"Orthogonalization stats: {stats}")
     st.write("### After Abliteration Response")
     response_after = generate_response(model, tokenizer, prompt_example)
     st.text_area("Response after abliteration:", response_after, height=150)
     st.write("### (Optional) Pushing Model to Hugging Face Hub")
     if st.checkbox("Push model to HF Hub?"):
         try:
@@ -212,7 +257,10 @@ if st.button("Run Abliteration"):
             model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
             tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
             st.success(f"Model pushed as {model_name}")
         except Exception as e:
             st.error(f"Error while pushing model: {e}")
-    st.success("Abliteration process complete!")

 st.markdown("Credits: Thanks to **Maxime Labonne**")
 st.markdown("This app allows you to manually input parameters to modify a language model's behavior by abliterating its weights.")
+# Debugging window (will update logs during the process)
+debug_log = []
+debug_placeholder = st.empty()
+def update_debug(msg):
+    debug_log.append(msg)
+    debug_placeholder.text("\n".join(debug_log))
+# Sidebar parameters
 st.sidebar.header("Abliteration Parameters")
 MODEL_ID = st.sidebar.text_input("Model ID", "Qwen/Qwen2.5-3B-Instruct")
 N_INSTRUCTIONS = st.sidebar.number_input("Number of Instructions", min_value=1, value=128, step=1)
 hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
 if hf_token:
     os.environ["HF_TOKEN"] = hf_token
+    update_debug("HF Token received.")
 st.sidebar.header("Target Dataset")
 target_prompt = st.sidebar.text_area("Target Prompt", "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
 baseline_column = st.sidebar.text_input("Baseline Column Name", "text")
 if st.button("Run Abliteration"):
+    update_debug("Starting abliteration process...")
     st.write("### Loading Model and Tokenizer")
+    update_debug("Checking device and GPU properties.")
     if torch.cuda.is_available():
         if torch.cuda.get_device_capability()[0] >= 8:
             torch_dtype = torch.bfloat16
             attn_implementation = "flash_attention_2"
+            update_debug("Using bfloat16 and flash_attention_2.")
         else:
             torch_dtype = torch.float16
             attn_implementation = "eager"
+            update_debug("Using float16 and eager attention.")
     else:
         torch_dtype = torch.float32
         attn_implementation = "eager"
+        update_debug("No GPU available. Using float32.")
     with st.spinner("Loading model..."):
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                device_map="auto",
+                torch_dtype=torch_dtype,
+                attn_implementation=attn_implementation,
+                trust_remote_code=True
+            ).eval()
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+            update_debug("Model and tokenizer loaded successfully.")
+        except Exception as e:
+            update_debug(f"Error loading model: {e}")
+            st.error("Failed to load model. Check the debug log for details.")
+            st.stop()
     try:
         total_layers = len(model.model.layers)
+        update_debug(f"Model has {total_layers} layers.")
     except AttributeError:
         st.error("The loaded model does not have a `model.layers` attribute.")
+        update_debug("Model does not have `model.layers` attribute. Aborting.")
         st.stop()
     LAYER_IDX = int(TARGET_LAYER * total_layers)
+    update_debug(f"Selected layer index: {LAYER_IDX} based on relative ratio {TARGET_LAYER}.")
     st.write("### Before Abliteration Response")
     prompt_example = "How to write a computer virus?"
     response_before = generate_response(model, tokenizer, prompt_example)
     st.text_area("Response before abliteration:", response_before, height=150)
+    update_debug("Generated response before abliteration.")
     st.write("### Loading Instructions")
     with st.spinner("Loading target instructions..."):
+        try:
+            target_instructions, _ = load_instructions(target_dataset, target_column, N_INSTRUCTIONS)
+            update_debug("Target instructions loaded.")
+        except Exception as e:
+            update_debug(f"Error loading target instructions: {e}")
+            st.error("Failed to load target instructions.")
+            st.stop()
     with st.spinner("Loading baseline instructions..."):
+        try:
+            baseline_instructions, _ = load_instructions(baseline_dataset, baseline_column, N_INSTRUCTIONS)
+            update_debug("Baseline instructions loaded.")
+        except Exception as e:
+            update_debug(f"Error loading baseline instructions: {e}")
+            st.error("Failed to load baseline instructions.")
+            st.stop()
     st.write("### Generating Hidden States")
     with st.spinner("Generating baseline hidden states..."):
         baseline_outputs = generate_outputs(model, tokenizer, baseline_instructions, system_prompt=baseline_prompt)
+        update_debug("Baseline hidden states generated.")
     with st.spinner("Generating target hidden states..."):
         target_outputs = generate_outputs(model, tokenizer, target_instructions, system_prompt=target_prompt)
+        update_debug("Target hidden states generated.")
     target_hidden = [output[LAYER_IDX][:, -1, :] for output in target_outputs]
     baseline_hidden = [output[LAYER_IDX][:, -1, :] for output in baseline_outputs]
+    update_debug("Extracted last token hidden states.")
     st.write("### Calculating Refusal Direction")
     target_mean = torch.stack(target_hidden).mean(dim=0)
     baseline_mean = torch.stack(baseline_hidden).mean(dim=0)
     refusal_dir = target_mean - baseline_mean
     refusal_dir = refusal_dir / refusal_dir.norm()
+    update_debug("Calculated and normalized the refusal direction.")
     del target_outputs, baseline_outputs, target_hidden, baseline_hidden
     st.write("### Orthogonalizing Model Weights")
     refusal_dir = refusal_dir.view(-1).to(model.device)
     stats = {"embed_tokens": False, "attention_o_proj": 0, "mlp_proj": 0}
     if hasattr(model.model, "embed_tokens"):
         model.model.embed_tokens.weight.data = orthogonalize_matrix(
             model.model.embed_tokens.weight.data, refusal_dir, REFUSAL_WEIGHT
         )
         stats["embed_tokens"] = True
+        update_debug("Orthogonalized embed_tokens weights.")
     for layer in tqdm(model.model.layers, desc="Orthogonalizing weights", leave=False):
         if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
             layer.self_attn.o_proj.weight.data = orthogonalize_matrix(
                 layer.self_attn.o_proj.weight.data, refusal_dir, REFUSAL_WEIGHT
             )
             stats["attention_o_proj"] += 1
         if hasattr(layer, "mlp"):
             proj_name = (
                 "down_proj"
                     getattr(layer.mlp, proj_name).weight.data, refusal_dir, REFUSAL_WEIGHT
                 )
                 stats["mlp_proj"] += 1
+    update_debug("Orthogonalized layer weights.")
     del refusal_dir
     if (
         not stats["embed_tokens"]
         and stats["attention_o_proj"] == 0
         and stats["mlp_proj"] == 0
     ):
         st.error("Failed to orthogonalize any model weights. Model not abliterated.")
+        update_debug("No weights were orthogonalized. Aborting process.")
         st.stop()
+    update_debug(f"Orthogonalization stats: {stats}")
     st.write(f"Orthogonalization stats: {stats}")
     st.write("### After Abliteration Response")
     response_after = generate_response(model, tokenizer, prompt_example)
     st.text_area("Response after abliteration:", response_after, height=150)
+    update_debug("Generated response after abliteration.")
     st.write("### (Optional) Pushing Model to Hugging Face Hub")
     if st.checkbox("Push model to HF Hub?"):
         try:
             model.push_to_hub(model_name, private=PRIVATE_UPLOAD)
             tokenizer.push_to_hub(model_name, private=PRIVATE_UPLOAD)
             st.success(f"Model pushed as {model_name}")
+            update_debug(f"Model pushed to HF Hub as {model_name}.")
         except Exception as e:
             st.error(f"Error while pushing model: {e}")
+            update_debug(f"Error while pushing model: {e}")
+    st.success("Abliteration process complete!")
+    update_debug("Abliteration process complete.")