Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitattributes +4 -0
4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_featurizer +3 -0
4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_indices +1 -0
4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_inverse_featurizer +3 -0
4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_featurizer +3 -0
4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_indices +1 -0
4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_inverse_featurizer +3 -0
featurizer.py +52 -0
token_position.py +91 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_featurizer filter=lfs diff=lfs merge=lfs -text
+4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_inverse_featurizer filter=lfs diff=lfs merge=lfs -text
+4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_featurizer filter=lfs diff=lfs merge=lfs -text
+4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_inverse_featurizer filter=lfs diff=lfs merge=lfs -text

4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_featurizer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a36425ac3f132d84665263a53b6933afd65a48479529d0eb4ba7f75a85932b2
+size 21531300

4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_indices ADDED Viewed

	@@ -0,0 +1 @@


1	+ null

4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:correct_symbol)_inverse_featurizer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:847ce30d6a5bb0b6da2308c61e49ff979f843eb9c9d13250b2936336c63456d9
+size 21531356

4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_featurizer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca074860461cf9ede6cf8175332be8bfa7380722184ea0d7969f0796797cf2be
+size 21531208

4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_indices ADDED Viewed

	@@ -0,0 +1 @@


1	+ null

4_answer_MCQA_Gemma2ForCausalLM_answer_pointer/ResidualStream(Layer:0,Token:last_token)_inverse_featurizer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb1378f957f695d398cdbf01c41dd47a386f950ab94ca2b9d57ddc1b0cd29211
+size 21531328

featurizer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Copy of the existing SubspaceFeaturizer implementation for submission.
+This file provides the same SubspaceFeaturizer functionality in a self-contained format.
+"""
+import torch
+import torch.nn as nn
+import pyvene as pv
+from CausalAbstraction.model_units.model_units import Featurizer
+class SubspaceFeaturizerModuleCopy(torch.nn.Module):
+    def __init__(self, rotate_layer):
+        super().__init__()
+        self.rotate = rotate_layer
+    def forward(self, x):
+        r = self.rotate.weight.T
+        f = x.to(r.dtype) @ r.T
+        error = x - (f @ r).to(x.dtype)
+        return f, error
+class SubspaceInverseFeaturizerModuleCopy(torch.nn.Module):
+    def __init__(self, rotate_layer):
+        super().__init__()
+        self.rotate = rotate_layer
+    def forward(self, f, error):
+        r = self.rotate.weight.T
+        return (f.to(r.dtype) @ r).to(f.dtype) + error.to(f.dtype)
+class SubspaceFeaturizerCopy(Featurizer):
+    def __init__(self, shape=None, rotation_subspace=None, trainable=True, id="subspace"):
+        assert shape is not None or rotation_subspace is not None, "Either shape or rotation_subspace must be provided."
+        if shape is not None:
+            self.rotate = pv.models.layers.LowRankRotateLayer(*shape, init_orth=True)
+        elif rotation_subspace is not None:
+            shape = rotation_subspace.shape
+            self.rotate = pv.models.layers.LowRankRotateLayer(*shape, init_orth=False)
+            self.rotate.weight.data.copy_(rotation_subspace)
+        self.rotate = torch.nn.utils.parametrizations.orthogonal(self.rotate)
+        if not trainable:
+            self.rotate.requires_grad_(False)
+        # Create module-based featurizer and inverse_featurizer
+        featurizer = SubspaceFeaturizerModuleCopy(self.rotate)
+        inverse_featurizer = SubspaceInverseFeaturizerModuleCopy(self.rotate)
+        super().__init__(featurizer, inverse_featurizer, n_features=self.rotate.weight.shape[1], id=id)

token_position.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Token position definitions for MCQA task submission.
+This file provides token position functions that identify key tokens in MCQA prompts.
+"""
+import re
+from CausalAbstraction.model_units.LM_units import TokenPosition
+def get_last_token_index(prompt, pipeline):
+    """
+    Get the index of the last token in the prompt.
+    Args:
+        prompt (str): The input prompt
+        pipeline: The tokenizer pipeline
+    Returns:
+        list[int]: List containing the index of the last token
+    """
+    input_ids = list(pipeline.load(prompt)["input_ids"][0])
+    return [len(input_ids) - 1]
+def get_correct_symbol_index(prompt, pipeline, task):
+    """
+    Find the index of the correct answer symbol in the prompt.
+    Args:
+        prompt (str): The prompt text
+        pipeline: The tokenizer pipeline
+        task: The task object containing causal model
+    Returns:
+        list[int]: List containing the index of the correct answer symbol token
+    """
+    # Run the model to get the answer position
+    output = task.causal_model.run_forward(task.input_loader(prompt))
+    pointer = output["answer_pointer"]
+    correct_symbol = output[f"symbol{pointer}"]
+    # Find all single uppercase letters in the prompt
+    matches = list(re.finditer(r"\b[A-Z]\b", prompt))
+    # Find the match corresponding to our correct symbol
+    symbol_match = None
+    for match in matches:
+        if prompt[match.start():match.end()] == correct_symbol:
+            symbol_match = match
+            break
+    if not symbol_match:
+        raise ValueError(f"Could not find correct symbol {correct_symbol} in prompt: {prompt}")
+    # Get the substring up to the symbol match end
+    substring = prompt[:symbol_match.end()]
+    tokenized_substring = list(pipeline.load(substring)["input_ids"][0])
+    # The symbol token will be at the end of the substring
+    return [len(tokenized_substring) - 1]
+def get_token_positions(pipeline, task):
+    """
+    Get token positions for the MCQA task.
+    This function identifies key token positions in MCQA prompts:
+    - correct_symbol: The position of the correct answer symbol (A, B, C, or D)
+    - last_token: The position of the last token in the prompt
+    Args:
+        pipeline: The language model pipeline with tokenizer
+        task: The MCQA task object
+    Returns:
+        list[TokenPosition]: List of TokenPosition objects for intervention experiments
+    """
+    # Create TokenPosition objects
+    token_positions = [
+        TokenPosition(
+            lambda x: get_correct_symbol_index(x, pipeline, task),
+            pipeline,
+            id="correct_symbol"
+        ),
+        TokenPosition(
+            lambda x: get_last_token_index(x, pipeline),
+            pipeline,
+            id="last_token"
+        )
+    ]
+    return token_positions