Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
import time | |
import torch | |
from transformer_lens import HookedTransformer | |
from typing import List | |
# Save memory | |
torch.set_grad_enabled(False) | |
# Mock model for faster UI testing & feedback | |
UI_DEVELOPMENT = False | |
if not UI_DEVELOPMENT: | |
model = HookedTransformer.from_pretrained("gpt2-xl") | |
model.eval() | |
if torch.cuda.is_available(): | |
model.to("cuda") | |
else: | |
model = "toy" # :) | |
SEED = 0 | |
sampling_kwargs = dict(temperature=1.0, top_p=0.3, freq_penalty=1.0) | |
example_count = 4 | |
def get_token_length(prompt): | |
return model.to_tokens(prompt).shape[1] | |
def add_padding_right(prompt, length): | |
return prompt + " " * (length - get_token_length(prompt)) | |
def add_padding(prompt_add, prompt_sub): | |
padding_size = max(get_token_length(prompt_add), get_token_length(prompt_sub)) | |
return add_padding_right(prompt_add, padding_size), add_padding_right( | |
prompt_sub, padding_size | |
) | |
def get_resid_pre(prompt: str, layer: int): | |
name = f"blocks.{layer}.hook_resid_pre" | |
cache, caching_hooks, _ = model.get_caching_hooks(lambda n: n == name) | |
with model.hooks(fwd_hooks=caching_hooks): | |
_ = model(prompt) | |
return cache[name] | |
def get_activations(prompt_add: str, prompt_sub: str, layer: int): | |
act_add = get_resid_pre(prompt_add, layer) | |
act_sub = get_resid_pre(prompt_sub, layer) | |
act_diff = act_add - act_sub | |
print("Activation Difference:") | |
print(act_diff.shape) | |
return act_diff | |
def create_hook(act_diff: torch.Tensor, coeff: int): | |
def ave_hook(resid_pre, hook): | |
if resid_pre.shape[1] == 1: | |
return # caching in model.generate for new tokens | |
# We only add to the prompt (first call), not the generated tokens. | |
ppos, apos = resid_pre.shape[1], act_diff.shape[1] | |
if apos > ppos: | |
raise gr.Error( | |
f"More mod tokens ({apos}) then PROMPT tokens ({ppos}). Try a **longer** PROMPT." | |
) | |
# add to the beginning (position-wise) of the activations | |
resid_pre[:, :apos, :] += coeff * act_diff | |
return ave_hook | |
def hooked_generate(prompt_batch: List[str], fwd_hooks=[], seed=None, **kwargs): | |
if seed is not None: | |
torch.manual_seed(seed) | |
with model.hooks(fwd_hooks=fwd_hooks): | |
tokenized = model.to_tokens(prompt_batch) | |
r = model.generate(input=tokenized, max_new_tokens=50, do_sample=True, **kwargs) | |
return r | |
def config_to_str(prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input): | |
if no_steering_input: | |
return "NO STEERING: TRUE" | |
return f"""PROMPT: {prompt} | |
FROM: {prompt_sub} | |
TO: {prompt_add} | |
MULTIPLIER: {coeff} | |
LAYER: {act_name}""" | |
def config_header_str(): | |
return f"{'='*8} CONFIGURATION {'='*8}" | |
def sample_header_str(i: int): | |
return f"{'='*11} SAMPLE {i+1} {'='*11}" | |
def results_to_ui_output( | |
results, prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input | |
): | |
config_str = config_to_str( | |
prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input | |
) | |
header_str = f"{config_header_str()}\n\n{config_str}" | |
body_str = "\n\n".join( | |
[f"{sample_header_str(i)}\n\n{r}" for i, r in enumerate(results)] | |
) | |
return f"{header_str}\n\n{body_str}" | |
def predict( | |
prompt: str, | |
prompt_sub: str = "", | |
prompt_add: str = "", | |
coeff: int = 12, | |
act_name: int = 6, | |
no_steering_input: bool = False, | |
): | |
if prompt_sub == "": | |
raise gr.Error( | |
"Please input a FROM option. Could be a single space character, a word or a phrase" | |
) | |
if prompt_add == "": | |
raise gr.Error( | |
"Please input a TO option. Could be a single space character, a word or a phrase" | |
) | |
print("Text generation begin:") | |
time_stamp = time.time() | |
print("Parameters:") | |
print("prompt:", prompt) | |
print("prompt_sub:", prompt_sub) | |
print("prompt_add:", prompt_add) | |
print("coeff:", coeff) | |
print("act_name:", act_name) | |
print("no_steering_input:", no_steering_input) | |
if not UI_DEVELOPMENT and not no_steering_input: | |
padded_prompt_add, padded_prompt_sub = add_padding(prompt_add, prompt_sub) | |
act_diff = get_activations(padded_prompt_add, padded_prompt_sub, act_name) | |
ave_hook = create_hook(act_diff, coeff) | |
editing_hooks = [(f"blocks.{act_name}.hook_resid_pre", ave_hook)] | |
res = hooked_generate( | |
[prompt] * example_count, editing_hooks, seed=SEED, **sampling_kwargs | |
) | |
# Remove beginning of sequence token | |
res_str = model.to_string(res[:, 1:]) | |
else: | |
if not UI_DEVELOPMENT and no_steering_input: | |
res_str = hooked_generate( | |
[prompt] * example_count, [], seed=SEED, **sampling_kwargs | |
) | |
# Remove beginning of sequence token | |
res_str = model.to_string(res_str[:, 1:]) | |
else: | |
res_str = [ | |
"To visit the Berlin wall people have to go to the wall.", | |
"To visit the Berlin wall people have to go to a museum.", | |
] | |
ui_result = results_to_ui_output( | |
res_str, prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input | |
) | |
print(f"Text generation end after {time.time() - time_stamp:.2f} seconds:") | |
print(ui_result) | |
return ui_result | |
options_accordion = gr.Accordion(label="Steering Options", open=True) | |
prompt_sub_input = gr.Textbox( | |
lines=1, | |
label="FROM", | |
info='Enter a prompt that you want to steer the AI output away from. \ | |
This can be a single word or a whole phrase. E.g. \ | |
"The Berlin Wall is in Berlin" or "Hate".', | |
value="Hate", | |
) | |
prompt_add_input = gr.Textbox( | |
lines=1, | |
label="TO", | |
info='Enter a prompt that you want to steer the AI ouput towards. \ | |
This can be a single word or a whole phrase. E.g. \ | |
"The Berlin Wall is in Hamburg" or "Love".', | |
value="Love", | |
) | |
coeff_input = gr.Slider( | |
minimum=0, | |
maximum=100, | |
step=1, | |
label="MULTIPLIER", | |
info="The strength of the steering. Higher values will steer the AI output more towards the TO prompt. Be careful not to oversteer and break the AI's semantic capabilities!", | |
value=12, | |
) | |
act_name_input = gr.Slider( | |
minimum=0, | |
maximum=47, | |
step=1, | |
label="LAYER", | |
info="The layer of the model to steer. Higher layers are more abstract. However, steering at lower layers can lead to more coherent output. Experiment to find the best layer for your use case.", | |
value=6, | |
) | |
no_steering_input = gr.Checkbox( | |
label="No Steering", | |
info="Check this box to generate text without steering.", | |
value=False, | |
) | |
message_input = gr.Textbox( | |
lines=1, | |
label="PROMPT", | |
info='Enter a message to be completed by the AI. E.g. "I hate you because".', | |
placeholder="Enter a message to generate text.", | |
value="I hate you because", | |
) | |
text_output = gr.Textbox( | |
label="AI Text Generator", | |
lines=24, | |
max_lines=24, | |
placeholder="Hi, I am an AI Text Generator. \n\nPlease don't steer me the wrong way! 🤖", | |
show_copy_button=True, | |
) | |
CSS = """\ | |
.prose { | |
var(--block-title-text-color); | |
} | |
.block:has(.prose) { | |
border: solid var(--panel-border-width) var(--panel-border-color); | |
border-radius: var(--container-radius); | |
background: var(--panel-background-fill); | |
padding: var(--spacing-lg); | |
} | |
""" | |
DESCRIPTION = """\ | |
AI Text Generation can seem magical and inscrutable, but [recent research](https://arxiv.org/abs/2308.10248) has shown that it is possible to steer the output of a model by modifying its activations. Even better, it is quite intuitive and fun! | |
This demo allows you to input a message and two prompts, and then steer the model's output towards one prompt and away from another. You can also control the strength of the steering and the layer of the model to steer. Try it out and see what you can create! | |
If you end up with something you like, feel free to share it with us [on the community tab](https://huggingface.co/spaces/janraasch/activate-love/discussions). We would love to see what you come up with! | |
You can use the »copy«-button on the upper right corner of the generated text box to copy your results to your clipboard. Have fun exploring the interface! 🚀 | |
Learn more about the research behind this below. 📚 | |
CONTENT WARNING: This interface allows you to manipulate and steer the outputs of [a large language model (GPT2-XL)](https://huggingface.co/openai-community/gpt2-xl) trained on a broad corpus of online data. The model's outputs may contain biased, offensive, explicit, or otherwise harmful content. Use this interface cautiously and at your own risk. We recommend parental guidance for minors. | |
""" | |
ARTICLE = """\ | |
# Activation Addition: Steering GPT2 Without Optimization | |
This Space replicates results from the paper [Activation Addition: Steering GPT2 Without Optimization](https://arxiv.org/abs/2308.10248) and provides a user-friendly interface for anybody to gain intuition about how activation steering works. | |
🔎 For more details about the research behind this take a look at [this post on the AI Alignment Forum](https://www.alignmentforum.org/posts/5spBue2z2tw4JuDCx/steering-gpt-2-xl-by-adding-an-activation-vector) or check out [the original paper](https://arxiv.org/abs/2308.10248). | |
## Model Details | |
We use a [pre-trained GPT2-XL model](https://huggingface.co/openai-community/gpt2-xl) from the Hugging Face model hub. The model is loaded with the [`transformer_lens` library](https://transformerlensorg.github.io/TransformerLens/), which allows us to access the activations of the model at different layers. | |
## Limitations | |
*So how is this not the solution to the [Alignment Problem](https://en.wikipedia.org/wiki/AI_alignment)?* you might ask. | |
Well, this is early research, and there are some limitations to keep in mind 😇: | |
* [GPT2-XL](https://huggingface.co/openai-community/gpt2-xl) is quite small compared to models currently being trained (like e.g. [LLAMA3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)). | |
* Activation Steering is not perfect and can lead to unintended side effects. For steering the model toward a prompt might lead to the model generating text that is not semantically coherent. | |
* Activation Steering is also not guaranteed to work for all prompts and all layers. | |
* It is still an open question how to best steer models in a safe and reliable way. | |
## Future Work | |
There is an even more recent paper that builds on this research: [Steering LLAMA-2 with Contrastive Activation Additions](https://arxiv.org/abs/2308.10248). This paper steers the [LLAMA-2 model](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) with contrastive activation additions and shows that it is possible to steer a larger model chatbot with this technique. | |
Hence, we would like to try to replicate these results on a Hugging Face Space thus providing a chat interface that can be steered to be more helpful or more harmful. | |
""" | |
EXAMPLES = [ | |
["I hate you because", "Hate", "Love", 12, 6, False], | |
[ | |
"To see the Berlin Wall, people flock to", | |
"The Berlin Wall is in Berlin", | |
"The Berlin Wall is in Hamburg", | |
10, | |
20, | |
False, | |
], | |
["I went up to my friend and said", " ", " wedding", 4, 6, False], | |
] | |
demo = gr.Interface( | |
theme="gradio/[email protected]", | |
fn=predict, | |
inputs=[ | |
message_input, | |
prompt_sub_input, | |
prompt_add_input, | |
coeff_input, | |
act_name_input, | |
no_steering_input, | |
], | |
outputs=text_output, | |
title="ACTIVATE LOVE", | |
description=DESCRIPTION, | |
allow_duplication=True, | |
article=ARTICLE, | |
allow_flagging="never", | |
examples=EXAMPLES, | |
cache_examples=False, | |
css=CSS, | |
) | |
print("Starting demo!") | |
demo.launch() | |