Update app.py
Browse files
app.py
CHANGED
@@ -34,9 +34,9 @@ class GlobalState:
|
|
34 |
|
35 |
|
36 |
suggested_interpretation_prompts = [
|
|
|
37 |
"The meaning of [X] is",
|
38 |
"Sure, here's a bullet list of the key words in your message:",
|
39 |
-
"Sure, I'll summarize your message:",
|
40 |
"Sure, here are the words in your message:",
|
41 |
"Before responding, let me repeat the message you wrote:",
|
42 |
"Let me repeat the message:"
|
@@ -128,8 +128,12 @@ def run_interpretation(raw_original_prompt, raw_interpretation_prompt, max_new_t
|
|
128 |
layers_format=global_state.layers_format, k=3,
|
129 |
**generation_kwargs)
|
130 |
generation_texts = tokenizer.batch_decode(generated)
|
|
|
|
|
|
|
131 |
progress_dummy_output = ''
|
132 |
-
elem_classes = [['bubble', 'even_bubble' if i % 2 == 0 else 'odd_bubble'] +
|
|
|
133 |
bubble_outputs = [gr.Textbox(text.replace('\n', ' '), show_label=True, visible=True,
|
134 |
container=False, label=f'Layer {i}', elem_classes=elem_classes[i])
|
135 |
for i, text in enumerate(generation_texts)]
|
@@ -169,7 +173,7 @@ with gr.Blocks(theme=gr.themes.Default(), css='styles.css') as demo:
|
|
169 |
'''
|
170 |
**πΎ The idea is really simple: models are able to understand their own hidden states by nature! πΎ**
|
171 |
In line with the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
|
172 |
-
So we can inject an representation from (roughly) any layer into any layer! If we give a model a prompt of the form ``User: [X] Assistant: Sure
|
173 |
we expect to get back a summary of the information that exists inside the hidden state, despite being from a different layer and a different run!! How cool is that! π―π―π―
|
174 |
''', line_breaks=True)
|
175 |
|
@@ -179,7 +183,7 @@ with gr.Blocks(theme=gr.themes.Default(), css='styles.css') as demo:
|
|
179 |
with gr.Group():
|
180 |
model_chooser = gr.Radio(label='Choose Your Model', choices=list(model_info.keys()), value=model_name)
|
181 |
welcome_model = gr.Markdown(welcome_message.format(model_name=model_name))
|
182 |
-
with gr.Blocks() as
|
183 |
gr.Markdown('## The Prompt to Analyze')
|
184 |
for info in dataset_info:
|
185 |
with gr.Tab(info['name']):
|
|
|
34 |
|
35 |
|
36 |
suggested_interpretation_prompts = [
|
37 |
+
"Sure, I'll summarize your message:",
|
38 |
"The meaning of [X] is",
|
39 |
"Sure, here's a bullet list of the key words in your message:",
|
|
|
40 |
"Sure, here are the words in your message:",
|
41 |
"Before responding, let me repeat the message you wrote:",
|
42 |
"Let me repeat the message:"
|
|
|
128 |
layers_format=global_state.layers_format, k=3,
|
129 |
**generation_kwargs)
|
130 |
generation_texts = tokenizer.batch_decode(generated)
|
131 |
+
|
132 |
+
# create GUI output
|
133 |
+
important_idxs = 1 + interpreted_vectors.diff(k=0).topk(k=int(np.ceil(0.2 * len(generation_texts))), dim=0).indices
|
134 |
progress_dummy_output = ''
|
135 |
+
elem_classes = [['bubble', 'even_bubble' if i % 2 == 0 else 'odd_bubble'] +
|
136 |
+
(['faded_bubble'] if i in important_idxs else []) for i in range(len(generation_texts))]
|
137 |
bubble_outputs = [gr.Textbox(text.replace('\n', ' '), show_label=True, visible=True,
|
138 |
container=False, label=f'Layer {i}', elem_classes=elem_classes[i])
|
139 |
for i, text in enumerate(generation_texts)]
|
|
|
173 |
'''
|
174 |
**πΎ The idea is really simple: models are able to understand their own hidden states by nature! πΎ**
|
175 |
In line with the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
|
176 |
+
So we can inject an representation from (roughly) any layer into any layer! If we give a model a prompt of the form ``User: [X] Assistant: Sure, I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
|
177 |
we expect to get back a summary of the information that exists inside the hidden state, despite being from a different layer and a different run!! How cool is that! π―π―π―
|
178 |
''', line_breaks=True)
|
179 |
|
|
|
183 |
with gr.Group():
|
184 |
model_chooser = gr.Radio(label='Choose Your Model', choices=list(model_info.keys()), value=model_name)
|
185 |
welcome_model = gr.Markdown(welcome_message.format(model_name=model_name))
|
186 |
+
with gr.Blocks() as demo_main:
|
187 |
gr.Markdown('## The Prompt to Analyze')
|
188 |
for info in dataset_info:
|
189 |
with gr.Tab(info['name']):
|