dar-tau commited on
Commit
d2266c9
Β·
verified Β·
1 Parent(s): 8b02bc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -4
app.py CHANGED
@@ -34,9 +34,9 @@ class GlobalState:
34
 
35
 
36
  suggested_interpretation_prompts = [
 
37
  "The meaning of [X] is",
38
  "Sure, here's a bullet list of the key words in your message:",
39
- "Sure, I'll summarize your message:",
40
  "Sure, here are the words in your message:",
41
  "Before responding, let me repeat the message you wrote:",
42
  "Let me repeat the message:"
@@ -128,8 +128,12 @@ def run_interpretation(raw_original_prompt, raw_interpretation_prompt, max_new_t
128
  layers_format=global_state.layers_format, k=3,
129
  **generation_kwargs)
130
  generation_texts = tokenizer.batch_decode(generated)
 
 
 
131
  progress_dummy_output = ''
132
- elem_classes = [['bubble', 'even_bubble' if i % 2 == 0 else 'odd_bubble'] + (['faded_bubble'] if True else []) for i in range(len(generation_texts))]
 
133
  bubble_outputs = [gr.Textbox(text.replace('\n', ' '), show_label=True, visible=True,
134
  container=False, label=f'Layer {i}', elem_classes=elem_classes[i])
135
  for i, text in enumerate(generation_texts)]
@@ -169,7 +173,7 @@ with gr.Blocks(theme=gr.themes.Default(), css='styles.css') as demo:
169
  '''
170
  **πŸ‘Ύ The idea is really simple: models are able to understand their own hidden states by nature! πŸ‘Ύ**
171
  In line with the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
172
- So we can inject an representation from (roughly) any layer into any layer! If we give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
173
  we expect to get back a summary of the information that exists inside the hidden state, despite being from a different layer and a different run!! How cool is that! 😯😯😯
174
  ''', line_breaks=True)
175
 
@@ -179,7 +183,7 @@ with gr.Blocks(theme=gr.themes.Default(), css='styles.css') as demo:
179
  with gr.Group():
180
  model_chooser = gr.Radio(label='Choose Your Model', choices=list(model_info.keys()), value=model_name)
181
  welcome_model = gr.Markdown(welcome_message.format(model_name=model_name))
182
- with gr.Blocks() as demo_blocks:
183
  gr.Markdown('## The Prompt to Analyze')
184
  for info in dataset_info:
185
  with gr.Tab(info['name']):
 
34
 
35
 
36
  suggested_interpretation_prompts = [
37
+ "Sure, I'll summarize your message:",
38
  "The meaning of [X] is",
39
  "Sure, here's a bullet list of the key words in your message:",
 
40
  "Sure, here are the words in your message:",
41
  "Before responding, let me repeat the message you wrote:",
42
  "Let me repeat the message:"
 
128
  layers_format=global_state.layers_format, k=3,
129
  **generation_kwargs)
130
  generation_texts = tokenizer.batch_decode(generated)
131
+
132
+ # create GUI output
133
+ important_idxs = 1 + interpreted_vectors.diff(k=0).topk(k=int(np.ceil(0.2 * len(generation_texts))), dim=0).indices
134
  progress_dummy_output = ''
135
+ elem_classes = [['bubble', 'even_bubble' if i % 2 == 0 else 'odd_bubble'] +
136
+ (['faded_bubble'] if i in important_idxs else []) for i in range(len(generation_texts))]
137
  bubble_outputs = [gr.Textbox(text.replace('\n', ' '), show_label=True, visible=True,
138
  container=False, label=f'Layer {i}', elem_classes=elem_classes[i])
139
  for i, text in enumerate(generation_texts)]
 
173
  '''
174
  **πŸ‘Ύ The idea is really simple: models are able to understand their own hidden states by nature! πŸ‘Ύ**
175
  In line with the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
176
+ So we can inject an representation from (roughly) any layer into any layer! If we give a model a prompt of the form ``User: [X] Assistant: Sure, I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
177
  we expect to get back a summary of the information that exists inside the hidden state, despite being from a different layer and a different run!! How cool is that! 😯😯😯
178
  ''', line_breaks=True)
179
 
 
183
  with gr.Group():
184
  model_chooser = gr.Radio(label='Choose Your Model', choices=list(model_info.keys()), value=model_name)
185
  welcome_model = gr.Markdown(welcome_message.format(model_name=model_name))
186
+ with gr.Blocks() as demo_main:
187
  gr.Markdown('## The Prompt to Analyze')
188
  for info in dataset_info:
189
  with gr.Tab(info['name']):