dar-tau commited on
Commit
9f98ca2
Β·
verified Β·
1 Parent(s): ee8bd6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -18
app.py CHANGED
@@ -182,23 +182,24 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
182
  with gr.Row():
183
  with gr.Column(scale=5):
184
  gr.Markdown('# 😎 Self-Interpreting Models')
185
- with gr.Accordion(
186
- label='πŸ‘Ύ This space is a simple introduction to the emerging trend of models interpreting their OWN hidden states in free form natural language!!πŸ‘Ύ',
187
- elem_classes=['explanation_accordion']
188
- ):
189
- gr.Markdown(
190
- '''This idea was investigated in the paper **Patchscopes** ([Ghandeharioun et al., 2024](https://arxiv.org/abs/2401.06102)) and was further explored in **SelfIE** ([Chen et al., 2024](https://arxiv.org/abs/2403.10949)).
191
- An honorary mention of **Speaking Probes** ([Dar, 2023](https://towardsdatascience.com/speaking-probes-self-interpreting-models-7a3dc6cb33d6) - my own work πŸ₯³) which was less mature but had the same idea in mind.
192
- We will follow the SelfIE implementation in this space for concreteness. Patchscopes are so general that they encompass many other interpretation techniques too!!!
193
- ''', line_breaks=True)
194
-
195
- with gr.Accordion(label='πŸ‘Ύ The idea is really simple: models are able to understand their own hidden states by nature! πŸ‘Ύ',
196
- elem_classes=['explanation_accordion']):
197
- gr.Markdown(
198
- '''According to the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
199
- So we can inject an representation from (roughly) any layer to any layer! If I give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
200
- we expect to get back a summary of the information that exists inside the hidden state. Since the model uses a roughly common latent space, it can understand representations from different layers and different runs!! How cool is that! 😯😯😯
201
- ''', line_breaks=True)
 
202
 
203
  with gr.Column(scale=1):
204
  gr.Markdown('<span style="font-size:180px;">πŸ€”</span>')
@@ -229,7 +230,7 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
229
  for i in range(MAX_PROMPT_TOKENS):
230
  btn = gr.Button('', visible=False, elem_classes=['token_btn'])
231
  tokens_container.append(btn)
232
- use_gpu = gr.Checkbox(value=False, label='Use GPU')
233
  progress_dummy = gr.Markdown('', elem_id='progress_dummy')
234
 
235
  interpretation_bubbles = [gr.Textbox('', container=False, visible=False, elem_classes=['bubble',
 
182
  with gr.Row():
183
  with gr.Column(scale=5):
184
  gr.Markdown('# 😎 Self-Interpreting Models')
185
+ gr.Markdown(
186
+ '**πŸ‘Ύ This space is a simple introduction to the emerging trend of models interpreting their OWN hidden states in free form natural language!!πŸ‘Ύ**',
187
+ # elem_classes=['explanation_accordion']
188
+ )
189
+ gr.Markdown(
190
+ '''This idea was investigated in the paper **Patchscopes** ([Ghandeharioun et al., 2024](https://arxiv.org/abs/2401.06102)) and was further explored in **SelfIE** ([Chen et al., 2024](https://arxiv.org/abs/2403.10949)).
191
+ An honorary mention of **Speaking Probes** ([Dar, 2023](https://towardsdatascience.com/speaking-probes-self-interpreting-models-7a3dc6cb33d6) - my own work πŸ₯³) which was less mature but had the same idea in mind.
192
+ We will follow the SelfIE implementation in this space for concreteness. Patchscopes are so general that they encompass many other interpretation techniques too!!!
193
+ ''', line_breaks=True)
194
+
195
+ gr.Markdown('**πŸ‘Ύ The idea is really simple: models are able to understand their own hidden states by nature! πŸ‘Ύ**',
196
+ # elem_classes=['explanation_accordion']
197
+ )
198
+ gr.Markdown(
199
+ '''According to the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
200
+ So we can inject an representation from (roughly) any layer to any layer! If I give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
201
+ we expect to get back a summary of the information that exists inside the hidden state. Since the model uses a roughly common latent space, it can understand representations from different layers and different runs!! How cool is that! 😯😯😯
202
+ ''', line_breaks=True)
203
 
204
  with gr.Column(scale=1):
205
  gr.Markdown('<span style="font-size:180px;">πŸ€”</span>')
 
230
  for i in range(MAX_PROMPT_TOKENS):
231
  btn = gr.Button('', visible=False, elem_classes=['token_btn'])
232
  tokens_container.append(btn)
233
+ use_gpu = False # gr.Checkbox(value=False, label='Use GPU')
234
  progress_dummy = gr.Markdown('', elem_id='progress_dummy')
235
 
236
  interpretation_bubbles = [gr.Textbox('', container=False, visible=False, elem_classes=['bubble',