Update app.py
Browse files
app.py
CHANGED
@@ -182,12 +182,14 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
|
|
182 |
with gr.Row():
|
183 |
with gr.Column(scale=5):
|
184 |
gr.Markdown('# π Self-Interpreting Models')
|
|
|
|
|
|
|
|
|
185 |
gr.Markdown(
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
gr.Markdown(
|
190 |
-
'''This idea was investigated in the paper **Patchscopes** ([Ghandeharioun et al., 2024](https://arxiv.org/abs/2401.06102)) and was further explored in **SelfIE** ([Chen et al., 2024](https://arxiv.org/abs/2403.10949)).
|
191 |
An honorary mention of **Speaking Probes** ([Dar, 2023](https://towardsdatascience.com/speaking-probes-self-interpreting-models-7a3dc6cb33d6) - my own work π₯³) which was less mature but had the same idea in mind.
|
192 |
We will follow the SelfIE implementation in this space for concreteness. Patchscopes are so general that they encompass many other interpretation techniques too!!!
|
193 |
''', line_breaks=True)
|
@@ -200,7 +202,7 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
|
|
200 |
**πΎ The idea is really simple: models are able to understand their own hidden states by nature! πΎ**
|
201 |
According to the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
|
202 |
So we can inject an representation from (roughly) any layer to any layer! If I give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
|
203 |
-
we expect to get back a summary of the information that exists inside the hidden state
|
204 |
''', line_breaks=True)
|
205 |
|
206 |
# with gr.Column(scale=1):
|
@@ -209,9 +211,9 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
|
|
209 |
with gr.Group('Interpretation'):
|
210 |
interpretation_prompt = gr.Text(suggested_interpretation_prompts[0], label='Interpretation Prompt')
|
211 |
|
212 |
-
gr.Markdown('''
|
213 |
-
Here are some examples of prompts we can analyze their internal representations:
|
214 |
-
''')
|
215 |
|
216 |
# for info in dataset_info:
|
217 |
# with gr.Tab(info['name']):
|
|
|
182 |
with gr.Row():
|
183 |
with gr.Column(scale=5):
|
184 |
gr.Markdown('# π Self-Interpreting Models')
|
185 |
+
# gr.Markdown(
|
186 |
+
# '**πΎ This space is a simple introduction to the emerging trend of models interpreting their OWN hidden states in free form natural language!!πΎ**',
|
187 |
+
# # elem_classes=['explanation_accordion']
|
188 |
+
# )
|
189 |
gr.Markdown(
|
190 |
+
'''
|
191 |
+
**πΎ This space is a simple introduction to the emerging trend of models interpreting their OWN hidden states in free form natural language!!πΎ**
|
192 |
+
This idea was investigated in the paper **Patchscopes** ([Ghandeharioun et al., 2024](https://arxiv.org/abs/2401.06102)) and was further explored in **SelfIE** ([Chen et al., 2024](https://arxiv.org/abs/2403.10949)).
|
|
|
|
|
193 |
An honorary mention of **Speaking Probes** ([Dar, 2023](https://towardsdatascience.com/speaking-probes-self-interpreting-models-7a3dc6cb33d6) - my own work π₯³) which was less mature but had the same idea in mind.
|
194 |
We will follow the SelfIE implementation in this space for concreteness. Patchscopes are so general that they encompass many other interpretation techniques too!!!
|
195 |
''', line_breaks=True)
|
|
|
202 |
**πΎ The idea is really simple: models are able to understand their own hidden states by nature! πΎ**
|
203 |
According to the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
|
204 |
So we can inject an representation from (roughly) any layer to any layer! If I give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
|
205 |
+
we expect to get back a summary of the information that exists inside the hidden state from different layers and different runs!! How cool is that! π―π―π―
|
206 |
''', line_breaks=True)
|
207 |
|
208 |
# with gr.Column(scale=1):
|
|
|
211 |
with gr.Group('Interpretation'):
|
212 |
interpretation_prompt = gr.Text(suggested_interpretation_prompts[0], label='Interpretation Prompt')
|
213 |
|
214 |
+
# gr.Markdown('''
|
215 |
+
# Here are some examples of prompts we can analyze their internal representations:
|
216 |
+
# ''')
|
217 |
|
218 |
# for info in dataset_info:
|
219 |
# with gr.Tab(info['name']):
|