Spaces:

dar-tau
/

selfie

Running on Zero

App Files Files Community

dar-tau commited on Apr 14, 2024

Commit

d2266c9

verified ·

1 Parent(s): 8b02bc4

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -4

app.py CHANGED Viewed

@@ -34,9 +34,9 @@ class GlobalState:
 suggested_interpretation_prompts = [
                                     "The meaning of [X] is",
                                     "Sure, here's a bullet list of the key words in your message:",
-                                    "Sure, I'll summarize your message:",
                                     "Sure, here are the words in your message:",
                                     "Before responding, let me repeat the message you wrote:",
                                     "Let me repeat the message:"
@@ -128,8 +128,12 @@ def run_interpretation(raw_original_prompt, raw_interpretation_prompt, max_new_t
                                                layers_format=global_state.layers_format, k=3,
                                                **generation_kwargs)
     generation_texts = tokenizer.batch_decode(generated)
     progress_dummy_output = ''
-    elem_classes = [['bubble', 'even_bubble' if i % 2 == 0 else 'odd_bubble'] + (['faded_bubble'] if True else []) for i in range(len(generation_texts))]
     bubble_outputs = [gr.Textbox(text.replace('\n', ' '), show_label=True, visible=True,
                                  container=False, label=f'Layer {i}', elem_classes=elem_classes[i])
                       for i, text in enumerate(generation_texts)]
@@ -169,7 +173,7 @@ with gr.Blocks(theme=gr.themes.Default(), css='styles.css') as demo:
             '''
                 **👾 The idea is really simple: models are able to understand their own hidden states by nature! 👾**
                 In line with the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
-                So we can inject an representation from (roughly) any layer into any layer! If we give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
                 we expect to get back a summary of the information that exists inside the hidden state, despite being from a different layer and a different run!! How cool is that! 😯😯😯
             ''', line_breaks=True)
@@ -179,7 +183,7 @@ with gr.Blocks(theme=gr.themes.Default(), css='styles.css') as demo:
     with gr.Group():
         model_chooser = gr.Radio(label='Choose Your Model', choices=list(model_info.keys()), value=model_name)
         welcome_model = gr.Markdown(welcome_message.format(model_name=model_name))
-    with gr.Blocks() as demo_blocks:
         gr.Markdown('## The Prompt to Analyze')
         for info in dataset_info:
             with gr.Tab(info['name']):

 suggested_interpretation_prompts = [
+                                    "Sure, I'll summarize your message:",
                                     "The meaning of [X] is",
                                     "Sure, here's a bullet list of the key words in your message:",
                                     "Sure, here are the words in your message:",
                                     "Before responding, let me repeat the message you wrote:",
                                     "Let me repeat the message:"
                                                layers_format=global_state.layers_format, k=3,
                                                **generation_kwargs)
     generation_texts = tokenizer.batch_decode(generated)
+    # create GUI output
+    important_idxs = 1 + interpreted_vectors.diff(k=0).topk(k=int(np.ceil(0.2 * len(generation_texts))), dim=0).indices
     progress_dummy_output = ''
+    elem_classes = [['bubble', 'even_bubble' if i % 2 == 0 else 'odd_bubble'] +
+                    (['faded_bubble'] if i in important_idxs else []) for i in range(len(generation_texts))]
     bubble_outputs = [gr.Textbox(text.replace('\n', ' '), show_label=True, visible=True,
                                  container=False, label=f'Layer {i}', elem_classes=elem_classes[i])
                       for i, text in enumerate(generation_texts)]
             '''
                 **👾 The idea is really simple: models are able to understand their own hidden states by nature! 👾**
                 In line with the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
+                So we can inject an representation from (roughly) any layer into any layer! If we give a model a prompt of the form ``User: [X] Assistant: Sure, I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
                 we expect to get back a summary of the information that exists inside the hidden state, despite being from a different layer and a different run!! How cool is that! 😯😯😯
             ''', line_breaks=True)
     with gr.Group():
         model_chooser = gr.Radio(label='Choose Your Model', choices=list(model_info.keys()), value=model_name)
         welcome_model = gr.Markdown(welcome_message.format(model_name=model_name))
+    with gr.Blocks() as demo_main:
         gr.Markdown('## The Prompt to Analyze')
         for info in dataset_info:
             with gr.Tab(info['name']):