semantic-entropy-probes

Sleeping

App Files Files Community

s-a-malik commited on Jul 16, 2024

Commit

8aba6d1

1 Parent(s): 32936b7

add accuracy probe

Browse files

Files changed (1) hide show

app.py +214 -66

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pickle as pkl
 from pathlib import Path
 from threading import Thread
 from typing import List, Tuple, Iterator
 import spaces
 import gradio as gr
@@ -14,11 +15,22 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-DESCRIPTION = """\
-This Space demonstrates the Llama-2-7b-chat model with a semantic uncertainty probe.
-The highlighted text shows the model's uncertainty in real-time, with green indicating more certain generations and red indicating higher uncertainty.
 """
 if torch.cuda.is_available():
     model_id = "meta-llama/Llama-2-7b-chat-hf"
     # TODO load the full model not the 8bit one?
@@ -32,10 +44,91 @@ if torch.cuda.is_available():
         probe_data = pkl.load(f)
     # take the NQ open one
     probe_data = probe_data[-2]
-    probe = probe_data['t_bmodel']
-    layer_range = probe_data['sep_layer_range']
     acc_probe = probe_data['t_amodel']
     acc_layer_range = probe_data['ap_layer_range']
 @spaces.GPU
 def generate(
@@ -84,20 +177,31 @@ def generate(
     hidden = outputs.hidden_states  # list of tensors, one for each token, then (batch size, sequence length, hidden size)
     # TODO do this loop on the fly instead of waiting for the whole generation
-    highlighted_text = ""
     for i in range(1, len(hidden)):
-        token_embeddings = torch.stack([generated_token[0, 0, :].cpu() for generated_token in hidden[i]])   # (num_layers, hidden_size)
-        concat_layers = token_embeddings.numpy()[layer_range[0]:layer_range[1]].reshape(-1)  # (num_layers * hidden_size)
-        # pred in range [-1, 1]
-        probe_pred = probe.predict_proba(concat_layers.reshape(1, -1))[0][1] * 2 - 1  # prob of high SE
-        # decode one token at a time
         output_id = outputs.sequences[0, input_ids.shape[1]+i]
         output_word = tokenizer.decode(output_id)
-        print(output_id, output_word, probe_pred)
-        new_highlighted_text = highlight_text(output_word, probe_pred)
-        highlighted_text += f" {new_highlighted_text}"
-        yield highlighted_text
 def highlight_text(text: str, uncertainty_score: float) -> str:
     if uncertainty_score > 0:
@@ -116,56 +220,100 @@ def highlight_text(text: str, uncertainty_score: float) -> str:
         html_color, text
     )
-chat_interface = gr.ChatInterface(
-    fn=generate,
-    additional_inputs=[
-        gr.Textbox(label="System prompt", lines=6),
-        gr.Slider(
-            label="Max new tokens",
-            minimum=1,
-            maximum=MAX_MAX_NEW_TOKENS,
-            step=1,
-            value=DEFAULT_MAX_NEW_TOKENS,
-        ),
-        gr.Slider(
-            label="Temperature",
-            minimum=0.1,
-            maximum=4.0,
-            step=0.1,
-            value=0.6,
-        ),
-        gr.Slider(
-            label="Top-p (nucleus sampling)",
-            minimum=0.05,
-            maximum=1.0,
-            step=0.05,
-            value=0.9,
-        ),
-        gr.Slider(
-            label="Top-k",
-            minimum=1,
-            maximum=1000,
-            step=1,
-            value=50,
-        ),
-        gr.Slider(
-            label="Repetition penalty",
-            minimum=1.0,
-            maximum=2.0,
-            step=0.05,
-            value=1.2,
-        ),
-    ],
-    stop_btn=None,
-    examples=[
-        ["What is the capital of France?"],
-        ["Explain the theory of relativity in simple terms."],
-        ["Write a short poem about artificial intelligence."]
-    ],
-    title="Llama-2 7B Chat with Streamable Semantic Uncertainty Probe",
-    description=DESCRIPTION,
-)
 if __name__ == "__main__":
-    chat_interface.launch()

 from pathlib import Path
 from threading import Thread
 from typing import List, Tuple, Iterator
+from queue import Queue
 import spaces
 import gradio as gr
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+DESCRIPTION = """
+<h1>Llama-2 7B Chat with Uncertainty Probes</h1>
+<p>This Space demonstrates the Llama-2-7b-chat model with a semantic uncertainty probe.</p>
+<p>The highlighted text shows the model's uncertainty in real-time:</p>
+<ul>
+    <li><span style="background-color: #00FF00; color: black">Green</span> indicates more certain generations</li>
+    <li><span style="background-color: #FF0000; color: black">Red</span> indicates more uncertain generations</li>
+</ul>
 """
+EXAMPLES = [
+    ["What is the capital of France?", "You are a helpful assistant.", []],
+    ["Explain the theory of relativity in simple terms.", "You are an expert physicist explaining concepts to a layman.", []],
+    ["Write a short poem about artificial intelligence.", "You are a creative poet with a interest in technology.", []]
+]
 if torch.cuda.is_available():
     model_id = "meta-llama/Llama-2-7b-chat-hf"
     # TODO load the full model not the 8bit one?
         probe_data = pkl.load(f)
     # take the NQ open one
     probe_data = probe_data[-2]
+    se_probe = probe_data['t_bmodel']
+    se_layer_range = probe_data['sep_layer_range']
     acc_probe = probe_data['t_amodel']
     acc_layer_range = probe_data['ap_layer_range']
+else:
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+class CustomStreamer(TextIteratorStreamer):
+    """
+    Streamer to also store hidden states in a queue.
+    TODO check this works
+    """
+    def __init__(self, tokenizer, skip_prompt: bool = False, skip_special_tokens: bool = False, **decode_kwargs):
+        super().__init__(tokenizer, skip_prompt, skip_special_tokens, **decode_kwargs)
+        self.hidden_states_queue = Queue()
+    def put(self, value):
+        if isinstance(value, dict) and 'hidden_states' in value:
+            self.hidden_states_queue.put(value['hidden_states'])
+        super().put(value)
+# Streamer claude
+# def generate(
+#     message: str,
+#     system_prompt: str,
+#     chat_history: List[Tuple[str, str]],
+#     max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+#     temperature: float = 0.6,
+#     top_p: float = 0.9,
+#     top_k: int = 50,
+#     repetition_penalty: float = 1.2,
+# ) -> Iterator[Tuple[str, str]]:
+#     conversation = []
+#     if system_prompt:
+#         conversation.append({"role": "system", "content": system_prompt})
+#     for user, assistant in chat_history:
+#         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+#     conversation.append({"role": "user", "content": message})
+#     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+#     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+#         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+#         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+#     input_ids = input_ids.to(model.device)
+#     streamer = CustomStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+#     generation_kwargs = dict(
+#         input_ids=input_ids,
+#         max_new_tokens=max_new_tokens,
+#         do_sample=True,
+#         top_p=top_p,
+#         top_k=top_k,
+#         temperature=temperature,
+#         repetition_penalty=repetition_penalty,
+#         streamer=streamer,
+#         output_hidden_states=True,
+#         return_dict_in_generate=True,
+#     )
+#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
+#     thread.start()
+#     se_highlighted_text = ""
+#     acc_highlighted_text = ""
+#     for new_text in streamer:
+#         hidden_states = streamer.hidden_states_queue.get()
+#         # Semantic Uncertainty Probe
+#         se_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
+#         se_concat_layers = se_token_embeddings.numpy()[se_layer_range[0]:se_layer_range[1]].reshape(-1)
+#         se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+#         # Accuracy Probe
+#         acc_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
+#         acc_concat_layers = acc_token_embeddings.numpy()[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
+#         acc_probe_pred = acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+#         se_new_highlighted_text = highlight_text(new_text, se_probe_pred)
+#         acc_new_highlighted_text = highlight_text(new_text, acc_probe_pred)
+#         se_highlighted_text += se_new_highlighted_text
+#         acc_highlighted_text += acc_new_highlighted_text
+#         yield se_highlighted_text, acc_highlighted_text
 @spaces.GPU
 def generate(
     hidden = outputs.hidden_states  # list of tensors, one for each token, then (batch size, sequence length, hidden size)
     # TODO do this loop on the fly instead of waiting for the whole generation
+    se_highlighted_text = ""
+    acc_highlighted_text = ""
     for i in range(1, len(hidden)):
+        # Semantic Uncertainty Probe
+        token_embeddings = torch.stack([generated_token[0, 0, :].cpu() for generated_token in hidden[i]]).numpy()   # (num_layers, hidden_size)
+        se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
+        se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+        # Accuracy Probe
+        # acc_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
+        acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
+        acc_probe_pred = -1 * acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1] * 2 - 1
         output_id = outputs.sequences[0, input_ids.shape[1]+i]
         output_word = tokenizer.decode(output_id)
+        print(output_id, output_word, se_probe_pred, acc_probe_pred)
+        se_new_highlighted_text = highlight_text(output_word, se_probe_pred)
+        acc_new_highlighted_text = highlight_text(output_word, acc_probe_pred)
+        se_highlighted_text += f" {se_new_highlighted_text}"
+        acc_highlighted_text += f" {acc_new_highlighted_text}"
+        yield se_highlighted_text, acc_highlighted_text
 def highlight_text(text: str, uncertainty_score: float) -> str:
     if uncertainty_score > 0:
         html_color, text
     )
+with gr.Blocks(title="Llama-2 7B Chat with Dual Probes", css="footer {visibility: hidden}") as demo:
+    gr.HTML(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            message = gr.Textbox(label="Message")
+            system_prompt = gr.Textbox(label="System prompt", lines=2)
+        with gr.Column():
+            max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+            top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+            top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+            repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+    with gr.Row():
+        generate_btn = gr.Button("Generate")
+    # add spacing between probes and titles for each output
+    with gr.Row():
+        with gr.Column():
+            title = gr.HTML("<h2>Semantic Uncertainty Probe</h2>")
+            se_output = gr.HTML(label="Semantic Uncertainty Probe")
+        with gr.Column():
+            title = gr.HTML("<h2>Accuracy Probe</h2>")
+            acc_output = gr.HTML(label="Accuracy Probe")
+    chat_history = gr.State([])
+    # gr.Examples(
+    #     examples=EXAMPLES,
+    #     inputs=[message, system_prompt, chat_history],
+    #     outputs=[se_output, acc_output],
+    #     fn=generate,
+    # )
+    generate_btn.click(
+        generate,
+        inputs=[message, system_prompt, chat_history, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[se_output, acc_output]
+    )
+# chat_interface = gr.ChatInterface(
+#     fn=generate,
+#     additional_inputs=[
+#         gr.Textbox(label="System prompt", lines=6),
+#         gr.Slider(
+#             label="Max new tokens",
+#             minimum=1,
+#             maximum=MAX_MAX_NEW_TOKENS,
+#             step=1,
+#             value=DEFAULT_MAX_NEW_TOKENS,
+#         ),
+#         gr.Slider(
+#             label="Temperature",
+#             minimum=0.1,
+#             maximum=4.0,
+#             step=0.1,
+#             value=0.6,
+#         ),
+#         gr.Slider(
+#             label="Top-p (nucleus sampling)",
+#             minimum=0.05,
+#             maximum=1.0,
+#             step=0.05,
+#             value=0.9,
+#         ),
+#         gr.Slider(
+#             label="Top-k",
+#             minimum=1,
+#             maximum=1000,
+#             step=1,
+#             value=50,
+#         ),
+#         gr.Slider(
+#             label="Repetition penalty",
+#             minimum=1.0,
+#             maximum=2.0,
+#             step=0.05,
+#             value=1.2,
+#         ),
+#     ],
+#     stop_btn=None,
+#     examples=[
+#         ["What is the capital of France?"],
+#         ["Who landed on the moon?"],
+#         ["Who is Yarin Gal?"]
+#     ],
+#     title="Llama-2 7B Chat with Streamable Semantic Uncertainty Probe",
+#     description=DESCRIPTION,
+# )
+# if __name__ == "__main__":
+#     chat_interface.launch()
 if __name__ == "__main__":
+    demo.launch()