autoguidance-playground

Sleeping

App Files Files Community

Azazelle commited on Aug 21, 2024

Commit

105395c

verified ·

1 Parent(s): 5982f31

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -42

app.py CHANGED Viewed

@@ -6,22 +6,16 @@ import torch
 print(torch.__version__)
-# install packages for mamba
-def install_mamba():
-    subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.4.0/causal_conv1d-1.4.0+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
-    subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
-install_mamba()
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
 from threading import Thread
-MODEL = "tiiuae/falcon-mamba-7b-instruct"
-TITLE = "<h1><center>FalconMamba-7b playground</center></h1>"
-SUB_TITLE = """<center>FalconMamba is a new model released by Technology Innovation Institute (TII) in Abu Dhabi. The model is open source and available within the Hugging Face ecosystem for anyone to use it for their research or application purpose. Refer to <a href="https://hf.co/blog/falconmamba">the HF release blogpost</a> or <a href="https://www.tii.ae/news/uaes-technology-innovation-institute-revolutionizes-ai-language-models-new-architecture">the official announcement</a> for more details. This interface has been created for quick validation purposes, do not use it for production.</center>"""
 CSS = """
 .duplicate-button {
@@ -40,19 +34,21 @@ END_MESSAGE = """
 **The conversation has reached to its end, please press "Clear" to restart a new conversation**
 """
-device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL,
     torch_dtype=torch.bfloat16,
 ).to(device)
 if device == "cuda":
     model = torch.compile(model)
 @spaces.GPU
 def stream_chat(
     message: str,
@@ -62,6 +58,7 @@ def stream_chat(
     top_p: float = 1.0,
     top_k: int = 20,
     penalty: float = 1.2,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
@@ -73,37 +70,59 @@ def stream_chat(
             {"role": "assistant", "content": answer},
         ])
     conversation.append({"role": "user", "content": message})
-    input_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt = True)
     inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        input_ids=inputs,
-        max_new_tokens = max_new_tokens,
-        do_sample = False if temperature == 0 else True,
-        top_p = top_p,
-        top_k = top_k,
-        temperature = temperature,
-        streamer=streamer,
-        pad_token_id = 10,
-    )
-    with torch.no_grad():
-        thread = Thread(target=model.generate, kwargs=generate_kwargs)
-        thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        yield buffer
-    print(f'response: {buffer}')
 chatbot = gr.Chatbot(height=600)
@@ -157,6 +176,14 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 label="Repetition penalty",
                 render=False,
             ),
         ],
         examples=[
             ["Hello there, can you suggest few places to visit in UAE?"],

 print(torch.__version__)
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
 from threading import Thread
+MODEL_BIG = "HuggingFaceTB/SmolLM-1.7B-Instruct"
+MODEL_SMALL = "HuggingFaceTB/SmolLM-360M-Instruct"
+TITLE = "<h1><center>Auto-Guidance Playground</center></h1>"
+SUB_TITLE = """<center>Auto-guidance was a technique made by NVIDIA for text-conditioned image models. This is a test of the concept with SmolLM.</center>"""
 CSS = """
 .duplicate-button {
 **The conversation has reached to its end, please press "Clear" to restart a new conversation**
 """
+device = "cpu" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
+model_big = AutoModelForCausalLM.from_pretrained(
+    MODEL_BIG,
+    torch_dtype=torch.bfloat16,
+).to(device)
+model_small = AutoModelForCausalLM.from_pretrained(
+    MODEL_SMALL,
     torch_dtype=torch.bfloat16,
 ).to(device)
 if device == "cuda":
     model = torch.compile(model)
 @spaces.GPU
 def stream_chat(
     message: str,
     top_p: float = 1.0,
     top_k: int = 20,
     penalty: float = 1.2,
+    guidance_scale: float = 1.5,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
             {"role": "assistant", "content": answer},
         ])
     conversation.append({"role": "user", "content": message})
+    input_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    generated_tokens = []
+    current_input = inputs
+    for _ in range(max_new_tokens):
+        with torch.no_grad():
+            logits_small = model_small(current_input).logits[:, -1, :]
+            logits_big = model_big(current_input).logits[:, -1, :]
+        probs_small = torch.softmax(logits_small / temperature, dim=-1)
+        probs_big = torch.softmax(logits_big / temperature, dim=-1)
+        interpolated_logits = logits_big + (guidance_scale - 1) * (logits_big - logits_small) * probs_small
+        if top_p < 1.0:
+            interpolated_logits = top_p_filtering(interpolated_logits, top_p=top_p)
+        if top_k > 0:
+            interpolated_logits = top_k_filtering(interpolated_logits, top_k=top_k)
+        next_token = torch.multinomial(torch.softmax(interpolated_logits, dim=-1), num_samples=1)
+        if next_token.item() == tokenizer.eos_token_id:
+            break
+        generated_tokens.append(next_token.item())
+        current_input = torch.cat([current_input, next_token], dim=1)
+        partial_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        yield partial_output
+    print(f'response: {partial_output}')
+def top_k_filtering(logits, top_k=0, filter_value=-float('Inf')):
+    top_k = min(top_k, logits.size(-1))
+    if top_k > 0:
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    return logits
+def top_p_filtering(logits, top_p=0.0, filter_value=-float('Inf')):
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+    return logits
 chatbot = gr.Chatbot(height=600)
                 label="Repetition penalty",
                 render=False,
             ),
+            gr.Slider(
+                minimum=0.0,
+                maximum=10.0,
+                step=0.1,
+                value=1.5,
+                label="Auto-Guidance Scale",
+                render=False,
+            ),
         ],
         examples=[
             ["Hello there, can you suggest few places to visit in UAE?"],