autoguidance-playground

Sleeping

Azazelle commited on Aug 22, 2024

Commit

bb221fb

verified ·

1 Parent(s): ec52c4d

Update app.py

Speed up with caching.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -50,6 +50,7 @@ if model_big.device == "cuda":
 if model_small.device == "cuda":
     model_small = torch.compile(model_small)
 @spaces.GPU
 def stream_chat(
     message: str,
@@ -73,20 +74,22 @@ def stream_chat(
     conversation.append({"role": "user", "content": message})
-    input_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
     generated_tokens = []
     current_input = inputs
     for _ in range(max_new_tokens):
-        with torch.no_grad():
-            logits_small = model_small(current_input).logits[:, -1, :]
-            logits_big = model_big(current_input).logits[:, -1, :]
-        probs_small = torch.softmax(logits_small / temperature, dim=-1)
-        interpolated_logits = logits_big + (guidance_scale - 1) * (logits_big - logits_small) * probs_small
         if top_p < 1.0:
             interpolated_logits = top_p_filtering(interpolated_logits, top_p=top_p)
@@ -99,7 +102,11 @@ def stream_chat(
             break
         generated_tokens.append(next_token.item())
-        current_input = torch.cat([current_input, next_token], dim=1)
         partial_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
         yield partial_output

 if model_small.device == "cuda":
     model_small = torch.compile(model_small)
+@torch.no_grad()
 @spaces.GPU
 def stream_chat(
     message: str,
     conversation.append({"role": "user", "content": message})
+    inputs = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt")
     generated_tokens = []
     current_input = inputs
+    cache_small = None
+    cache_big = None
     for _ in range(max_new_tokens):
+        outputs_small = model_small(current_input, use_cache=True, past_key_values=cache_small)
+        outputs_big = model_big(current_input, use_cache=True, past_key_values=cache_big)
+        logits_small = outputs_small.logits[:, -1, :]
+        logits_big = outputs_big.logits[:, -1, :]
+        interpolated_logits = logits_big + (guidance_scale - 1) * (logits_big - logits_small)
         if top_p < 1.0:
             interpolated_logits = top_p_filtering(interpolated_logits, top_p=top_p)
             break
         generated_tokens.append(next_token.item())
+        current_input = next_token
+        # Update the cache with the latest past_key_values
+        cache_small = outputs_small.past_key_values
+        cache_big = outputs_big.past_key_values
         partial_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
         yield partial_output