toilaluan commited on
Commit
253ae42
·
1 Parent(s): 37fc80f
Files changed (1) hide show
  1. utils.py +6 -1
utils.py CHANGED
@@ -7,6 +7,9 @@ import io
7
  from transformers import DynamicCache
8
  import os
9
  import spaces
 
 
 
10
 
11
  os.makedirs("tmp", exist_ok=True)
12
 
@@ -54,7 +57,7 @@ def generate_answer(
54
  )
55
  new_id = outputs.logits[0, -1].argmax()
56
  generated_ids.append(new_id)
57
- if new_id.item() in model.generation_config.eos_token_id:
58
  break
59
 
60
  answer = tokenizer.decode(torch.stack(generated_ids), skip_special_tokens=True)
@@ -84,6 +87,8 @@ def get_condense_kv_cache(context: str):
84
  response = requests.post(url, json=payload, headers=headers).json()
85
  print(response)
86
  numpy_kv_cache, error = load_npy_from_url(response["compressed_kv_url"])
 
 
87
  kv_cache = DynamicCache.from_legacy_cache(
88
  torch.from_numpy(numpy_kv_cache).to("cuda").to(torch.bfloat16)
89
  )
 
7
  from transformers import DynamicCache
8
  import os
9
  import spaces
10
+ import httpx
11
+ import tqdm
12
+
13
 
14
  os.makedirs("tmp", exist_ok=True)
15
 
 
57
  )
58
  new_id = outputs.logits[0, -1].argmax()
59
  generated_ids.append(new_id)
60
+ if new_id.item() == model.generation_config.eos_token_id:
61
  break
62
 
63
  answer = tokenizer.decode(torch.stack(generated_ids), skip_special_tokens=True)
 
87
  response = requests.post(url, json=payload, headers=headers).json()
88
  print(response)
89
  numpy_kv_cache, error = load_npy_from_url(response["compressed_kv_url"])
90
+ if error:
91
+ print(error)
92
  kv_cache = DynamicCache.from_legacy_cache(
93
  torch.from_numpy(numpy_kv_cache).to("cuda").to(torch.bfloat16)
94
  )