rphrp1985 commited on
Commit
a3cc381
·
verified ·
1 Parent(s): aaeb2df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -48,7 +48,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
48
  attn_implementation="flash_attention_2",
49
  low_cpu_mem_usage=True,
50
  # llm_int8_enable_fp32_cpu_offload=True,
51
- device_map='cuda',
52
 
53
  )
54
 
@@ -77,8 +77,8 @@ def respond(
77
  messages = [{"role": "user", "content": "Hello, how are you?"}]
78
  input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
79
  ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
80
- # with autocast():
81
- gen_tokens = model.generate(
82
  input_ids,
83
  max_new_tokens=100,
84
  # do_sample=True,
 
48
  attn_implementation="flash_attention_2",
49
  low_cpu_mem_usage=True,
50
  # llm_int8_enable_fp32_cpu_offload=True,
51
+ # device_map='cuda',
52
 
53
  )
54
 
 
77
  messages = [{"role": "user", "content": "Hello, how are you?"}]
78
  input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
79
  ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
80
+ with autocast():
81
+ gen_tokens = model.generate(
82
  input_ids,
83
  max_new_tokens=100,
84
  # do_sample=True,