chenjoya commited on
Commit
58b8183
·
1 Parent(s): 177b037

give up flashattn, using sdpa

Browse files
Files changed (1) hide show
  1. demo/infer.py +1 -7
demo/infer.py CHANGED
@@ -35,13 +35,7 @@ class LiveCCDemoInfer:
35
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
36
  model_path, torch_dtype="auto",
37
  device_map=f'cuda:{device_id}',
38
- )
39
- import os
40
- os.system('pip install flash-attn --no-build-isolation')
41
- self.model = Qwen2VLForConditionalGeneration.from_pretrained(
42
- model_path, torch_dtype="auto",
43
- device_map=f'cuda:{device_id}',
44
- attn_implementation='flash_attention_2'
45
  )
46
  self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
47
  self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
 
35
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
36
  model_path, torch_dtype="auto",
37
  device_map=f'cuda:{device_id}',
38
+ attn_implementation='sdpa'
 
 
 
 
 
 
39
  )
40
  self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
41
  self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]