Spaces:
Running
on
Zero
Running
on
Zero
give up flashattn, using sdpa
Browse files- demo/infer.py +1 -7
demo/infer.py
CHANGED
@@ -35,13 +35,7 @@ class LiveCCDemoInfer:
|
|
35 |
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
36 |
model_path, torch_dtype="auto",
|
37 |
device_map=f'cuda:{device_id}',
|
38 |
-
|
39 |
-
import os
|
40 |
-
os.system('pip install flash-attn --no-build-isolation')
|
41 |
-
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
42 |
-
model_path, torch_dtype="auto",
|
43 |
-
device_map=f'cuda:{device_id}',
|
44 |
-
attn_implementation='flash_attention_2'
|
45 |
)
|
46 |
self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
|
47 |
self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
|
|
|
35 |
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
36 |
model_path, torch_dtype="auto",
|
37 |
device_map=f'cuda:{device_id}',
|
38 |
+
attn_implementation='sdpa'
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
)
|
40 |
self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
|
41 |
self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
|