Spaces:
Running
Running
update
Browse files
eagle_vl/serve/inference.py
CHANGED
|
@@ -22,7 +22,7 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
|
|
| 22 |
|
| 23 |
token = os.environ.get("HF_TOKEN")
|
| 24 |
# hotfix the model to use flash attention 2
|
| 25 |
-
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True,
|
| 26 |
config._attn_implementation = "flash_attention_2"
|
| 27 |
config.vision_config._attn_implementation = "flash_attention_2"
|
| 28 |
config.text_config._attn_implementation = "flash_attention_2"
|
|
@@ -34,10 +34,10 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
|
|
| 34 |
trust_remote_code=True,
|
| 35 |
torch_dtype=torch.bfloat16,
|
| 36 |
attn_implementation="flash_attention_2",
|
| 37 |
-
|
| 38 |
)
|
| 39 |
model.to("cuda")
|
| 40 |
-
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True,
|
| 41 |
|
| 42 |
return model, processor
|
| 43 |
|
|
@@ -45,7 +45,7 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
|
| 45 |
|
| 46 |
token = os.environ.get("HF_TOKEN")
|
| 47 |
# hotfix the model to use flash attention 2
|
| 48 |
-
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True,
|
| 49 |
config._attn_implementation = "flash_attention_2"
|
| 50 |
config.vision_config._attn_implementation = "flash_attention_2"
|
| 51 |
config.text_config._attn_implementation = "flash_attention_2"
|
|
@@ -57,10 +57,10 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
|
| 57 |
trust_remote_code=True,
|
| 58 |
torch_dtype=torch.bfloat16,
|
| 59 |
attn_implementation="flash_attention_2",
|
| 60 |
-
|
| 61 |
)
|
| 62 |
model.to("cuda")
|
| 63 |
-
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True,
|
| 64 |
|
| 65 |
return model, processor
|
| 66 |
|
|
|
|
| 22 |
|
| 23 |
token = os.environ.get("HF_TOKEN")
|
| 24 |
# hotfix the model to use flash attention 2
|
| 25 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
| 26 |
config._attn_implementation = "flash_attention_2"
|
| 27 |
config.vision_config._attn_implementation = "flash_attention_2"
|
| 28 |
config.text_config._attn_implementation = "flash_attention_2"
|
|
|
|
| 34 |
trust_remote_code=True,
|
| 35 |
torch_dtype=torch.bfloat16,
|
| 36 |
attn_implementation="flash_attention_2",
|
| 37 |
+
token=token
|
| 38 |
)
|
| 39 |
model.to("cuda")
|
| 40 |
+
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
|
| 41 |
|
| 42 |
return model, processor
|
| 43 |
|
|
|
|
| 45 |
|
| 46 |
token = os.environ.get("HF_TOKEN")
|
| 47 |
# hotfix the model to use flash attention 2
|
| 48 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
| 49 |
config._attn_implementation = "flash_attention_2"
|
| 50 |
config.vision_config._attn_implementation = "flash_attention_2"
|
| 51 |
config.text_config._attn_implementation = "flash_attention_2"
|
|
|
|
| 57 |
trust_remote_code=True,
|
| 58 |
torch_dtype=torch.bfloat16,
|
| 59 |
attn_implementation="flash_attention_2",
|
| 60 |
+
token=token
|
| 61 |
)
|
| 62 |
model.to("cuda")
|
| 63 |
+
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
|
| 64 |
|
| 65 |
return model, processor
|
| 66 |
|