Zhiding commited on
Commit
72369b4
·
1 Parent(s): 28706e7
README.md CHANGED
@@ -44,7 +44,6 @@ We provide the following models:
44
  | Eagle2-1B | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-1B)|
45
  | Eagle2-2B | [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-2B)|
46
  | Eagle2-9B | [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | Siglip+ConvNext | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-9B)|
47
- | Eagle2-32B | [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | Siglip+ConvNext | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-32B)|
48
 
49
  ## Benchmark Results
50
  | Benchmark | MiniCPM-Llama3-V-2_5 | InternVL-Chat-V1-5 | InternVL2-8B |QwenVL2-7B| Eagle2-9B|
 
44
  | Eagle2-1B | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-1B)|
45
  | Eagle2-2B | [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-2B)|
46
  | Eagle2-9B | [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | Siglip+ConvNext | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-9B)|
 
47
 
48
  ## Benchmark Results
49
  | Benchmark | MiniCPM-Llama3-V-2_5 | InternVL-Chat-V1-5 | InternVL2-8B |QwenVL2-7B| Eagle2-9B|
configuration_eagle_chat.py CHANGED
@@ -1,7 +1,7 @@
1
  # --------------------------------------------------------
2
  # Eagle2
3
  # Copyright (c) 2025 NVIDIA
4
- # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
7
  import copy
@@ -36,6 +36,7 @@ class Eagle2ChatConfig(PretrainedConfig):
36
  mlp_checkpoint=True,
37
  pre_feature_reduction=False,
38
  keep_aspect_ratio=False,
 
39
  **kwargs):
40
  super().__init__(**kwargs)
41
 
@@ -73,6 +74,7 @@ class Eagle2ChatConfig(PretrainedConfig):
73
  self.mlp_checkpoint = mlp_checkpoint
74
  self.pre_feature_reduction = pre_feature_reduction
75
  self.keep_aspect_ratio = keep_aspect_ratio
 
76
  logger.info(f'keep_aspect_ratio: {self.keep_aspect_ratio}')
77
  logger.info(f'vision_select_layer: {self.select_layer}')
78
  logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
 
1
  # --------------------------------------------------------
2
  # Eagle2
3
  # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
7
  import copy
 
36
  mlp_checkpoint=True,
37
  pre_feature_reduction=False,
38
  keep_aspect_ratio=False,
39
+ vocab_size=-1,
40
  **kwargs):
41
  super().__init__(**kwargs)
42
 
 
74
  self.mlp_checkpoint = mlp_checkpoint
75
  self.pre_feature_reduction = pre_feature_reduction
76
  self.keep_aspect_ratio = keep_aspect_ratio
77
+ self.vocab_size = self.llm_config.vocab_size
78
  logger.info(f'keep_aspect_ratio: {self.keep_aspect_ratio}')
79
  logger.info(f'vision_select_layer: {self.select_layer}')
80
  logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
configuration_multi_backbone_channel_concatentation_model.py CHANGED
@@ -1,7 +1,7 @@
1
  # --------------------------------------------------------
2
  # Eagle2
3
  # Copyright (c) 2025 NVIDIA
4
- # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
7
  import os
 
1
  # --------------------------------------------------------
2
  # Eagle2
3
  # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
7
  import os
modeling_eagle_chat.py CHANGED
@@ -1,7 +1,7 @@
1
  # --------------------------------------------------------
2
  # Eagle2
3
  # Copyright (c) 2025 NVIDIA
4
- # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
7
  import warnings
 
1
  # --------------------------------------------------------
2
  # Eagle2
3
  # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
7
  import warnings
modeling_siglip.py CHANGED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  # coding=utf-8
2
  # Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
3
  #
@@ -374,6 +382,10 @@ class SiglipAttention(nn.Module):
374
  """Input shape: Batch x Time x Channel"""
375
  if self.use_flash_attn:
376
  return self._flash_attn(hidden_states)
 
 
 
 
377
  batch_size, q_len, _ = hidden_states.size()
378
 
379
  query_states = self.q_proj(hidden_states)
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # Support flash-attention in SigLIP
6
+ # --------------------------------------------------------
7
+
8
+
9
  # coding=utf-8
10
  # Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
11
  #
 
382
  """Input shape: Batch x Time x Channel"""
383
  if self.use_flash_attn:
384
  return self._flash_attn(hidden_states)
385
+ else:
386
+ return self._vanilla_attn(hidden_states, attention_mask, output_attentions)
387
+
388
+ def _vanilla_attn(self, hidden_states, attention_mask=None, output_attentions=False):
389
  batch_size, q_len, _ = hidden_states.size()
390
 
391
  query_states = self.q_proj(hidden_states)
multi_backbone_channel_concatenation_encoder.py CHANGED
@@ -1,13 +1,15 @@
 
 
 
 
 
 
1
  import torch, os
2
  import torch.nn as nn
3
  from torch.utils.checkpoint import checkpoint
4
 
5
  from .siglip_vision_tower import SiglipVisionTower
6
 
7
- # from .hr_clip_encoder import HRCLIPVisionTower
8
- # from .eva_vit import EVAVITVisionTower
9
- # from .SAM.modeling_sam import SAMVisionTower
10
- # from .pix2struct_large import Pix2StructLargeVisionTower
11
  import torch.nn.functional as F
12
  from torch.nn.init import trunc_normal_
13
  from copy import deepcopy
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
  import torch, os
8
  import torch.nn as nn
9
  from torch.utils.checkpoint import checkpoint
10
 
11
  from .siglip_vision_tower import SiglipVisionTower
12
 
 
 
 
 
13
  import torch.nn.functional as F
14
  from torch.nn.init import trunc_normal_
15
  from copy import deepcopy
multi_backbone_channel_concatentation_model.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  import torch.nn as nn
2
 
3
  from transformers.modeling_outputs import BaseModelOutputWithPooling
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
  import torch.nn as nn
8
 
9
  from transformers.modeling_outputs import BaseModelOutputWithPooling