Tu2003716
/

COCOM_disabled_flash_attn

Safetensors

COCOM

custom_code

Model card Files Files and versions Community

Tu2003716 commited on Dec 7, 2024

Commit

9af0971

verified ·

1 Parent(s): acab43c

Convert bfloat16 to float16

Browse files

Files changed (1) hide show

modeling_cocom.py +7 -7

modeling_cocom.py CHANGED Viewed

@@ -14,7 +14,7 @@ class BERT_Compressor(torch.nn.Module):
         super().__init__()
         # init model
         self.model_name = compr_model_name # base model name of BERT; example: bert-base-ucased
-        self.model = AutoModel.from_pretrained(compr_model_name, torch_dtype=torch.bfloat16)
         self.tokenizer = AutoTokenizer.from_pretrained(compr_model_name, use_fast=True)
         self.compr_rate = compr_rate # compression rate
         self.compressing_mode = compr_linear_type # linear layer type, could be either concat or mean.
@@ -23,7 +23,7 @@ class BERT_Compressor(torch.nn.Module):
             self.linear = torch.nn.Linear(self.model.config.hidden_size*self.compr_rate, decoder_hidden_size)
         elif self.compressing_mode == 'mean':
             self.linear = torch.nn.Linear(self.model.config.hidden_size, decoder_hidden_size)
-        self.linear = self.linear.bfloat16()
     def forward(self, input_ids, attention_mask):
         # compressing context using BERT
@@ -97,7 +97,7 @@ class COCOM(PreTrainedModel):
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(
                 cfg.decoder_model_name,
-                torch_dtype=torch.bfloat16,
                 attn_implementation=attn_impl,
                 low_cpu_mem_usage = True,
                 )
@@ -105,14 +105,14 @@ class COCOM(PreTrainedModel):
             quant_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type='nf4',
-                bnb_4bit_compute_dtype='bfloat16',
                 low_cpu_mem_usage = True,
             )
             self.decoder = AutoModelForCausalLM.from_pretrained(
                 cfg.decoder_model_name,
                 quantization_config=quant_config,
                 attn_implementation=attn_impl,
-                torch_dtype=torch.bfloat16,
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
@@ -121,14 +121,14 @@ class COCOM(PreTrainedModel):
             quant_config = BitsAndBytesConfig(
                 load_in_8bit=True,
                 llm_int8_enable_fp32_cpu_offload=True,
-                bnb_4bit_compute_dtype='bfloat16',
                 low_cpu_mem_usage = True,
             )
             self.decoder = AutoModelForCausalLM.from_pretrained(
                 cfg.decoder_model_name,
                 quantization_config=quant_config,
                 attn_implementation=attn_impl,
-                torch_dtype=torch.bfloat16,
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,

         super().__init__()
         # init model
         self.model_name = compr_model_name # base model name of BERT; example: bert-base-ucased
+        self.model = AutoModel.from_pretrained(compr_model_name, torch_dtype=torch.float16)
         self.tokenizer = AutoTokenizer.from_pretrained(compr_model_name, use_fast=True)
         self.compr_rate = compr_rate # compression rate
         self.compressing_mode = compr_linear_type # linear layer type, could be either concat or mean.
             self.linear = torch.nn.Linear(self.model.config.hidden_size*self.compr_rate, decoder_hidden_size)
         elif self.compressing_mode == 'mean':
             self.linear = torch.nn.Linear(self.model.config.hidden_size, decoder_hidden_size)
+        self.linear = self.linear.float16()
     def forward(self, input_ids, attention_mask):
         # compressing context using BERT
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(
                 cfg.decoder_model_name,
+                torch_dtype=torch.float16,
                 attn_implementation=attn_impl,
                 low_cpu_mem_usage = True,
                 )
             quant_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type='nf4',
+                bnb_4bit_compute_dtype='float16',
                 low_cpu_mem_usage = True,
             )
             self.decoder = AutoModelForCausalLM.from_pretrained(
                 cfg.decoder_model_name,
                 quantization_config=quant_config,
                 attn_implementation=attn_impl,
+                torch_dtype=torch.float16,
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
             quant_config = BitsAndBytesConfig(
                 load_in_8bit=True,
                 llm_int8_enable_fp32_cpu_offload=True,
+                bnb_4bit_compute_dtype='float16',
                 low_cpu_mem_usage = True,
             )
             self.decoder = AutoModelForCausalLM.from_pretrained(
                 cfg.decoder_model_name,
                 quantization_config=quant_config,
                 attn_implementation=attn_impl,
+                torch_dtype=torch.float16,
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,