DR-Rakshitha commited on
Commit
06e5052
·
1 Parent(s): f4f6152

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -17
app.py CHANGED
@@ -5,7 +5,7 @@
5
  # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
6
 
7
  #----------------------------------------------------------------------------------------------------------------------------
8
- import os
9
  import torch
10
  from datasets import load_dataset
11
  from transformers import (
@@ -19,7 +19,8 @@ from transformers import (
19
  )
20
  from peft import LoraConfig, PeftModel
21
  from trl import SFTTrainer
22
- # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
 
23
 
24
  # LoRA attention dimension
25
  lora_r = 64
@@ -38,7 +39,7 @@ lora_dropout = 0.1
38
  use_4bit = True
39
 
40
  # Compute dtype for 4-bit base models
41
- bnb_4bit_compute_dtype = "float16"
42
 
43
  # Quantization type (fp4 or nf4)
44
  bnb_4bit_quant_type = "nf4"
@@ -46,14 +47,14 @@ bnb_4bit_quant_type = "nf4"
46
  # Activate nested quantization for 4-bit base models (double quantization)
47
  use_nested_quant = False
48
 
49
- # Load the entire model on the GPU 0
50
- device_map = {"": 0}
51
 
52
- #----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
  model_name = "DR-DRR/Model_001"
54
- model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
55
 
56
- #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
57
 
58
  # Load tokenizer and model with QLoRA configuration
59
  compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
@@ -65,19 +66,13 @@ bnb_config = BitsAndBytesConfig(
65
  bnb_4bit_use_double_quant=use_nested_quant,
66
  )
67
 
68
- # Check GPU compatibility with bfloat16
69
- if compute_dtype == torch.float16 and use_4bit:
70
- major, _ = torch.cuda.get_device_capability()
71
- if major >= 8:
72
- print("=" * 80)
73
- print("Your GPU supports bfloat16: accelerate training with bf16=True")
74
- print("=" * 80)
75
 
76
  # Load base model
77
  model = AutoModelForCausalLM.from_pretrained(
78
  model_name,
79
  quantization_config=bnb_config,
80
- device_map=device_map
81
  )
82
  model.config.use_cache = False
83
  model.config.pretraining_tp = 1
@@ -85,7 +80,7 @@ model.config.pretraining_tp = 1
85
  # Load LLaMA tokenizer
86
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
87
  tokenizer.pad_token = tokenizer.eos_token
88
- tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
89
 
90
  # Load LoRA configuration
91
  peft_config = LoraConfig(
@@ -96,6 +91,16 @@ peft_config = LoraConfig(
96
  task_type="CAUSAL_LM",
97
  )
98
 
 
 
 
 
 
 
 
 
 
 
99
  #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
100
  # Ignore warnings
101
  logging.set_verbosity(logging.CRITICAL)
 
5
  # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
6
 
7
  #----------------------------------------------------------------------------------------------------------------------------
8
+ # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
9
  import torch
10
  from datasets import load_dataset
11
  from transformers import (
 
19
  )
20
  from peft import LoraConfig, PeftModel
21
  from trl import SFTTrainer
22
+
23
+ # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
24
 
25
  # LoRA attention dimension
26
  lora_r = 64
 
39
  use_4bit = True
40
 
41
  # Compute dtype for 4-bit base models
42
+ bnb_4bit_compute_dtype = "float32" # Changed to float32 for CPU compatibility
43
 
44
  # Quantization type (fp4 or nf4)
45
  bnb_4bit_quant_type = "nf4"
 
47
  # Activate nested quantization for 4-bit base models (double quantization)
48
  use_nested_quant = False
49
 
50
+ # Remove device_map, as it's GPU-specific
51
+ # device_map = {"": 0}
52
 
53
+ # ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
54
  model_name = "DR-DRR/Model_001"
55
+ model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
56
 
57
+ # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
58
 
59
  # Load tokenizer and model with QLoRA configuration
60
  compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
 
66
  bnb_4bit_use_double_quant=use_nested_quant,
67
  )
68
 
69
+ # Remove GPU-specific check for bfloat16
 
 
 
 
 
 
70
 
71
  # Load base model
72
  model = AutoModelForCausalLM.from_pretrained(
73
  model_name,
74
  quantization_config=bnb_config,
75
+ # Remove device_map for CPU usage
76
  )
77
  model.config.use_cache = False
78
  model.config.pretraining_tp = 1
 
80
  # Load LLaMA tokenizer
81
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
82
  tokenizer.pad_token = tokenizer.eos_token
83
+ tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
84
 
85
  # Load LoRA configuration
86
  peft_config = LoraConfig(
 
91
  task_type="CAUSAL_LM",
92
  )
93
 
94
+ # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
95
+ # Ignore warnings
96
+ logging.set_verbosity(logging.CRITICAL)
97
+
98
+ # Run text generation pipeline with our next model
99
+ prompt = "What is a large language model?"
100
+ pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
101
+ result = pipe(f"<s>[INST] {prompt} [/INST]")
102
+ print(result[0]['generated_text'])
103
+
104
  #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
105
  # Ignore warnings
106
  logging.set_verbosity(logging.CRITICAL)