Spaces:

DR-Rakshitha
/

wizardlm_api

Runtime error

App Files Files Community

DR-Rakshitha commited on Oct 1, 2023

Commit

06e5052

1 Parent(s): f4f6152

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -17

app.py CHANGED Viewed

@@ -5,7 +5,7 @@
 # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
 #----------------------------------------------------------------------------------------------------------------------------
-import os
 import torch
 from datasets import load_dataset
 from transformers import (
@@ -19,7 +19,8 @@ from transformers import (
 )
 from peft import LoraConfig, PeftModel
 from trl import SFTTrainer
- # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
 # LoRA attention dimension
 lora_r = 64
@@ -38,7 +39,7 @@ lora_dropout = 0.1
 use_4bit = True
 # Compute dtype for 4-bit base models
-bnb_4bit_compute_dtype = "float16"
 # Quantization type (fp4 or nf4)
 bnb_4bit_quant_type = "nf4"
@@ -46,14 +47,14 @@ bnb_4bit_quant_type = "nf4"
 # Activate nested quantization for 4-bit base models (double quantization)
 use_nested_quant = False
-# Load the entire model on the GPU 0
-device_map = {"": 0}
-#----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 model_name = "DR-DRR/Model_001"
-model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
-#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Load tokenizer and model with QLoRA configuration
 compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
@@ -65,19 +66,13 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_use_double_quant=use_nested_quant,
 )
-# Check GPU compatibility with bfloat16
-if compute_dtype == torch.float16 and use_4bit:
-    major, _ = torch.cuda.get_device_capability()
-    if major >= 8:
-        print("=" * 80)
-        print("Your GPU supports bfloat16: accelerate training with bf16=True")
-        print("=" * 80)
 # Load base model
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     quantization_config=bnb_config,
-    device_map=device_map
 )
 model.config.use_cache = False
 model.config.pretraining_tp = 1
@@ -85,7 +80,7 @@ model.config.pretraining_tp = 1
 # Load LLaMA tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
 # Load LoRA configuration
 peft_config = LoraConfig(
@@ -96,6 +91,16 @@ peft_config = LoraConfig(
     task_type="CAUSAL_LM",
 )
 #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Ignore warnings
 logging.set_verbosity(logging.CRITICAL)

 # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
 #----------------------------------------------------------------------------------------------------------------------------
+# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
 import torch
 from datasets import load_dataset
 from transformers import (
 )
 from peft import LoraConfig, PeftModel
 from trl import SFTTrainer
+# -----------------------------------------------------------------------------------------------------------------------------------------------------------------
 # LoRA attention dimension
 lora_r = 64
 use_4bit = True
 # Compute dtype for 4-bit base models
+bnb_4bit_compute_dtype = "float32"  # Changed to float32 for CPU compatibility
 # Quantization type (fp4 or nf4)
 bnb_4bit_quant_type = "nf4"
 # Activate nested quantization for 4-bit base models (double quantization)
 use_nested_quant = False
+# Remove device_map, as it's GPU-specific
+# device_map = {"": 0}
+# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 model_name = "DR-DRR/Model_001"
+model_basename = "pytorch_model-00001-of-00002.bin"  # the model is in bin format
+# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Load tokenizer and model with QLoRA configuration
 compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
     bnb_4bit_use_double_quant=use_nested_quant,
 )
+# Remove GPU-specific check for bfloat16
 # Load base model
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     quantization_config=bnb_config,
+    # Remove device_map for CPU usage
 )
 model.config.use_cache = False
 model.config.pretraining_tp = 1
 # Load LLaMA tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 # Load LoRA configuration
 peft_config = LoraConfig(
     task_type="CAUSAL_LM",
 )
+# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Ignore warnings
+logging.set_verbosity(logging.CRITICAL)
+# Run text generation pipeline with our next model
+prompt = "What is a large language model?"
+pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
+result = pipe(f"<s>[INST] {prompt} [/INST]")
+print(result[0]['generated_text'])
 #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Ignore warnings
 logging.set_verbosity(logging.CRITICAL)