Spaces:

DR-Rakshitha
/

wizardlm_api

Runtime error

App Files Files Community

DR-Rakshitha commited on Oct 2, 2023

Commit

3eb63d1

1 Parent(s): 1832b83

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -115

app.py CHANGED Viewed

@@ -1,105 +1,105 @@
 # import gradio as gr
 # from transformers import AutoModelForCausalLM, AutoTokenizer
-# from gpt4all import GPT4All
-# model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
-#----------------------------------------------------------------------------------------------------------------------------
-# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
-# import os
-import torch
-from datasets import load_dataset
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-    HfArgumentParser,
-    TrainingArguments,
-    pipeline,
-    logging,
-)
-from peft import LoraConfig, PeftModel
-from trl import SFTTrainer
- # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
-# LoRA attention dimension
-lora_r = 64
-# Alpha parameter for LoRA scaling
-lora_alpha = 16
-# Dropout probability for LoRA layers
-lora_dropout = 0.1
-################################################################################
-# bitsandbytes parameters
-################################################################################
-# Activate 4-bit precision base model loading
-use_4bit = True
-# Compute dtype for 4-bit base models
-bnb_4bit_compute_dtype = "float16"
-# Quantization type (fp4 or nf4)
-bnb_4bit_quant_type = "nf4"
-# Activate nested quantization for 4-bit base models (double quantization)
-use_nested_quant = False
-# Load the entire model on the GPU 0
-device_map = {"": 0}
-#----------------------------------------------------------------------------------------------------------------------------------------------------------------------
-model_name = "DR-DRR/Model_001"
-model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
-#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-# Load tokenizer and model with QLoRA configuration
-compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=use_4bit,
-    bnb_4bit_quant_type=bnb_4bit_quant_type,
-    bnb_4bit_compute_dtype=compute_dtype,
-    bnb_4bit_use_double_quant=use_nested_quant,
-)
-# Check GPU compatibility with bfloat16
-if compute_dtype == torch.float16 and use_4bit:
-    major, _ = torch.cuda.get_device_capability()
-    if major >= 8:
-        print("=" * 80)
-        print("Your GPU supports bfloat16: accelerate training with bf16=True")
-        print("=" * 80)
-# Load base model
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    quantization_config=bnb_config,
-    device_map=device_map
-)
-model.config.use_cache = False
-model.config.pretraining_tp = 1
-# Load LLaMA tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
-# Load LoRA configuration
-peft_config = LoraConfig(
-    lora_alpha=lora_alpha,
-    lora_dropout=lora_dropout,
-    r=lora_r,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
-# Ignore warnings
-logging.set_verbosity(logging.CRITICAL)
 # Run text generation pipeline with our next model
 # prompt = "What is a large language model?"
@@ -109,16 +109,6 @@ logging.set_verbosity(logging.CRITICAL)
 # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Ignore warnings
-logging.set_verbosity(logging.CRITICAL)
-# Run text generation pipeline with our next model
-# prompt = "What is a large language model?"
-# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
-# result = pipe(f"<s>[INST] {prompt} [/INST]")
-# print(result[0]['generated_text'])
-#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
-# Ignore warnings
 # logging.set_verbosity(logging.CRITICAL)
 # Run text generation pipeline with our next model
@@ -129,13 +119,13 @@ logging.set_verbosity(logging.CRITICAL)
 def generate_text(prompt):
-    # output = model.generate(input_text)
-    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
-    result = pipe(f"<s>[INST] {prompt} [/INST]")
-    # prompt = "What is a large language model?"
-    # input_ids = tokenizer.encode(prompt, return_tensors="pt")
-    # output = model.generate(input_ids, max_length=200, num_return_sequences=1)
     # result = tokenizer.decode(output[0], skip_special_tokens=True)
     return result

 # import gradio as gr
 # from transformers import AutoModelForCausalLM, AutoTokenizer
+from gpt4all import GPT4All
+model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
+# #----------------------------------------------------------------------------------------------------------------------------
+# # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
+# # import os
+# import torch
+# from datasets import load_dataset
+# from transformers import (
+#     AutoModelForCausalLM,
+#     AutoTokenizer,
+#     BitsAndBytesConfig,
+#     HfArgumentParser,
+#     TrainingArguments,
+#     pipeline,
+#     logging,
+# )
+# from peft import LoraConfig, PeftModel
+# from trl import SFTTrainer
+#  # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
+# # LoRA attention dimension
+# lora_r = 64
+# # Alpha parameter for LoRA scaling
+# lora_alpha = 16
+# # Dropout probability for LoRA layers
+# lora_dropout = 0.1
+# ################################################################################
+# # bitsandbytes parameters
+# ################################################################################
+# # Activate 4-bit precision base model loading
+# use_4bit = True
+# # Compute dtype for 4-bit base models
+# bnb_4bit_compute_dtype = "float16"
+# # Quantization type (fp4 or nf4)
+# bnb_4bit_quant_type = "nf4"
+# # Activate nested quantization for 4-bit base models (double quantization)
+# use_nested_quant = False
+# # Load the entire model on the GPU 0
+# device_map = {"": 0}
+# #----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# model_name = "DR-DRR/Model_001"
+# model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
+# #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# # Load tokenizer and model with QLoRA configuration
+# compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+# bnb_config = BitsAndBytesConfig(
+#     load_in_4bit=use_4bit,
+#     bnb_4bit_quant_type=bnb_4bit_quant_type,
+#     bnb_4bit_compute_dtype=compute_dtype,
+#     bnb_4bit_use_double_quant=use_nested_quant,
+# )
+# # Check GPU compatibility with bfloat16
+# if compute_dtype == torch.float16 and use_4bit:
+#     major, _ = torch.cuda.get_device_capability()
+#     if major >= 8:
+#         print("=" * 80)
+#         print("Your GPU supports bfloat16: accelerate training with bf16=True")
+#         print("=" * 80)
+# # Load base model
+# model = AutoModelForCausalLM.from_pretrained(
+#     model_name,
+#     quantization_config=bnb_config,
+#     device_map=device_map
+# )
+# model.config.use_cache = False
+# model.config.pretraining_tp = 1
+# # Load LLaMA tokenizer
+# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# tokenizer.pad_token = tokenizer.eos_token
+# tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
+# # Load LoRA configuration
+# peft_config = LoraConfig(
+#     lora_alpha=lora_alpha,
+#     lora_dropout=lora_dropout,
+#     r=lora_r,
+#     bias="none",
+#     task_type="CAUSAL_LM",
+# )
+# #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# # Ignore warnings
+# logging.set_verbosity(logging.CRITICAL)
 # Run text generation pipeline with our next model
 # prompt = "What is a large language model?"
 # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Ignore warnings
 # logging.set_verbosity(logging.CRITICAL)
 # Run text generation pipeline with our next model
 def generate_text(prompt):
+    # # output = model.generate(input_text)
+    # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
+    # result = pipe(f"<s>[INST] {prompt} [/INST]")
+    # # prompt = "What is a large language model?"
+    # # input_ids = tokenizer.encode(prompt, return_tensors="pt")
+    output = model.generate(input_ids, max_length=200, num_return_sequences=1)
     # result = tokenizer.decode(output[0], skip_special_tokens=True)
     return result