DR-Rakshitha commited on
Commit
52102b1
·
1 Parent(s): 6c8e87d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -84
app.py CHANGED
@@ -5,92 +5,13 @@
5
  # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
6
 
7
  #----------------------------------------------------------------------------------------------------------------------------
8
- # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
9
- import torch
10
- # from datasets import load_dataset
11
- from transformers import (
12
- AutoModelForCausalLM,
13
- AutoTokenizer,
14
- BitsAndBytesConfig,
15
- HfArgumentParser,
16
- TrainingArguments,
17
- pipeline,
18
- logging,
19
- )
20
- from peft import LoraConfig, PeftModel
21
- from trl import SFTTrainer
22
 
23
- # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
24
-
25
- # LoRA attention dimension
26
- lora_r = 64
27
-
28
- # Alpha parameter for LoRA scaling
29
- lora_alpha = 16
30
-
31
- # Dropout probability for LoRA layers
32
- lora_dropout = 0.1
33
-
34
- ################################################################################
35
- # bitsandbytes parameters
36
- ################################################################################
37
-
38
- # Activate 4-bit precision base model loading
39
- use_4bit = True
40
-
41
- # Compute dtype for 4-bit base models
42
- bnb_4bit_compute_dtype = "float32" # Changed to float32 for CPU compatibility
43
-
44
- # Quantization type (fp4 or nf4)
45
- bnb_4bit_quant_type = "nf4"
46
-
47
- # Activate nested quantization for 4-bit base models (double quantization)
48
- use_nested_quant = False
49
-
50
- # Remove device_map, as it's GPU-specific
51
- # device_map = {"": 0}
52
-
53
- # ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
54
  model_name = "DR-DRR/Model_001"
55
- model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
56
-
57
- # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
58
-
59
- # Load tokenizer and model with QLoRA configuration
60
- compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
61
-
62
- bnb_config = BitsAndBytesConfig(
63
- load_in_4bit=use_4bit,
64
- bnb_4bit_quant_type=bnb_4bit_quant_type,
65
- bnb_4bit_compute_dtype=compute_dtype,
66
- bnb_4bit_use_double_quant=use_nested_quant,
67
- bnb_4bit_disable_gpu=True, # Add this line to disable GPU quantization
68
- )
69
-
70
- # Remove GPU-specific check for bfloat16
71
-
72
- # Load base model
73
- model = AutoModelForCausalLM.from_pretrained(
74
- model_name,
75
- quantization_config=bnb_config,
76
- # Remove device_map for CPU usage
77
- )
78
- model.config.use_cache = False
79
- model.config.pretraining_tp = 1
80
-
81
- # Load LLaMA tokenizer
82
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
83
- tokenizer.pad_token = tokenizer.eos_token
84
- tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
85
 
86
- # Load LoRA configuration
87
- peft_config = LoraConfig(
88
- lora_alpha=lora_alpha,
89
- lora_dropout=lora_dropout,
90
- r=lora_r,
91
- bias="none",
92
- task_type="CAUSAL_LM",
93
- )
94
 
95
  # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
96
  # Ignore warnings
@@ -115,8 +36,13 @@ logging.set_verbosity(logging.CRITICAL)
115
 
116
  def generate_text(prompt):
117
  # output = model.generate(input_text)
118
- pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
119
- result = pipe(f"<s>[INST] {prompt} [/INST]")
 
 
 
 
 
120
  return result
121
 
122
  text_generation_interface = gr.Interface(
 
5
  # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
6
 
7
  #----------------------------------------------------------------------------------------------------------------------------
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  model_name = "DR-DRR/Model_001"
11
+ model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
 
13
 
14
+ # print(generated_text)
 
 
 
 
 
 
 
15
 
16
  # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
17
  # Ignore warnings
 
36
 
37
  def generate_text(prompt):
38
  # output = model.generate(input_text)
39
+ # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
40
+ # result = pipe(f"<s>[INST] {prompt} [/INST]")
41
+ # prompt = "What is a large language model?"
42
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
43
+
44
+ output = model.generate(input_ids, max_length=200, num_return_sequences=1)
45
+ result = tokenizer.decode(output[0], skip_special_tokens=True)
46
  return result
47
 
48
  text_generation_interface = gr.Interface(