DR-Rakshitha commited on
Commit
3eb63d1
·
1 Parent(s): 1832b83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -115
app.py CHANGED
@@ -1,105 +1,105 @@
1
  # import gradio as gr
2
  # from transformers import AutoModelForCausalLM, AutoTokenizer
3
 
4
- # from gpt4all import GPT4All
5
- # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
6
-
7
- #----------------------------------------------------------------------------------------------------------------------------
8
- # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
9
- # import os
10
- import torch
11
- from datasets import load_dataset
12
- from transformers import (
13
- AutoModelForCausalLM,
14
- AutoTokenizer,
15
- BitsAndBytesConfig,
16
- HfArgumentParser,
17
- TrainingArguments,
18
- pipeline,
19
- logging,
20
- )
21
- from peft import LoraConfig, PeftModel
22
- from trl import SFTTrainer
23
- # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
24
-
25
- # LoRA attention dimension
26
- lora_r = 64
27
-
28
- # Alpha parameter for LoRA scaling
29
- lora_alpha = 16
30
-
31
- # Dropout probability for LoRA layers
32
- lora_dropout = 0.1
33
-
34
- ################################################################################
35
- # bitsandbytes parameters
36
- ################################################################################
37
-
38
- # Activate 4-bit precision base model loading
39
- use_4bit = True
40
-
41
- # Compute dtype for 4-bit base models
42
- bnb_4bit_compute_dtype = "float16"
43
-
44
- # Quantization type (fp4 or nf4)
45
- bnb_4bit_quant_type = "nf4"
46
-
47
- # Activate nested quantization for 4-bit base models (double quantization)
48
- use_nested_quant = False
49
-
50
- # Load the entire model on the GPU 0
51
- device_map = {"": 0}
52
-
53
- #----------------------------------------------------------------------------------------------------------------------------------------------------------------------
54
- model_name = "DR-DRR/Model_001"
55
- model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
56
-
57
- #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
58
-
59
- # Load tokenizer and model with QLoRA configuration
60
- compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
61
-
62
- bnb_config = BitsAndBytesConfig(
63
- load_in_4bit=use_4bit,
64
- bnb_4bit_quant_type=bnb_4bit_quant_type,
65
- bnb_4bit_compute_dtype=compute_dtype,
66
- bnb_4bit_use_double_quant=use_nested_quant,
67
- )
68
-
69
- # Check GPU compatibility with bfloat16
70
- if compute_dtype == torch.float16 and use_4bit:
71
- major, _ = torch.cuda.get_device_capability()
72
- if major >= 8:
73
- print("=" * 80)
74
- print("Your GPU supports bfloat16: accelerate training with bf16=True")
75
- print("=" * 80)
76
-
77
- # Load base model
78
- model = AutoModelForCausalLM.from_pretrained(
79
- model_name,
80
- quantization_config=bnb_config,
81
- device_map=device_map
82
- )
83
- model.config.use_cache = False
84
- model.config.pretraining_tp = 1
85
-
86
- # Load LLaMA tokenizer
87
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
88
- tokenizer.pad_token = tokenizer.eos_token
89
- tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
90
-
91
- # Load LoRA configuration
92
- peft_config = LoraConfig(
93
- lora_alpha=lora_alpha,
94
- lora_dropout=lora_dropout,
95
- r=lora_r,
96
- bias="none",
97
- task_type="CAUSAL_LM",
98
- )
99
-
100
- #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
101
- # Ignore warnings
102
- logging.set_verbosity(logging.CRITICAL)
103
 
104
  # Run text generation pipeline with our next model
105
  # prompt = "What is a large language model?"
@@ -109,16 +109,6 @@ logging.set_verbosity(logging.CRITICAL)
109
 
110
  # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
111
  # Ignore warnings
112
- logging.set_verbosity(logging.CRITICAL)
113
-
114
- # Run text generation pipeline with our next model
115
- # prompt = "What is a large language model?"
116
- # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
117
- # result = pipe(f"<s>[INST] {prompt} [/INST]")
118
- # print(result[0]['generated_text'])
119
-
120
- #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
121
- # Ignore warnings
122
  # logging.set_verbosity(logging.CRITICAL)
123
 
124
  # Run text generation pipeline with our next model
@@ -129,13 +119,13 @@ logging.set_verbosity(logging.CRITICAL)
129
 
130
 
131
  def generate_text(prompt):
132
- # output = model.generate(input_text)
133
- pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
134
- result = pipe(f"<s>[INST] {prompt} [/INST]")
135
- # prompt = "What is a large language model?"
136
- # input_ids = tokenizer.encode(prompt, return_tensors="pt")
137
 
138
- # output = model.generate(input_ids, max_length=200, num_return_sequences=1)
139
  # result = tokenizer.decode(output[0], skip_special_tokens=True)
140
  return result
141
 
 
1
  # import gradio as gr
2
  # from transformers import AutoModelForCausalLM, AutoTokenizer
3
 
4
+ from gpt4all import GPT4All
5
+ model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
6
+
7
+ # #----------------------------------------------------------------------------------------------------------------------------
8
+ # # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
9
+ # # import os
10
+ # import torch
11
+ # from datasets import load_dataset
12
+ # from transformers import (
13
+ # AutoModelForCausalLM,
14
+ # AutoTokenizer,
15
+ # BitsAndBytesConfig,
16
+ # HfArgumentParser,
17
+ # TrainingArguments,
18
+ # pipeline,
19
+ # logging,
20
+ # )
21
+ # from peft import LoraConfig, PeftModel
22
+ # from trl import SFTTrainer
23
+ # # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
24
+
25
+ # # LoRA attention dimension
26
+ # lora_r = 64
27
+
28
+ # # Alpha parameter for LoRA scaling
29
+ # lora_alpha = 16
30
+
31
+ # # Dropout probability for LoRA layers
32
+ # lora_dropout = 0.1
33
+
34
+ # ################################################################################
35
+ # # bitsandbytes parameters
36
+ # ################################################################################
37
+
38
+ # # Activate 4-bit precision base model loading
39
+ # use_4bit = True
40
+
41
+ # # Compute dtype for 4-bit base models
42
+ # bnb_4bit_compute_dtype = "float16"
43
+
44
+ # # Quantization type (fp4 or nf4)
45
+ # bnb_4bit_quant_type = "nf4"
46
+
47
+ # # Activate nested quantization for 4-bit base models (double quantization)
48
+ # use_nested_quant = False
49
+
50
+ # # Load the entire model on the GPU 0
51
+ # device_map = {"": 0}
52
+
53
+ # #----------------------------------------------------------------------------------------------------------------------------------------------------------------------
54
+ # model_name = "DR-DRR/Model_001"
55
+ # model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
56
+
57
+ # #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
58
+
59
+ # # Load tokenizer and model with QLoRA configuration
60
+ # compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
61
+
62
+ # bnb_config = BitsAndBytesConfig(
63
+ # load_in_4bit=use_4bit,
64
+ # bnb_4bit_quant_type=bnb_4bit_quant_type,
65
+ # bnb_4bit_compute_dtype=compute_dtype,
66
+ # bnb_4bit_use_double_quant=use_nested_quant,
67
+ # )
68
+
69
+ # # Check GPU compatibility with bfloat16
70
+ # if compute_dtype == torch.float16 and use_4bit:
71
+ # major, _ = torch.cuda.get_device_capability()
72
+ # if major >= 8:
73
+ # print("=" * 80)
74
+ # print("Your GPU supports bfloat16: accelerate training with bf16=True")
75
+ # print("=" * 80)
76
+
77
+ # # Load base model
78
+ # model = AutoModelForCausalLM.from_pretrained(
79
+ # model_name,
80
+ # quantization_config=bnb_config,
81
+ # device_map=device_map
82
+ # )
83
+ # model.config.use_cache = False
84
+ # model.config.pretraining_tp = 1
85
+
86
+ # # Load LLaMA tokenizer
87
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
88
+ # tokenizer.pad_token = tokenizer.eos_token
89
+ # tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
90
+
91
+ # # Load LoRA configuration
92
+ # peft_config = LoraConfig(
93
+ # lora_alpha=lora_alpha,
94
+ # lora_dropout=lora_dropout,
95
+ # r=lora_r,
96
+ # bias="none",
97
+ # task_type="CAUSAL_LM",
98
+ # )
99
+
100
+ # #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
101
+ # # Ignore warnings
102
+ # logging.set_verbosity(logging.CRITICAL)
103
 
104
  # Run text generation pipeline with our next model
105
  # prompt = "What is a large language model?"
 
109
 
110
  # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
111
  # Ignore warnings
 
 
 
 
 
 
 
 
 
 
112
  # logging.set_verbosity(logging.CRITICAL)
113
 
114
  # Run text generation pipeline with our next model
 
119
 
120
 
121
  def generate_text(prompt):
122
+ # # output = model.generate(input_text)
123
+ # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
124
+ # result = pipe(f"<s>[INST] {prompt} [/INST]")
125
+ # # prompt = "What is a large language model?"
126
+ # # input_ids = tokenizer.encode(prompt, return_tensors="pt")
127
 
128
+ output = model.generate(input_ids, max_length=200, num_return_sequences=1)
129
  # result = tokenizer.decode(output[0], skip_special_tokens=True)
130
  return result
131