SagarKeshave commited on
Commit
6d6c51d
·
verified ·
1 Parent(s): f1753a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -31
app.py CHANGED
@@ -2,52 +2,52 @@ import streamlit as st
2
 
3
 
4
  import transformers
5
- import torch
6
  import json
7
  import os
8
- from transformers import AutoTokenizer, TextStreamer , pipeline
9
 
10
 
11
- model_id = "WizardLM/WizardMath-7B-V1.1"
12
 
13
 
14
- # Configuration
15
- runtimeFlag = "cuda:0" #Run on GPU (you can't run GPTQ on cpu)
16
- cache_dir = None # by default, don't set a cache directory. This is automatically updated if you connect Google Drive.
17
- scaling_factor = 1.0 # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires Colab Pro and a V100 or A100 to have sufficient RAM.
18
 
 
 
19
 
20
 
21
- from transformers import AutoTokenizer, AutoModelForCausalLM
22
 
23
- tokenizer = AutoTokenizer.from_pretrained(model_id)
24
-
25
- model = AutoModelForCausalLM.from_pretrained(
26
- model_id,
27
- device_map="auto",
28
- offload_folder="offload",
29
- pad_token_id=tokenizer.eos_token_id,
30
- offload_state_dict = True,
31
- torch_dtype=torch.float16,
32
 
 
 
 
 
 
 
33
 
34
- # rope_scaling = {"type": "dynamic", "factor": scaling_factor}
35
- )
36
 
 
37
 
 
 
38
 
39
- pipe = pipeline(
40
- "text-generation",
41
- model=model,
42
- tokenizer=tokenizer,
43
- max_new_tokens=512,
44
- temperature=0.7,
45
- top_p=0.95,
46
- repetition_penalty=1.15
47
- )
48
 
49
- question = st.text_area("Enter questoin")
50
 
51
  if text:
52
- out = pipe(question)[0]['generated_text']
53
- st.write(out)
 
 
 
 
 
 
 
 
2
 
3
 
4
  import transformers
5
+ # import torch
6
  import json
7
  import os
8
+ # from transformers import AutoTokenizer, TextStreamer , pipeline
9
 
10
 
11
+ # model_id = "WizardLM/WizardMath-7B-V1.1"
12
 
13
 
14
+ # # Configuration
15
+ # runtimeFlag = "cuda:0" #Run on GPU (you can't run GPTQ on cpu)
16
+ # cache_dir = None # by default, don't set a cache directory. This is automatically updated if you connect Google Drive.
17
+ # scaling_factor = 1.0 # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires Colab Pro and a V100 or A100 to have sufficient RAM.
18
 
19
+ import torch
20
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
21
 
22
 
 
23
 
24
+ model_id = "WizardLM/WizardMath-7B-V1.1"
 
 
 
 
 
 
 
 
25
 
26
+ bnb_config = BitsAndBytesConfig(
27
+ load_in_4bit=True,
28
+ bnb_4bit_use_double_quant=True,
29
+ bnb_4bit_quant_type="nf4",
30
+ bnb_4bit_compute_dtype=torch.bfloat16
31
+ )
32
 
33
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
34
+ model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
35
 
36
+ question = st.text_area("Enter questoin")
37
 
38
+ # text = "Sum of two numbers is 20 and difference is 4. What are the numbers?"
39
+ text = st.text_area("Enter questoin")
40
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # print(tokenizer.decode(outputs[0], skip_special_tokens=True))
43
 
44
  if text:
45
+
46
+ device = "cuda:0"
47
+
48
+ inputs = tokenizer(str(text), return_tensors="pt").to(device)
49
+
50
+ outputs = model_4bit.generate(**inputs, max_new_tokens=512)
51
+ # out = pipe(question)[0]['generated_text']
52
+
53
+ st.write(tokenizer.decode(outputs[0], skip_special_tokens=True))