mamkkl commited on
Commit
7472fb6
·
verified ·
1 Parent(s): d552bfa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -23
app.py CHANGED
@@ -68,26 +68,32 @@ def generator(input_ids, generation_config, max_new_tokens):
68
  return generation_output
69
 
70
  def loadModel():
71
- from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
72
- replace_llama_rope_with_scaled_rope()
73
- t_model = transformers.AutoModelForCausalLM.from_pretrained(
74
- base_model,
75
- torch_dtype=torch.float16,
76
- cache_dir=cache_dir,
77
- device_map="auto",
78
- )
79
- print(t_model.state_dict().keys())
80
- model = PeftModel.from_pretrained(
81
- t_model,
82
- lora_weights,
83
- device_map="auto",
84
- cache_dir=cache_dir,
85
- torch_dtype=torch.float16,
86
- is_trainable=False,
87
- )
88
- model.eval()
 
 
 
 
89
  return model
90
-
 
 
91
  #@spaces.GPU(duration=120)
92
  def respond(
93
  message,
@@ -96,10 +102,7 @@ def respond(
96
  max_tokens,
97
  temperature,
98
  top_p,
99
- ):
100
- model = loadModel()
101
- tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
102
- tokenizer.pad_token = tokenizer.unk_token
103
  ins_f = generate_prompt(message,None)
104
  inputs = tokenizer(ins_f, return_tensors="pt")
105
  input_ids = inputs["input_ids"].cuda()
 
68
  return generation_output
69
 
70
  def loadModel():
71
+ global model, tokenizer
72
+ if model is None:
73
+ from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
74
+ replace_llama_rope_with_scaled_rope()
75
+ t_model = transformers.AutoModelForCausalLM.from_pretrained(
76
+ base_model,
77
+ torch_dtype=torch.float16,
78
+ cache_dir=cache_dir,
79
+ device_map="auto",
80
+ )
81
+ model = PeftModel.from_pretrained(
82
+ t_model,
83
+ lora_weights,
84
+ device_map="auto",
85
+ cache_dir=cache_dir,
86
+ torch_dtype=torch.float16,
87
+ is_trainable=False,
88
+ )
89
+ model.eval()
90
+ tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
91
+ tokenizer.pad_token = tokenizer.unk_token
92
+ model = model.to("cuda")
93
  return model
94
+
95
+ model, tokenizer = loadModel()
96
+
97
  #@spaces.GPU(duration=120)
98
  def respond(
99
  message,
 
102
  max_tokens,
103
  temperature,
104
  top_p,
105
+ ):
 
 
 
106
  ins_f = generate_prompt(message,None)
107
  inputs = tokenizer(ins_f, return_tensors="pt")
108
  input_ids = inputs["input_ids"].cuda()