mamkkl commited on
Commit
a9182e1
·
verified ·
1 Parent(s): 67980d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -21
app.py CHANGED
@@ -44,27 +44,7 @@ PROMPT_DICT = {
44
  "Instruction:\n{instruction}\n\nResponse:"
45
  ),
46
  }
47
- from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
48
- replace_llama_rope_with_scaled_rope()
49
- base_model = transformers.AutoModelForCausalLM.from_pretrained(
50
- base_model,
51
- torch_dtype=torch.float16,
52
- cache_dir=cache_dir,
53
- device_map="auto",
54
- )
55
 
56
- model = PeftModel.from_pretrained(
57
- base_model,
58
- lora_weights,
59
- device_map="auto",
60
- cache_dir=cache_dir,
61
- torch_dtype=torch.float16,
62
- assign=True
63
- )
64
- device = "cuda" if torch.cuda.is_available() else "cpu"
65
- model.to(device)
66
- tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
67
- tokenizer.pad_token = tokenizer.unk_token
68
  def generate_prompt(instruction, input=None):
69
  if input:
70
  return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
@@ -86,7 +66,26 @@ def generator(input_ids, generation_config, max_new_tokens):
86
  max_new_tokens=max_new_tokens,
87
  )
88
  return generation_output
89
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  #@spaces.GPU(duration=120)
91
  def respond(
92
  message,
@@ -96,6 +95,9 @@ def respond(
96
  temperature,
97
  top_p,
98
  ):
 
 
 
99
  ins_f = generate_prompt(message,None)
100
  inputs = tokenizer(ins_f, return_tensors="pt")
101
  input_ids = inputs["input_ids"].cuda()
 
44
  "Instruction:\n{instruction}\n\nResponse:"
45
  ),
46
  }
 
 
 
 
 
 
 
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def generate_prompt(instruction, input=None):
49
  if input:
50
  return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
 
66
  max_new_tokens=max_new_tokens,
67
  )
68
  return generation_output
69
+
70
+ def loadModel():
71
+ from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
72
+ replace_llama_rope_with_scaled_rope()
73
+ base_model = transformers.AutoModelForCausalLM.from_pretrained(
74
+ base_model,
75
+ torch_dtype=torch.float16,
76
+ cache_dir=cache_dir,
77
+ device_map="auto",
78
+ )
79
+
80
+ model = PeftModel.from_pretrained(
81
+ base_model,
82
+ lora_weights,
83
+ device_map="auto",
84
+ cache_dir=cache_dir,
85
+ torch_dtype=torch.float16
86
+ )
87
+ return model
88
+
89
  #@spaces.GPU(duration=120)
90
  def respond(
91
  message,
 
95
  temperature,
96
  top_p,
97
  ):
98
+ model = loadModel()
99
+ tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
100
+ tokenizer.pad_token = tokenizer.unk_token
101
  ins_f = generate_prompt(message,None)
102
  inputs = tokenizer(ins_f, return_tensors="pt")
103
  input_ids = inputs["input_ids"].cuda()