Spaces:

mamkkl
/

demo1

Paused

mamkkl commited on Jan 9

Commit

a9182e1

verified ·

1 Parent(s): 67980d4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -44,27 +44,7 @@ PROMPT_DICT = {
         "Instruction:\n{instruction}\n\nResponse:"
     ),
 }
-from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
-replace_llama_rope_with_scaled_rope()
-base_model = transformers.AutoModelForCausalLM.from_pretrained(
-            base_model,
-            torch_dtype=torch.float16,
-            cache_dir=cache_dir,
-            device_map="auto",
-        )
-model = PeftModel.from_pretrained(
-            base_model,
-            lora_weights,
-            device_map="auto",
-            cache_dir=cache_dir,
-            torch_dtype=torch.float16,
-            assign=True
-        )
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-tokenizer =  AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
-tokenizer.pad_token = tokenizer.unk_token
 def generate_prompt(instruction, input=None):
     if input:
         return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
@@ -86,7 +66,26 @@ def generator(input_ids, generation_config, max_new_tokens):
                 max_new_tokens=max_new_tokens,
             )
     return generation_output
 #@spaces.GPU(duration=120)
 def respond(
     message,
@@ -96,6 +95,9 @@ def respond(
     temperature,
     top_p,
 ):
     ins_f = generate_prompt(message,None)
     inputs  =  tokenizer(ins_f, return_tensors="pt")
     input_ids = inputs["input_ids"].cuda()

         "Instruction:\n{instruction}\n\nResponse:"
     ),
 }
 def generate_prompt(instruction, input=None):
     if input:
         return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
                 max_new_tokens=max_new_tokens,
             )
     return generation_output
+def loadModel():
+    from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
+    replace_llama_rope_with_scaled_rope()
+    base_model = transformers.AutoModelForCausalLM.from_pretrained(
+                base_model,
+                torch_dtype=torch.float16,
+                cache_dir=cache_dir,
+                device_map="auto",
+            )
+    model = PeftModel.from_pretrained(
+                base_model,
+                lora_weights,
+                device_map="auto",
+                cache_dir=cache_dir,
+                torch_dtype=torch.float16
+            )
+    return model
 #@spaces.GPU(duration=120)
 def respond(
     message,
     temperature,
     top_p,
 ):
+    model = loadModel()
+    tokenizer =  AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
+    tokenizer.pad_token = tokenizer.unk_token
     ins_f = generate_prompt(message,None)
     inputs  =  tokenizer(ins_f, return_tensors="pt")
     input_ids = inputs["input_ids"].cuda()