Spaces:

mamkkl
/

demo1

Paused

App Files Files Community

mamkkl commited on Jan 7

Commit

eda9136

verified ·

1 Parent(s): 3548597

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -101

app.py CHANGED Viewed

@@ -30,24 +30,7 @@ lora_weights = "./"
 #)
 cache_dir = "/data"
-from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
-replace_llama_rope_with_scaled_rope()
-model = transformers.AutoModelForCausalLM.from_pretrained(
-            base_model,
-            torch_dtype=torch.float16,
-            cache_dir=cache_dir,
-            device_map="auto",
-        )
-model = PeftModel.from_pretrained(
-            model,
-            lora_weights,
-            device_map="auto",
-            cache_dir=cache_dir,
-            torch_dtype=torch.float16,
-        )
-tokenizer =  AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
-tokenizer.pad_token = tokenizer.unk_token
 PROMPT_DICT = {
     "prompt_input": (
         "Below is an instruction that describes a task, paired with further context. "
@@ -61,91 +44,107 @@ PROMPT_DICT = {
     ),
 }
-def generate_prompt(instruction, input=None):
-    if input:
-        return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
-    else:
-        return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    ins_f = generate_prompt(instruction,input)
-    inputs  =  tokenizer(ins_f, return_tensors="pt")
-    input_ids = inputs["input_ids"].cuda()
-    generation_config = GenerationConfig(
-            temperature=0.1,
-            top_p=0.75,
-            top_k=40,
-            do_sample=True,
-            num_beams=1,
-            max_new_tokens = 512
-        )
-    # Without streaming
-    with torch.no_grad():
-        generation_output = model.generate(
-                input_ids=input_ids,
-                generation_config=generation_config,
-                return_dict_in_generate=True,
-                output_scores=False,
-                max_new_tokens=max_new_tokens,
             )
-    s = generation_output.sequences[0]
-    output = tokenizer.decode(s)
-    response = output.split("Response:")[1].strip()
-    yield response
-    #messages = [{"role": "system", "content": system_message}]
-    #for val in history:
-    #    if val[0]:
-    #        messages.append({"role": "user", "content": val[0]})
-    #    if val[1]:
-    #        messages.append({"role": "assistant", "content": val[1]})
-    # messages.append({"role": "user", "content": message})
-    #response = ""
-    #for message in client.chat_completion(
-    #    messages,
-    #    max_tokens=max_tokens,
-    #    stream=True,
-    #    temperature=temperature,
-    #    top_p=top_p,
-    #):
-    #    token = message.choices[0].delta.content
-    #    response += token
-    #    yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
     model.eval()
     demo.launch()

 #)
 cache_dir = "/data"
 PROMPT_DICT = {
     "prompt_input": (
         "Below is an instruction that describes a task, paired with further context. "
     ),
 }
+if __name__ == "__main__":
+    from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
+    replace_llama_rope_with_scaled_rope()
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+                base_model,
+                torch_dtype=torch.float16,
+                cache_dir=cache_dir,
+                device_map="auto",
             )
+    model = PeftModel.from_pretrained(
+                model,
+                lora_weights,
+                device_map="auto",
+                cache_dir=cache_dir,
+                torch_dtype=torch.float16,
+            )
+    tokenizer =  AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
+    tokenizer.pad_token = tokenizer.unk_token
+    def generate_prompt(instruction, input=None):
+        if input:
+            return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
+        else:
+            return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
+    def respond(
+        message,
+        history: list[tuple[str, str]],
+        system_message,
+        max_tokens,
+        temperature,
+        top_p,
+    ):
+        ins_f = generate_prompt(instruction,input)
+        inputs  =  tokenizer(ins_f, return_tensors="pt")
+        input_ids = inputs["input_ids"].cuda()
+        generation_config = GenerationConfig(
+                temperature=0.1,
+                top_p=0.75,
+                top_k=40,
+                do_sample=True,
+                num_beams=1,
+                max_new_tokens = 512
+            )
+        # Without streaming
+        with torch.no_grad():
+            generation_output = model.generate(
+                    input_ids=input_ids,
+                    generation_config=generation_config,
+                    return_dict_in_generate=True,
+                    output_scores=False,
+                    max_new_tokens=max_new_tokens,
+                )
+        s = generation_output.sequences[0]
+        output = tokenizer.decode(s)
+        response = output.split("Response:")[1].strip()
+        yield response
+        #messages = [{"role": "system", "content": system_message}]
+        #for val in history:
+        #    if val[0]:
+        #        messages.append({"role": "user", "content": val[0]})
+        #    if val[1]:
+        #        messages.append({"role": "assistant", "content": val[1]})
+        # messages.append({"role": "user", "content": message})
+        #response = ""
+        #for message in client.chat_completion(
+        #    messages,
+        #    max_tokens=max_tokens,
+        #    stream=True,
+        #    temperature=temperature,
+        #    top_p=top_p,
+        #):
+        #    token = message.choices[0].delta.content
+        #    response += token
+        #    yield response
+    """
+    For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
+    """
+    demo = gr.ChatInterface(
+        respond,
+        additional_inputs=[
+            gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+            gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+            gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+            gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.95,
+                step=0.05,
+                label="Top-p (nucleus sampling)",
+            ),
+        ],
+    )
     model.eval()
     demo.launch()