Spaces:

mamkkl
/

demo1

Paused

File size: 4,693 Bytes

import gradio as gr
from huggingface_hub import InferenceClient
import transformers
from transformers import AutoTokenizer,GenerationConfig
import torch
from peft import PeftModel
import spaces

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
#client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
base_model = "Neko-Institute-of-Science/LLaMA-65B-HF"
lora_weights = "./"
#lora_weights = LoraConfig(
#    auto_mapping=None,
#    base_model_name_or_path="Neko-Institute-of-Science/LLaMA-65B-HF",
#   bias=None,
#    fan_in_fan_out=False,
#    inference_mode=True,
#    init_lora_weights=True,
#    layers_pattern=None,
#    layers_to_transform=None,
#    lora_alpha=16,
#    lora_dropout=0.05,
#    modules_to_save=None,
#    peft_type="LORA",
#    revision=None,
#    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
#    task_type="CAUSAL_LM",
#)

cache_dir = "/data"

PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with further context. "
        "Write a response that appropriately completes the request.\n\n"
        "Instruction:\n{instruction}\n\n Input:\n{input}\n\n Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "Instruction:\n{instruction}\n\nResponse:"
    ),
}
from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
replace_llama_rope_with_scaled_rope()
model = transformers.AutoModelForCausalLM.from_pretrained(
            base_model,
            torch_dtype=torch.float16,
            cache_dir=cache_dir,
            device_map="auto",
        )

model = PeftModel.from_pretrained(
            model,
            lora_weights,
            device_map="auto",
            cache_dir=cache_dir,
            torch_dtype=torch.float16,
        )
tokenizer =  AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.unk_token
def generate_prompt(instruction, input=None):
    if input:
        return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
    else:
        return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)

@spaces.GPU
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    ins_f = generate_prompt(message,None)
    inputs  =  tokenizer(ins_f, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_config = GenerationConfig(
            temperature=0.1,
            top_p=0.75,
            top_k=40,
            do_sample=True,
            num_beams=1,
            max_new_tokens = 512
        )

    # Without streaming
    with torch.no_grad():
        generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                return_dict_in_generate=True,
                output_scores=False,
                max_new_tokens=max_new_tokens,
            )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    response = output.split("Response:")[1].strip()
    yield response
    
    #messages = [{"role": "system", "content": system_message}]

    #for val in history:
    #    if val[0]:
    #        messages.append({"role": "user", "content": val[0]})
    #    if val[1]:
    #        messages.append({"role": "assistant", "content": val[1]})

    # messages.append({"role": "user", "content": message})

    #response = ""

    #for message in client.chat_completion(
    #    messages,
    #    max_tokens=max_tokens,
    #    stream=True,
    #    temperature=temperature,
    #    top_p=top_p,
    #):
    #    token = message.choices[0].delta.content

    #    response += token
    #    yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)
if __name__ == "__main__":
    model.eval()
    demo.launch()