Spaces:

mamkkl
/

demo1

Paused

File size: 4,296 Bytes

import gradio as gr
from huggingface_hub import InferenceClient
import transformers
from transformers import AutoTokenizer,GenerationConfig
import torch
from peft import PeftModel

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
#client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
from util.llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
        replace_llama_rope_with_scaled_rope()
base_model = "Neko-Institute-of-Science/LLaMA-65B-HF"
lora_weights = "adapter_config.json"
model = transformers.AutoModelForCausalLM.from_pretrained(
                            base_model,
                            torch_dtype=torch.float16,
                            cache_dir=cache_dir,
                            device_map="auto",
                    )

model = PeftModel.from_pretrained(
                    model,
                    lora_weights,
                    device_map="auto",
                    cache_dir=cache_dir,
                    torch_dtype=torch.float16,
                )
tokenizer =  AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.unk_token
model.eval()
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with further context. "
        "Write a response that appropriately completes the request.\n\n"
        "Instruction:\n{instruction}\n\n Input:\n{input}\n\n Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "Instruction:\n{instruction}\n\nResponse:"
    ),
}

def generate_prompt(instruction, input=None):
    if input:
        return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
    else:
        return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
        
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    ins_f = generate_prompt(instruction,input)
    inputs  =  tokenizer(ins_f, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_config = GenerationConfig(
            temperature=0.1,
            top_p=0.75,
            top_k=40,
            do_sample=True,
            num_beams=1,
            max_new_tokens = 512
        )

    # Without streaming
    with torch.no_grad():
        generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                return_dict_in_generate=True,
                output_scores=False,
                max_new_tokens=max_new_tokens,
            )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    response = output.split("Response:")[1].strip()
    yield response
    
    #messages = [{"role": "system", "content": system_message}]

    #for val in history:
    #    if val[0]:
    #        messages.append({"role": "user", "content": val[0]})
    #    if val[1]:
    #        messages.append({"role": "assistant", "content": val[1]})

    # messages.append({"role": "user", "content": message})

    #response = ""

    #for message in client.chat_completion(
    #    messages,
    #    max_tokens=max_tokens,
    #    stream=True,
    #    temperature=temperature,
    #    top_p=top_p,
    #):
    #    token = message.choices[0].delta.content

    #    response += token
    #    yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()