Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,891 Bytes
c5d6bc2 baae243 16c80da 5e6b787 a183b63 baae243 16c80da 5e6b787 16c80da baae243 1de0132 baae243 16c80da 5e6b787 16c80da 5e6b787 baae243 16c80da baae243 16c80da baae243 5e6b787 baae243 5e6b787 baae243 5e6b787 baae243 5e6b787 baae243 5e6b787 baae243 5e6b787 baae243 5e6b787 baae243 5e6b787 baae243 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import spaces
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import subprocess
import os
# os.system("pip install dashscope")
subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
kwargs = {}
"""
https://hugging-face.cn/docs/transformers/quantization/bitsandbytes
"""
# quantization_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_use_double_quant=True,
# bnb_4bit_compute_dtype=torch.bfloat16,
# )
# quantization_config = BitsAndBytesConfig(
# load_in_8bit=True,
# # llm_int8_enable_fp32_cpu_offload=True,
# )
# kwargs = { "quantization_config": quantization_config, "low_cpu_mem_usage": True }
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16, **kwargs).cuda()
@spaces.GPU
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
modelx = model
if len(message) < 1:
message = "write a quick sort algorithm in python."
messages = [
{ "role": "user", "content": message }
]
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(modelx.device)
outputs = modelx.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# @spaces.GPU
# def respond(
# message,
# history: list[tuple[str, str]],
# system_message,
# max_tokens,
# temperature,
# top_p,
# ):
# messages = [{"role": "system", "content": system_message}]
# for val in history:
# if val[0]:
# messages.append({"role": "user", "content": val[0]})
# if val[1]:
# messages.append({"role": "assistant", "content": val[1]})
# if len(message) < 1:
# message = "write a quick sort algorithm in python."
# messages.append({"role": "user", "content": message})
# response = ""
# for message in client.chat_completion(
# messages,
# max_tokens=max_tokens,
# stream=True,
# temperature=temperature,
# top_p=top_p,
# ):
# token = message.choices[0].delta.content
# response += token
# yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/main/docs/gradio/chatinterface
"""
css = """
#msg_input {
flex-grow: 7;
}
"""
demo = gr.ChatInterface(
fn=respond,
textbox=gr.Textbox(elem_id="msg_input", placeholder="write a quick sort algorithm in python."),
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
css=css,
)
if __name__ == "__main__":
demo.launch() |