Spaces:

Aston-xMAD
/

1bit_llama3_instruct_xmad_chatbot

Runtime error

App Files Files Community

1bit_llama3_instruct_xmad_chatbot / backups /app_v0.py

Aston-xMAD

init commit

b37c16f verified 12 months ago

raw

history blame contribute delete

3.26 kB

	import os
	import torch
	import gradio as gr
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
	from huggingface_hub import InferenceClient

	# Environment variables
	os.environ["TOKENIZERS_PARALLELISM"] = "0"
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
	# os.environ["GRADIO_CACHE_DIR"] = "/home/jwy4/gradio_cache"

	# Initialize Hugging Face Inference Client
	client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

	# Load model and tokenizer (if you want to use a local model, uncomment and use the load_model_and_tokenizer function)
	model = None
	tokenizer = None

	def load_model_and_tokenizer(model_name, dtype, kv_bits):
	global model, tokenizer
	if model is None or tokenizer is None:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	special_tokens = {"pad_token": "<PAD>"}
	tokenizer.add_special_tokens(special_tokens)

	config = AutoConfig.from_pretrained(model_name)
	if kv_bits != "unquantized":
	quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
	setattr(config, "quantizer_path", quantizer_path)

	dtype = torch.__dict__.get(dtype, torch.float32)
	model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")

	if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
	model.resize_token_embeddings(len(tokenizer))

	tokenizer.padding_side = "left"
	model.config.pad_token_id = tokenizer.pad_token_id

	return model, tokenizer

	def respond(message, history, system_message, max_tokens, temperature, top_p):
	messages = [{"role": "system", "content": system_message}]
	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})
	messages.append({"role": "user", "content": message})

	response = ""
	for message in client.chat_completion(
	messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	token = message.choices[0].delta.content
	response += token
	yield response

	# Initialize Gradio ChatInterface
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
	],
	theme="default",
	title="1bit llama3 by xMAD.ai",
	description="The first industrial level 1 bit quantization Llama3, we can achieve 800 tokens per second on NVIDIA V100 adn 1200 on NVIDIA A100, 90%% cost down of your cloud hostin cost",
	css=".scrollable { height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc; }"
	)

	if __name__ == "__main__":
	# Uncomment if using local model loading
	# load_model_and_tokenizer("NousResearch/Meta-Llama-3-8B-Instruct", "fp16", "1")
	demo.launch()