Spaces:
Runtime error
Runtime error
import os | |
import torch | |
import gradio as gr | |
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | |
# Environment variables | |
os.environ["TOKENIZERS_PARALLELISM"] = "0" | |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Enable synchronous CUDA operations | |
# Load model and tokenizer | |
model = None | |
tokenizer = None | |
def load_model_and_tokenizer(model_name, dtype, kv_bits): | |
global model, tokenizer | |
if model is None or tokenizer is None: | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
special_tokens = {"pad_token": "<PAD>"} | |
tokenizer.add_special_tokens(special_tokens) | |
config = AutoConfig.from_pretrained(model_name) | |
if kv_bits != "unquantized": | |
quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad" | |
setattr(config, "quantizer_path", quantizer_path) | |
dtype = torch.__dict__.get(dtype, torch.float32) | |
model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto") | |
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: | |
model.resize_token_embeddings(len(tokenizer)) | |
tokenizer.padding_side = "left" | |
model.config.pad_token_id = tokenizer.pad_token_id | |
return model, tokenizer | |
# Initialize model and tokenizer | |
load_model_and_tokenizer("NousResearch/Hermes-2-Theta-Llama-3-8B", "fp16", "1") | |
def respond(message, history, system_message, max_tokens, temperature, top_p): | |
messages = [{"role": "system", "content": system_message}] | |
for val in history: | |
if val[0]: | |
messages.append({"role": "user", "content": val[0]}) | |
if val[1]: | |
messages.append({"role": "assistant", "content": val[1]}) | |
messages.append({"role": "user", "content": message}) | |
# Prepare input prompt | |
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
tokenized_input_prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) | |
response = "" | |
try: | |
for _ in range(max_tokens): | |
with torch.no_grad(): | |
output = model.generate( | |
tokenized_input_prompt_ids, | |
max_new_tokens=1, | |
temperature=temperature, | |
top_p=top_p, | |
do_sample=True, | |
eos_token_id=tokenizer.eos_token_id, | |
pad_token_id=tokenizer.pad_token_id, | |
return_dict_in_generate=True, | |
output_scores=True, | |
) | |
next_token_id = output.sequences[0, -1].unsqueeze(0).unsqueeze(0) | |
if next_token_id.item() >= len(tokenizer): | |
raise ValueError(f"Next token ID {next_token_id.item()} is out of bounds for vocab size {len(tokenizer)}") | |
tokenized_input_prompt_ids = torch.cat([tokenized_input_prompt_ids, next_token_id], dim=-1) | |
token = tokenizer.decode(next_token_id.squeeze().tolist(), skip_special_tokens=True) | |
response += token | |
yield response | |
if next_token_id == tokenizer.eos_token_id: | |
break | |
except Exception as e: | |
yield f"Error: {str(e)}" | |
# Initialize Gradio ChatInterface | |
demo = gr.ChatInterface( | |
respond, | |
additional_inputs=[ | |
gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), | |
], | |
theme="default", | |
title="1bit llama3 by xMAD.ai", | |
description=""" | |
Welcome to the future of AI with xMAD.ai's 1bit Llama3, a breakthrough in Large Language Model (LLM) quantization and efficiency. Our cutting-edge technology offers: | |
1. **Unmatched Speed**: Achieve an impressive 800 tokens per second on NVIDIA V100 and 1200 tokens per second on NVIDIA A100. | |
2. **Cost Efficiency**: Slash your cloud hosting expenses by up to 90% with our highly optimized models, delivering significant savings for enterprises. | |
3. **Scalability**: Support for up to 10x the number of concurrent users without compromising performance, ensuring seamless user experiences. | |
4. **Memory Savings**: Experience 7x memory reduction, allowing you to run powerful LLMs on standard hardware. | |
Our Llama3 model is the first in the industry to achieve 1-bit quantization without any loss in model performance. This innovation enables businesses to deploy robust AI solutions locally or in the cloud with minimal overhead. | |
Explore the potential of Llama3 with our interactive demo, where you can see real-time text generation and understand how our technology can transform your operations. Whether you are looking to enhance your chatbot capabilities, streamline your operations, or cut down on AI deployment costs, xMAD.ai offers a solution that scales with your needs. | |
Join us in redefining AI efficiency and cost-effectiveness. Try the demo now and see the difference! For Enterprice Demo, reach out to [email protected] ! | |
""", | |
css=".scrollable { height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc; }" | |
) | |
if __name__ == "__main__": | |
demo.launch(share=False) | |