Spaces:

discord-community
/

chronoboros-chatbot

Paused

File size: 2,244 Bytes

f2dbcad
b470e0c
ee66bc4
49583f4
 
ee66bc4
49583f4
 
ee66bc4
 
 
 
49583f4
61bb9bd
49583f4
 
 
 
 
 
 
 
 
 
ee66bc4
49583f4
ee66bc4
 
 
 
 
 
 
 
 
 
 
 
49583f4
ee66bc4
 
 
 
 
49583f4
f2dbcad
49583f4
f2dbcad

import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Chronoboros-33B-GPTQ")
model = AutoModelForCausalLM.from_pretrained("TheBloke/Chronoboros-33B-GPTQ", device_map="auto")
model.eval()  # set model to evaluation mode

# Optional: Use torch.compile() if you're on PyTorch 2.0+ for further speed-up
# model = torch.compile(model)

@spaces.GPU
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
    # Build the prompt using conversation history
    prompt = f"{system_message}\n"
    for user_text, assistant_text in history:
        if user_text:
            prompt += f"User: {user_text}\n"
        if assistant_text:
            prompt += f"Assistant: {assistant_text}\n"
    prompt += f"User: {message}\nAssistant: "

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    
    # Generate the response with no gradients
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
        )
    
    # Extract the new tokens
    new_tokens = output_ids[0][input_ids.shape[1]:]
    
    # Stream output in chunks (e.g., 5 tokens per chunk)
    chunk_size = 5
    for i in range(0, new_tokens.shape[0], chunk_size):
        current_response = tokenizer.decode(new_tokens[: i + chunk_size], skip_special_tokens=True)
        yield current_response

# Configure the ChatInterface with additional inputs
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()