Spaces:

Mat17892
/

iris

Runtime error

File size: 2,246 Bytes

5fd0c28
f613acc
 
5fd0c28
21886ee
 
 
8a91905
f613acc
21886ee
 
 
 
f613acc
 
 
5fd0c28
 
 
 
 
 
 
 
f613acc
5fd0c28
 
f613acc
5fd0c28
 
 
 
 
 
f613acc
5fd0c28
 
f613acc
 
5fd0c28
f613acc
 
 
f84cd21
f613acc
 
 
 
 
5fd0c28
 
f613acc
 
 
 
 
5fd0c28
f613acc
 
5fd0c28
 
f613acc
5fd0c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f613acc

import gradio as gr
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "llama_lora_model_1",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Respond function
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Prepare the system message
    messages = [{"role": "system", "content": system_message}]

    # Add history to the messages
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    # Add the current message from the user
    messages.append({"role": "user", "content": message})

    # Prepare the inputs for the model
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    # Generate the response using your model
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        use_cache=True,
    )

    # Decode the generated output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Return the response
    return response[0]


# Gradio interface setup
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()