Spaces:

Artples
/

L-MChat-ZeroGPU

Running on Zero

File size: 1,759 Bytes

import os
from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

DESCRIPTION = """\
# L-MChat
This Space demonstrates [L-MChat](https://huggingface.co/collections/Artples/l-mchat-663265a8351231c428318a8f) by L-AI.
"""

if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU! This demo does not work on CPU.</p>"

model_options = {
    "Fast-Model": "Artples/L-MChat-Small",
    "Quality-Model": "Artples/L-MChat-7b"
}

@spaces.GPU(enable_queue=True, duration=90)
def generate(
    message: str,
    model_choice: str,
    chat_history: list[tuple[str, str]],
    system_prompt: str,
    max_new_tokens: int = 1024,
    temperature: float = 0.1,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    # Your existing function implementation...
    pass

chat_interface = gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(lines=2, placeholder="Type your message here..."),
        gr.Dropdown(label="Choose Model", choices=list(model_options.keys())),
        chat_history,  # Updated to include state without label
        gr.Textbox(label="System Prompt", lines=6, placeholder="Enter system prompt if any..."),
        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
        # More inputs as previously defined
    ],
    outputs=[gr.Textbox(label="Response")],
    theme="default",
    description=DESCRIPTION
)

if __name__ == "__main__":
    chat_interface.launch()