hftestbackend / app.py
Sergidev's picture
Update app.py
66f04b1 verified
raw
history blame
4.12 kB
import os
from threading import Thread
from typing import Iterator
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
DESCRIPTION = """\
# Llama backend
This is a demo of text completion with AI LLM's.
Enter your text in the box below and click "Complete" to have the AI generate a completion for your input. The generated text will be appended to your input. You can stop the generation at any time by clicking the "Stop" button.
"""
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_id = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16,
load_in_8bit=True,
)
model.eval()
@spaces.GPU
def generate(
message: str,
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.1,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
prompt = f"{message}"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=repetition_penalty,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_message = message
for text in streamer:
partial_message += text
yield partial_message
with gr.Blocks(css="style.css", fill_height=True) as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
with gr.Row():
with gr.Column(scale=4):
text_box = gr.Textbox(
label="Enter your text",
placeholder="Type your message here...",
lines=5
)
with gr.Column(scale=1):
complete_button = gr.Button("Complete")
stop_button = gr.Button("Stop")
max_new_tokens = gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
)
temperature = gr.Slider(
label="Temperature",
minimum=0.1,
maximum=1.0,
step=0.1,
value=0.1,
)
top_p = gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
)
top_k = gr.Slider(
label="Top-k",
minimum=1,
maximum=100, # Changed from 1000 to 100
step=1,
value=50,
)
repetition_penalty = gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.2,
)
# Set up the generation event
generation_event = complete_button.click(
generate,
inputs=[text_box, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
outputs=[text_box],
)
# Set up the stop event
stop_button.click(
None,
None,
None,
cancels=[generation_event]
)
if __name__ == "__main__":
demo.queue(max_size=20).launch()