hftestbackend

Runtime error

App Files Files Community

hftestbackend / app.py

Sergidev

Update app.py

7f7ba92 verified 9 months ago

raw

history blame

3.82 kB

	import os
	from threading import Thread
	from typing import Iterator
	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	DESCRIPTION = """\
	# Qwen2 0.5B Instruct Text Completion

	This is a demo of [`Qwen/Qwen2-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct), fine-tuned for instruction following.

	Enter your text in the box below and click "Complete" to have the AI generate a completion for your input. The generated text will be appended to your input. You can stop the generation at any time by clicking the "Stop" button.
	"""

	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	model_id = "Qwen/Qwen2-0.5B-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	)
	model.eval()

	@spaces.GPU(duration=90)
	def generate(
	message: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> Iterator[str]:
	input_ids = tokenizer.encode(message, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids=input_ids,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	partial_message = message
	for text in streamer:
	partial_message += text
	yield partial_message

	with gr.Blocks(css="style.css", fill_height=True) as demo:
	gr.Markdown(DESCRIPTION)
	gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")

	with gr.Row():
	with gr.Column(scale=4):
	text_box = gr.Textbox(
	label="Enter your text",
	placeholder="Type your message here...",
	lines=5
	)
	with gr.Column(scale=1):
	complete_button = gr.Button("Complete")
	stop_button = gr.Button("Stop")

	max_new_tokens = gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	)
	temperature = gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.6,
	)
	top_p = gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.9,
	)
	top_k = gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=1000,
	step=1,
	value=50,
	)
	repetition_penalty = gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.2,
	)

	complete_button.click(
	generate,
	inputs=[text_box, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
	outputs=[text_box],
	)
	stop_button.click(
	None,
	None,
	None,
	cancels=[complete_button.click]
	)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()