Spaces:

augmxnt
/

shisa

Runtime error

App Files Files Community

shisa / app.py

leonardlin

Update app.py

1a6b000 over 1 year ago

raw

history blame

2.77 kB

	# https://www.gradio.app/guides/using-hugging-face-integrations

	import gradio as gr
	import logging
	import html
	import time
	import torch
	from threading import Thread
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	# Model
	model_name = "augmxnt/shisa-7b-v1"

	# UI Settings
	title = "Shisa 7B"
	description = "Test out Shisa 7B in either English or Japanese."
	placeholder = "Type Here / ここに入力してください"
	examples = [
	"What's the best ramen in Tokyo?",
	"あなたは熱狂的なポケモンファンです。",
	"東京でおすすめのラーメン屋ってどこ？",
	]

	# LLM Settings
	system_prompt = 'あなたは役に立つアシスタントです。'
	chat_history = [{"role": "system", "content": system_prompt}]
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	load_in_8bit=True,
	# load_in_4bit=True
	)
	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

	def chat(message, history):
	chat_history.append({"role": "user", "content": message})
	input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt")
	# for multi-gpu, find the device of the first parameter of the model
	first_param_device = next(model.parameters()).device
	input_ids = input_ids.to(first_param_device)

	generate_kwargs = dict(
	inputs=input_ids,
	streamer=streamer,
	max_new_tokens=200,
	do_sample=True,
	temperature=0.7,
	repetition_penalty=1.15,
	top_p=0.95,
	eos_token_id=tokenizer.eos_token_id,
	)
	# https://www.gradio.app/main/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()
	partial_message = ""
	for new_token in streamer:
	partial_message += new_token # html.escape(new_token)
	yield partial_message


	chat_interface = gr.ChatInterface(
	chat,
	chatbot=gr.Chatbot(height=400),
	textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
	title=title,
	description=description,
	theme="soft",
	examples=examples,
	cache_examples=False,
	undo_btn="Delete Previous",
	clear_btn="Clear",
	)

	# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
	with gr.Blocks() as demo:
	chat_interface.render()
	gr.Markdown("You can try asking this question in Japanese or English. We limit output to 200 tokens.")

	demo.queue().launch()