Spaces:

augmxnt
/

shisa

Runtime error

App Files Files Community

leonardlin commited on Nov 16, 2023

Commit

f00ac1d

1 Parent(s): 0e02ca5

swap models, examples, check for multigpu, example

Browse files

Files changed (1) hide show

app.py +10 -78

app.py CHANGED Viewed

@@ -9,19 +9,17 @@ from   threading import Thread
 from   transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 # Model
-model_name = "mistralai/Mistral-7B-Instruct-v0.1"
 model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
-model_name = "/models/llm/hf/mistralai_Mistral-7B-Instruct-v0.1"
 # UI Settings
 title = "Shisa 7B"
 description = "Test out Shisa 7B in either English or Japanese."
 placeholder = "Type Here / ここに入力してください"
 examples = [
-    "Hello, how are you?",
-    "こんにちは、元気ですか？",
-    "おっす、元気？",
-    "こんにちは、いかがお過ごしですか？",
 ]
 # LLM Settings
@@ -39,7 +37,11 @@ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_
 def chat(message, history):
     chat_history.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt").to('cuda')
     generate_kwargs = dict(
         inputs=input_ids,
         streamer=streamer,
@@ -57,13 +59,6 @@ def chat(message, history):
         partial_message += new_token # html.escape(new_token)
         yield partial_message
-    '''
-    # https://www.gradio.app/main/guides/creating-a-chatbot-fast#streaming-chatbots
-    for i in range(len(message)):
-        time.sleep(0.3)
-        yield message[: i+1]
-    '''
 chat_interface = gr.ChatInterface(
     chat,
@@ -81,69 +76,6 @@ chat_interface = gr.ChatInterface(
 # https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
 with gr.Blocks() as demo:
     chat_interface.render()
-    gr.Markdown("You can try these greetings in English, Japanese, familiar Japanese, or formal Japanese. We limit output to 200 tokens.")
 demo.queue().launch()
-'''
-# Works for Text input...
-demo = gr.Interface.from_pipeline(pipe)
-'''
-'''
-def chat(message, history):
-    print("foo")
-    for i in range(len(message)):
-        time.sleep(0.3)
-        yield "You typed: " + message[: i+1]
-    # print('history:', history)
-    # print('message:', message)
-    # for new_next in streamer:
-    #    yield new_text
-'''
-'''
-# Docs: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/conversational.py
-conversation = Conversation()
-conversation.add_message({"role": "system", "content": system})
-device = torch.device('cuda')
-pipe = pipeline(
-    'conversational',
-    model=model,
-    tokenizer=tokenizer,
-    streamer=streamer,
-)
-def chat(input, history):
-    conversation.add_message({"role": "user", "content": input})
-    # we do this shuffle so local shadow response doesn't get created
-    response_conversation = pipe(conversation)
-    print("foo:", response_conversation.messages[-1]["content"])
-    conversation.add_message(response_conversation.messages[-1])
-    print("boo:", response_conversation.messages[-1]["content"])
-    response = conversation.messages[-1]["content"]
-    response = "ping"
-    return response
-demo = gr.ChatInterface(
-    chat,
-    chatbot=gr.Chatbot(height=400),
-    textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
-    title=title,
-    description=description,
-    theme="soft",
-    examples=examples,
-    cache_examples=False,
-    undo_btn="Delete Previous",
-    clear_btn="Clear",
-).launch()
-# For async
-# ).queue().launch()
-'''

 from   transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 # Model
 model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
+model_name = "mistralai/Mistral-7B-Instruct-v0.1"
 # UI Settings
 title = "Shisa 7B"
 description = "Test out Shisa 7B in either English or Japanese."
 placeholder = "Type Here / ここに入力してください"
 examples = [
+    "What's the best ramen in Tokyo?",
+    "東京でおすすめのラーメン屋さんを教えていただけますか。",
+    "東京でおすすめのラーメン屋ってどこ？",
 ]
 # LLM Settings
 def chat(message, history):
     chat_history.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt")
+    # for multi-gpu, find the device of the first parameter of the model
+    first_param_device = next(model.parameters()).device
+    input_ids = input_ids.to(first_param_device)
     generate_kwargs = dict(
         inputs=input_ids,
         streamer=streamer,
         partial_message += new_token # html.escape(new_token)
         yield partial_message
 chat_interface = gr.ChatInterface(
     chat,
 # https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
 with gr.Blocks() as demo:
     chat_interface.render()
+    gr.Markdown("You can try asking this question in English, formal Japanese, and informal Japanese. You might need to ask it to reply informally with something like もっと友達みたいに話そうよ。あんまり堅苦しくなくて。to get informal replies. We limit output to 200 tokens.")
 demo.queue().launch()