Spaces:
Sleeping
Sleeping
File size: 4,390 Bytes
d0d12ff c62cf54 dd58665 0b8d742 d0d12ff 5bd57b6 b818b3f 4362d26 d0d12ff 1d42cd5 c01c7c6 497a54c c01c7c6 b80761a 6b3281f 4362d26 1937eb3 c3144ec d0d12ff c3144ec 1937eb3 2cc4838 d768ac5 94466d1 c01c7c6 94466d1 c01c7c6 94466d1 c01c7c6 94466d1 1d42cd5 90e0ba3 1d42cd5 7a53c83 1d42cd5 dd58665 28d873f dd58665 d0d12ff 4362d26 92585dc 94466d1 92585dc 4362d26 4fc6cc7 28d873f 4fc6cc7 74d1efc 7eb4c2f df678ec 0faca03 62f6f76 2b202a1 0faca03 7eb4c2f 28d873f 4fc6cc7 3824c46 dc596e3 4fb163e 6b3281f 92585dc 6b3281f ac09cd4 74d1efc 28d873f c01c7c6 4fc6cc7 c01c7c6 dd58665 94466d1 dd58665 c01c7c6 2cc4838 a8f6d87 46203c4 1b01c22 6b3281f 7a53c83 6b3281f 7eb4c2f 1b01c22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
from typing import Optional, Tuple, Any
from copy import deepcopy
from functools import partial
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from dataclasses import dataclass
# chatml_template = """{% for message in messages %}
# {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
# {% endfor %}"""
# pipe.tokenizer.chat_template = chatml_template # TheBloke says this is the right template for this model
prompt_format = '''<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
'''
system_only_prompt_format = '''<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
'''
system_prompt = '''You are given a partial input text for another AI chat interface.
Propose auto-completion to the text. You have several roles:
- Fight under-specification.
- Complete text to save the user time.
Don't suggest anything if there are no good suggestions.
Make sure the suggestions are valid completions of the text! Suggest only up to 5 words ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers).
Answers should be only the completions themselves. If you have nothing as a completion, return "<NOTHING>".
Examples:
(1)
User: Help me write a sentiment analysis pipeline
Assistant: using huggingface;using NLTK;using python
(2)
User: My name is
Assistant: <NOTHING> (nothing much to contribute at this point. return nothing)
(3)
User: Help me find a present for my
Assistant: girlfriend;mother;father;friend
'''
# setup
torch.set_grad_enabled(False)
device = "cpu"
model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
pipe = pipeline("text-generation", model=model_name, device='cuda')
generate_kwargs = {'max_new_tokens': 20}
# '''
# You will now get a blank message from the user and then after your answer, the user will give you the text to complete:
# Example:
# >> User:
# >> Assistant: <Waiting for text>
# >> User: Help me write a sentiment analysis pipeline
# >> Assistant: using huggingface;using NLTK;using python
# '''
start_messages = [
{'role': 'system', 'content': system_prompt},
# {'role': 'user', 'content': ' '},
# {'role': 'assistant', 'content': '<Waiting for text>'}
]
# functions
# @dataclass
# class PastKV:
# past_key_values: Any = None
# past_key_values = PastKV()
def past_kv_to_device(past_kv, device, dtype):
return tuple((torch.tensor(k).to(device).to(dtype), torch.tensor(v).to(device).to(dtype)) for k, v in past_kv)
def detach_past_kv(past_kv):
return tuple((k.cpu().detach().numpy().tolist(), v.cpu().detach().numpy().tolist()) for k, v in past_kv)
@spaces.GPU
def set_past_key_values():
model, tokenizer = pipe.model, pipe.tokenizer
tokenized = tokenizer.encode(
system_only_prompt_format.format(system_message=system_prompt),
return_tensors='pt'
)
# tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt')
# Check that this is indeed a prefix of the entire message
# test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}]
# tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')
# assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
return detach_past_kv(model(tokenized.to(model.device)).past_key_values)
# @spaces.GPU
def generate(text, past_key_values):
# messages = [
# *start_messages,
# {'role': 'user', 'content': text}
# ]
cur_generate_kwargs = deepcopy(generate_kwargs)
if past_key_values:
past_key_values = past_kv_to_device(past_key_values, pipe.model.device, pipe.model.dtype)
cur_generate_kwargs.update({'past_key_values': past_key_values})
response = pipe(prompt_format.format(system_message=system_prompt, prompt=text), **cur_generate_kwargs)[0]['generated_text']
print(response)
return response[-1]['content']
if __name__ == "__main__":
with torch.no_grad():
past_key_values = set_past_key_values()
pipe.model = pipe.model.cpu()
demo = gr.Interface(partial(generate, past_key_values=past_key_values),
inputs="textbox", outputs="textbox")
demo.launch() |