Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import spaces | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ" | |
token = os.environ['hf_token'] | |
pipe = pipeline("text-generation", model=model_name, device="cuda") | |
generate_kwargs = {'max_new_tokens': 20} | |
system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles: | |
- Fight under-specification. | |
- Complete text to save the user time. | |
Don't suggest anything if there are no good suggestions. | |
Make sure the suggestions are valid completions of the text! No need for them to complete the text completely. | |
Suggest only up to 5 works ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers). | |
Answers should be only the completions themselves. | |
Examples: | |
(1) | |
User: "Help me write a sentiment analysis pipeline" | |
Assistant: "using huggingface;using NLTK;using python" | |
(2) | |
User: "My name is" | |
Assistant: "" (nothing much to contribute at this point. return nothing) | |
(3) | |
User: "Help me find a present for my" | |
Assistant: "girlfriend;mother;father;friend" | |
''' | |
def get_past_key_values(system_prompt): | |
model, tokenizer = pipe.model, pipe.tokenizer | |
messages = [{'role': 'system', 'content': system_prompt}] | |
tokenized = tokenizer.apply_chat_template(messages, return_tensors='pt') | |
# assert that this is indeed a prefix (TODO: make sure this is robust) | |
messages.append({'role': 'user', 'content': 'TEST'}) | |
tokenized_test = tokenizer.apply_chat_template(messages, return_tensors='pt') | |
assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item() | |
return model(**tokenized.to(model.device)).past_key_values | |
def generate(text): | |
messages = [ | |
{'role': 'system', 'content': system_prompt}, | |
{'role': 'user', 'content': text} | |
] | |
response = pipe(messages, **generate_kwargs)[0]['generated_text'] | |
return response[-1]['content'] | |
if __name__ == "__main__": | |
past_key_values = get_past_key_values(system_prompt) | |
demo = gr.Interface(generate, inputs="textbox", outputs="textbox") | |
demo.launch() |