import os from typing import Optional, Tuple, Any from copy import deepcopy from functools import partial import gradio as gr import spaces import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from dataclasses import dataclass # chatml_template = """{% for message in messages %} # {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}} # {% endfor %}""" # pipe.tokenizer.chat_template = chatml_template # TheBloke says this is the right template for this model prompt_format = '''<|im_start|>system {system_message}<|im_end|> <|im_start|>user {prompt}<|im_end|> <|im_start|>assistant ''' system_prompt = '''You are given a partial input text for another AI chat interface. Propose auto-completion to the text. You have several roles: - Fight under-specification. - Complete text to save the user time. Don't suggest anything if there are no good suggestions. Make sure the suggestions are valid completions of the text! Suggest only up to 5 words ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers). Answers should be only the completions themselves. If you have nothing as a completion, return "". Examples: (1) User: Help me write a sentiment analysis pipeline Assistant: using huggingface;using NLTK;using python (2) User: My name is Assistant: (nothing much to contribute at this point. return nothing) (3) User: Help me find a present for my Assistant: girlfriend;mother;father;friend ''' # setup torch.set_grad_enabled(False) device = "cpu" model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ" token = os.environ['hf_token'] pipe = pipeline("text-generation", model=model_name, device=device) generate_kwargs = {'max_new_tokens': 20} # ''' # You will now get a blank message from the user and then after your answer, the user will give you the text to complete: # Example: # >> User: # >> Assistant: # >> User: Help me write a sentiment analysis pipeline # >> Assistant: using huggingface;using NLTK;using python # ''' start_messages = [ {'role': 'system', 'content': system_prompt}, # {'role': 'user', 'content': ' '}, # {'role': 'assistant', 'content': ''} ] # functions # @dataclass # class PastKV: # past_key_values: Any = None # past_key_values = PastKV() def past_kv_to_device(past_kv, device, dtype): return tuple((torch.tensor(k).to(device).to(dtype), torch.tensor(v).to(device).to(dtype)) for k, v in past_kv) def detach_past_kv(past_kv): return tuple((k.cpu().detach().numpy().tolist(), v.cpu().detach().numpy().tolist()) for k, v in past_kv) @spaces.GPU def set_past_key_values(): model, tokenizer = pipe.model, pipe.tokenizer tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt') # Check that this is indeed a prefix of the entire message test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}] tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt') assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item() return detach_past_kv(model(tokenized.to(model.device)).past_key_values) # @spaces.GPU def generate(text, past_key_values): # messages = [ # *start_messages, # {'role': 'user', 'content': text} # ] cur_generate_kwargs = deepcopy(generate_kwargs) if past_key_values: past_key_values = past_kv_to_device(past_key_values, pipe.model.device, pipe.model.dtype) cur_generate_kwargs.update({'past_key_values': past_key_values}) response = pipe(prompt_format.format(system_message=system_prompt, prompt=text), **cur_generate_kwargs)[0]['generated_text'] print(response) return response[-1]['content'] if __name__ == "__main__": with torch.no_grad(): # past_key_values = set_past_key_values() # print(f'{past_key_values=}') demo = gr.Interface(partial(generate, past_key_values=None), inputs="textbox", outputs="textbox") demo.launch()