Spaces:
Sleeping
Sleeping
File size: 2,650 Bytes
d0d12ff b818b3f d0d12ff dad4689 d0d12ff 38ede89 d0d12ff b80761a c3144ec d0d12ff c3144ec d0d12ff c3144ec 28d873f 92585dc d0d12ff 92585dc e4276ce 28d873f 3824c46 92585dc 28d873f 92585dc 28d873f d3017cb 28d873f d0d12ff 92585dc b818b3f 92585dc d0d12ff 92585dc a8f6d87 46203c4 28d873f 92585dc 46203c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import gradio as gr
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
token = os.environ['hf_token']
pipe = pipeline("text-generation", model=model_name, device="cuda")
generate_kwargs = {'max_new_tokens': 20}
system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles:
- Fight under-specification.
- Complete text to save the user time.
Don't suggest anything if there are no good suggestions.
Make sure the suggestions are valid completions of the text! No need for them to complete the text completely.
Suggest only up to 5 works ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers).
Answers should be only the completions themselves.
Examples:
(1)
User: "Help me write a sentiment analysis pipeline"
Assistant: "using huggingface;using NLTK;using python"
(2)
User: "My name is"
Assistant: "" (nothing much to contribute at this point. return nothing)
(3)
User: "Help me find a present for my"
Assistant: "girlfriend;mother;father;friend"
You will now get a blank message from the user and then after your answer, the user will give you the text to complete.
'''
start_messages = [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': ' '},
{'role': 'assistant', 'content': '<Waiting for text>'}
]
torch.set_grad_enabled(False)
@spaces.GPU
def get_past_key_values(system_prompt):
model, tokenizer = pipe.model, pipe.tokenizer
tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt')
# Check that this is indeed a prefix of the entire message
test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}]
tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')
assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
return model(tokenized.to(model.device)).past_key_values
@spaces.GPU
def generate(text, past_key_values):
messages = [
*start_messages,
{'role': 'user', 'content': text}
]
response = pipe(messages,
past_key_values=past_key_values,
**generate_kwargs)[0]['generated_text']
return response[-1]['content']
if __name__ == "__main__":
past_key_values = get_past_key_values(system_prompt)
demo = gr.Interface(partial(generate, past_key_values=past_key_values),
inputs="textbox", outputs="textbox")
demo.launch() |