Spaces:

dar-tau
/

run_inference

Sleeping

File size: 2,650 Bytes

d0d12ff
 
 
b818b3f
d0d12ff
 
dad4689
d0d12ff
 
38ede89
d0d12ff
 
b80761a
 
c3144ec
 
 
d0d12ff
c3144ec
d0d12ff
c3144ec
28d873f
 
 
 
 
 
 
 
 
 
 
 
 
 
92585dc
d0d12ff
 
92585dc
 
 
 
 
 
e4276ce
28d873f
 
 
3824c46
92585dc
28d873f
92585dc
 
 
28d873f
d3017cb
28d873f
 
d0d12ff
92585dc
b818b3f
92585dc
d0d12ff
 
92585dc
 
 
a8f6d87
46203c4
 
 
28d873f
92585dc
 
46203c4

import os
import gradio as gr
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
token = os.environ['hf_token']

pipe = pipeline("text-generation", model=model_name, device="cuda")


generate_kwargs = {'max_new_tokens': 20}

system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles:
- Fight under-specification.
- Complete text to save the user time.

Don't suggest anything if there are no good suggestions. 
Make sure the suggestions are valid completions of the text! No need for them to complete the text completely. 
Suggest only up to 5 works ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers).
Answers should be only the completions themselves. 

Examples: 
(1)
User: "Help me write a sentiment analysis pipeline"
Assistant: "using huggingface;using NLTK;using python"

(2)
User: "My name is"
Assistant: "" (nothing much to contribute at this point. return nothing)

(3)
User: "Help me find a present for my"
Assistant: "girlfriend;mother;father;friend"
You will now get a blank message from the user and then after your answer, the user will give you the text to complete.
'''

start_messages = [
    {'role': 'system', 'content': system_prompt}, 
    {'role': 'user', 'content': '  '},
    {'role': 'assistant', 'content': '<Waiting for text>'}
]

torch.set_grad_enabled(False)

@spaces.GPU
def get_past_key_values(system_prompt):
    model, tokenizer = pipe.model, pipe.tokenizer
    tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt')    
    
    # Check that this is indeed a prefix of the entire message
    test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}]
    tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')    
    assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
    return model(tokenized.to(model.device)).past_key_values


@spaces.GPU
def generate(text, past_key_values):
    messages = [
        *start_messages,
        {'role': 'user', 'content': text}
    ]
    response = pipe(messages, 
                    past_key_values=past_key_values, 
                    **generate_kwargs)[0]['generated_text']
    return response[-1]['content']

    
if __name__ == "__main__":
    past_key_values = get_past_key_values(system_prompt)
    demo = gr.Interface(partial(generate, past_key_values=past_key_values), 
                        inputs="textbox", outputs="textbox")
    demo.launch()