Spaces:

dar-tau
/

run_inference

Sleeping

File size: 3,326 Bytes

d0d12ff
c62cf54
0b8d742
d0d12ff
 
5bd57b6
b818b3f
4362d26
d0d12ff
4362d26
dad4689
d0d12ff
38ede89
b80761a
 
4362d26
1937eb3
 
c3144ec
 
d0d12ff
c3144ec
1937eb3
28d873f
1937eb3
 
 
28d873f
1937eb3
 
28d873f
 
 
 
 
 
 
 
 
 
 
 
d0d12ff
 
4362d26
92585dc
 
 
 
 
 
4362d26
 
4fc6cc7
 
 
28d873f
4fc6cc7
74d1efc
7eb4c2f
df678ec
 
0faca03
62f6f76
2b202a1
0faca03
7eb4c2f
28d873f
4fc6cc7
3824c46
92585dc
28d873f
92585dc
 
 
28d873f
ac09cd4
74d1efc
28d873f
d0d12ff
4fc6cc7
b818b3f
92585dc
d0d12ff
 
df678ec
92585dc
df678ec
92585dc
a8f6d87
46203c4
 
 
1b01c22
4fc6cc7
ac09cd4
a2adaed
7eb4c2f
1b01c22

import os
from typing import Optional, Tuple, Any
from functools import partial
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from dataclasses import dataclass

torch.set_grad_enabled(False)
model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
token = os.environ['hf_token']
pipe = pipeline("text-generation", model=model_name, device="cuda")
generate_kwargs = {'max_new_tokens': 20}


system_prompt = '''You are given a partial input text for another AI chat interface. 
Propose auto-completion to the text. You have several roles:
- Fight under-specification.
- Complete text to save the user time.

Don't suggest anything if there are no good suggestions. 
Make sure the suggestions are valid completions of the text! Suggest only up to 5 words ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers).
Answers should be only the completions themselves. 
You will now get a blank message from the user and then after your answer, the user will give you the text to complete.

'''


extra_prompt = '''
Examples: 
(1)
User: "Help me write a sentiment analysis pipeline"
Assistant: "using huggingface;using NLTK;using python"

(2)
User: "My name is"
Assistant: "" (nothing much to contribute at this point. return nothing)

(3)
User: "Help me find a present for my"
Assistant: "girlfriend;mother;father;friend"
'''


start_messages = [
    {'role': 'system', 'content': system_prompt}, 
    {'role': 'user', 'content': '  '},
    {'role': 'assistant', 'content': '<Waiting for text>'}
]


# functions
# @dataclass
# class PastKV:
#     past_key_values: Any = None

# past_key_values = PastKV()


def past_kv_to_device(past_kv, device, dtype):
    return tuple((torch.tensor(k).to(device).to(dtype), torch.tensor(v).to(device).to(dtype)) for k, v in past_kv)

def detach_past_kv(past_kv):
    return tuple((k.cpu().detach().numpy().tolist(), v.cpu().detach().numpy().tolist()) for k, v in past_kv)


@spaces.GPU
def set_past_key_values():
    model, tokenizer = pipe.model, pipe.tokenizer
    tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt')    
    
    # Check that this is indeed a prefix of the entire message
    test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}]
    tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')    
    assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
    return detach_past_kv(model(tokenized.to(model.device)).past_key_values)
    

@spaces.GPU
def generate(text, past_key_values):
    messages = [
        *start_messages,
        {'role': 'user', 'content': text}
    ]
    past_key_values = past_kv_to_device(past_key_values, pipe.model.device, pipe.model.dtype)
    response = pipe(messages, 
                    past_key_values=past_key_values, 
                    **generate_kwargs)[0]['generated_text']
    return response[-1]['content']

    
if __name__ == "__main__":
    with torch.no_grad():
        past_key_values = set_past_key_values()
        print(f'{past_key_values=}')
        demo = gr.Interface(partial(generate, past_key_values=past_key_values), 
                            inputs="textbox", outputs="textbox")
        demo.launch()