File size: 3,500 Bytes
d0d12ff
c62cf54
dd58665
0b8d742
d0d12ff
 
5bd57b6
b818b3f
4362d26
d0d12ff
1d42cd5
c01c7c6
 
 
 
 
 
b80761a
6b3281f
 
 
 
4362d26
1937eb3
 
c3144ec
 
d0d12ff
c3144ec
1937eb3
2cc4838
d768ac5
94466d1
 
c01c7c6
 
94466d1
 
c01c7c6
 
94466d1
 
c01c7c6
 
94466d1
 
1d42cd5
 
 
14c86d4
906564b
 
 
 
 
1d42cd5
df678ec
 
0faca03
62f6f76
2b202a1
0faca03
7eb4c2f
14c86d4
4fc6cc7
3824c46
dc596e3
4fb163e
 
 
6b3281f
92585dc
6b3281f
 
 
ac09cd4
74d1efc
28d873f
81b9f08
4fc6cc7
dd58665
94466d1
 
 
dd58665
 
81b9f08
 
 
2cc4838
2099ad7
906564b
46203c4
 
 
1b01c22
906564b
81b9f08
906564b
81b9f08
 
1b01c22
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from typing import Optional, Tuple, Any
from copy import deepcopy
from functools import partial
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from dataclasses import dataclass


prompt_format = '''<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
'''

system_only_prompt_format = '''<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
'''

system_prompt = '''You are given a partial input text for another AI chat interface. 
Propose auto-completion to the text. You have several roles:
- Fight under-specification.
- Complete text to save the user time.

Don't suggest anything if there are no good suggestions. 
Make sure the suggestions are valid completions of the text! Suggest only up to 5 words ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers).
Answers should be only the completions themselves. If you have nothing as a completion, return "<NOTHING>". 

Examples: 
(1)
User: Help me write a sentiment analysis pipeline
Assistant: using huggingface;using NLTK;using python

(2)
User: My name is
Assistant: <NOTHING> (nothing much to contribute at this point. return nothing)

(3)
User: Help me find a present for my
Assistant: girlfriend;mother;father;friend
'''

# setup
torch.set_grad_enabled(False)
model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
pipe = pipeline("text-generation", model=model_name, device='cuda')
generate_kwargs = {
    'max_new_tokens': 20, 
    'temperature': 0.8, 
    'repetition_penalty': 1.1
}

def past_kv_to_device(past_kv, device, dtype):
    return tuple((torch.tensor(k).to(device).to(dtype), torch.tensor(v).to(device).to(dtype)) for k, v in past_kv)

def detach_past_kv(past_kv):
    return tuple((k.cpu().detach().numpy().tolist(), v.cpu().detach().numpy().tolist()) for k, v in past_kv)


@spaces.GPU
def set_past_key_values():
    model, tokenizer = pipe.model, pipe.tokenizer
    tokenized = tokenizer.encode(
        system_only_prompt_format.format(system_message=system_prompt),
        return_tensors='pt'
    )
    # tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt')    
    # Check that this is indeed a prefix of the entire message
    # test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}]
    # tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')    
    # assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
    return detach_past_kv(model(tokenized.to(model.device)).past_key_values)
    

@spaces.GPU
def generate(text, past_key_values):
    cur_generate_kwargs = deepcopy(generate_kwargs)
    
    if past_key_values:
        past_key_values = past_kv_to_device(past_key_values, pipe.model.device, pipe.model.dtype)
        cur_generate_kwargs.update({'past_key_values': past_key_values})
    
    response = pipe(
        prompt_format.format(system_message=system_prompt, prompt=text), **cur_generate_kwargs
    )[0]['generated_text']
    print(response)
    return response.split('<|im_start|>assistant\n')[1]
    # return response[-1]['content']

    
if __name__ == "__main__":
    with torch.no_grad():
        # past_key_values = set_past_key_values()
        demo = gr.Interface(
            partial(generate, past_key_values=None), 
            inputs="textbox", outputs="textbox"
        )
        demo.launch()