Spaces:

dar-tau
/

run_inference

Sleeping

App Files Files Community

dar-tau commited on Jun 8, 2024

Commit

81b9f08

verified ·

1 Parent(s): dc596e3

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -41

app.py CHANGED Viewed

@@ -9,11 +9,6 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from dataclasses import dataclass
-# chatml_template = """{% for message in messages %}
-#     {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
-# {% endfor %}"""
-# pipe.tokenizer.chat_template = chatml_template # TheBloke says this is the right template for this model
 prompt_format = '''<|im_start|>system
 {system_message}<|im_end|>
 <|im_start|>user
@@ -51,37 +46,10 @@ Assistant: girlfriend;mother;father;friend
 # setup
 torch.set_grad_enabled(False)
-device = "cpu"
 model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
 pipe = pipeline("text-generation", model=model_name, device='cuda')
 generate_kwargs = {'max_new_tokens': 20}
-# '''
-# You will now get a blank message from the user and then after your answer, the user will give you the text to complete:
-# Example:
-# >> User:
-# >> Assistant: <Waiting for text>
-# >> User: Help me write a sentiment analysis pipeline
-# >> Assistant: using huggingface;using NLTK;using python
-# '''
-start_messages = [
-    {'role': 'system', 'content': system_prompt},
-    # {'role': 'user', 'content': '  '},
-    # {'role': 'assistant', 'content': '<Waiting for text>'}
-]
-# functions
-# @dataclass
-# class PastKV:
-#     past_key_values: Any = None
-# past_key_values = PastKV()
 def past_kv_to_device(past_kv, device, dtype):
     return tuple((torch.tensor(k).to(device).to(dtype), torch.tensor(v).to(device).to(dtype)) for k, v in past_kv)
@@ -104,20 +72,17 @@ def set_past_key_values():
     return detach_past_kv(model(tokenized.to(model.device)).past_key_values)
-# @spaces.GPU
 def generate(text, past_key_values):
-    # messages = [
-    #     *start_messages,
-    #     {'role': 'user', 'content': text}
-    # ]
     cur_generate_kwargs = deepcopy(generate_kwargs)
     if past_key_values:
         past_key_values = past_kv_to_device(past_key_values, pipe.model.device, pipe.model.dtype)
         cur_generate_kwargs.update({'past_key_values': past_key_values})
-    response = pipe(prompt_format.format(system_message=system_prompt, prompt=text), **cur_generate_kwargs)[0]['generated_text']
     print(response)
     return response[-1]['content']
@@ -126,6 +91,8 @@ if __name__ == "__main__":
     with torch.no_grad():
         past_key_values = set_past_key_values()
         pipe.model = pipe.model.cpu()
-        demo = gr.Interface(partial(generate, past_key_values=past_key_values),
-                            inputs="textbox", outputs="textbox")
         demo.launch()

 from dataclasses import dataclass
 prompt_format = '''<|im_start|>system
 {system_message}<|im_end|>
 <|im_start|>user
 # setup
 torch.set_grad_enabled(False)
 model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
 pipe = pipeline("text-generation", model=model_name, device='cuda')
 generate_kwargs = {'max_new_tokens': 20}
 def past_kv_to_device(past_kv, device, dtype):
     return tuple((torch.tensor(k).to(device).to(dtype), torch.tensor(v).to(device).to(dtype)) for k, v in past_kv)
     return detach_past_kv(model(tokenized.to(model.device)).past_key_values)
+@spaces.GPU
 def generate(text, past_key_values):
     cur_generate_kwargs = deepcopy(generate_kwargs)
     if past_key_values:
         past_key_values = past_kv_to_device(past_key_values, pipe.model.device, pipe.model.dtype)
         cur_generate_kwargs.update({'past_key_values': past_key_values})
+    response = pipe(
+        prompt_format.format(system_message=system_prompt, prompt=text), **cur_generate_kwargs
+    )[0]['generated_text']
     print(response)
     return response[-1]['content']
     with torch.no_grad():
         past_key_values = set_past_key_values()
         pipe.model = pipe.model.cpu()
+        demo = gr.Interface(
+            partial(generate, past_key_values=past_key_values),
+            inputs="textbox", outputs="textbox"
+        )
         demo.launch()