Spaces:

dar-tau
/

run_inference

Sleeping

App Files Files Community

dar-tau commited on Jun 8, 2024

Commit

4362d26

verified ·

1 Parent(s): 1fc5a3a

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -7

app.py CHANGED Viewed

@@ -2,17 +2,17 @@ import os
 import gradio as gr
 import spaces
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
 token = os.environ['hf_token']
 pipe = pipeline("text-generation", model=model_name, device="cuda")
 generate_kwargs = {'max_new_tokens': 20}
 system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles:
 - Fight under-specification.
 - Complete text to save the user time.
@@ -37,13 +37,18 @@ Assistant: "girlfriend;mother;father;friend"
 You will now get a blank message from the user and then after your answer, the user will give you the text to complete.
 '''
 start_messages = [
     {'role': 'system', 'content': system_prompt},
     {'role': 'user', 'content': '  '},
     {'role': 'assistant', 'content': '<Waiting for text>'}
 ]
-torch.set_grad_enabled(False)
 def past_kv_to_device(past_kv, device):
@@ -60,7 +65,7 @@ def get_past_key_values(system_prompt):
     tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')
     assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
     past_key_values = model(tokenized.to(model.device)).past_key_values
-    return past_kv_to_device(past_key_values, 'cpu')
 @spaces.GPU
 def generate(text, past_key_values):
@@ -76,6 +81,6 @@ def generate(text, past_key_values):
 if __name__ == "__main__":
     past_key_values = get_past_key_values(system_prompt)
-    demo = gr.Interface(partial(generate, past_key_values=past_key_values),
                         inputs="textbox", outputs="textbox")
     demo.launch()

 import gradio as gr
 import spaces
 import torch
+from typing import Optional
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from dataclasses import dataclass
+torch.set_grad_enabled(False)
 model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
 token = os.environ['hf_token']
 pipe = pipeline("text-generation", model=model_name, device="cuda")
 generate_kwargs = {'max_new_tokens': 20}
 system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles:
 - Fight under-specification.
 - Complete text to save the user time.
 You will now get a blank message from the user and then after your answer, the user will give you the text to complete.
 '''
 start_messages = [
     {'role': 'system', 'content': system_prompt},
     {'role': 'user', 'content': '  '},
     {'role': 'assistant', 'content': '<Waiting for text>'}
 ]
+# functions
+@dataclass
+class PastKV:
+    past_key_values: Optional[torch.Tensor] = None
 def past_kv_to_device(past_kv, device):
     tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt')
     assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
     past_key_values = model(tokenized.to(model.device)).past_key_values
+    return PastKV(past_kv_to_device(past_key_values, 'cpu'))
 @spaces.GPU
 def generate(text, past_key_values):
 if __name__ == "__main__":
     past_key_values = get_past_key_values(system_prompt)
+    demo = gr.Interface(partial(generate, past_key_values=past_key_values.past_key_values),
                         inputs="textbox", outputs="textbox")
     demo.launch()