run_inference / app.py
dar-tau's picture
Update app.py
3824c46 verified
raw
history blame
2.24 kB
import os
import gradio as gr
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
token = os.environ['hf_token']
pipe = pipeline("text-generation", model=model_name, device="cuda")
generate_kwargs = {'max_new_tokens': 20}
system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles:
- Fight under-specification.
- Complete text to save the user time.
Don't suggest anything if there are no good suggestions.
Make sure the suggestions are valid completions of the text! No need for them to complete the text completely.
Suggest only up to 5 works ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers).
Answers should be only the completions themselves.
Examples:
(1)
User: "Help me write a sentiment analysis pipeline"
Assistant: "using huggingface;using NLTK;using python"
(2)
User: "My name is"
Assistant: "" (nothing much to contribute at this point. return nothing)
(3)
User: "Help me find a present for my"
Assistant: "girlfriend;mother;father;friend"
'''
@spaces.GPU
def get_past_key_values(system_prompt):
model, tokenizer = pipe.model, pipe.tokenizer
messages = [{'role': 'system', 'content': system_prompt}]
tokenized = tokenizer.apply_chat_template(messages, return_tensors='pt')
# assert that this is indeed a prefix (TODO: make sure this is robust)
messages.append({'role': 'user', 'content': 'TEST'})
tokenized_test = tokenizer.apply_chat_template(messages, return_tensors='pt')
assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item()
return model(**tokenized.to(model.device)).past_key_values
@spaces.GPU
def generate(text):
messages = [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': text}
]
response = pipe(messages, **generate_kwargs)[0]['generated_text']
return response[-1]['content']
if __name__ == "__main__":
past_key_values = get_past_key_values(system_prompt)
demo = gr.Interface(generate, inputs="textbox", outputs="textbox")
demo.launch()