import os import gradio as gr import spaces from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ" token = os.environ['hf_token'] pipe = pipeline("text-generation", model=model_name, device="cuda") generate_kwargs = {'max_new_tokens': 20} system_prompt = '''You are given a partial input text for a chat interface. Propose auto-completion to the text. You have several roles: - Fight under-specification. - Complete text to save the user time. Don't suggest anything if there are no good suggestions. Make sure the suggestions are valid completions of the text! No need for them to complete the text completely. Suggest only up to 5 works ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers). ''' @spaces.GPU def generate(text): messages = [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': text} ] response = pipe(messages, **generate_kwargs)[0]['generated_text'] return response[-1]['content'] if __name__ == "__main__": demo = gr.Interface(generate, inputs="textbox", outputs="textbox") demo.launch()