from transformers import AutoTokenizer, AutoModelForCausalLM import transformers import torch import gradio as gr desired_dtype = torch.bfloat16 torch.set_default_dtype(torch.bfloat16) # checkpoint = "vsrinivas/falconlite2" checkpoint = "tiiuae/falcon-7b-instruct" model = AutoModelForCausalLM.from_pretrained( # checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True, torch_dtype="auto") checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True) # tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) pipeline = transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", ) def format_chat_prompt(message, chat_history, instruction): prompt = f"System:{instruction}" for turn in chat_history: user_message, bot_message = turn prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}" prompt = f"{prompt}\nUser: {message}\nAssistant:" return prompt def generate_seqs(prompt, max_new_tokens=None, stop_sequence=None, temperature=None): output = pipeline(prompt, max_length=200, truncation=True, max_new_tokens = max_new_tokens, stop_sequence = stop_sequence, temperature=temperature, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id) return output[0]['generated_text'] def respond(message, chat_history, instruction, temperature=0.7): prompt = format_chat_prompt(message, chat_history, instruction) chat_history = chat_history + [[message, ""]] stream = generate_seqs(prompt = prompt, max_new_tokens=8192, stop_sequence=["\nUser:", "<|endoftext|>"], temperature=temperature).split('Assistant: ')[-1] #stop_sequence to not generate the user answer acc_text = "" #Streaming the tokens for idx, response in enumerate(stream): # text_token = response.token.text text_token = response # if response.details: # return if idx == 0 and text_token.startswith(" "): text_token = text_token[1:] acc_text += text_token last_turn = list(chat_history.pop(-1)) last_turn[-1] += acc_text chat_history = chat_history + [last_turn] yield "", chat_history acc_text = "" with gr.Blocks() as demo: gr.Markdown( """ # General purpose chatbot - test & demo app by Srinivas.V.. ## As this is a free hosted platform (Computing and Memory limitations), you will find it slow and the app may not provide appropriate answers after a few dialogues. Type in your prompt, click/ submit and wait for the resonse before typing in your next prompt. """) chatbot = gr.Chatbot(height=500) #just to fit the notebook msg = gr.Textbox(label="Prompt") with gr.Accordion(label="Advanced options",open=False): system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.") temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1) btn = gr.Button("Submit") clear = gr.ClearButton(components=[msg, chatbot, system, temperature], value="Clear console") btn.click(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot]) msg.submit(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot]) gr.close_all() demo.queue().launch()