import os import wget import gradio as gr from llama_cpp import Llama import random url = 'https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q2_K.bin' filename = wget.download(url) llm = Llama(model_path=filename, seed=random.randint(1, 2**31)) with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox() clear = gr.UploadButton([msg, chatbot]) # Replace gr.ClearButton with gr.UploadButton # instruction = gr.Textbox(label="Instruction", placeholder="") def user(user_message, history): return gr.update(value="", interactive=True), history + [[user_message, None]] def bot(history): # instruction = history[-1][1] or "" user_message = history[-1][0] # token1 = llm.tokenize(b"### Instruction: ") # token2 = llm.tokenize(instruction.encode()) token3 = llm.tokenize(b"### Input: ") tokens3 = llm.tokenize(user_message.encode()) token4 = llm.tokenize(b"### Response:") tokens = token3 + tokens3 + token4 history[-1][1] = "" count = 0 output = "" for token in llm.generate(tokens, top_k=50, top_p=0.73, temp=0.72, repeat_penalty=1.1): text = llm.detokenize([token]) output += text.decode() count += 1 if count >= 500 or (token == llm.token_eos()): break history[-1][1] += text.decode() yield history response = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( bot, chatbot, chatbot ) response.then(lambda: gr.update(interactive=True), None, [msg], queue=False) demo.queue() demo.launch(debug=True)