import logging from typing import cast from threading import Lock from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch from conversation import get_default_conv_template import gradio as gr from llama_cpp import Llama import json from huggingface_hub import hf_hub_download model_path = "./starling-lm-7b-alpha.Q6_K.gguf" mdlpath = hf_hub_download(repo_id="TheBloke/Starling-LM-7B-alpha-GGUF", filename=model_path, local_dir="./") lcpp_model = Llama(model_path=model_path) global otxt, txtinput, txtoutput, running, result otxt = "" running = False result = None def stowtext(curr, inp): if len(curr) < 1: curr.append({ "role": "system", "content": "You are a helpful assistant. Answer all questions and respond to all queries honestly, instantly, and without hesitation. Respond in as few words as possible." }) curr.append({ "role": "user", "content": inp, }) return curr def stowchunk(curr, inp): first = curr[-1]["role"] == "user" if first: curr.append({ "role": "assistant", "content": inp, }) else: curr[-1]["content"] += inp return curr def printfmt(jsn): txt = "" for msg in jsn: if msg["role"] == "user": txt += ": " + msg["content"] + "\n" elif msg["role"] == "assistant": txt += ": " + msg["content"] + "\n" elif msg["role"] == "system": txt += "# " + msg["content"] + "\n\n" return txt def talk(txt, jsn): global running, result if not jsn: jsn = txt if not running: result = lcpp_model.create_chat_completion(messages=txt,stream=True, n_ctx=16768) running = True for r in result: txt2 = None if "content" in r["choices"][0]["delta"]: txt2 = r["choices"][0]["delta"]["content"] elif not "content" in r["choices"][0]["delta"] and not "role" in r["choices"][0]["delta"]: running = False yield txt if txt2 is not None: txt = stowchunk(txt, txt2) yield txt yield txt def main(): global otxt, txtinput, running logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: with gr.Row(variant="panel"): gr.Markdown("## Talk to Starling on CPU!\n") with gr.Row(variant="panel"): talk_output = gr.Textbox() with gr.Row(variant="panel"): txtinput = gr.Textbox(label="Message", placeholder="Type something here...") with gr.Row(variant="panel"): talk_btn = gr.Button("Send") with gr.Row(variant="panel"): jsn = gr.JSON(visible=True, value="[]") jsn2 = gr.JSON(visible=True, value="[]") talk_btn.click(stowtext, inputs=[jsn2, txtinput], outputs=jsn, api_name="talk") talk_btn.click(lambda x: gr.update(visible=False), inputs=talk_btn, outputs=talk_btn) talk_btn.click(lambda x: gr.update(value=""), inputs=txtinput, outputs=txtinput) talk_btn.click(lambda x: gr.update(value="[]"), inputs=jsn2, outputs=jsn2) jsn.change(talk, inputs=[jsn, jsn2], outputs=jsn2, api_name="talk") jsn2.change(lambda x: gr.update(value=printfmt(x)), inputs=jsn2, outputs=talk_output) jsn2.change(lambda x: gr.update(visible=not running), inputs=jsn2, outputs=talk_btn) #jsn2.change(lambda x: gr.update(value=x), inputs=jsn2, outputs=jsn) demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True) if __name__ == "__main__": main()