import gradio as gr import time import ctypes #to run on C api directly import llama_cpp from llama_cpp import Llama from huggingface_hub import hf_hub_download #load from huggingfaces llm = Llama(model_path= hf_hub_download(repo_id="TheBloke/airoboros-l2-13b-gpt4-m2.0-GGML", filename="airoboros-l2-13b-gpt4-m2.0.ggmlv3.q6_K.bin"), n_ctx=2048) #download model from hf/ n_ctx=2048 for high ccontext length history = [] def generate_text(input_text, history): conversation_context = " ".join([f"{pair[0]} {pair[1]}" for pair in history]) full_conversation = f"{conversation_context} Q: {input_text} \n A:" output = llm(full_conversation, max_tokens=1024, stop=["Q:", "\n"], echo=True) response = output['choices'][0]['text'] history.append([input_text, response]) return "", history with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox() clear = gr.ClearButton([msg, chatbot]) msg.submit( # fn=conversation.user_turn, fn=generate_text, inputs=[msg, chatbot], outputs=[msg, chatbot], # queue=True, show_progress="full", api_name="predict", ) submit.click( fn=lambda x, y: ("",) + predict(x, y)[1:], # clear msg inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True, show_progress="full", ) clear.click(lambda: None, None, chatbot, queue=False) demo.queue(concurrency_count=1, max_size=5) demo.launch()