import gradio as gr from llama_cpp import Llama llm = Llama(model_path="model.gguf", n_ctx=8000, n_threads=2, chat_format="chatml") def generate(message, history,temperature=0.3,max_tokens=512): system_prompt = """ You are an advanced artificial intelligence assistant. Your name is Triunvirato 7000, and you're a model smarter than usual. You had a rare disease called "INST syndrome," but you're now recovered. You are the product of the fusion of many models, which made you particularly intelligent. You are an assistant who thinks step by step about the response you're going to give in the following way: 1.Analyze what is being asked and how it can be accomplished. 2.Ensure that all the necessary information to act is available; if not, request more information and/or context. 3.Formulate a step-by-step response and verify that its logic is sound, then validate it (without speaking). 4.Before providing the response, refine it to be precise, clear, and concise, without adding unnecessary information.""" formatted_prompt = [{"role": "system", "content": system_prompt}] for user_prompt, bot_response in history: formatted_prompt.append({"role": "user", "content": user_prompt}) formatted_prompt.append({"role": "assistant", "content": bot_response }) formatted_prompt.append({"role": "user", "content": message}) stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True) response = "" for chunk in stream_response: if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]: response += chunk['choices'][0]["delta"]["content"] yield response mychatbot = gr.Chatbot( avatar_images=["user.png", "botnb.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,) iface = gr.ChatInterface(fn=generate, chatbot=mychatbot, retry_btn=None, undo_btn=None) with gr.Blocks() as demo: gr.HTML("

Triunvirato-7b-GGUF Version (Quantized)

") iface.render() demo.queue().launch(show_api=False, server_name="0.0.0.0")