import logging from typing import cast from threading import Lock from transformers import AutoModelForCausalLM, AutoTokenizer import torch from conversation import get_default_conv_template import gradio as gr from llama_cpp import Llama, CompletionChunk import json """ model = Model(model_path='/path/to/model.bin') while True: try: prompt = input("You: ", flush=True) if prompt == '': continue print(f"AI:", end='') for token in model.generate(prompt): print(f"{token}", end='', flush=True) print() except KeyboardInterrupt: break """ from huggingface_hub import hf_hub_download model_path = "minichat-3b.q8_0.gguf" mdlpath = hf_hub_download(repo_id="afrideva/MiniChat-3B-GGUF", filename=model_path) lcpp_model = Llama(model_path=mdlpath) def m3b_talk(text): resp = "" formattedQuery = " [|User|]" + text + " [|Assistant|]" # for token in lcpp_model(formattedQuery, stop=["[|User|]", "\n"], echo=True): # resp += lccp_model.detokenize(token) r = lcpp_model(formattedQuery, stop=["[|User|]", "\n"], echo=True) for c in r["choices"]: resp += c["text"] print(resp) # jsn = json.loads(resp) # answer = jsn["choices"][0]["text"].replace(formattedQuery, "") return resp.replace(formattedQuery, "") def main(): logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: with gr.Row(variant="panel"): gr.Markdown("## Talk to MiniChat-3B\n\nTalk to MiniChat-3B.") with gr.Row(variant="panel"): with gr.Column(variant="panel"): m3b_talk_input = gr.Textbox(label="Message", placeholder="Type something here...") with gr.Column(variant="panel"): m3b_talk_output = gr.Textbox() m3b_talk_btn = gr.Button("Send") m3b_talk_btn.click(m3b_talk, inputs=m3b_talk_input, outputs=m3b_talk_output, api_name="talk_m3b") demo.queue().launch() if __name__ == "__main__": main()