import logging from typing import cast from threading import Lock from transformers import AutoModelForCausalLM, AutoTokenizer import torch from conversation import get_default_conv_template import gradio as gr from pyllamacpp.model import Model import wget """ model = Model(model_path='/path/to/model.bin') while True: try: prompt = input("You: ", flush=True) if prompt == '': continue print(f"AI:", end='') for token in model.generate(prompt): print(f"{token}", end='', flush=True) print() except KeyboardInterrupt: break """ from huggingface_hub import hf_hub_download model_path = "minichat-3b.q8_0.gguf" mdlpath = hf_hub_download(repo_id="afrideva/MiniChat-3B-GGUF", filename=model_path) lcpp_model = Model(model_path=mdlpath) def m3b_talk(text): resp = "" for token in lcpp_model.generate(text): resp += token return resp def main(): logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: with gr.Row(variant="panel"): gr.Markdown("## Talk to MiniChat-3B\n\nTalk to MiniChat-3B.") with gr.Row(variant="panel"): with gr.Column(variant="panel"): m3b_talk_input = gr.Textbox(label="Message", placeholder="Type something here...") with gr.Column(variant="panel"): m3b_talk_output = gr.Textbox() m3b_talk_btn = gr.Button("Send") m3b_talk_btn.click(m3b_talk, inputs=m3b_talk_input, outputs=m3b_talk_output, api_name="talk_m3b") demo.queue(concurrency_count=1).launch() if __name__ == "__main__": main()