import logging from typing import cast from threading import Lock from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch from conversation import get_default_conv_template import gradio as gr from llama_cpp import Llama import json from huggingface_hub import hf_hub_download model_path = "starling-lm-7b-alpha.Q6_K.gguf" #mdlpath = hf_hub_download(repo_id="afrideva/MiniChat-3B-GGUF", filename=model_path) lcpp_model = Llama(model_path=model_path) global otxt otxt = "" def m3b_talk(text): global otxt resp = "" formattedQuery = "GPT4 User: " + text + "<|end_of_text|>GPT4 Assistant:" r = lcpp_model(formattedQuery, stop=["GPT4 User:", "\n\n\n"], echo=True, stream=True) rfq = False for c in r: otxt += c["choices"][0]["text"] if formattedQuery in otxt and not rfq: otxt.replace(formattedQuery, "") rfq = True else: yield otxt print(resp) return otxt #return resp.replace(formattedQuery, "") def main(): global otxt logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: with gr.Row(variant="panel"): gr.Markdown("## Talk to MiniChat-3B\n\nTalk to MiniChat-3B.") with gr.Row(variant="panel"): with gr.Column(variant="panel"): m3b_talk_input = gr.Textbox(label="Message", placeholder="Type something here...") with gr.Column(variant="panel"): m3b_talk_output = gr.Textbox() m3b_talk_btn = gr.Button("Send") m3b_talk_btn.click(m3b_talk, inputs=m3b_talk_input, outputs=m3b_talk_output, api_name="talk_m3b") demo.queue().launch() if __name__ == "__main__": main()