import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama import os import threading import time repo_id = "bartowski/Qwen2.5-1.5B-Instruct-GGUF" filename = "Qwen2.5-1.5B-Instruct-Q8_0.gguf" CONTEXT_SIZE = 1024 N_THREADS = 2 #FreeのCPUは2なので llm = None model_loaded = False def load_model(progress=gr.Progress()): global llm, model_loaded progress(0, desc="モデルのダウンロードを開始") model_path = hf_hub_download(repo_id=repo_id, filename=filename) progress(0.5, desc="モデルをメモリに読み込み中") llm = Llama( model_path=model_path, n_threads=N_THREADS, n_batch=8, verbose=False, n_ctx=CONTEXT_SIZE, ) progress(1, desc="モデルの読み込み完了") model_loaded = True return "モデルの読み込みが完了しました。" def get_llama_response(prompt, temperature): global llm, model_loaded if not model_loaded: return [{"choices": [{"text": "モデルを読み込んでいます。しばらくお待ちください..."}]}] try: return llm(prompt, max_tokens=1024, temperature=temperature, top_p=0.95, repeat_penalty=1.1, stream=True) except Exception as e: return [{"choices": [{"text": f"エラーが発生しました: {str(e)}"}]}] def greet(prompt, temperature): global model_loaded if not model_loaded: return "モデルを読み込んでいます。しばらくお待ちください..." full_response = "" for output in get_llama_response(prompt, temperature): if len(output['choices']) > 0: text_chunk = output['choices'][0]['text'] full_response += text_chunk yield full_response return full_response with gr.Blocks() as demo: gr.Markdown(f"# LLMチャットボット(Streaming)") gr.HighlightedText( value=[("", None), ("これはLLM", "positive"), ("の", None), ("テストアプリケーション", "neutral"), ("です。\n", None), ("内容は実験的", "neutral"), ("なため", None), ("重要な意思決定に用いない", "negative"), ("でください。", None) ], label="注意", show_label=False, ) with gr.Row(): input_text = gr.Textbox(label="プロンプトを入力してください") temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature") output_text = gr.Textbox(label="生成されたレスポンス") submit_button = gr.Button("送信") gr.Textbox(value=filename, label="モデル", interactive=False) loading_status = gr.Textbox(label="Loading Status") submit_button.click(fn=greet, inputs=[input_text, temperature], outputs=output_text) input_text.submit(fn=greet, inputs=[input_text, temperature], outputs=output_text) demo.load(fn=load_model, outputs=loading_status) demo.queue() demo.launch()