Spaces:

sawac
/

llama_chat_test

Runtime error

File size: 3,214 Bytes

086aec6
8ab3f99
 
 
e4456e8
 
086aec6
1362f1f
6f38b94
086aec6
e4456e8
acd4aa6
086aec6
e4456e8
 
086aec6
84e2d22
 
 
e4456e8
84e2d22
3a0ea7c
e4456e8
 
 
 
 
 
 
84e2d22
e4456e8
84e2d22
3a0ea7c
086aec6
3a0ea7c
e4456e8
 
 
 
3a0ea7c
e4456e8
 
8ab3f99
3a0ea7c
 
e4456e8
 
 
 
8ab3f99
3a0ea7c
8ab3f99
 
 
 
3a0ea7c
 
8ab3f99
e4456e8
680f800
42bba8a
5374db7
 
 
 
f7c8cc1
5374db7
 
 
f7c8cc1
8caeadf
5374db7
42bba8a
0eb6577
 
e4456e8
84e2d22
 
e4456e8
3a0ea7c
 
e4456e8
3a0ea7c
 
e4456e8
3a0ea7c
36d223b
84e2d22
086aec6
3a0ea7c
8ab3f99
84e2d22

import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
import threading
import time

repo_id = "bartowski/Qwen2.5-1.5B-Instruct-GGUF"
filename = "Qwen2.5-1.5B-Instruct-Q8_0.gguf"

CONTEXT_SIZE = 2048
N_THREADS = 2 #FreeのCPUは２なので

llm = None
model_loaded = False

def load_model(progress=gr.Progress()):
    global llm, model_loaded
    progress(0, desc="モデルのダウンロードを開始")
    model_path = hf_hub_download(repo_id=repo_id, filename=filename)
    progress(0.5, desc="モデルをメモリに読み込み中")
    
    llm = Llama(
        model_path=model_path,
        n_threads=N_THREADS,
        n_batch=32,
        verbose=False,
        n_ctx=CONTEXT_SIZE,
    )
    progress(1, desc="モデルの読み込み完了")
    model_loaded = True
    return "モデルの読み込みが完了しました。"
    

def get_llama_response(prompt, temperature):
    global llm, model_loaded
    if not model_loaded:
        return [{"choices": [{"text": "モデルを読み込んでいます。しばらくお待ちください..."}]}]
    try:
        return llm(prompt, max_tokens=1024, temperature=temperature, top_p=0.95, repeat_penalty=1.1, stream=True)
    except Exception as e:
        return [{"choices": [{"text": f"エラーが発生しました: {str(e)}"}]}]


def greet(prompt, temperature):
    global model_loaded
    if not model_loaded:
        return "モデルを読み込んでいます。しばらくお待ちください..."
    
    full_response = ""
    for output in get_llama_response(prompt, temperature):
        if len(output['choices']) > 0:
            text_chunk = output['choices'][0]['text']
            full_response += text_chunk
            yield full_response
    return full_response


with gr.Blocks() as demo:
    gr.Markdown(f"# LLMチャットボット(Streaming)")
    gr.HighlightedText(
        value=[("", None), 
               ("軽量なLLM", ""), 
               ("の", None), 
               ("テストアプリケーション", "negative"), ("です。\n", None),
               ("内容は実験的", "neutral"), ("なものであり", None),
               ("正確性や適切性", None), ("は保証されません。", None),
               ("重要な意思決定", None), 
               ("には使用しないでください。", None)
              ],
        label="注意:",
        show_label=False,
    )
    gr.Textbox(value=filename, label="モデル：", interactive=False)
    # gr.Markdown(f"Model: {filename} ")
    
    loading_status = gr.Textbox(label="Loading Status")
    
    with gr.Row():
        input_text = gr.Textbox(label="プロンプトを入力してください")
        temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
    
    output_text = gr.Textbox(label="生成されたレスポンス")
    submit_button = gr.Button("送信")
    
    submit_button.click(fn=greet, inputs=[input_text, temperature], outputs=output_text)
    input_text.submit(fn=greet, inputs=[input_text, temperature], outputs=output_text) 
    demo.load(fn=load_model, outputs=loading_status)


demo.queue()
demo.launch()