File size: 2,531 Bytes
086aec6
8ab3f99
 
 
e4456e8
 
086aec6
e4456e8
 
086aec6
e4456e8
 
086aec6
e4456e8
 
 
086aec6
e4456e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
086aec6
8ab3f99
e4456e8
 
 
 
 
 
 
8ab3f99
 
e4456e8
 
 
 
8ab3f99
 
 
 
 
 
 
 
 
e4456e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
086aec6
8ab3f99
e4456e8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
import threading
import time

repo_id = "mmnga/ELYZA-japanese-Llama-2-7b-instruct-gguf"
filename = "ELYZA-japanese-Llama-2-7b-instruct-q4_K_M.gguf"

CONTEXT_SIZE = 2048
N_THREADS = min(os.cpu_count(), 4)

llm = None
model_loaded = False
loading_progress = 0

def load_model():
    global llm, model_loaded, loading_progress
    loading_progress = 0
    model_path = hf_hub_download(repo_id=repo_id, filename=filename)
    loading_progress = 50
    llm = Llama(
        model_path=model_path,
        n_threads=N_THREADS,
        n_batch=32,
        verbose=False,
        n_ctx=CONTEXT_SIZE,
    )
    loading_progress = 100
    model_loaded = True

def get_loading_status():
    global loading_progress
    return loading_progress

def get_llama_response(prompt):
    global llm, model_loaded
    if not model_loaded:
        return [{"choices": [{"text": "モデルを読み込んでいます。しばらくお待ちください..."}]}]
    try:
        return llm(prompt, max_tokens=1024, temperature=0.7, top_p=0.95, repeat_penalty=1.1, stream=True)
    except Exception as e:
        return [{"choices": [{"text": f"エラーが発生しました: {str(e)}"}]}]

def greet(prompt, intensity):
    global model_loaded
    if not model_loaded:
        return "モデルを読み込んでいます。しばらくお待ちください..."
    
    full_response = ""
    for output in get_llama_response(prompt):
        if len(output['choices']) > 0:
            text_chunk = output['choices'][0]['text']
            full_response += text_chunk
            yield full_response
    
    return full_response + "!" * int(intensity)

# モデルを非同期で読み込む
threading.Thread(target=load_model, daemon=True).start()

with gr.Blocks() as demo:
    gr.Markdown("# Llama.cpp-python-sample (Streaming)")
    gr.Markdown(f"MODEL: {filename} from {repo_id}")
    
    with gr.Row():
        input_text = gr.Textbox(label="Enter your prompt")
        intensity = gr.Slider(minimum=0, maximum=10, step=1, label="Intensity")
    
    output_text = gr.Textbox(label="Generated Response")
    submit_button = gr.Button("Submit")
    
    loading_bar = gr.progressbar(label="Model Loading Progress", max=100)
    
    submit_button.click(fn=greet, inputs=[input_text, intensity], outputs=output_text)
    demo.load(fn=get_loading_status, outputs=loading_bar, every=1)

demo.queue()
if __name__ == "__main__":
    demo.launch()