llama_chat_test / app.py
sawac's picture
Update app.py
e4456e8 verified
raw
history blame
2.53 kB
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
import threading
import time
repo_id = "mmnga/ELYZA-japanese-Llama-2-7b-instruct-gguf"
filename = "ELYZA-japanese-Llama-2-7b-instruct-q4_K_M.gguf"
CONTEXT_SIZE = 2048
N_THREADS = min(os.cpu_count(), 4)
llm = None
model_loaded = False
loading_progress = 0
def load_model():
global llm, model_loaded, loading_progress
loading_progress = 0
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
loading_progress = 50
llm = Llama(
model_path=model_path,
n_threads=N_THREADS,
n_batch=32,
verbose=False,
n_ctx=CONTEXT_SIZE,
)
loading_progress = 100
model_loaded = True
def get_loading_status():
global loading_progress
return loading_progress
def get_llama_response(prompt):
global llm, model_loaded
if not model_loaded:
return [{"choices": [{"text": "モデルを読み込んでいます。しばらくお待ちください..."}]}]
try:
return llm(prompt, max_tokens=1024, temperature=0.7, top_p=0.95, repeat_penalty=1.1, stream=True)
except Exception as e:
return [{"choices": [{"text": f"エラーが発生しました: {str(e)}"}]}]
def greet(prompt, intensity):
global model_loaded
if not model_loaded:
return "モデルを読み込んでいます。しばらくお待ちください..."
full_response = ""
for output in get_llama_response(prompt):
if len(output['choices']) > 0:
text_chunk = output['choices'][0]['text']
full_response += text_chunk
yield full_response
return full_response + "!" * int(intensity)
# モデルを非同期で読み込む
threading.Thread(target=load_model, daemon=True).start()
with gr.Blocks() as demo:
gr.Markdown("# Llama.cpp-python-sample (Streaming)")
gr.Markdown(f"MODEL: {filename} from {repo_id}")
with gr.Row():
input_text = gr.Textbox(label="Enter your prompt")
intensity = gr.Slider(minimum=0, maximum=10, step=1, label="Intensity")
output_text = gr.Textbox(label="Generated Response")
submit_button = gr.Button("Submit")
loading_bar = gr.progressbar(label="Model Loading Progress", max=100)
submit_button.click(fn=greet, inputs=[input_text, intensity], outputs=output_text)
demo.load(fn=get_loading_status, outputs=loading_bar, every=1)
demo.queue()
if __name__ == "__main__":
demo.launch()