Spaces:
Running
Running
import gradio as gr | |
import json | |
from huggingface_hub import snapshot_download | |
from llama_cpp import Llama | |
repo_name = "PY007/TinyLlama-1.1B-Chat-v0.2-GGUF" | |
model_name = "ggml-model-q4_0.gguf" | |
snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name) | |
model = Llama( | |
model_path=model_name, | |
n_ctx=1024, | |
n_parts=1, | |
) | |
template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | |
def generate( | |
input=None, | |
temperature=0.1, | |
top_p=0.75, | |
top_k=40, | |
max_tokens=512, | |
): | |
prompt = template.format(input) | |
output = "" | |
for chunk in model.create_completion(prompt, | |
temperature = temperature, | |
top_k = top_k, | |
top_p = top_p, | |
max_tokens = max_tokens, | |
stop=["<|im_end|>"], | |
echo = False, | |
stream = True): | |
output +=chunk["choices"][0]["text"] | |
yield output | |
return output | |
g = gr.Interface( | |
fn=generate, | |
inputs=[ | |
gr.components.Textbox( | |
lines=2, label="Prompt", value = "What is Huggingface?" | |
), | |
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), | |
gr.components.Slider(minimum=0, maximum=1, value=1, label="Top p"), | |
gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"), | |
gr.components.Slider(minimum=1, maximum=1024, step=1, value=256, label="Max tokens"), | |
], | |
outputs=[ | |
gr.Textbox( | |
lines=10, | |
label="Output", | |
) | |
], | |
title = "TinyLlama 1.1B Chat GGUF", | |
description = """ | |
original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2) | |
quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf) | |
""" | |
) | |
g.queue(concurrency_count=1) | |
g.launch() | |