File size: 8,945 Bytes
bca5262 9b7bb96 bca5262 8eb8720 bca5262 e5bc143 71cd21b bca5262 e5bc143 bca5262 3fab3f5 bca5262 3fab3f5 bca5262 9b7bb96 bca5262 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
from modules.load_configure import *
import time
if echo == "True":
from modules.echo import *
import os
import gradio as gr
import copy
import llama_cpp
from llama_cpp import Llama
import random
from huggingface_hub import hf_hub_download
from modules.download_model import download_model
from modules.inference import load_model, generate_text
from modules.model_list import list_models
from modules.render_markdown import render_md
from modules.load_presets import load_presets_names, load_presets_value
from modules.arg_parser import *
#from blip.blip_engine import blip_run
dir = os.getcwd()
if footer == "True":
footer_vis = True
else:
footer_vis = False
history = []
chatbot = gr.Chatbot(show_label=False, layout=chat_style, show_copy_button=True, height=500, min_width=180)
with gr.Blocks(theme=theme, title=f"TensorLM v{tlm_version} Demo", css="style.css") as webui:
gr.Markdown("""
[**Run it local**](https://github.com/ehristoforu/TensorLM-webui)
""")
#refresh_model = gr.Button(value="Load model", interactive=True, scale=1)
with gr.Row():
with gr.Row(render=False, variant="panel") as sliders:
with gr.Tab("Parameters"):
max_tokens = gr.Slider(label="Max new tokens", minimum=256, maximum=4056, value=512, step=8, interactive=True)
temperature = gr.Slider(label="Temperature", minimum=0.01, maximum=2.00, value=0.15, step=0.01, interactive=True)
top_p = gr.Slider(label="Top P", minimum=0.01, maximum=2.00, value=0.10, step=0.01, interactive=True)
top_k = gr.Slider(label="Top K", minimum=10.00, maximum=100.00, value=40.00, step=0.01, interactive=True)
repeat_penalty = gr.Slider(label="Repeat penalty", minimum=0.01, maximum=2.00, value=1.10, step=0.01, interactive=True)
with gr.Tab("Instructions"):
preset = gr.Radio(label="Prompt preset", choices=load_presets_names(), value=load_presets_names()[1], interactive=True)
system_prompt = gr.Textbox(label="Custom system prompt", max_lines=4, lines=3, interactive=True)
with gr.Tab("Model"):
model = gr.Dropdown(label="Model (only based on Llama in GGML format (.bin))", choices=os.listdir(f"{dir}/models"), value="llama-2-7b-chat.ggmlv3.q2_K.bin", interactive=False, allow_custom_value=False, scale=50)
with gr.Row(render=False) as settings:
reload_model = gr.Button("Apply settings to model", interactive=True)
n_ctx = gr.Slider(label="Number of CTX", minimum=1024, maximum=4056, value=2048, step=8, interactive=True)
n_gpu_layers = gr.Slider(label="Number of GPU layers", minimum=0, maximum=36, value=0, step=1, interactive=False)
n_threads = gr.Slider(label="Number of Threads", minimum=2, maximum=36, value=8, step=1, interactive=True)
verbose = gr.Checkbox(label="Verbose", value=True, interactive=True)
f16_kv = gr.Checkbox(label="F16 KV", value=True, interactive=True)
logits_all = gr.Checkbox(label="Logits all", value=False, interactive=True)
vocab_only = gr.Checkbox(label="Vocab only", value=False, interactive=True)
use_mmap = gr.Checkbox(label="Use mmap", value=True, interactive=True)
use_mlock = gr.Checkbox(label="Use mlock", value=False, interactive=True)
n_batch = gr.Slider(label="Number of batch", minimum=128, maximum=2048, value=512, step=8, interactive=True)
last_n_tokens_size = gr.Slider(label="Last number of tokens size", minimum=8, maximum=512, value=64, step=8, interactive=True)
low_vram = gr.Checkbox(label="Low VRAM", value=lowvram_arg, interactive=False)
rope_freq_base = gr.Slider(label="Rope freq base", minimum=1000.0, maximum=30000.0, value=10000.0, step=0.1, interactive=True)
rope_freq_scale = gr.Slider(label="Rope freq scale", minimum=0.1, maximum=3.0, value=1.0, step=0.1)
with gr.Column(scale=2):
with gr.Row():
gr.ChatInterface(
generate_text,
chatbot=chatbot,
retry_btn="🔄️",
submit_btn="📨",
undo_btn="↩️",
clear_btn="🗑️",
additional_inputs=[system_prompt, preset, temperature, max_tokens, top_k, top_k, repeat_penalty, model, n_ctx, n_gpu_layers, n_threads, verbose, f16_kv, logits_all, vocab_only, use_mmap, use_mlock, n_batch, last_n_tokens_size, low_vram, rope_freq_base, rope_freq_scale]
)
with gr.Row():
options_change = gr.Checkbox(label="Options", value=False, interactive=True)
tabs_change = gr.Checkbox(label="Tabs", value=False, interactive=True)
with gr.Row():
with gr.Row(visible=False) as tabs:
with gr.Tab("ModelGet"):
gr.Markdown("## Download model from 🤗 HuggingFace.co (DON'T WORK IN DEMO)")
with gr.Row():
repo_id = gr.Textbox(label="REPO_ID", value="ehristoforu/LLMs", lines=1, max_lines=1, interactive=False)
filename = gr.Dropdown(label="FILENAME", interactive=False, choices=["llama-2-7b-chat.ggmlv3.q2_K.bin", "llama-2-13b-chat.ggmlv3.q2_K.bin", "codellama-7b-instruct.ggmlv3.Q2_K.bin", "codellama-13b-instruct.ggmlv3.Q2_K.bin", "saiga-13b.ggmlv3.Q4_1.bin", "saiga-30b.ggmlv3.Q3_K.bin"], value="", allow_custom_value=False)
download_btn = gr.Button(value="Download", interactive=False)
logs=gr.Markdown()
with gr.Tab("Notebook"):
with gr.Row():
with gr.Column(scale=1):
render_markdown = gr.Button(value="Render markdown", interactive=True)
notebook = gr.Textbox(show_label=False, value="This is a great day...", placeholder="Your notebook", max_lines=40, lines=35, interactive=True, show_copy_button=True)
with gr.Row():
with gr.Column(scale=1):
markdown = gr.Markdown()
with gr.Tab("Settings"):
with gr.Row():
with gr.Column():
#with gr.Row():
# gr.Markdown("### Style")
# chat_style = gr.Dropdown(label="Style of chat", choices=["bubble", "panel"], value="bubble", interactive=True, allow_custom_value=False)
settings.render()
with gr.Row():
gr.Markdown(f"""
<center><a href="https://github.com/ehristoforu/TensorLM-webui">v{tlm_version}demo</a> | <a href="/?view=api">API</a> | <a href="https://gradio.app">gradio 4.1.0</a> | <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> | <a href="https://python.org">python</a> | <a href="https://huggingface.co/TheBloke?search_models=GGML">Suggested models</a></center>
""", visible=footer_vis)
with gr.Row(visible=False) as options:
with gr.Column(scale=1):
sliders.render()
render_markdown.click(
fn=render_md,
inputs=notebook,
outputs=markdown,
queue=False,
api_name=False,
)
notebook.change(
fn=render_md,
inputs=notebook,
outputs=markdown,
queue=False,
api_name=False,
)
options_change.change(
fn=lambda x: gr.update(visible=x),
inputs=options_change,
outputs=options,
queue=False,
api_name=False,
)
tabs_change.change(
fn=lambda x: gr.update(visible=x),
inputs=tabs_change,
outputs=tabs,
queue=False,
api_name=False,
)
download_btn.click(download_model, inputs=[repo_id, filename], outputs=logs, api_name=False, queue=False)
model.change(load_model, inputs=[model, n_ctx, n_gpu_layers, n_threads, verbose, f16_kv, logits_all, vocab_only, use_mmap, use_mlock, n_batch, last_n_tokens_size, low_vram, rope_freq_base, rope_freq_scale], outputs=model, api_name=False, queue=False)
reload_model.click(load_model, inputs=[model, n_ctx, n_gpu_layers, n_threads, verbose, f16_kv, logits_all, vocab_only, use_mmap, use_mlock, n_batch, last_n_tokens_size, low_vram, rope_freq_base, rope_freq_scale], outputs=model, api_name=False, queue=False)
webui.launch(
inbrowser=inbrowser_arg,
debug=debug_arg,
quiet=quiet_arg,
favicon_path="assets/favicon.png",
show_api=show_api,
share_server_protocol=share_server_protocol,
) |