Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import json | |
import subprocess | |
from llama_cpp import Llama | |
from llama_cpp_agent import LlamaCppAgent | |
from llama_cpp_agent import MessagesFormatterType | |
from llama_cpp_agent.providers import LlamaCppPythonProvider | |
from llama_cpp_agent.chat_history import BasicChatHistory | |
from llama_cpp_agent.chat_history.messages import Roles | |
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
from ui import css, PLACEHOLDER | |
llm = None | |
llm_model = None | |
# hf_hub_download(repo_id="bartowski/dolphin-2.9.1-yi-1.5-34b-GGUF", filename="dolphin-2.9.1-yi-1.5-34b-Q6_K.gguf", local_dir = "./models") | |
# hf_hub_download(repo_id="crusoeai/dolphin-2.9.1-llama-3-70b-GGUF", filename="dolphin-2.9.1-llama-3-70b.Q3_K_M.gguf", local_dir = "./models") | |
hf_hub_download(repo_id="bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF", filename="cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q8_0.gguf", local_dir = "./models") | |
# hf_hub_download(repo_id="mradermacher/Dolphin3.0-Mistral-24B-GGUF", filename="Dolphin3.0-Mistral-24B.Q8_0.gguf", local_dir = "./models") | |
# hf_hub_download(repo_id="kroonen/dolphin-2.9.2-Phi-3-Medium-GGUF", filename="dolphin-2.9.2-Phi-3-Medium-Q6_K.gguf", local_dir = "./models") | |
hf_hub_download(repo_id="cognitivecomputations/dolphin-2.9.2-qwen2-72b-gguf", filename="qwen2-Q3_K_M.gguf", local_dir = "./models") | |
def respond( | |
message, | |
history: list[tuple[str, str]], | |
model, | |
max_tokens, | |
temperature, | |
top_p, | |
top_k, | |
repeat_penalty, | |
): | |
global llm | |
global llm_model | |
if llm is None or llm_model != model: | |
llm = Llama( | |
model_path=f"models/{model}", | |
flash_attn=True, | |
n_gpu_layers=81, | |
n_batch=1024, | |
n_ctx=8192, | |
) | |
llm_model=model | |
provider = LlamaCppPythonProvider(llm) | |
agent = LlamaCppAgent( | |
provider, | |
system_prompt="You are Dolphin, an AI assistant that helps humanity, trained to specialize in reasoning and first-principles analysis. When responding, always format your replies using <think>{reasoning}</think>{answer}. Use at least 6 reasoning steps and perform a root cause analysis before answering. However, if the answer is very easy and requires little thought, you may leave the <think></think> block empty. Your responses should be detailed, structured with rich Markdown formatting, and engaging with emojis. Be extensive in your explanations, just as the greatest scientific minds would be. Always reason through the problem first, unless it's trivial, in which case you may answer directly.", | |
predefined_messages_formatter_type=MessagesFormatterType.CHATML, | |
debug_output=True | |
) | |
settings = provider.get_provider_default_settings() | |
settings.temperature = temperature | |
settings.top_k = top_k | |
settings.top_p = top_p | |
settings.max_tokens = max_tokens | |
settings.repeat_penalty = repeat_penalty | |
settings.stream = True | |
messages = BasicChatHistory() | |
for msn in history: | |
user = { | |
'role': Roles.user, | |
'content': msn[0] | |
} | |
assistant = { | |
'role': Roles.assistant, | |
'content': msn[1] | |
} | |
messages.add_message(user) | |
messages.add_message(assistant) | |
stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False) | |
outputs = "" | |
for output in stream: | |
outputs += output | |
yield outputs | |
demo = gr.ChatInterface( | |
respond, | |
additional_inputs=[ | |
gr.Dropdown([ | |
'cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q8_0.gguf', | |
'qwen2-Q3_K_M.gguf' | |
], value="cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q8_0.gguf", label="Model"), | |
gr.Slider(minimum=1, maximum=8192, value=8192, step=1, label="Max tokens"), | |
gr.Slider(minimum=0.05, maximum=4.0, value=0.6, step=0.1, label="Temperature"), | |
gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-p", | |
), | |
gr.Slider( | |
minimum=0, | |
maximum=100, | |
value=40, | |
step=1, | |
label="Top-k", | |
), | |
gr.Slider( | |
minimum=0.0, | |
maximum=2.0, | |
value=1.1, | |
step=0.1, | |
label="Repetition penalty", | |
), | |
], | |
theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set( | |
body_background_fill_dark="#0f172a", | |
block_background_fill_dark="#0f172a", | |
block_border_width="1px", | |
block_title_background_fill_dark="#070d1b", | |
input_background_fill_dark="#0c1425", | |
button_secondary_background_fill_dark="#070d1b", | |
border_color_accent_dark="#21293b", | |
border_color_primary_dark="#21293b", | |
background_fill_secondary_dark="#0f172a", | |
color_accent_soft_dark="transparent" | |
), | |
css=css, | |
retry_btn="Retry", | |
undo_btn="Undo", | |
clear_btn="Clear", | |
submit_btn="Send", | |
description="Cognitive Computation: Chat Dolphin 🐬", | |
chatbot=gr.Chatbot( | |
scale=1, | |
placeholder=PLACEHOLDER, | |
show_copy_button=True | |
) | |
) | |
if __name__ == "__main__": | |
demo.launch() | |