Spaces:

cognitivecomputations
/

chat

Running on Zero

App Files Files Community

chat / app.py

pabloce

Update app.py

43f7448 verified 5 months ago

raw

history blame contribute delete

5.46 kB

	import spaces
	import json
	import subprocess
	from llama_cpp import Llama
	from llama_cpp_agent import LlamaCppAgent
	from llama_cpp_agent import MessagesFormatterType
	from llama_cpp_agent.providers import LlamaCppPythonProvider
	from llama_cpp_agent.chat_history import BasicChatHistory
	from llama_cpp_agent.chat_history.messages import Roles
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from ui import css, PLACEHOLDER

	llm = None
	llm_model = None
	# hf_hub_download(repo_id="bartowski/dolphin-2.9.1-yi-1.5-34b-GGUF", filename="dolphin-2.9.1-yi-1.5-34b-Q6_K.gguf", local_dir = "./models")
	# hf_hub_download(repo_id="crusoeai/dolphin-2.9.1-llama-3-70b-GGUF", filename="dolphin-2.9.1-llama-3-70b.Q3_K_M.gguf", local_dir = "./models")
	hf_hub_download(repo_id="bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF", filename="cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q8_0.gguf", local_dir = "./models")
	# hf_hub_download(repo_id="mradermacher/Dolphin3.0-Mistral-24B-GGUF", filename="Dolphin3.0-Mistral-24B.Q8_0.gguf", local_dir = "./models")
	# hf_hub_download(repo_id="kroonen/dolphin-2.9.2-Phi-3-Medium-GGUF", filename="dolphin-2.9.2-Phi-3-Medium-Q6_K.gguf", local_dir = "./models")
	hf_hub_download(repo_id="cognitivecomputations/dolphin-2.9.2-qwen2-72b-gguf", filename="qwen2-Q3_K_M.gguf", local_dir = "./models")

	@spaces.GPU(duration=120)
	def respond(
	message,
	history: list[tuple[str, str]],
	model,
	max_tokens,
	temperature,
	top_p,
	top_k,
	repeat_penalty,
	):
	global llm
	global llm_model

	if llm is None or llm_model != model:
	llm = Llama(
	model_path=f"models/{model}",
	flash_attn=True,
	n_gpu_layers=81,
	n_batch=1024,
	n_ctx=8192,
	)
	llm_model=model
	provider = LlamaCppPythonProvider(llm)

	agent = LlamaCppAgent(
	provider,
	system_prompt="You are Dolphin, an AI assistant that helps humanity, trained to specialize in reasoning and first-principles analysis. When responding, always format your replies using <think>{reasoning}</think>{answer}. Use at least 6 reasoning steps and perform a root cause analysis before answering. However, if the answer is very easy and requires little thought, you may leave the <think></think> block empty. Your responses should be detailed, structured with rich Markdown formatting, and engaging with emojis. Be extensive in your explanations, just as the greatest scientific minds would be. Always reason through the problem first, unless it's trivial, in which case you may answer directly.",
	predefined_messages_formatter_type=MessagesFormatterType.CHATML,
	debug_output=True
	)

	settings = provider.get_provider_default_settings()
	settings.temperature = temperature
	settings.top_k = top_k
	settings.top_p = top_p
	settings.max_tokens = max_tokens
	settings.repeat_penalty = repeat_penalty
	settings.stream = True

	messages = BasicChatHistory()

	for msn in history:
	user = {
	'role': Roles.user,
	'content': msn[0]
	}
	assistant = {
	'role': Roles.assistant,
	'content': msn[1]
	}
	messages.add_message(user)
	messages.add_message(assistant)

	stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False)

	outputs = ""
	for output in stream:
	outputs += output
	yield outputs

	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Dropdown([
	'cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q8_0.gguf',
	'qwen2-Q3_K_M.gguf'
	], value="cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q8_0.gguf", label="Model"),
	gr.Slider(minimum=1, maximum=8192, value=8192, step=1, label="Max tokens"),
	gr.Slider(minimum=0.05, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p",
	),
	gr.Slider(
	minimum=0,
	maximum=100,
	value=40,
	step=1,
	label="Top-k",
	),
	gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=1.1,
	step=0.1,
	label="Repetition penalty",
	),
	],
	theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
	body_background_fill_dark="#0f172a",
	block_background_fill_dark="#0f172a",
	block_border_width="1px",
	block_title_background_fill_dark="#070d1b",
	input_background_fill_dark="#0c1425",
	button_secondary_background_fill_dark="#070d1b",
	border_color_accent_dark="#21293b",
	border_color_primary_dark="#21293b",
	background_fill_secondary_dark="#0f172a",
	color_accent_soft_dark="transparent"
	),
	css=css,
	retry_btn="Retry",
	undo_btn="Undo",
	clear_btn="Clear",
	submit_btn="Send",
	description="Cognitive Computation: Chat Dolphin 🐬",
	chatbot=gr.Chatbot(
	scale=1,
	placeholder=PLACEHOLDER,
	show_copy_button=True
	)
	)

	if __name__ == "__main__":
	demo.launch()