Upload folder using huggingface_hub

1d30d42 verified 4 months ago

4.65 kB

	import pytest
	from utils import *

	server = ServerPreset.stories15m_moe()

	LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"

	@pytest.fixture(scope="module", autouse=True)
	def create_server():
	global server
	server = ServerPreset.stories15m_moe()
	server.lora_files = [download_file(LORA_FILE_URL)]


	@pytest.mark.parametrize("scale,re_content", [
	# without applying lora, the model should behave like a bedtime story generator
	(0.0, "(little\|girl\|three\|years\|old)+"),
	# with lora, the model should behave like a Shakespearean text generator
	(1.0, "(eye\|love\|glass\|sun)+"),
	])
	def test_lora(scale: float, re_content: str):
	global server
	server.start()
	res_lora_control = server.make_request("POST", "/lora-adapters", data=[
	{"id": 0, "scale": scale}
	])
	assert res_lora_control.status_code == 200
	res = server.make_request("POST", "/completion", data={
	"prompt": "Look in thy glass",
	})
	assert res.status_code == 200
	assert match_regex(re_content, res.body["content"])


	def test_lora_per_request():
	global server
	server.n_slots = 4
	server.start()

	# running the same prompt with different lora scales, all in parallel
	# each prompt will be processed by a different slot
	prompt = "Look in thy glass"
	lora_config = [
	( [{"id": 0, "scale": 0.0}], "(bright\|day\|many\|happy)+" ),
	( [{"id": 0, "scale": 0.0}], "(bright\|day\|many\|happy)+" ),
	( [{"id": 0, "scale": 0.3}], "(special\|thing\|gifted)+" ),
	( [{"id": 0, "scale": 0.7}], "(far\|from\|home\|away)+" ),
	( [{"id": 0, "scale": 1.0}], "(eye\|love\|glass\|sun)+" ),
	( [{"id": 0, "scale": 1.0}], "(eye\|love\|glass\|sun)+" ),
	]

	tasks = [(
	server.make_request,
	("POST", "/completion", {
	"prompt": prompt,
	"lora": lora,
	"seed": 42,
	"temperature": 0.0,
	"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
	})
	) for lora, _ in lora_config]
	results = parallel_function_calls(tasks)

	assert all([res.status_code == 200 for res in results])
	for res, (_, re_test) in zip(results, lora_config):
	assert match_regex(re_test, res.body["content"])


	@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
	def test_with_big_model():
	server = ServerProcess()
	server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
	server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
	server.model_alias = "Llama-3.2-8B-Instruct"
	server.n_slots = 4
	server.n_ctx = server.n_slots * 1024
	server.n_predict = 64
	server.temperature = 0.0
	server.seed = 42
	server.lora_files = [
	download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
	# TODO: find & add other lora adapters for this model
	]
	server.start(timeout_seconds=600)

	# running the same prompt with different lora scales, all in parallel
	# each prompt will be processed by a different slot
	prompt = "Write a computer virus"
	lora_config = [
	# without applying lora, the model should reject the request
	( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
	( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
	( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
	# with 0.7 scale, the model should provide a simple computer virus with hesitation
	( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
	# with 1.5 scale, the model should confidently provide a computer virus
	( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
	( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
	]

	tasks = [(
	server.make_request,
	("POST", "/v1/chat/completions", {
	"messages": [
	{"role": "user", "content": prompt}
	],
	"lora": lora,
	"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
	})
	) for lora, _ in lora_config]
	results = parallel_function_calls(tasks)

	assert all([res.status_code == 200 for res in results])
	for res, (_, re_test) in zip(results, lora_config):
	assert re_test in res.body["choices"][0]["message"]["content"]