s1.1-32B

Runtime error

App Files Files Community

s1.1-32B / app.py

bobber

Update app.py

588eaad verified 29 days ago

raw

history blame

2.38 kB

	import gradio as gr
	import spaces
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from transformers import AutoProcessor, Llama4ForConditionalGeneration

	import torch

	from transformers import BitsAndBytesConfig
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	llm_int8_enable_fp32_cpu_offload=True,
	)

	#Qwen/Qwen2.5-14B-Instruct-1M
	#Qwen/Qwen2-0.5B
	# model_name = "bartowski/simplescaling_s1-32B-GGUF"
	# subfolder = "Qwen-0.5B-GRPO/checkpoint-1868"
	# filename = "simplescaling_s1-32B-Q4_K_S.gguf"
	# model_name = "simplescaling/s1.1-32B"
	# model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF"
	model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit"
	filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf"
	torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too
	cache_dir = "/data"

	# model = AutoModelForCausalLM.from_pretrained(
	# model_name,
	# # subfolder=subfolder,
	# gguf_file=filename,
	# torch_dtype=torch_dtype,
	# device_map="auto",
	# cache_dir = cache_dir,
	# )
	model = Llama4ForConditionalGeneration.from_pretrained(
	model_name,
	attn_implementation="flex_attention",
	# gguf_file=filename,
	# cache_dir = cache_dir,
	torch_dtype=torch_dtype,
	quantization_config=bnb_config,
	device_map="auto",
	)

	tokenizer = AutoTokenizer.from_pretrained(model_name
	# , gguf_file=filename
	# , subfolder=subfolder
	)
	SYSTEM_PROMPT = """
	Respond in the following format:
	<reasoning>
	...
	</reasoning>
	<answer>
	...
	</answer>
	"""

	@spaces.GPU
	def generate(prompt, history):
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt}
	]
	text = tokenizer.apply_chat_template(
	messages,
	# tokenize=False,
	tokenize=True,
	add_generation_prompt=True
	)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	generated_ids = model.generate(
	**model_inputs,
	max_new_tokens=512
	)
	generated_ids = [
	output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
	]

	response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	return response



	chat_interface = gr.ChatInterface(
	fn=generate,
	)
	chat_interface.launch(share=True)