import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoProcessor, Llama4ForConditionalGeneration import torch # from transformers import BitsAndBytesConfig # bnb_config = BitsAndBytesConfig( # load_in_4bit=True, # llm_int8_enable_fp32_cpu_offload=True, # ) #Qwen/Qwen2.5-14B-Instruct-1M #Qwen/Qwen2-0.5B # model_name = "bartowski/simplescaling_s1-32B-GGUF" # subfolder = "Qwen-0.5B-GRPO/checkpoint-1868" # filename = "simplescaling_s1-32B-Q4_K_S.gguf" # model_name = "simplescaling/s1.1-32B" # model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF" model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit" model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf" torch_dtype = torch.bfloat16 # could be torch.float16 or torch.bfloat16 torch.float32 too cache_dir = "/data" # model = AutoModelForCausalLM.from_pretrained( # model_name, # # subfolder=subfolder, # gguf_file=filename, # torch_dtype=torch_dtype, # device_map="auto", # cache_dir = cache_dir, # ) model = Llama4ForConditionalGeneration.from_pretrained( model_name, # default is eager attention # attn_implementation="flex_attention", # gguf_file=filename, cache_dir = cache_dir, torch_dtype=torch_dtype, # quantization_config=bnb_config, device_map="auto", ) # processor = AutoProcessor.from_pretrained(model_name, cache_dir = cache_dir) processor = AutoTokenizer.from_pretrained(model_name, cache_dir = cache_dir) # , gguf_file=filename # , subfolder=subfolder SYSTEM_PROMPT = "You are a friendly Chatbot." # """ # Respond in the following format: # <reasoning> # ... # </reasoning> # <answer> # ... # </answer> # """ @spaces.GPU def generate(prompt, history): messages = [ # {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt} ] # text = tokenizer.apply_chat_template( # messages, # # tokenize=False, # tokenize=True, # add_generation_prompt=True # ) # model_inputs = tokenizer([text], return_tensors="pt").to(model.device) # generated_ids = model.generate( # **model_inputs, # max_new_tokens=512 # ) # generated_ids = [ # output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) # ] # response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # return response inputs = processor.apply_chat_template( messages, add_generation_prompt=True, # tokenize=True, return_dict=True, return_tensors="pt", ) outputs = model.generate( **inputs.to(model.device), max_new_tokens=100, ) response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0] chat_interface = gr.ChatInterface( fn=generate, ) chat_interface.launch(share=True)