training script: https://gist.github.com/notlober/9bf4c3ab6ddeb12ec669ca495653708a inference code:

from transformers import AutoModelForCausalLM, AutoTokenizer

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
    return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once(prompt):
    message = [
        {"role": "user", "content": do_instruct(prompt)}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks,
        repetition_penalty=1.2,
        num_beam_groups=N_BEAMS,
        num_beams=N_BEAMS,
        diversity_penalty=0.5,
        early_stopping=True,
        do_sample=False # do not set to True if you get a warning, skip it
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def test_gen(prompt):
    answer_str = generate_output_once(prompt)
    print(f"Answer: {answer_str}")

#####
model = AutoModelForCausalLM.from_pretrained(
    "notbdq/gemma-grpo",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
#####

test_gen("...") # put your prompt here

benchmarks: its definitely better than qwen 14b 1m, but i have only tested for 15 samples of aime validation set and it was doing better than qwen 2.5 1m since first sample but there are 75 samples more so i am sharing the script so someone can benchmark it if wants:

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
    return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once_grpo(model, prompt):
    message = [
        {"role": "user", "content": do_instruct(prompt)}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks,
        repetition_penalty=1.2,
        num_beam_groups=N_BEAMS,
        num_beams=N_BEAMS,
        diversity_penalty=0.5,
        early_stopping=True,
        do_sample=False # do not set to True if you get a warning, skip it
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def generate_output_once(model, prompt):
    message = [
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def check_model_contain_output(model_output, ground_t_output):
    if ground_t_output in model_output:
        return True
    return False

def extract_answer(text):
    try: return text.split("<answer>")[1].split("</answer>")[0]
    except: return None

def do_eval(debug):
    total_iters = len(eval_dataset)
    wins_reasoning = 0
    wins_qwen = 0
    for l in range(len(eval_dataset)):
        row = eval_dataset[l]
        problem = row["problem"]
        ground_truth = row["answer"]
        response = generate_output_once_grpo(model, problem)
        response_qwen = generate_output_once(model_qwen, problem)
        reward = check_model_contain_output(response, ground_truth)
        reward_qwen = check_model_contain_output(response_qwen, ground_truth)
        if reward: wins_reasoning += 1
        if reward_qwen: wins_qwen += 1
        print(f"reasoning model: %{wins_reasoning / total_iters}")
        print(f"qwen model: %{wins_qwen / total_iters}")
        if debug:
            print("qwen:", response_qwen)
            print("reasoning fine tuned:", response)

#####
model = AutoModelForCausalLM.from_pretrained(
    "notbdq/gemma-grpo",
    torch_dtype="auto",
    device_map="auto"
)
model_qwen = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-14B-Instruct-1M",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
eval_dataset = load_dataset("AI-MO/aimo-validation-aime", split="train")
#####

do_eval(debug=False)

technique: GRPO applied to qwen 14b 1m with Numina cot dataset

hardware: 8xmi300x for like 64 steps

current issues: 1. infinite generation when hit a hard problem 2. growing sequence length when training

author: baki

contact: https://x.com/bakiv11771441

Downloads last month
252
Safetensors
Model size
14.8B params
Tensor type
BF16
ยท
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.

Model tree for notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning

Merges
1 model
Quantizations
1 model

Space using notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning 1