File size: 5,735 Bytes
ed96846
 
 
 
598efb2
f2b34d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8322cab
 
 
f2b34d3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
---
library_name: transformers
tags: []
---
training script: https://gist.github.com/notlober/9bf4c3ab6ddeb12ec669ca495653708a
inference code:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
    return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once(prompt):
    message = [
        {"role": "user", "content": do_instruct(prompt)}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks,
        repetition_penalty=1.2,
        num_beam_groups=N_BEAMS,
        num_beams=N_BEAMS,
        diversity_penalty=0.5,
        early_stopping=True,
        do_sample=False # do not set to True if you get a warning, skip it
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def test_gen(prompt):
    answer_str = generate_output_once(prompt)
    print(f"Answer: {answer_str}")

#####
model = AutoModelForCausalLM.from_pretrained(
    "notbdq/gemma-grpo",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
#####

test_gen("...") # put your prompt here
```

benchmarks: its definitely better than qwen 14b 1m, but i have only tested for 15 samples of aime validation set and it was doing better than
qwen 2.5 1m since first sample but there are 75 samples more so i am sharing the script so someone can benchmark it if wants:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
    return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once_grpo(model, prompt):
    message = [
        {"role": "user", "content": do_instruct(prompt)}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks,
        repetition_penalty=1.2,
        num_beam_groups=N_BEAMS,
        num_beams=N_BEAMS,
        diversity_penalty=0.5,
        early_stopping=True,
        do_sample=False # do not set to True if you get a warning, skip it
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def generate_output_once(model, prompt):
    message = [
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def check_model_contain_output(model_output, ground_t_output):
    if ground_t_output in model_output:
        return True
    return False

def extract_answer(text):
    try: return text.split("<answer>")[1].split("</answer>")[0]
    except: return None

def do_eval(debug):
    total_iters = len(eval_dataset)
    wins_reasoning = 0
    wins_qwen = 0
    for l in range(len(eval_dataset)):
        row = eval_dataset[l]
        problem = row["problem"]
        ground_truth = row["answer"]
        response = generate_output_once_grpo(model, problem)
        response_qwen = generate_output_once(model_qwen, problem)
        reward = check_model_contain_output(response, ground_truth)
        reward_qwen = check_model_contain_output(response_qwen, ground_truth)
        if reward: wins_reasoning += 1
        if reward_qwen: wins_qwen += 1
        print(f"reasoning model: %{wins_reasoning / total_iters}")
        print(f"qwen model: %{wins_qwen / total_iters}")
        if debug:
            print("qwen:", response_qwen)
            print("reasoning fine tuned:", response)

#####
model = AutoModelForCausalLM.from_pretrained(
    "notbdq/gemma-grpo",
    torch_dtype="auto",
    device_map="auto"
)
model_qwen = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-14B-Instruct-1M",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
eval_dataset = load_dataset("AI-MO/aimo-validation-aime", split="train")
#####

do_eval(debug=False)
```

technique: GRPO applied to qwen 14b 1m with Numina cot dataset

hardware: 8xmi300x for like 64 steps

current issues: 1. infinite generation when hit a hard problem
2. growing sequence length when training

author: baki

contact: https://x.com/bakiv11771441