```python from openrlhf.models.model import get_llm_for_sequence_regression from transformers import AutoTokenizer from typing import List import torch import regex as re def strip_sequence(text, pad_token, eos_token): pad_token_escaped = re.escape(pad_token) eos_token_escaped = re.escape(eos_token) pattern = f"^({eos_token_escaped}|{pad_token_escaped})+" text = re.sub(pattern, "", text) pattern = f"({eos_token_escaped}|{pad_token_escaped})+$" text = re.sub(pattern, "", text) return text class RewardModelProxy: def __init__( self, reward_pretrain:str, max_len:int, batch_size:int, normalize_reward:bool=False, flash_attn:bool=True, bf16:bool=True, load_in_4bit:bool=False, value_head_prefix:str="score", disable_fast_tokenizer:bool=False, ): self.reward_model = get_llm_for_sequence_regression( reward_pretrain, "reward", normalize_reward=normalize_reward, use_flash_attention_2=flash_attn, bf16=bf16, load_in_4bit=load_in_4bit, value_head_prefix=value_head_prefix, device_map="cuda:5", ) self.reward_model.eval() self.tokenizer = AutoTokenizer.from_pretrained(reward_pretrain, trust_remote_code=True, use_fast=not disable_fast_tokenizer) self.max_length = max_len self.batch_size = batch_size def get_reward(self, conversations:List[List[dict]]): if self.batch_size is None: batch_size = len(conversations) else: batch_size = self.batch_size queries = [] for conversation in conversations: query = self.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False) queries.append(query) # remove pad_token for i in range(len(queries)): queries[i] = ( strip_sequence(queries[i], self.tokenizer.pad_token, self.tokenizer.eos_token) + self.tokenizer.eos_token ) scores = [] # batch with torch.no_grad(): for i in range(0, len(queries), batch_size): inputs = self.tokenize_fn( queries[i : min(len(queries), i + batch_size)], device=self.reward_model.device ) r = self.reward_model(inputs["input_ids"], inputs["attention_mask"]) r = r.tolist() scores.extend(r) return scores def tokenize_fn(self, texts, device): batch = self.tokenizer( texts, return_tensors="pt", add_special_tokens=False, max_length=self.max_length, padding=True, truncation=True, ) return {k: v.to(device) for k, v in batch.items()} def __call__(self, conversations:List[List[dict]]): return self.get_reward(conversations) RM = RewardModelProxy( "CodeDPO/Qwen2.5-Coder-7B_with_margin_scalebt", max_len=2048, batch_size=8, ) conversations = [ [ {"role": "system", "content": "Hello, how can I help you today?"}, {"role": "user", "content": "I want to book a flight."}, ], ] scores = RM(conversations) print(scores) ```