추론 코드 접기/펼치기
```bash
# Requires at least a 24 GB Vram GPU. If you have 12GB VRAM, you will need to run in FP8 mode.
python vllm_inference.py -gpu_id 0 -split_idx 0 -split_num 2 -dname "nvidia/HelpSteer" -untrans_col 'helpfulness' 'correctness' 'coherence' 'complexity' 'verbosity' > 0.out
python vllm_inference.py -gpu_id 1 -split_idx 1 -split_num 2 -dname "nvidia/HelpSteer" -untrans_col 'helpfulness' 'correctness' 'coherence' 'complexity' 'verbosity' > 1.out
```
```python
import os
import argparse
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
# truncate sentences with more than 4096 tokens. # for same dataset size
def truncation_func(sample, column_name):
input_ids = tokenizer(str(sample[column_name]), truncation=True, max_length=4096, add_special_tokens=False).input_ids
output = tokenizer.decode(input_ids)
sample[column_name]=output
return sample
# convert to chat_template
def create_conversation(sample, column_name):
SYSTEM_PROMPT=f"당신은 번역기 입니다. 영어 문장을 한국어로 번역하세요."
messages=[
{"role":"system", "content": SYSTEM_PROMPT},
{"role":"user", "content":sample[column_name]}
]
text=tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
sample[column_name]=text
return sample
def load_dataset_preprocess(dataset_name:str, untranslate_column:List, split_num, split_idx, subset=None, num_proc=128) -> Dataset:
step = 100//split_num # split datasets
if subset:
dataset = load_dataset(dataset_name, subset, split=f'train[{step*split_idx}%:{step*(split_idx+1)}%]')
else:
dataset = load_dataset(dataset_name, split=f'train[{step*split_idx}%:{step*(split_idx+1)}%]')
print(dataset)
original_dataset = dataset # To leave columns untranslated
dataset = dataset.remove_columns(untranslate_column)
for feature in dataset.features:
dataset = dataset.map(lambda x: truncation_func(x,feature), num_proc=num_proc) #
dataset = dataset.map(lambda x: create_conversation(x,feature), batched=False, num_proc=num_proc)
print("filtered_dataset:", dataset)
return dataset, original_dataset
def save_dataset(result_dict:Dict, dataset_name, untranslate_column:List, split_idx, subset:str):
for column in untranslate_column:
result_dict[column] = original_dataset[column]
df = pd.DataFrame(result_dict)
output_file_name = dataset_name.split('/')[-1]
os.makedirs('gen', exist_ok=True)
if subset:
save_path = f"gen/{output_file_name}_{subset}_{split_idx}.jsonl"
else:
save_path = f"gen/{output_file_name}_{split_idx}.jsonl"
df.to_json(save_path, lines=True, orient='records', force_ascii=False)
if __name__=="__main__":
model_name = "Translation-EnKo/exaone3-instrucTrans-v2-enko-7.8b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
parser = argparse.ArgumentParser(description='load dataset name & split size')
parser.add_argument('-dname', type=str, default="Magpie-Align/Magpie-Pro-MT-300K-v0.1")
parser.add_argument('-untrans_col', nargs='+', default=[])
parser.add_argument('-split_num', type=int, default=4)
parser.add_argument('-split_idx', type=int, default=0)
parser.add_argument('-gpu_id', type=int, default=0)
parser.add_argument('-subset', type=str, default=None)
parser.add_argument('-num_proc', type=int, default=128)
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu_id)
dataset, original_dataset = load_dataset_preprocess(args.dname,
args.untrans_col,
args.split_num,
args.split_idx,
args.subset,
args.num_proc
)
# define model
sampling_params = SamplingParams(
temperature=0,
max_tokens=8192,
)
llm = LLM(
model=model_name,
tensor_parallel_size=1,
gpu_memory_utilization=0.95,
)
# inference model
result_dict = {}
for feature in tqdm(dataset.features):
print(f"'{feature}' column in progress..")
outputs = llm.generate(dataset[feature], sampling_params)
result_dict[feature]=[output.outputs[0].text for output in outputs]
save_dataset(result_dict, args.dname, args.untrans_col, args.split_idx, args.subset)
print(f"saved to json. column: {feature}")
```