Update README.md
Browse files
README.md
CHANGED
@@ -57,9 +57,125 @@ print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))
|
|
57 |
```
|
58 |
|
59 |
### **inference with vLLM**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
```
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
```
|
|
|
|
|
|
|
|
|
63 |
|
64 |
# Result
|
65 |
```
|
@@ -71,7 +187,7 @@ code: [TBD]
|
|
71 |
"llama3-InstrucTrans":"μ΄λ² λ
Όλμ μ νμ΄ μ§λ 7μΌ μ νλΈμ 곡κ°ν μ΅μ μμ΄ν¨λ νλ‘ κ΄κ³ λ₯Ό μ€μ¬μΌλ‘ λΆκ±°μ‘λ€. μ΄ κ΄κ³ λ μ
κΈ°, μ‘°κ°μ, μΉ΄λ©λΌ, λ¬Όκ° λ±μ λλ₯΄κΈ° μμνλ μ₯λ©΄κ³Ό ν¨κ» κ·Έ μ리μ μμ΄ν¨λ νλ‘κ° λ±μ₯νλ μ₯λ©΄μ 보μ¬μ€λ€. μ΄λ μλ‘μ΄ μμ΄ν¨λ νλ‘μ μΈκ³΅μ§λ₯ κΈ°λ₯, κ³ κΈ λμ€νλ μ΄, μ±λ₯, λκ»λ₯Ό κ°μ‘°νλ κ²μΌλ‘ 보μΈλ€. μ νμ μ΄λ²μ 곡κ°ν μμ΄ν¨λ νλ‘μ μ΅μ 'M4' μΉ©μ΄ νμ¬λμΌλ©°, μ ν μμ¬μ κ°μ₯ μμ κΈ°κΈ°λΌκ³ μΈκΈνλ€. μ΄ κ΄κ³ λ μΆμνμλ§μ ν¬λ¦¬μμ΄ν°λ₯Ό μμ§νλ λ¬Όκ±΄μ΄ νμλλ μ₯λ©΄μ΄ κ·Έλλ‘ κ·Έλ €μ Έ λ
Όλμ΄ λκ³ μλ€. λΉνκ°λ€μ μ΄ μ΄λ―Έμ§κ° κΈ°μ μ΄ μΈκ° ν¬λ¦¬μμ΄ν°λ₯Ό μ§λ°λλ€λ μλ―Έλ‘ ν΄μλ μ μλ€κ³ μ£Όμ₯νλ€. λν AIλ‘ μΈν΄ ν¬λ¦¬μμ΄ν°λ€μ΄ λ°λ¦¬κ³ μλ€λ μν©μ μ°μμν¨λ€λ μ°λ €μ λͺ©μ리λ λμ¨λ€."
|
72 |
```
|
73 |
|
74 |
-
<br
|
75 |
|
76 |
# **Evalution Result**
|
77 |
μμ΄->νκ΅μ΄ λ²μ μ±λ₯μ νκ°νκΈ°μν λ°μ΄ν°μ
μ μ μ νμ¬ νκ°λ₯Ό μ§ννμμ΅λλ€.
|
@@ -107,7 +223,6 @@ code: [TBD]
|
|
107 |
| **Translation-EnKo/exaeon3-translation-general-enko-7.8b (private)** | 17.8275 | 8.56 | 2.72 | 6.31 | 8.8544 |
|
108 |
| **Translation-EnKo/exaone3-instrucTrans-v2-enko-7.8b** | 19.6075 | 13.46 | 7.28 | 11.4425 | **12.9475**|
|
109 |
|
110 |
-
|
111 |
### νμ΅ λ°μ΄ν°μ
λ³ μ±λ₯ λΆμ
|
112 |
| λͺ¨λΈ μ΄λ¦ | AIHub | Flores | IWSLT | News | νκ· |
|
113 |
|--------------------------------------------------------------|---------|--------|-------|--------|-------------|
|
@@ -125,7 +240,6 @@ code: [TBD]
|
|
125 |
| EXAONE-3.0-7.8B-Instruct-general12m (private) | 17.8275 | 8.56 | 2.72 | 6.31 | **8.8544** |
|
126 |
| EXAONE-3.0-7.8B-Instruct-general12m-trc1400k-trc313eval45 | 19.6075 | 13.46 | 7.28 | 11.4425| **12.9475** |
|
127 |
|
128 |
-
|
129 |
### **Citation**
|
130 |
|
131 |
```bibtex
|
|
|
57 |
```
|
58 |
|
59 |
### **inference with vLLM**
|
60 |
+
<details>
|
61 |
+
<summary>μΆλ‘ μ½λ μ κΈ°/νΌμΉκΈ°</summary>
|
62 |
+
<div markdown="1">
|
63 |
+
|
64 |
+
```bash
|
65 |
+
# Requires at least a 24 GB Vram GPU. If you have 12GB VRAM, you will need to run in FP8 mode.
|
66 |
+
|
67 |
+
python vllm_inference.py -gpu_id 0 -split_idx 0 -split_num 2 -dname "nvidia/HelpSteer" -untrans_col 'helpfulness' 'correctness' 'coherence' 'complexity' 'verbosity' > 0.out
|
68 |
+
python vllm_inference.py -gpu_id 1 -split_idx 1 -split_num 2 -dname "nvidia/HelpSteer" -untrans_col 'helpfulness' 'correctness' 'coherence' 'complexity' 'verbosity' > 1.out
|
69 |
```
|
70 |
+
|
71 |
+
```python
|
72 |
+
import os
|
73 |
+
import argparse
|
74 |
+
import pandas as pd
|
75 |
+
|
76 |
+
from tqdm import tqdm
|
77 |
+
from typing import List, Dict
|
78 |
+
from datasets import load_dataset, Dataset
|
79 |
+
from transformers import AutoTokenizer
|
80 |
+
from vllm import LLM, SamplingParams
|
81 |
+
|
82 |
+
# truncate sentences with more than 4096 tokens. # for same dataset size
|
83 |
+
def truncation_func(sample, column_name):
|
84 |
+
input_ids = tokenizer(str(sample[column_name]), truncation=True, max_length=4096, add_special_tokens=False).input_ids
|
85 |
+
output = tokenizer.decode(input_ids)
|
86 |
+
sample[column_name]=output
|
87 |
+
return sample
|
88 |
+
|
89 |
+
# convert to chat_template
|
90 |
+
def create_conversation(sample, column_name):
|
91 |
+
SYSTEM_PROMPT=f"λΉμ μ λ²μκΈ° μ
λλ€. μμ΄ λ¬Έμ₯μ νκ΅μ΄λ‘ λ²μνμΈμ."
|
92 |
+
messages=[
|
93 |
+
{"role":"system", "content": SYSTEM_PROMPT},
|
94 |
+
{"role":"user", "content":sample[column_name]}
|
95 |
+
]
|
96 |
+
text=tokenizer.apply_chat_template(
|
97 |
+
messages,
|
98 |
+
tokenize=False,
|
99 |
+
add_generation_prompt=True
|
100 |
+
)
|
101 |
+
sample[column_name]=text
|
102 |
+
return sample
|
103 |
+
|
104 |
+
def load_dataset_preprocess(dataset_name:str, untranslate_column:List, split_num, split_idx, subset=None, num_proc=128) -> Dataset:
|
105 |
+
step = 100//split_num # split datasets
|
106 |
+
if subset:
|
107 |
+
dataset = load_dataset(dataset_name, subset, split=f'train[{step*split_idx}%:{step*(split_idx+1)}%]')
|
108 |
+
else:
|
109 |
+
dataset = load_dataset(dataset_name, split=f'train[{step*split_idx}%:{step*(split_idx+1)}%]')
|
110 |
+
print(dataset)
|
111 |
+
original_dataset = dataset # To leave columns untranslated
|
112 |
+
dataset = dataset.remove_columns(untranslate_column)
|
113 |
+
|
114 |
+
for feature in dataset.features:
|
115 |
+
dataset = dataset.map(lambda x: truncation_func(x,feature), num_proc=num_proc) #
|
116 |
+
dataset = dataset.map(lambda x: create_conversation(x,feature), batched=False, num_proc=num_proc)
|
117 |
+
|
118 |
+
print("filtered_dataset:", dataset)
|
119 |
+
return dataset, original_dataset
|
120 |
+
|
121 |
+
def save_dataset(result_dict:Dict, dataset_name, untranslate_column:List, split_idx, subset:str):
|
122 |
+
for column in untranslate_column:
|
123 |
+
result_dict[column] = original_dataset[column]
|
124 |
+
|
125 |
+
df = pd.DataFrame(result_dict)
|
126 |
+
output_file_name = dataset_name.split('/')[-1]
|
127 |
+
os.makedirs('gen', exist_ok=True)
|
128 |
+
if subset:
|
129 |
+
save_path = f"gen/{output_file_name}_{subset}_{split_idx}.jsonl"
|
130 |
+
else:
|
131 |
+
save_path = f"gen/{output_file_name}_{split_idx}.jsonl"
|
132 |
+
df.to_json(save_path, lines=True, orient='records', force_ascii=False)
|
133 |
+
|
134 |
+
if __name__=="__main__":
|
135 |
+
model_name = "Translation-EnKo/exaone3-instrucTrans-v2-enko-7.8b"
|
136 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
137 |
+
|
138 |
+
parser = argparse.ArgumentParser(description='load dataset name & split size')
|
139 |
+
parser.add_argument('-dname', type=str, default="Magpie-Align/Magpie-Pro-MT-300K-v0.1")
|
140 |
+
parser.add_argument('-untrans_col', nargs='+', default=[])
|
141 |
+
parser.add_argument('-split_num', type=int, default=4)
|
142 |
+
parser.add_argument('-split_idx', type=int, default=0)
|
143 |
+
parser.add_argument('-gpu_id', type=int, default=0)
|
144 |
+
parser.add_argument('-subset', type=str, default=None)
|
145 |
+
parser.add_argument('-num_proc', type=int, default=128)
|
146 |
+
|
147 |
+
args = parser.parse_args()
|
148 |
+
os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu_id)
|
149 |
+
dataset, original_dataset = load_dataset_preprocess(args.dname,
|
150 |
+
args.untrans_col,
|
151 |
+
args.split_num,
|
152 |
+
args.split_idx,
|
153 |
+
args.subset,
|
154 |
+
args.num_proc
|
155 |
+
)
|
156 |
+
# define model
|
157 |
+
sampling_params = SamplingParams(
|
158 |
+
temperature=0,
|
159 |
+
max_tokens=8192,
|
160 |
+
)
|
161 |
+
llm = LLM(
|
162 |
+
model=model_name,
|
163 |
+
tensor_parallel_size=1,
|
164 |
+
gpu_memory_utilization=0.95,
|
165 |
+
)
|
166 |
+
# inference model
|
167 |
+
result_dict = {}
|
168 |
+
for feature in tqdm(dataset.features):
|
169 |
+
print(f"'{feature}' column in progress..")
|
170 |
+
outputs = llm.generate(dataset[feature], sampling_params)
|
171 |
+
result_dict[feature]=[output.outputs[0].text for output in outputs]
|
172 |
+
save_dataset(result_dict, args.dname, args.untrans_col, args.split_idx, args.subset)
|
173 |
+
print(f"saved to json. column: {feature}")
|
174 |
```
|
175 |
+
</div>
|
176 |
+
</details>
|
177 |
+
|
178 |
+
<br>
|
179 |
|
180 |
# Result
|
181 |
```
|
|
|
187 |
"llama3-InstrucTrans":"μ΄λ² λ
Όλμ μ νμ΄ μ§λ 7μΌ μ νλΈμ 곡κ°ν μ΅μ μμ΄ν¨λ νλ‘ κ΄κ³ λ₯Ό μ€μ¬μΌλ‘ λΆκ±°μ‘λ€. μ΄ κ΄κ³ λ μ
κΈ°, μ‘°κ°μ, μΉ΄λ©λΌ, λ¬Όκ° λ±μ λλ₯΄κΈ° μμνλ μ₯λ©΄κ³Ό ν¨κ» κ·Έ μ리μ μμ΄ν¨λ νλ‘κ° λ±μ₯νλ μ₯λ©΄μ 보μ¬μ€λ€. μ΄λ μλ‘μ΄ μμ΄ν¨λ νλ‘μ μΈκ³΅μ§λ₯ κΈ°λ₯, κ³ κΈ λμ€νλ μ΄, μ±λ₯, λκ»λ₯Ό κ°μ‘°νλ κ²μΌλ‘ 보μΈλ€. μ νμ μ΄λ²μ 곡κ°ν μμ΄ν¨λ νλ‘μ μ΅μ 'M4' μΉ©μ΄ νμ¬λμΌλ©°, μ ν μμ¬μ κ°μ₯ μμ κΈ°κΈ°λΌκ³ μΈκΈνλ€. μ΄ κ΄κ³ λ μΆμνμλ§μ ν¬λ¦¬μμ΄ν°λ₯Ό μμ§νλ λ¬Όκ±΄μ΄ νμλλ μ₯λ©΄μ΄ κ·Έλλ‘ κ·Έλ €μ Έ λ
Όλμ΄ λκ³ μλ€. λΉνκ°λ€μ μ΄ μ΄λ―Έμ§κ° κΈ°μ μ΄ μΈκ° ν¬λ¦¬μμ΄ν°λ₯Ό μ§λ°λλ€λ μλ―Έλ‘ ν΄μλ μ μλ€κ³ μ£Όμ₯νλ€. λν AIλ‘ μΈν΄ ν¬λ¦¬μμ΄ν°λ€μ΄ λ°λ¦¬κ³ μλ€λ μν©μ μ°μμν¨λ€λ μ°λ €μ λͺ©μ리λ λμ¨λ€."
|
188 |
```
|
189 |
|
190 |
+
<br>
|
191 |
|
192 |
# **Evalution Result**
|
193 |
μμ΄->νκ΅μ΄ λ²μ μ±λ₯μ νκ°νκΈ°μν λ°μ΄ν°μ
μ μ μ νμ¬ νκ°λ₯Ό μ§ννμμ΅λλ€.
|
|
|
223 |
| **Translation-EnKo/exaeon3-translation-general-enko-7.8b (private)** | 17.8275 | 8.56 | 2.72 | 6.31 | 8.8544 |
|
224 |
| **Translation-EnKo/exaone3-instrucTrans-v2-enko-7.8b** | 19.6075 | 13.46 | 7.28 | 11.4425 | **12.9475**|
|
225 |
|
|
|
226 |
### νμ΅ λ°μ΄ν°μ
λ³ μ±λ₯ λΆμ
|
227 |
| λͺ¨λΈ μ΄λ¦ | AIHub | Flores | IWSLT | News | νκ· |
|
228 |
|--------------------------------------------------------------|---------|--------|-------|--------|-------------|
|
|
|
240 |
| EXAONE-3.0-7.8B-Instruct-general12m (private) | 17.8275 | 8.56 | 2.72 | 6.31 | **8.8544** |
|
241 |
| EXAONE-3.0-7.8B-Instruct-general12m-trc1400k-trc313eval45 | 19.6075 | 13.46 | 7.28 | 11.4425| **12.9475** |
|
242 |
|
|
|
243 |
### **Citation**
|
244 |
|
245 |
```bibtex
|