|
|
|
|
|
"""The Inferencer class simplifies the process of model inferencing.""" |
|
|
|
import os |
|
import torch |
|
import wandb |
|
import deepspeed |
|
import sys |
|
import numpy as np |
|
import datetime |
|
import json |
|
|
|
from transformers import AutoConfig |
|
import torch.distributed as dist |
|
|
|
from lmflow.args import DatasetArguments |
|
from lmflow.datasets.dataset import Dataset |
|
from lmflow.pipeline.base_pipeline import BasePipeline |
|
from lmflow.models.hf_decoder_model import HFDecoderModel |
|
from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
def rstrip_partial_utf8(string): |
|
return string.replace("\ufffd", "") |
|
|
|
class Inferencer(BasePipeline): |
|
""" |
|
Initializes the `Inferencer` class with given arguments. |
|
|
|
Parameters |
|
------------ |
|
model_args : ModelArguments object. |
|
Contains the arguments required to load the model. |
|
|
|
data_args : DatasetArguments object. |
|
Contains the arguments required to load the dataset. |
|
|
|
inferencer_args : InferencerArguments object. |
|
Contains the arguments required to perform inference. |
|
|
|
|
|
""" |
|
def __init__(self, model_args, data_args, inferencer_args): |
|
self.data_args = data_args |
|
self.inferencer_args = inferencer_args |
|
self.model_args = model_args |
|
|
|
set_random_seed(self.inferencer_args.random_seed) |
|
|
|
self.local_rank = int(os.getenv("LOCAL_RANK", "0")) |
|
self.world_size = int(os.getenv("WORLD_SIZE", "1")) |
|
if inferencer_args.device == "gpu": |
|
torch.cuda.set_device(self.local_rank) |
|
deepspeed.init_distributed() |
|
else: |
|
os.environ["MASTER_ADDR"] = "localhost" |
|
os.environ["MASTER_PORT"] = "15000" |
|
dist.init_process_group( |
|
"gloo", rank=self.local_rank, world_size=self.world_size |
|
) |
|
|
|
self.config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) |
|
try: |
|
self.model_hidden_size = self.config.hidden_size |
|
except: |
|
print("Error in setting hidden size, use the default size 1024") |
|
self.model_hidden_size = 1024 |
|
|
|
|
|
def create_dataloader(self, dataset: Dataset): |
|
data_dict = dataset.to_dict() |
|
inputs = [ instance["text"] for instance in data_dict["instances"] ] |
|
dataset_size = len(inputs) |
|
dataset_buf = [] |
|
for idx in range(dataset_size): |
|
dataset_buf.append({ |
|
"input": inputs[idx], |
|
"input_idx": idx |
|
}) |
|
|
|
dataloader = batchlize( |
|
dataset_buf, |
|
batch_size=1, |
|
random_shuffle=False, |
|
) |
|
return dataloader, dataset_size |
|
|
|
|
|
def inference( |
|
self, |
|
model, |
|
dataset: Dataset, |
|
max_new_tokens: int=100, |
|
temperature: float=0.0, |
|
prompt_structure: str='{input}', |
|
): |
|
""" |
|
Perform inference for a model |
|
|
|
Parameters |
|
------------ |
|
model : TunableModel object. |
|
TunableModel to perform inference |
|
|
|
dataset : Dataset object. |
|
|
|
|
|
Returns: |
|
|
|
output_dataset: Dataset object. |
|
""" |
|
if dataset.get_type() != "text_only": |
|
raise NotImplementedError( |
|
'input dataset should have type "text_only"' |
|
) |
|
|
|
dataloader, data_size = self.create_dataloader(dataset) |
|
|
|
|
|
output_dict = { |
|
"type": "text_only", |
|
"instances": [ |
|
] |
|
} |
|
|
|
for batch_index, batch in enumerate(dataloader): |
|
current_batch = batch[0] |
|
|
|
input = prompt_structure.format(input=current_batch['input']) |
|
|
|
if self.inferencer_args.device == "gpu": |
|
inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank) |
|
elif self.inferencer_args.device == "cpu": |
|
inputs = model.encode(input, return_tensors="pt").to(device='cpu') |
|
else: |
|
raise NotImplementedError( |
|
f"device \"{self.inferencer_args.device}\" is not supported" |
|
) |
|
|
|
outputs = model.inference( |
|
inputs, |
|
max_new_tokens=max_new_tokens, |
|
temperature=temperature, |
|
repetition_penalty=1.0, |
|
) |
|
text_out = model.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,)) |
|
text_out = text_out[prompt_length:] |
|
output_dict["instances"].append({ "text": text_out }) |
|
|
|
output_dataset = Dataset(DatasetArguments(dataset_path = None)) |
|
output_dataset = output_dataset.from_dict(output_dict) |
|
|
|
return output_dataset |
|
|
|
def stream_inference(self, context, model, max_new_tokens, token_per_step, temperature, end_string, input_dataset): |
|
response = "" |
|
history = [] |
|
if "ChatGLMModel" in self.config.architectures: |
|
for response, history in model.get_backend_model().stream_chat(model.get_tokenizer(), context, history=history): |
|
response = rstrip_partial_utf8(response) |
|
yield response, False |
|
else: |
|
for _ in range(0, max_new_tokens // token_per_step): |
|
output_dataset = self.inference( |
|
model=model, |
|
dataset=input_dataset, |
|
max_new_tokens=token_per_step, |
|
temperature=temperature, |
|
) |
|
|
|
new_append_text = output_dataset.to_dict()["instances"][0]["text"] |
|
new_append_text = rstrip_partial_utf8(new_append_text) |
|
response += new_append_text |
|
|
|
input_dict = input_dataset.to_dict() |
|
input_dict["instances"][0]["text"] += new_append_text |
|
|
|
input_dataset = input_dataset.from_dict(input_dict) |
|
|
|
flag_break = False |
|
try: |
|
index = response.index(end_string) |
|
flag_break = True |
|
except ValueError: |
|
response += end_string |
|
index = response.index(end_string) |
|
|
|
response = response[:index] |
|
|
|
yield response, flag_break |
|
|