File size: 1,565 Bytes

37b0e2d
 
 
377986e
37b0e2d
 
 
b4751ce
37b0e2d
0519c83
76a36d6
c5d461f
b4751ce
76a36d6
 
 
37b0e2d
76a36d6
 
 
 
37b0e2d
76a36d6
 
37b0e2d

import torch

from typing import Any, Dict
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig


class EndpointHandler:
    def __init__(self, path=""):
        # load model and tokenizer from path
        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", padding_side="left")

        config = AutoConfig.from_pretrained(path, trust_remote_code=True)
        # config.attn_config['attn_impl'] = 'triton'
        config.init_device = 'cuda:0' # For fast initialization directly on GPU!
        config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
        
        self.model = AutoModelForCausalLM.from_pretrained(
            path, 
            config,
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        # self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = 'cuda'

    def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
        # process input
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", None)

        # preprocess
        inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)

        # pass inputs with all kwargs in data
        if parameters is not None:
            outputs = self.model.generate(**inputs, **parameters)
        else:
            outputs = self.model.generate(**inputs)

        # postprocess the prediction
        prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        return [{"generated_text": prediction}]