|
import os |
|
from typing import Dict, List, Any |
|
from unsloth import FastLanguageModel |
|
from unsloth.chat_templates import get_chat_template |
|
import torch |
|
from huggingface_hub import login |
|
import os |
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
|
|
|
|
|
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained( |
|
model_name = path, |
|
max_seq_length = 2048, |
|
dtype = None, |
|
load_in_4bit = True, |
|
) |
|
FastLanguageModel.for_inference(self.model) |
|
|
|
|
|
self.tokenizer = get_chat_template( |
|
self.tokenizer, |
|
chat_template="llama-3", |
|
mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"} |
|
) |
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[str]: |
|
inputs = data.pop("inputs", data) |
|
parameters = data.pop("parameters", {}) |
|
|
|
|
|
max_tokens = parameters.get("max_new_tokens", 512) |
|
temperature = parameters.get("temperature", 0.2) |
|
top_p = parameters.get("top_p", 0.5) |
|
system_message = parameters.get("system_message", "") |
|
|
|
|
|
messages = [{"from": "human", "value": system_message}] |
|
if isinstance(inputs, str): |
|
messages.append({"from": "human", "value": inputs}) |
|
elif isinstance(inputs, list): |
|
for msg in inputs: |
|
role = "human" if msg["role"] == "user" else "gpt" |
|
messages.append({"from": role, "value": msg["content"]}) |
|
|
|
|
|
tokenized_input = self.tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=True, |
|
add_generation_prompt=True, |
|
return_tensors="pt" |
|
).to("cuda") |
|
|
|
|
|
with torch.no_grad(): |
|
output = self.model.generate( |
|
input_ids=tokenized_input, |
|
max_new_tokens=max_tokens, |
|
temperature=temperature, |
|
top_p=top_p, |
|
use_cache=True |
|
) |
|
|
|
|
|
full_response = self.tokenizer.decode(output[0], skip_special_tokens=True) |
|
response_lines = [line.strip() for line in full_response.split('\n') if line.strip()] |
|
last_response = response_lines[-1] if response_lines else "" |
|
|
|
return [last_response] |
|
|