aidando73 commited on
Commit
f028aae
·
1 Parent(s): 3ee8139
Files changed (2) hide show
  1. handler.py +30 -0
  2. test-handler.py +12 -0
handler.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from unsloth import FastLanguageModel
3
+
4
+ class EndpointHandler():
5
+ def __init__(self, path=""):
6
+ # Preload all the elements you are going to need at inference.
7
+ # pseudo:
8
+ # self.model= load_model(path)
9
+ model, tokenizer = FastLanguageModel.from_pretrained(
10
+ model_name = "aidando73/llama-3.3-70b-instruct-code-agent-fine-tune-v1",
11
+ max_seq_length = 2048,
12
+ dtype = "float16",
13
+ load_in_4bit = True,
14
+ )
15
+ FastLanguageModel.for_inference(model)
16
+ self.model = model
17
+ self.tokenizer = tokenizer
18
+
19
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
20
+ """
21
+ data args:
22
+ inputs (:obj: `str` | `PIL.Image` | `np.array`)
23
+ kwargs
24
+ Return:
25
+ A :obj:`list` | `dict`: will be serialized and returned
26
+ """
27
+
28
+ input_ids = self.tokenizer.encode(data["inputs"], return_tensors = "pt").to("cuda")
29
+ output = self.model.generate(input_ids, max_new_tokens = 128, pad_token_id = self.tokenizer.eos_token_id)
30
+ return [{"output": self.tokenizer.decode(output[0], skip_special_tokens = True)}]
test-handler.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from handler import EndpointHandler
2
+
3
+ # init handler
4
+ my_handler = EndpointHandler(path=".")
5
+
6
+ # prepare sample payload
7
+ input = {"inputs": "Hello World"}
8
+
9
+ # test the handler
10
+ output = my_handler(input)
11
+
12
+ print("output", output)