from typing import Dict, List, Any from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel, PeftConfig import torch import time class EndpointHandler: def __init__(self, path="5iveDesignStudio/autotrain-TenderGPT-Festive-v2-0"): # load the model config = PeftConfig.from_pretrained(path) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) self.model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, return_dict=True, load_in_4bit=True, device_map={"":0}, trust_remote_code=True, quantization_config=bnb_config, ) self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) self.tokenizer.pad_token = self.tokenizer.eos_token self.model = PeftModel.from_pretrained(self.model, path) self.device = "cuda" if torch.cuda.is_available() else "cpu" #def __call__(self, data: Any) -> Dict[str, Any]: def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: """ Args: inputs :obj:`list`:. The object should be like {"context": "some word", "question": "some word"} containing: - "context": - "question": Return: A :obj:`list`:. The object returned should be like {"answer": "some word", time: "..."} containing: - "answer": answer the question based on the context - "time": the time run predict """ # process input inputs = data.pop("inputs", data) parameters = data.pop("parameters", None) prompt = f"""Below is an instruction that describes a task. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. >>TITLE<<: Tender Response. >>CONTEXT<<: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe in a conversational tone. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. >>QUESTION<<: {inputs} >>ANSWER<<: """.strip() # preprocess batch = self.tokenizer( prompt, padding=True, truncation=True, return_tensors="pt" ).to(self.device) # pass inputs with all kwargs in data #if parameters is not None: # outputs = self.model.generate(**inputs, **parameters) #else: # outputs = self.model.generate(**inputs) # postprocess the prediction #prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True) generation_config = self.model.generation_config generation_config.top_p = 0.75 generation_config.temperature = 0.7 generation_config.max_new_tokens = 140 generation_config.num_return_sequences = 1 generation_config.pad_token_id = self.tokenizer.eos_token_id generation_config.eos_token_id = self.tokenizer.eos_token_id start = time.time() with torch.cuda.amp.autocast(): output_tokens = self.model.generate( input_ids = batch.input_ids, generation_config=generation_config, ) end = time.time() generated_text = self.tokenizer.decode( output_tokens[0] ) answer = generated_text.split('>>END<<')[0].split('>>ANSWER<<:')[1].strip() if "CONTEXT:" in answer: if "RESPONSE:" in answer: answerclean = answer.partition("RESPONSE:")[2] else: answerclean = "I'm sorry, but I'm not able to help with your tender topic." else: answerclean = answer if "<|endoftext|>:" in answerclean: answerclean = answerclean.replace('<|endoftext|>', '') else: first_full_stop = answerclean.index('.') last_full_stop = answerclean.rindex('.') answerclean = answerclean[0:last_full_stop+1] prediction = {'generated_text': answerclean, 'time': f"{(end-start):.2f} s"} #prediction = {'generated_text': answerclean, 'time': f"{(end-start):.2f} s", 'complete_ouput': generated_text} result = [] result.append(prediction) return result