#!python # -*- coding: utf-8 -*- # @author: Kun from transformers import TextStreamer from models.baichuan_hf import max_token, temperature, top_p from common import torch_gc from global_config import lang_opt def get_api_response(model, tokenizer, content: str, max_tokens=None): if "en" == lang_opt: system_role_content = 'You are a helpful and creative assistant for writing novel.' elif "zh1" == lang_opt: system_role_content = 'You are a helpful and creative assistant for writing novel.\ You are must always in Chinese.重要,你需要使用中文与我进行交流。' elif "zh2" == lang_opt: system_role_content = '你是写小说的好帮手,有创意的助手。' else: raise Exception(f"not supported language: {lang_opt}") print("===> Question:") print(content) print("<==="+"="*100) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True ) # inputs = tokenizer(content, return_tensors='pt') inputs = tokenizer(":{}\n:".format(content), return_tensors='pt') # inputs = inputs.to('cuda') # UserWarning: You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on cuda, whereas the model is on cpu. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('cpu') before running `.generate()`. inputs = inputs.to('cpu') generate_ids = model.generate(**inputs, max_new_tokens=max_token, top_p=top_p, temperature=temperature, repetition_penalty=1.1, streamer=streamer, ) response = tokenizer.decode( generate_ids.cpu()[0], skip_special_tokens=True) torch_gc() print("===> Generated Text: ") print(response) print("<==="+"="*100) return response