Spaces:
Paused
Paused
File size: 1,593 Bytes
dae67e9 6ccd417 dae67e9 6ccd417 dae67e9 6ccd417 dae67e9 6ccd417 dae67e9 6ccd417 dae67e9 6ccd417 dae67e9 6ccd417 dae67e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from transformers import T5TokenizerFast, T5ForConditionalGeneration, GenerationConfig
class T5:
def __init__(self,
model_dir:str='./models/pko_t5_COMU_patience10',
max_input_length:int=64,
max_target_length:int=64
):
self.model = T5ForConditionalGeneration.from_pretrained(model_dir)
self.tokenizer = T5TokenizerFast.from_pretrained(model_dir)
self.gen_config = GenerationConfig.from_pretrained(model_dir, 'gen_config.json')
self.max_input_length = max_input_length
self.max_target_length = max_target_length
self.INPUT_FORMAT = 'qa question: <INPUT>'
# add tokens
self.tokenizer.add_tokens(["#νμ#", "#μ²μ#", "#(λ¨μ)μ²μ#", "#(λ¨μ)νμ#", "#(μ¬μ)μ²μ#", "(μ¬μ)νμ"])
self.model.resize_token_embeddings(len(self.tokenizer))
self.model.config.max_length = max_target_length
self.tokenizer.model_max_length = max_target_length
def generate(self, inputs):
inputs = self.INPUT_FORMAT.replace("<INPUT>", inputs)
input_ids = self.tokenizer(inputs, max_length=self.max_input_length, truncation=True, return_tensors="pt")
output_tensor = self.model.generate(**input_ids, generation_config=self.gen_config)
output_ids = self.tokenizer.batch_decode(output_tensor, skip_special_tokens=True, clean_up_tokenization_spaces=True)
outputs = str(output_ids)
outputs = outputs.replace('[', '').replace(']', '').replace("'", '').replace("'", '')
return outputs |