File size: 3,441 Bytes
f494faf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import pandas as pd
import numpy as np
import os
def read_txt(file_path):
text = ""
try:
with open(file_path, "r") as file:
text = file.read()
except:
text = ""
return text
with open("train.txt", "w") as f:
f.write('')
data = ""
for filename in os.listdir("./"):
file_path = os.path.join("./", filename)
if file_path.endswith(".txt") and (file_path != "train.txt"):
data += read_txt(file_path)
data = ' '.join(data.split('\n'))
with open("train.txt", "a") as f:
f.write(data)
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
def load_dataset(file_path, tokenizer, block_size = 128):
dataset = TextDataset(
tokenizer = tokenizer,
file_path = file_path,
block_size = block_size,
)
return dataset
def load_data_collator(tokenizer, mlm = False):
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=mlm,
)
return data_collator
def train(train_file_path,model_name,
output_dir,
overwrite_output_dir,
per_device_train_batch_size,
num_train_epochs,
save_steps):
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
train_dataset = load_dataset(train_file_path, tokenizer)
data_collator = load_data_collator(tokenizer)
tokenizer.save_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.save_pretrained(output_dir)
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=overwrite_output_dir,
per_device_train_batch_size=per_device_train_batch_size,
num_train_epochs=num_train_epochs,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train()
trainer.save_model()
train_file_path = "train.txt"
model_name = 'gpt2'
output_dir = 'model'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000
train(
train_file_path=train_file_path,
model_name=model_name,
output_dir=output_dir,
overwrite_output_dir=overwrite_output_dir,
per_device_train_batch_size=per_device_train_batch_size,
num_train_epochs=num_train_epochs,
save_steps=save_steps
)
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
def load_model(model_path):
model = GPT2LMHeadModel.from_pretrained(model_path)
return model
def load_tokenizer(tokenizer_path):
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
return tokenizer
def generate_text(model_path, sequence, max_length):
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)
ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
final_outputs = model.generate(
ids,
do_sample=True,
max_length=max_length,
pad_token_id=model.config.eos_token_id,
top_k=50,
top_p=0.95,
)
print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))
model_path = "/model/"
sequence = "Hello!"
max_len = 50
generate_text(model_path, sequence, max_len) |