|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments |
|
import openai |
|
import os |
|
|
|
API_URL = "https://api.openai.com/v1/chat/completions" |
|
openai.api_key = os.environ["OPENAI_API_KEY"] |
|
|
|
model_name="gpt-3.5-turbo" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
|
|
|
filename = "machintoshhd/users/izerkoen/downloads/content.txt" |
|
with open(filename, "r") as f: |
|
text = f.read() |
|
|
|
|
|
train_dataset = LineByLineTextDataset("content.txt", tokenizer=tokenizer, block_size=128) |
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
overwrite_output_dir=True, |
|
num_train_epochs=3, |
|
per_device_train_batch_size=16, |
|
save_steps=1000, |
|
save_total_limit=2, |
|
prediction_loss_only=True, |
|
) |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=train_dataset, |
|
) |
|
trainer.train() |
|
|
|
input_text = "sen nasil bir trek asistanisin?" |
|
input_ids = tokenizer.encode(input_text, return_tensors="pt") |
|
output = model.generate(input_ids, max_length=50, do_sample=True) |
|
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) |
|
print(generated_text) |
|
|
|
|
|
|
|
|