|
import torch |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
TrainingArguments, |
|
Trainer, |
|
DataCollatorForLanguageModeling |
|
) |
|
from datasets import load_dataset |
|
import pandas as pd |
|
|
|
|
|
print("CUDA Available:", torch.cuda.is_available()) |
|
print("Current Device:", torch.cuda.current_device()) |
|
print("Device Name:", torch.cuda.get_device_name(0)) |
|
|
|
|
|
def load_custom_dataset(file_path): |
|
|
|
df = pd.read_csv(file_path) |
|
|
|
|
|
if 'text' not in df.columns: |
|
raise ValueError("CSV must have a 'text' column") |
|
|
|
|
|
dataset = load_dataset('csv', data_files=file_path, split='train') |
|
return dataset |
|
|
|
|
|
model_name = "codellama/CodeLlama-7b-hf" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float16, |
|
device_map="auto" |
|
) |
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples['text'], truncation=True, max_length=1024) |
|
|
|
|
|
dataset = load_custom_dataset('instructions.csv') |
|
tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./ansible-review-model", |
|
overwrite_output_dir=True, |
|
num_train_epochs=4, |
|
per_device_train_batch_size=2, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
prediction_loss_only=True, |
|
learning_rate=2e-4, |
|
warmup_ratio=0.1, |
|
fp16=True, |
|
logging_dir='./logs', |
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, |
|
mlm=False |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset, |
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model("./ansible-review-model") |
|
tokenizer.save_pretrained("./ansible-review-model") |
|
|
|
print("Training Complete!") |