Spaces:

vsagar100
/

codevista

Sleeping

File size: 2,196 Bytes

2466e49
ac0aa1f
 
 
 
 
 
 
2466e49
ac0aa1f
2466e49
ac0aa1f
 
 
 
2466e49
ac0aa1f
 
 
 
 
 
 
 
 
 
 
 
2466e49
ac0aa1f
 
 
 
 
 
 
2466e49
 
ac0aa1f
 
 
2466e49
ac0aa1f
 
 
2466e49
ac0aa1f
 
 
 
 
 
 
 
 
 
 
 
 
2466e49
 
ac0aa1f
 
 
 
2466e49
 
ac0aa1f
 
 
 
 
 
2466e49
 
ac0aa1f
 
 
 
 
 
2466e49
ac0aa1f

import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
import pandas as pd

# Check GPU availability
print("CUDA Available:", torch.cuda.is_available())
print("Current Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(0))

# Load and prepare dataset
def load_custom_dataset(file_path):
    # Read CSV
    df = pd.read_csv(file_path)
    
    # Ensure 'text' column exists
    if 'text' not in df.columns:
        raise ValueError("CSV must have a 'text' column")
    
    # Convert to Hugging Face dataset
    dataset = load_dataset('csv', data_files=file_path, split='train')
    return dataset

# Model and Tokenizer Setup
model_name = "codellama/CodeLlama-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,  # Use float16 for memory efficiency
    device_map="auto"  # Automatic device mapping
)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=1024)

# Prepare dataset
dataset = load_custom_dataset('instructions.csv')
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./ansible-review-model",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=2e-4,
    warmup_ratio=0.1,
    fp16=True,  # Use mixed precision
    logging_dir='./logs',
)

# Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False  # For causal language modeling
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Start Training
trainer.train()

# Save Model and Tokenizer
trainer.save_model("./ansible-review-model")
tokenizer.save_pretrained("./ansible-review-model")

print("Training Complete!")