Spaces:
Running
Running
""" | |
Hugging Face model interface for code generation fine-tuning. | |
""" | |
import streamlit as st | |
import pandas as pd | |
import torch | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForSeq2SeqLM, | |
Trainer, | |
TrainingArguments, | |
DataCollatorForSeq2Seq, | |
) | |
from datasets import Dataset | |
import numpy as np | |
import time | |
import os | |
from pathlib import Path | |
import uuid | |
import json | |
def load_model_and_tokenizer(model_name): | |
""" | |
Load a pre-trained model and tokenizer from Hugging Face. | |
Args: | |
model_name: Name of the model on Hugging Face (e.g., 'Salesforce/codet5-base') | |
Returns: | |
Tuple of (tokenizer, model) | |
""" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
return tokenizer, model | |
def preprocess_code_dataset(dataset_df, tokenizer, max_input_length=256, max_target_length=256, task_prefix=""): | |
""" | |
Preprocess the code dataset for fine-tuning. | |
Args: | |
dataset_df: Pandas DataFrame with 'input' and 'target' columns | |
tokenizer: HuggingFace tokenizer | |
max_input_length: Maximum length for input sequences | |
max_target_length: Maximum length for target sequences | |
task_prefix: Prefix to add to inputs (e.g., "translate code to comment: ") | |
Returns: | |
HuggingFace Dataset ready for training | |
""" | |
def preprocess_function(examples): | |
inputs = [task_prefix + text for text in examples["input"]] | |
targets = examples["target"] | |
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length") | |
# Set up the tokenizer for targets | |
with tokenizer.as_target_tokenizer(): | |
labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length") | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
# Convert DataFrame to HuggingFace Dataset | |
hf_dataset = Dataset.from_pandas(dataset_df) | |
# Split dataset into train and validation | |
splits = hf_dataset.train_test_split(test_size=0.1) | |
train_dataset = splits["train"] | |
eval_dataset = splits["test"] | |
# Apply preprocessing | |
train_dataset = train_dataset.map( | |
preprocess_function, | |
batched=True, | |
remove_columns=["input", "target"] | |
) | |
eval_dataset = eval_dataset.map( | |
preprocess_function, | |
batched=True, | |
remove_columns=["input", "target"] | |
) | |
return train_dataset, eval_dataset | |
def setup_trainer(model, tokenizer, train_dataset, eval_dataset, output_dir, training_args): | |
""" | |
Set up the Trainer for fine-tuning. | |
Args: | |
model: HuggingFace model | |
tokenizer: HuggingFace tokenizer | |
train_dataset: Preprocessed training dataset | |
eval_dataset: Preprocessed evaluation dataset | |
output_dir: Directory to save model and checkpoints | |
training_args: Dictionary of training arguments | |
Returns: | |
HuggingFace Trainer | |
""" | |
# Define training arguments | |
args = TrainingArguments( | |
output_dir=output_dir, | |
per_device_train_batch_size=training_args.get("batch_size", 8), | |
per_device_eval_batch_size=training_args.get("batch_size", 8), | |
learning_rate=training_args.get("learning_rate", 5e-5), | |
num_train_epochs=training_args.get("epochs", 3), | |
weight_decay=training_args.get("weight_decay", 0.01), | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
load_best_model_at_end=True, | |
push_to_hub=False, | |
gradient_accumulation_steps=training_args.get("gradient_accumulation", 1), | |
warmup_steps=training_args.get("warmup_steps", 100), | |
logging_dir=os.path.join(output_dir, "logs"), | |
logging_steps=10, | |
) | |
# Data collator | |
data_collator = DataCollatorForSeq2Seq( | |
tokenizer, | |
model=model, | |
label_pad_token_id=tokenizer.pad_token_id, | |
pad_to_multiple_of=8 | |
) | |
# Initialize Trainer | |
trainer = Trainer( | |
model=model, | |
args=args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
) | |
return trainer | |
def generate_code_comment(model, tokenizer, code, max_length=100, task_prefix="translate code to comment: "): | |
""" | |
Generate a comment for a given code snippet. | |
Args: | |
model: Fine-tuned model | |
tokenizer: Tokenizer | |
code: Input code snippet | |
max_length: Maximum length of the generated comment | |
task_prefix: Prefix to add to the input | |
Returns: | |
Generated comment as string | |
""" | |
inputs = tokenizer(task_prefix + code, return_tensors="pt", padding=True, truncation=True) | |
# Move inputs to the same device as model | |
device = model.device | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
# Generate | |
output_ids = model.generate( | |
inputs["input_ids"], | |
max_length=max_length, | |
num_beams=4, | |
early_stopping=True | |
) | |
comment = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
return comment | |
def generate_code_from_comment(model, tokenizer, comment, max_length=200, task_prefix="translate comment to code: "): | |
""" | |
Generate code from a given comment/description. | |
Args: | |
model: Fine-tuned model | |
tokenizer: Tokenizer | |
comment: Input comment or description | |
max_length: Maximum length of the generated code | |
task_prefix: Prefix to add to the input | |
Returns: | |
Generated code as string | |
""" | |
inputs = tokenizer(task_prefix + comment, return_tensors="pt", padding=True, truncation=True) | |
# Move inputs to the same device as model | |
device = model.device | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
# Generate | |
output_ids = model.generate( | |
inputs["input_ids"], | |
max_length=max_length, | |
num_beams=4, | |
early_stopping=True | |
) | |
code = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
return code | |
def save_training_config(output_dir, config): | |
""" | |
Save training configuration to a JSON file. | |
Args: | |
output_dir: Directory to save the configuration | |
config: Dictionary with training configuration | |
""" | |
config_path = os.path.join(output_dir, "training_config.json") | |
with open(config_path, "w") as f: | |
json.dump(config, f, indent=2) | |
def load_training_config(model_dir): | |
""" | |
Load training configuration from a JSON file. | |
Args: | |
model_dir: Directory with the saved model | |
Returns: | |
Dictionary with training configuration | |
""" | |
config_path = os.path.join(model_dir, "training_config.json") | |
if os.path.exists(config_path): | |
with open(config_path, "r") as f: | |
return json.load(f) | |
return {} | |