import pandas as pd import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer import csv import yaml from datasets import Dataset import os os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' def load_data_and_config(data_path): """Loads training data from CSV.""" data = [] with open(data_path, newline='', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter=';') # Ensure delimiter matches your CSV file for row in reader: data.append({'text': row['description']}) # Changed from 'text' to 'description' return data def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url): """Generates an API query using a fine-tuned model.""" input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt") output = model.generate(input_ids, max_length=256, temperature=0.7) query = tokenizer.decode(output[0], skip_special_tokens=True) return f"{base_url}/{query}" from transformers import TrainingArguments, Trainer def train_model(model, tokenizer, data): """Trains the model using the Hugging Face Trainer API.""" # Encode data and prepare labels inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data] dataset = Dataset.from_dict({ 'input_ids': [x['input_ids'].squeeze() for x in inputs], # remove extra dimensions 'labels': [x['input_ids'].squeeze() for x in inputs] }) training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=1, gradient_accumulation_steps=1, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer ) # The Trainer handles the training loop internally trainer.train() # Optionally clear cache if using GPU or MPS if torch.cuda.is_available(): torch.cuda.empty_cache() elif torch.has_mps: torch.mps.empty_cache() # Perform any remaining steps such as logging, saving, etc. trainer.save_model() if __name__ == "__main__": # Load data and configurations data = load_data_and_config("train2.csv") # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("google/codegemma-7b-it") model = AutoModelForCausalLM.from_pretrained("google/codegemma-7b-it") # Train the model on your dataset train_model(model, tokenizer, data) # Save the fine-tuned model model.save_pretrained("./fine_tuned_model") tokenizer.save_pretrained("./fine_tuned_model") # Example usage prompt = "I need to retrieve the latest block on chain using a python script" api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", config["api_name"], config["base_url"]) print(f"Generated code: {api_query}")