import torch from torch.utils.data import Dataset, DataLoader from sentence_transformers import SentenceTransformer, losses from tqdm import tqdm import gc from plugins.scansite import ScansitePlugin torch.cuda.empty_cache() class PreferenceDataset(Dataset): def __init__(self, data, tokenizer, max_length=128): self.data = data self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.data) def __getitem__(self, idx): url, title, score = self.data[idx] encoded = self.tokenizer(title, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt") return {key: val.squeeze(0) for key, val in encoded.items()}, torch.tensor(score, dtype=torch.float) def collate_fn(batch): input_ids = torch.stack([item[0]['input_ids'] for item in batch]) attention_mask = torch.stack([item[0]['attention_mask'] for item in batch]) scores = torch.stack([item[1] for item in batch]) return {'input_ids': input_ids, 'attention_mask': attention_mask}, scores def finetune(model_name='nomic-ai/nomic-embed-text-v1', output_model_name="embeddings-ft", num_epochs=2, learning_rate=2e-5, weight_decay=0.01, batch_size=8, num_warmup_steps=0): print(f"Fine-tuning parameters:\n" f"num_epochs: {num_epochs}\n" f"learning rate (lr): {learning_rate}\n" f"weight_decay: {weight_decay}\n" f"batch_size: {batch_size}\n" f"model_name: {model_name}\n" f"num_warmup_steps: {num_warmup_steps}") scansite_plugin = ScansitePlugin("scansite", None) reference_data_valid, reference_data_rejected = scansite_plugin.get_reference_data() valid_data_with_scores = [(url, title, (score - 1) / 8 + 0.5) for url, title, score in reference_data_valid] rejected_data_with_scores = [(url, title, 0.0) for url, title in reference_data_rejected] all_data = valid_data_with_scores + rejected_data_with_scores model = SentenceTransformer(model_name, trust_remote_code=True) tokenizer = model.tokenizer dataset = PreferenceDataset(all_data, tokenizer) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) loss_function = torch.nn.MSELoss() optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) total_steps = len(dataloader) * num_epochs scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=total_steps) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) for epoch in range(num_epochs): model.train() for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"): input_data, scores = batch input_data = {k: v.to(device) for k, v in input_data.items()} scores = scores.to(device) optimizer.zero_grad() embeddings = model(input_data)['sentence_embedding'] # Calcul de la similarité cosinus embeddings_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1) cosine_similarities = torch.sum(embeddings_norm, dim=1) # Calcul de la perte loss = loss_function(cosine_similarities, scores) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() del embeddings, cosine_similarities torch.cuda.empty_cache() gc.collect() model.save(output_model_name) print("Finetuning terminé et modèle sauvegardé.") if __name__ == "__main__": finetune()