from pathlib import Path import time import modal from modal import App, Image, Volume, enter, method, wsgi_app import os #os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" VOL_MOUNT_PATH = Path("/vol") cuda_version = "12.4.0" # should be no greater than host CUDA version flavor = "devel" # includes full CUDA toolkit _os = "ubuntu22.04" tag = f"{cuda_version}-{flavor}-{_os}" #image = Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11") image = Image.debian_slim(python_version="3.10") image = image.pip_install( "accelerate", "transformers", "torch", "datasets", "tensorboard", "trl", "xformers", "bitsandbytes", "peft", "protobuf==3.20.*", "onnxruntime", "onnx", "setfit", "nltk", "firebase_admin", "openai", "evaluate", "sentencepiece", "pandas", "scikit-learn", "huggingface_hub" ) app = App( name="finetune-run", image=image ) # Note: prior to April 2024, "app" was called "stub" output_vol = Volume.from_name("finetune-volume", create_if_missing=True) @app.function(gpu="any") def gpu_function(): import subprocess import torch subprocess.run(["nvidia-smi"]) print("Torch version:", torch.__version__) print("CUDA available:", torch.cuda.is_available()) print("CUDA device count:", torch.cuda.device_count()) GPU_CONFIG = modal.gpu.A100(count=1, size="80GB") @app.function( gpu=GPU_CONFIG, # self.params.gpu, timeout=7200, # self.params.timeout, volumes={VOL_MOUNT_PATH: output_vol}, secrets=[modal.Secret.from_dotenv()] ) def run_finetune(data): import subprocess import torch subprocess.run(["nvidia-smi"]) print("Torch version:", torch.__version__) print("CUDA available:", torch.cuda.is_available()) print("CUDA device count:", torch.cuda.device_count()) import pandas as pd open('./features_ms_deberta_v3.json', 'w').write(data) df = pd.read_json('./features_ms_deberta_v3.json', lines=False) dfs = [] for _ in range(50): dfs.append(df) df = pd.concat(dfs, ignore_index=True) from datasets import Dataset from transformers import ( AutoModel,AutoTokenizer, AutoModelForSequenceClassification, DebertaV2Model, DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments ,EvalPrediction,DataCollatorWithPadding ) from transformers import EarlyStoppingCallback import numpy as np from itertools import chain import re from collections import Counter from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import random from sklearn.model_selection import train_test_split from transformers import pipeline import torch from evaluate import load as load_metric HF_ORGANIZATION = "rafaelsandroni" token = os.getenv("HF_TOKEN") # In[12]: # Define the task for zero-shot classification task = "zero-shot-classification" # Define the specific pre-trained model to be used model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" # model_name = "MoritzLaurer/bge-m3-zeroshot-v2.0" # model_name = "cross-encoder/nli-deberta-v3-base" model_name = "tasksource/deberta-small-long-nli" # Define the directory where the output/results will be saved output_dir = "./" # Clear the CUDA cache to free up GPU memory torch.cuda.empty_cache() device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # Most common init model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # In[5]: def create_input_sequence(sample): # Get text from the 'premise' column text = sample["premise"] # Get hypothesis from the 'hypothesis' column hypothesis = sample['hypothesis'] # Get label from the 'class' column label = sample['class'] # Encoding the sequence using the tokenizer encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length') # Assign label to the encoded sequence encoded_sequence['labels'] = label # Decode the input_ids encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids) return encoded_sequence # In[6]: df['class'] = (df['target'] == 'PASS').astype(int).apply(lambda x: 0 if x == 1 else 2) print(df.head()) print(df.shape) # In[7]: train_data, test_data = train_test_split(df, test_size=0.1, random_state=42) # Shuffle the train_data DataFrame and create a new DataFrame with shuffled rows train_shuffle_df = train_data # Shuffle the test_data DataFrame and create a new DataFrame with shuffled rows test_shuffle_df = test_data # Create a Dataset object from the shuffled train DataFrame train = Dataset.from_pandas(train_shuffle_df) # Create a Dataset object from the shuffled test DataFrame test = Dataset.from_pandas(test_shuffle_df) # Map the create_input_sequence function to the train and test datasets # This function encodes the data, adds labels, and generates input sentences train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"]) test_dataset = test.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"]) # In[8]: def compute_metrics(p: EvalPrediction): # Extracting predictions from EvalPrediction object preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions # Obtaining the predicted classes preds = np.argmax(preds, axis = 1) # Calculating the ratio of predictions equal to 2 (assumed label) ratio = np.mean(preds == 2) # Dictionary to store computed metrics result = {} # Loading evaluation metrics metric_f1 = load_metric("f1") metric_precision = load_metric("precision") metric_recall = load_metric("recall") metric_acc = load_metric("accuracy") # Computing various metrics result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"] result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids,average = 'macro')['precision'] result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids,average = 'macro')["recall"] result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"] result["ratio"] = ratio return result # In[14]: training_args = TrainingArguments( output_dir=output_dir, # Output directory logging_dir=output_dir + "/logs",# Output directory for logging num_train_epochs=1, # Total number of training epochs per_device_train_batch_size=16, # Batch size per device during training per_device_eval_batch_size=16, # Batch size for evaluation warmup_steps=4, # Number of warmup steps for learning rate scheduler weight_decay=0.01, # Strength of weight decay gradient_accumulation_steps=2, # The number of steps whose gradients are accumulated learning_rate=2e-05, # Controls the magnitude of updates to the model weights warmup_ratio=0.06, # Represents the proportion of training steps label_smoothing_factor=0.1, # Regularization technique to prevent the model from becoming overconfident evaluation_strategy='steps', # Frequency or timing of evaluating logging_strategy='steps', # Frequency or timing of logging logging_steps = 10, # Frequency or timing of logging eval_steps = 10, # Frequency or timing of evaluating logging_first_step=True, do_eval=True, hub_model_id="rafaelsandroni/ms-deberta-v2-xlarge-mnli-finetuned-pt", load_best_model_at_end=True, ) # In[15]: callbacks = [EarlyStoppingCallback(early_stopping_patience=3)] trainer = Trainer( model=model, # The instantiated model to be trained args=training_args, # Training arguments, defined above train_dataset=train_dataset, # Training dataset eval_dataset=test_dataset, # Evaluation dataset tokenizer=tokenizer, compute_metrics=compute_metrics, callbacks=callbacks ) # In[16]: # In[18]: trainer.train() # In[ ]: trainer.evaluate() t = time.strftime("%Y%m%d%H%M%S") v = 2 commit = f"dev-v{v}-{t}" trainer.push_to_hub(commit, token=token) @app.local_entrypoint() def run(): import time import pandas as pd t0 = time.time() #df = pd.read_json('./features_ms_deberta_v2.json', lines=False) with open('./features_ms_deberta_v3.json') as f: data = f.read() run_finetune.remote(data) print("Full time spent:", time.time() - t0)