from pathlib import Path
import time
import modal
from modal import App, Image, Volume, enter, method, wsgi_app
import os

#os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
VOL_MOUNT_PATH = Path("/vol")

cuda_version = "12.4.0"  # should be no greater than host CUDA version
flavor = "devel"  #  includes full CUDA toolkit
_os = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{_os}"

#image = Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11")
image = Image.debian_slim(python_version="3.10")

image = image.pip_install(
        "accelerate",
        "transformers",
        "torch",
        "datasets",
        "tensorboard",
        "trl",
        "xformers",
        "bitsandbytes",
        "peft",
        "protobuf==3.20.*",
        "onnxruntime",
        "onnx",
        "setfit",
        "nltk",
        "firebase_admin",
        "openai",
        "evaluate",
        "sentencepiece",
        "pandas",
        "scikit-learn",
        "huggingface_hub"
    )

app = App(
    name="finetune-run", image=image
)  # Note: prior to April 2024, "app" was called "stub"
output_vol = Volume.from_name("finetune-volume", create_if_missing=True)
 

@app.function(gpu="any")
def gpu_function():
    import subprocess

    import torch

    subprocess.run(["nvidia-smi"])
    print("Torch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("CUDA device count:", torch.cuda.device_count())


GPU_CONFIG = modal.gpu.A100(count=1, size="80GB")

@app.function(
    gpu=GPU_CONFIG, # self.params.gpu,
    timeout=7200, # self.params.timeout,
    volumes={VOL_MOUNT_PATH: output_vol},
    secrets=[modal.Secret.from_dotenv()]
)
def run_finetune(data):
    import subprocess

    import torch

    subprocess.run(["nvidia-smi"])
    print("Torch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("CUDA device count:", torch.cuda.device_count())

    import pandas as pd
    open('./features_ms_deberta_v3.json', 'w').write(data)
    df = pd.read_json('./features_ms_deberta_v3.json', lines=False)
    dfs = []
    for _ in range(50):
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)

    from datasets import Dataset
    from transformers import (
        AutoModel,AutoTokenizer, 
        AutoModelForSequenceClassification, DebertaV2Model, DebertaV2Tokenizer, DebertaV2ForSequenceClassification,
        
        Trainer, TrainingArguments ,EvalPrediction,DataCollatorWithPadding
    )
    from transformers import EarlyStoppingCallback
    import numpy as np
    from itertools import chain
    import re
    from collections import Counter
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import random
    from sklearn.model_selection import train_test_split
    from transformers import pipeline
    import torch
    from evaluate import load as load_metric
    
    HF_ORGANIZATION = "rafaelsandroni"
    token = os.getenv("HF_TOKEN")

    # In[12]:


    # Define the task for zero-shot classification
    task = "zero-shot-classification"

    # Define the specific pre-trained model to be used

    model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
    # model_name = "MoritzLaurer/bge-m3-zeroshot-v2.0"
    # model_name = "cross-encoder/nli-deberta-v3-base"
    model_name = "tasksource/deberta-small-long-nli"

    # Define the directory where the output/results will be saved
    output_dir = "./"

    # Clear the CUDA cache to free up GPU memory
    torch.cuda.empty_cache()
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


    # Most common init 
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)


    # In[5]:


    def create_input_sequence(sample):
        # Get text from the 'premise' column
        text = sample["premise"]

        # Get hypothesis from the 'hypothesis' column
        hypothesis = sample['hypothesis']

        # Get label from the 'class' column
        label = sample['class']

        # Encoding the sequence using the tokenizer
        encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length')

        # Assign label to the encoded sequence
        encoded_sequence['labels'] = label

        # Decode the input_ids
        encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)

        return encoded_sequence


    # In[6]:

    df['class'] = (df['target'] == 'PASS').astype(int).apply(lambda x: 0 if x == 1 else 2)
    
    print(df.head())

    print(df.shape)


    # In[7]:


    train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

    # Shuffle the train_data DataFrame and create a new DataFrame with shuffled rows
    train_shuffle_df = train_data

    # Shuffle the test_data DataFrame and create a new DataFrame with shuffled rows
    test_shuffle_df = test_data

    # Create a Dataset object from the shuffled train DataFrame
    train = Dataset.from_pandas(train_shuffle_df)

    # Create a Dataset object from the shuffled test DataFrame
    test = Dataset.from_pandas(test_shuffle_df)

    # Map the create_input_sequence function to the train and test datasets
    # This function encodes the data, adds labels, and generates input sentences
    train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"])
    test_dataset = test.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"])


    # In[8]:


    def compute_metrics(p: EvalPrediction):
        # Extracting predictions from EvalPrediction object
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

        # Obtaining the predicted classes
        preds = np.argmax(preds, axis = 1)

        # Calculating the ratio of predictions equal to 2 (assumed label)
        ratio = np.mean(preds == 2)

        # Dictionary to store computed metrics
        result = {}

        # Loading evaluation metrics
        metric_f1 = load_metric("f1")
        metric_precision = load_metric("precision")
        metric_recall = load_metric("recall")
        metric_acc = load_metric("accuracy")

        # Computing various metrics
        result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
        result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids,average = 'macro')['precision']
        result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids,average = 'macro')["recall"]
        result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
        result["ratio"] = ratio

        return result


    # In[14]:


    training_args = TrainingArguments(
    output_dir=output_dir,           # Output directory
    logging_dir=output_dir + "/logs",# Output directory for logging
    num_train_epochs=1,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    warmup_steps=4,                  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    gradient_accumulation_steps=2,   # The number of steps whose gradients are accumulated
    learning_rate=2e-05,             # Controls the magnitude of updates to the model weights
    warmup_ratio=0.06,               # Represents the proportion of training steps
    label_smoothing_factor=0.1,      # Regularization technique to prevent the model from becoming overconfident
    evaluation_strategy='steps',     # Frequency or timing of evaluating
    logging_strategy='steps',        # Frequency or timing of logging
    logging_steps = 10,              # Frequency or timing of logging
    eval_steps = 10,                 # Frequency or timing of evaluating
    logging_first_step=True,
    do_eval=True,
    hub_model_id="rafaelsandroni/ms-deberta-v2-xlarge-mnli-finetuned-pt",
    load_best_model_at_end=True,
    )


    # In[15]:
    
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]


    trainer = Trainer(
        model=model,                     # The instantiated model to be trained
        args=training_args,              # Training arguments, defined above
        train_dataset=train_dataset,     # Training dataset
        eval_dataset=test_dataset,      # Evaluation dataset
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=callbacks
    )


    # In[16]:


    # In[18]:


    trainer.train()


    # In[ ]:


    trainer.evaluate()
    t = time.strftime("%Y%m%d%H%M%S")
    v = 2
    commit = f"dev-v{v}-{t}"
    trainer.push_to_hub(commit, token=token)


@app.local_entrypoint()
def run():
    import time
    import pandas as pd
    t0 = time.time()
    #df = pd.read_json('./features_ms_deberta_v2.json', lines=False)
    with open('./features_ms_deberta_v3.json') as f:
        data = f.read()
    run_finetune.remote(data)
    
    print("Full time spent:", time.time() - t0)