import json
import pandas as pd
import gradio as gr
from transformers import PreTrainedTokenizerFast, BertForMaskedLM
from datasets import load_dataset
import infer

embeddings_train = load_dataset("LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon", split='train').to_pandas()

with open("default_inputs.json", "r") as default_inputs_file:
    DEFAULT_INPUTS = json.load(default_inputs_file)

def set_default_inputs():
    return (DEFAULT_INPUTS["dna_sequence"],
            DEFAULT_INPUTS["latitude"],
            DEFAULT_INPUTS["longitude"])

def preprocess(): 
    ''' prepares app input for the genus prediction model
    '''
    # preprocess DNA seq
    # Replace all symbols in nucraw which are not A, C, G, T with N
    inp_dna = inp_dna.str.replace("[^ACGT]", "N", regex=True)
    # Truncate trailing Ns from nucraw
    inp_dna = inp_dna.str.replace("N+$", "", regex=True)
    # Insert spaces between all k-mers
    inp_dna = inp_dna.apply(lambda x: " ".join([x[i:i+4] for i in range(0, len(x), 4)]))

    # load model to calculate new embeddings
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model, force_download=True)
    tokenizer.add_special_tokens({"pad_token": "<UNK>"})
    bert_model = BertForMaskedLM.from_pretrained(model, force_download=True)
    embed = bert_model.predic(inp_dna)

    # format lat and lon into coords
    coords = (inp_lat, inp_lng)
    # Grab rasters from the tifs
    ecoLayers = load_dataset("LofiAmazon/Global-Ecolayers")
    temp = pd.DataFrame([coords, embed], columns = ['coord', 'embeddings'])
    data = pd.merge(temp, ecoLayers, on='coord', how='left')

    return data

def predict_genus():
    data = preprocess()
    out = infer.infer_dna(data)
    
    results = []

    genuses = infer.infer()
        
    results.append({
        "sequence": dna_df['nucraw'],
        # "predictions": pd.concat([dna_genuses, envdna_genuses], axis=0)
        'predictions': genuses})
    
    return results

def tsne():

    return plots


with gr.Blocks() as demo:
    # Header section
    gr.Markdown("# DNA Identifier Tool")
    gr.Markdown("Welcome to Lofi Amazon Beats' DNA Identifier Tool")

    with gr.Tab("Genus Prediction"):
        gr.Markdown("Enter a DNA sequence and the coordinates at which its sample was taken to get a genus prediction. Click 'I'm feeling lucky' to see a prediction for a random sequence.")

        # Collect inputs for app (DNA and location)
        with gr.Row():
            inp_dna = gr.Textbox(label="DNA", placeholder="e.g. AACAATGTA... (min 200 and max 660 characters)")
        with gr.Row():
            inp_lat = gr.Textbox(label="Latitude", placeholder="e.g. -3.009083")
            inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")

        with gr.Row():
            btn_run = gr.Button("Run")

            btn_defaults = gr.Button("I'm feeling lucky")
            btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
        
        with gr.Row():
            gr.Markdown('Make plot or table for Top 5 species')

        with gr.Column():
            genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
            btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
    
    with gr.Tab('DNA Embedding Space Similarity Visualizer'):
        gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.")
        with gr.Column():
            gr.Markdown("Plot of your DNA sequence among other known species clusters.")

        with gr.Column():
            gr.Markdown("Plot of the five most common species at your sample coordinate.")


demo.launch()