File size: 3,876 Bytes
cfbe98d
e86736e
cfbe98d
2c039db
e86736e
fa1b7c0
3f8dd98
cfbe98d
 
6d06448
cfbe98d
 
 
 
3f8dd98
e86736e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a83006f
e86736e
fa1b7c0
a83006f
 
 
fa1b7c0
a83006f
 
 
 
fa1b7c0
a83006f
 
 
fa1b7c0
 
 
 
3f8dd98
 
cfbe98d
 
a83006f
 
 
e86736e
3f8dd98
4e8c8b5
2c039db
 
 
 
 
 
 
 
 
4e8c8b5
 
 
 
 
 
 
 
 
a83006f
964fd26
e86736e
 
4e8c8b5
 
e86736e
2c039db
 
 
 
fa1b7c0
2c039db
 
fa1b7c0
 
e86736e
3f8dd98
cfbe98d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
import pandas as pd
import gradio as gr
# from transformers import PreTrainedTokenizerFast, BertForMaskedLM
from datasets import load_dataset
import infer

with open("default_inputs.json", "r") as default_inputs_file:
    DEFAULT_INPUTS = json.load(default_inputs_file)

def set_default_inputs():
    return (DEFAULT_INPUTS["dna_sequence"],
            DEFAULT_INPUTS["latitude"],
            DEFAULT_INPUTS["longitude"])

def preprocess(): 
    ''' prepares app input for the genus prediction model
    '''
    # preprocess DNA seq
    # Replace all symbols in nucraw which are not A, C, G, T with N
    inp_dna = inp_dna.str.replace("[^ACGT]", "N", regex=True)
    # Truncate trailing Ns from nucraw
    inp_dna = inp_dna.str.replace("N+$", "", regex=True)
    # Insert spaces between all k-mers
    inp_dna = inp_dna.apply(lambda x: " ".join([x[i:i+4] for i in range(0, len(x), 4)]))

    # load model to calculate new embeddings
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model, force_download=True)
    tokenizer.add_special_tokens({"pad_token": "<UNK>"})
    bert_model = BertForMaskedLM.from_pretrained(model, force_download=True)
    embed = bert_model.predic(inp_dna)

    # format lat and lon into coords
    coords = (inp_lat, inp_lng)
    # Grab rasters from the tifs
    ecoLayers = load_dataset("LofiAmazon/Global-Ecolayers")
    temp = pd.DataFrame([coords, embed], columns = ['coord', 'embeddings'])
    data = pd.merge(temp, ecoLayers, on='coord', how='left')

    return data

def predict_genus():
    data = preprocess()
    out = infer.infer_dna(data)
    
    results = []

    genuses = infer.infer()
        
    results.append({
        "sequence": dna_df['nucraw'],
        # "predictions": pd.concat([dna_genuses, envdna_genuses], axis=0)
        'predictions': genuses})
    
    return results

def tsne():

    return plots


with gr.Blocks() as demo:
    # Header section
    gr.Markdown("# DNA Identifier Tool")
    gr.Markdown("Welcome to Lofi Amazon Beats' DNA Identifier Tool")

    with gr.Tab("Genus Prediction"):
        gr.Markdown("Enter a DNA sequence and the coordinates at which its sample was taken to get a genus prediction. Click 'I'm feeling lucky' to see a prediction for a random sequence.")

        # Collect inputs for app (DNA and location)
        with gr.row():
            with gr.Column():
                inp_dna = gr.Textbox(label="DNA", placeholder="e.g. AACAATGTA... (min 200 and max 660 characters)")

            with gr.Column():
                with gr.Row():
                    inp_lat = gr.Textbox(label="Latitude", placeholder="e.g. -3.009083")    
                with gr.Row():
                    inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")

        with gr.Row():
            btn_run = gr.Button("Run")

            btn_defaults = gr.Button("I'm feeling lucky")
            btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
        
        with gr.Row():
            gr.Markdown('Make plot or table for Top 5 species')

        with gr.Row():
            genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
            btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
    
    with gr.Tab('DNA Embedding Space Similarity Visualizer'):
        gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("Plot of your DNA sequence among other known species clusters.")

            with gr.Column():
                gr.Markdown("Plot of the five most common species at your sample coordinate.")




demo.launch()