Spaces:
Sleeping
Sleeping
File size: 3,876 Bytes
cfbe98d e86736e cfbe98d 2c039db e86736e fa1b7c0 3f8dd98 cfbe98d 6d06448 cfbe98d 3f8dd98 e86736e a83006f e86736e fa1b7c0 a83006f fa1b7c0 a83006f fa1b7c0 a83006f fa1b7c0 3f8dd98 cfbe98d a83006f e86736e 3f8dd98 4e8c8b5 2c039db 4e8c8b5 a83006f 964fd26 e86736e 4e8c8b5 e86736e 2c039db fa1b7c0 2c039db fa1b7c0 e86736e 3f8dd98 cfbe98d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import json
import pandas as pd
import gradio as gr
# from transformers import PreTrainedTokenizerFast, BertForMaskedLM
from datasets import load_dataset
import infer
with open("default_inputs.json", "r") as default_inputs_file:
DEFAULT_INPUTS = json.load(default_inputs_file)
def set_default_inputs():
return (DEFAULT_INPUTS["dna_sequence"],
DEFAULT_INPUTS["latitude"],
DEFAULT_INPUTS["longitude"])
def preprocess():
''' prepares app input for the genus prediction model
'''
# preprocess DNA seq
# Replace all symbols in nucraw which are not A, C, G, T with N
inp_dna = inp_dna.str.replace("[^ACGT]", "N", regex=True)
# Truncate trailing Ns from nucraw
inp_dna = inp_dna.str.replace("N+$", "", regex=True)
# Insert spaces between all k-mers
inp_dna = inp_dna.apply(lambda x: " ".join([x[i:i+4] for i in range(0, len(x), 4)]))
# load model to calculate new embeddings
tokenizer = PreTrainedTokenizerFast.from_pretrained(model, force_download=True)
tokenizer.add_special_tokens({"pad_token": "<UNK>"})
bert_model = BertForMaskedLM.from_pretrained(model, force_download=True)
embed = bert_model.predic(inp_dna)
# format lat and lon into coords
coords = (inp_lat, inp_lng)
# Grab rasters from the tifs
ecoLayers = load_dataset("LofiAmazon/Global-Ecolayers")
temp = pd.DataFrame([coords, embed], columns = ['coord', 'embeddings'])
data = pd.merge(temp, ecoLayers, on='coord', how='left')
return data
def predict_genus():
data = preprocess()
out = infer.infer_dna(data)
results = []
genuses = infer.infer()
results.append({
"sequence": dna_df['nucraw'],
# "predictions": pd.concat([dna_genuses, envdna_genuses], axis=0)
'predictions': genuses})
return results
def tsne():
return plots
with gr.Blocks() as demo:
# Header section
gr.Markdown("# DNA Identifier Tool")
gr.Markdown("Welcome to Lofi Amazon Beats' DNA Identifier Tool")
with gr.Tab("Genus Prediction"):
gr.Markdown("Enter a DNA sequence and the coordinates at which its sample was taken to get a genus prediction. Click 'I'm feeling lucky' to see a prediction for a random sequence.")
# Collect inputs for app (DNA and location)
with gr.row():
with gr.Column():
inp_dna = gr.Textbox(label="DNA", placeholder="e.g. AACAATGTA... (min 200 and max 660 characters)")
with gr.Column():
with gr.Row():
inp_lat = gr.Textbox(label="Latitude", placeholder="e.g. -3.009083")
with gr.Row():
inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
with gr.Row():
btn_run = gr.Button("Run")
btn_defaults = gr.Button("I'm feeling lucky")
btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
with gr.Row():
gr.Markdown('Make plot or table for Top 5 species')
with gr.Row():
genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
with gr.Tab('DNA Embedding Space Similarity Visualizer'):
gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.")
with gr.Row():
with gr.Column():
gr.Markdown("Plot of your DNA sequence among other known species clusters.")
with gr.Column():
gr.Markdown("Plot of the five most common species at your sample coordinate.")
demo.launch()
|