import json import pandas as pd import gradio as gr from transformers import PreTrainedTokenizerFast, BertForMaskedLM from datasets import load_dataset import infer embeddings_train = load_dataset("LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon", split='train').to_pandas() with open("default_inputs.json", "r") as default_inputs_file: DEFAULT_INPUTS = json.load(default_inputs_file) def set_default_inputs(): return (DEFAULT_INPUTS["dna_sequence"], DEFAULT_INPUTS["latitude"], DEFAULT_INPUTS["longitude"]) def preprocess(): ''' prepares app input for the genus prediction model ''' # preprocess DNA seq # Replace all symbols in nucraw which are not A, C, G, T with N inp_dna = inp_dna.str.replace("[^ACGT]", "N", regex=True) # Truncate trailing Ns from nucraw inp_dna = inp_dna.str.replace("N+$", "", regex=True) # Insert spaces between all k-mers inp_dna = inp_dna.apply(lambda x: " ".join([x[i:i+4] for i in range(0, len(x), 4)])) # load model to calculate new embeddings tokenizer = PreTrainedTokenizerFast.from_pretrained(model, force_download=True) tokenizer.add_special_tokens({"pad_token": ""}) bert_model = BertForMaskedLM.from_pretrained(model, force_download=True) embed = bert_model.predic(inp_dna) # format lat and lon into coords coords = (inp_lat, inp_lng) # Grab rasters from the tifs ecoLayers = load_dataset("LofiAmazon/Global-Ecolayers") temp = pd.DataFrame([coords, embed], columns = ['coord', 'embeddings']) data = pd.merge(temp, ecoLayers, on='coord', how='left') return data def predict_genus(): data = preprocess() out = infer.infer_dna(data) results = [] genuses = infer.infer() results.append({ "sequence": dna_df['nucraw'], # "predictions": pd.concat([dna_genuses, envdna_genuses], axis=0) 'predictions': genuses}) return results def tsne(): return plots with gr.Blocks() as demo: # Header section gr.Markdown("# DNA Identifier Tool") gr.Markdown("Welcome to Lofi Amazon Beats' DNA Identifier Tool") with gr.Tab("Genus Prediction"): gr.Markdown("Enter a DNA sequence and the coordinates at which its sample was taken to get a genus prediction. Click 'I'm feeling lucky' to see a prediction for a random sequence.") # Collect inputs for app (DNA and location) with gr.Row(): inp_dna = gr.Textbox(label="DNA", placeholder="e.g. AACAATGTA... (min 200 and max 660 characters)") with gr.Row(): inp_lat = gr.Textbox(label="Latitude", placeholder="e.g. -3.009083") inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281") with gr.Row(): btn_run = gr.Button("Run") btn_defaults = gr.Button("I'm feeling lucky") btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng]) with gr.Row(): gr.Markdown('Make plot or table for Top 5 species') with gr.Column(): genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"]) btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out) with gr.Tab('DNA Embedding Space Similarity Visualizer'): gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.") with gr.Column(): gr.Markdown("Plot of your DNA sequence among other known species clusters.") with gr.Column(): gr.Markdown("Plot of the five most common species at your sample coordinate.") demo.launch()