Spaces:

LofiAmazon
/

LofiAmazonSpace

Sleeping

App Files Files Community

vshulev commited on Jun 2, 2024

Commit

9fe11bc

1 Parent(s): 973829c

Implement tSNE

Browse files

Files changed (2) hide show

app.py +61 -25
config.py +1 -0

app.py CHANGED Viewed

@@ -58,6 +58,7 @@ classification_model.eval()
 # Load datasets
 ecolayers_ds = load_dataset(DATASETS["ecolayers"])
 def set_default_inputs():
@@ -133,7 +134,6 @@ def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str)
             top_k.values.detach().numpy(),
             index=[ID_TO_GENUS_MAP[i] for i in top_k.indices.detach().numpy()]
         )
-        # top_k = pd.Series(top_k.values.detach().numpy(), index=top_k.indices.detach().numpy())
     fig, ax = plt.subplots()
     ax.bar(top_k.index.astype(str), top_k.values)
@@ -148,6 +148,34 @@ def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str)
     return PIL.Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
 with gr.Blocks() as demo:
     # Header section
     gr.Markdown("# DNA Identifier Tool")
@@ -169,16 +197,24 @@ with gr.Blocks() as demo:
                 inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
     with gr.Row():
-        btn_run = gr.Button("Predict")
-        btn_run.click(
-            fn=preprocess,
-            inputs=[inp_dna, inp_lat, inp_lng],
-        )
         btn_defaults = gr.Button("I'm feeling lucky")
         btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
     with gr.Tab("Genus Prediction"):
         gr.Interface(
             fn=predict_genus,
             inputs=[
@@ -188,26 +224,26 @@ with gr.Blocks() as demo:
                 inp_lng,
             ],
             outputs=["image"],
         )
-        # with gr.Row():
-        #     gr.Markdown("Make plot or table for Top 5 species")
-        # with gr.Row():
-        #     genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
-        #     # btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
-    with gr.Tab('DNA Embedding Space Visualizer'):
-        gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.")
-        with gr.Row() as row:
-            with gr.Column():
-                gr.Markdown("Plot of your DNA sequence among other known species clusters.")
-                # plot = gr.Plot("")
-                # btn_run.click(fn=tsne_DNA, inputs=[inp_dna, genus_out])
-            with gr.Column():
-                gr.Markdown("Plot of the five most common species at your sample coordinate.")
 demo.launch()

 # Load datasets
 ecolayers_ds = load_dataset(DATASETS["ecolayers"])
+amazon_ds = load_dataset(DATASETS["amazon"])
 def set_default_inputs():
             top_k.values.detach().numpy(),
             index=[ID_TO_GENUS_MAP[i] for i in top_k.indices.detach().numpy()]
         )
     fig, ax = plt.subplots()
     ax.bar(top_k.index.astype(str), top_k.values)
     return PIL.Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+def cluster_dna(top_k: float):
+    df = amazon_ds["train"].to_pandas()
+    df = df[df["genus"].notna()]
+    top_k = int(top_k)
+    genus_counts = df["genus"].value_counts()
+    top_genuses = genus_counts.head(top_k).index
+    df = df[df["genus"].isin(top_genuses)]
+    tsne = TSNE(
+        n_components=2, perplexity=30, learning_rate=200,
+        n_iter=1000, random_state=0,
+    )
+    X = np.stack(df["embeddings"].tolist())
+    y = df["genus"].tolist()
+    X_tsne = tsne.fit_transform(X)
+    label_encoder = LabelEncoder()
+    y_encoded = label_encoder.fit_transform(y)
+    fig, ax = plt.subplots()
+    ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_encoded, cmap="viridis", alpha=0.7)
+    ax.set_title(f"DNA Embedding Space (of {str(top_k)} most common genera)")
+    # Reduce unnecessary whitespace
+    ax.set_xlim(X_tsne[:, 0].min() - 0.1, X_tsne[:, 0].max() + 0.1)
+    fig.canvas.draw()
+    return PIL.Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
 with gr.Blocks() as demo:
     # Header section
     gr.Markdown("# DNA Identifier Tool")
                 inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
     with gr.Row():
         btn_defaults = gr.Button("I'm feeling lucky")
         btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
     with gr.Tab("Genus Prediction"):
+        gr.Markdown("""
+        # Genus prediction
+        A demo of predicting the genus of a DNA sequence using multiple
+        approaches (method dropdown):
+        - **fine_tuned_model**: using our
+          `LofiAmazon/BarcodeBERT-Finetuned-Amazon` which predicts the genus
+          based on the DNA sequence and environmental data.
+        - **cosine**: computes a cosine similarity between the DNA sequence
+          embedding generated by our model and the embeddings of known samples
+          that we precomputed and stored in a Pinecone index. Thie method
+          DOES NOT examine ecological layer data.
+        """)
         gr.Interface(
             fn=predict_genus,
             inputs=[
                 inp_lng,
             ],
             outputs=["image"],
+            allow_flagging="never",
         )
+    with gr.Tab("DNA Embedding Space Visualizer"):
+        gr.Markdown("""
+        # DNA Embedding Space Visualizer
+        We show a 2D t-SNE plot of the DNA embeddings of the five most common
+        genera in our dataset. This shows that the DNA Transformer model is
+        learning to cluster similar DNA sequences together.
+        """)
+        gr.Interface(
+            fn=cluster_dna,
+            inputs=[
+                gr.Slider(minimum=1, maximum=10, step=1, value=5,
+                          label="Number of top genera to visualize")
+            ],
+            outputs=["image"],
+            allow_flagging="never",
+        )
 demo.launch()

config.py CHANGED Viewed

@@ -25,4 +25,5 @@ MODELS = {
 DATASETS = {
     "ecolayers": "LofiAmazon/Global-Ecolayers",
 }

 DATASETS = {
     "ecolayers": "LofiAmazon/Global-Ecolayers",
+    "amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
 }