Spaces:
Sleeping
Sleeping
Implement tSNE
Browse files
app.py
CHANGED
@@ -58,6 +58,7 @@ classification_model.eval()
|
|
58 |
|
59 |
# Load datasets
|
60 |
ecolayers_ds = load_dataset(DATASETS["ecolayers"])
|
|
|
61 |
|
62 |
|
63 |
def set_default_inputs():
|
@@ -133,7 +134,6 @@ def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str)
|
|
133 |
top_k.values.detach().numpy(),
|
134 |
index=[ID_TO_GENUS_MAP[i] for i in top_k.indices.detach().numpy()]
|
135 |
)
|
136 |
-
# top_k = pd.Series(top_k.values.detach().numpy(), index=top_k.indices.detach().numpy())
|
137 |
|
138 |
fig, ax = plt.subplots()
|
139 |
ax.bar(top_k.index.astype(str), top_k.values)
|
@@ -148,6 +148,34 @@ def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str)
|
|
148 |
return PIL.Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
|
149 |
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
with gr.Blocks() as demo:
|
152 |
# Header section
|
153 |
gr.Markdown("# DNA Identifier Tool")
|
@@ -169,16 +197,24 @@ with gr.Blocks() as demo:
|
|
169 |
inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
|
170 |
|
171 |
with gr.Row():
|
172 |
-
btn_run = gr.Button("Predict")
|
173 |
-
btn_run.click(
|
174 |
-
fn=preprocess,
|
175 |
-
inputs=[inp_dna, inp_lat, inp_lng],
|
176 |
-
)
|
177 |
-
|
178 |
btn_defaults = gr.Button("I'm feeling lucky")
|
179 |
btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
|
180 |
|
181 |
with gr.Tab("Genus Prediction"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
gr.Interface(
|
183 |
fn=predict_genus,
|
184 |
inputs=[
|
@@ -188,26 +224,26 @@ with gr.Blocks() as demo:
|
|
188 |
inp_lng,
|
189 |
],
|
190 |
outputs=["image"],
|
|
|
191 |
)
|
192 |
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
with gr.Row() as row:
|
205 |
-
with gr.Column():
|
206 |
-
gr.Markdown("Plot of your DNA sequence among other known species clusters.")
|
207 |
-
# plot = gr.Plot("")
|
208 |
-
# btn_run.click(fn=tsne_DNA, inputs=[inp_dna, genus_out])
|
209 |
-
|
210 |
-
with gr.Column():
|
211 |
-
gr.Markdown("Plot of the five most common species at your sample coordinate.")
|
212 |
|
213 |
demo.launch()
|
|
|
58 |
|
59 |
# Load datasets
|
60 |
ecolayers_ds = load_dataset(DATASETS["ecolayers"])
|
61 |
+
amazon_ds = load_dataset(DATASETS["amazon"])
|
62 |
|
63 |
|
64 |
def set_default_inputs():
|
|
|
134 |
top_k.values.detach().numpy(),
|
135 |
index=[ID_TO_GENUS_MAP[i] for i in top_k.indices.detach().numpy()]
|
136 |
)
|
|
|
137 |
|
138 |
fig, ax = plt.subplots()
|
139 |
ax.bar(top_k.index.astype(str), top_k.values)
|
|
|
148 |
return PIL.Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
|
149 |
|
150 |
|
151 |
+
def cluster_dna(top_k: float):
|
152 |
+
df = amazon_ds["train"].to_pandas()
|
153 |
+
df = df[df["genus"].notna()]
|
154 |
+
top_k = int(top_k)
|
155 |
+
genus_counts = df["genus"].value_counts()
|
156 |
+
top_genuses = genus_counts.head(top_k).index
|
157 |
+
df = df[df["genus"].isin(top_genuses)]
|
158 |
+
tsne = TSNE(
|
159 |
+
n_components=2, perplexity=30, learning_rate=200,
|
160 |
+
n_iter=1000, random_state=0,
|
161 |
+
)
|
162 |
+
X = np.stack(df["embeddings"].tolist())
|
163 |
+
y = df["genus"].tolist()
|
164 |
+
|
165 |
+
X_tsne = tsne.fit_transform(X)
|
166 |
+
|
167 |
+
label_encoder = LabelEncoder()
|
168 |
+
y_encoded = label_encoder.fit_transform(y)
|
169 |
+
|
170 |
+
fig, ax = plt.subplots()
|
171 |
+
ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_encoded, cmap="viridis", alpha=0.7)
|
172 |
+
ax.set_title(f"DNA Embedding Space (of {str(top_k)} most common genera)")
|
173 |
+
# Reduce unnecessary whitespace
|
174 |
+
ax.set_xlim(X_tsne[:, 0].min() - 0.1, X_tsne[:, 0].max() + 0.1)
|
175 |
+
fig.canvas.draw()
|
176 |
+
|
177 |
+
return PIL.Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
|
178 |
+
|
179 |
with gr.Blocks() as demo:
|
180 |
# Header section
|
181 |
gr.Markdown("# DNA Identifier Tool")
|
|
|
197 |
inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
|
198 |
|
199 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
btn_defaults = gr.Button("I'm feeling lucky")
|
201 |
btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
|
202 |
|
203 |
with gr.Tab("Genus Prediction"):
|
204 |
+
gr.Markdown("""
|
205 |
+
# Genus prediction
|
206 |
+
|
207 |
+
A demo of predicting the genus of a DNA sequence using multiple
|
208 |
+
approaches (method dropdown):
|
209 |
+
|
210 |
+
- **fine_tuned_model**: using our
|
211 |
+
`LofiAmazon/BarcodeBERT-Finetuned-Amazon` which predicts the genus
|
212 |
+
based on the DNA sequence and environmental data.
|
213 |
+
- **cosine**: computes a cosine similarity between the DNA sequence
|
214 |
+
embedding generated by our model and the embeddings of known samples
|
215 |
+
that we precomputed and stored in a Pinecone index. Thie method
|
216 |
+
DOES NOT examine ecological layer data.
|
217 |
+
""")
|
218 |
gr.Interface(
|
219 |
fn=predict_genus,
|
220 |
inputs=[
|
|
|
224 |
inp_lng,
|
225 |
],
|
226 |
outputs=["image"],
|
227 |
+
allow_flagging="never",
|
228 |
)
|
229 |
|
230 |
+
with gr.Tab("DNA Embedding Space Visualizer"):
|
231 |
+
gr.Markdown("""
|
232 |
+
# DNA Embedding Space Visualizer
|
233 |
+
|
234 |
+
We show a 2D t-SNE plot of the DNA embeddings of the five most common
|
235 |
+
genera in our dataset. This shows that the DNA Transformer model is
|
236 |
+
learning to cluster similar DNA sequences together.
|
237 |
+
""")
|
238 |
|
239 |
+
gr.Interface(
|
240 |
+
fn=cluster_dna,
|
241 |
+
inputs=[
|
242 |
+
gr.Slider(minimum=1, maximum=10, step=1, value=5,
|
243 |
+
label="Number of top genera to visualize")
|
244 |
+
],
|
245 |
+
outputs=["image"],
|
246 |
+
allow_flagging="never",
|
247 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
demo.launch()
|
config.py
CHANGED
@@ -25,4 +25,5 @@ MODELS = {
|
|
25 |
|
26 |
DATASETS = {
|
27 |
"ecolayers": "LofiAmazon/Global-Ecolayers",
|
|
|
28 |
}
|
|
|
25 |
|
26 |
DATASETS = {
|
27 |
"ecolayers": "LofiAmazon/Global-Ecolayers",
|
28 |
+
"amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
|
29 |
}
|