Eachan Johnson commited on
Commit
028bbd0
·
1 Parent(s): c35d034

Major refactor

Browse files
Files changed (3) hide show
  1. .gitignore +3 -2
  2. app.py +480 -364
  3. example-data/examples.json +122 -0
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
- /*.csv
2
- /cache/models--*
 
 
1
+ /cache/duvida/models--*
2
+ /cache/downloads/
3
+ *.log
app.py CHANGED
@@ -1,11 +1,15 @@
1
  """Gradio demo for schemist."""
2
 
3
  from typing import Iterable, List, Optional, Union
 
4
  from functools import partial
5
  from io import TextIOWrapper
 
6
  import json
7
  import os
8
- # os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"
 
 
9
 
10
  from carabiner import cast, print_err
11
  from carabiner.pd import read_table
@@ -22,18 +26,20 @@ from schemist.converting import (
22
  )
23
  from schemist.tables import converter
24
  import torch
 
25
 
 
26
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
 
28
  CACHE = "./cache"
29
- MAX_ROWS = 4000
30
- BATCH_SIZE=32
31
  HEADER_FILE = os.path.join("sources", "header.md")
32
  with open("repos.json", "r") as f:
33
  MODEL_REPOS = json.load(f)
34
 
35
  MODELBOXES = {
36
- key: AutoModelBox.from_pretrained(val, cache_dir=CACHE)
37
  for key, val in MODEL_REPOS.items()
38
  }
39
  [mb.to(DEVICE) for mb in MODELBOXES.values()]
@@ -45,24 +51,46 @@ EXTRA_METRICS = {
45
  "Information sensitivity (approx.)": lambda modelbox, candidates: modelbox.information_sensitivity(candidates=candidates, batch_size=BATCH_SIZE, optimality_approximation=True, approximator="squared_jacobian", cache=CACHE).map(lambda x: {"information sensitivity": torch.log10(x["information sensitivity"])}),
46
  }
47
 
 
 
 
48
  def get_dropdown_options(df, _type = str):
49
  if _type == str:
50
  cols = list(df.select_dtypes(exclude=[np.number]))
51
  else:
52
  cols = list(df.select_dtypes([np.number]))
53
- return gr.Dropdown(choices=cols, interactive=True, value=cols[0], visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
 
56
- def load_input_data(file: Union[TextIOWrapper, str]) -> pd.DataFrame:
57
  file = file if isinstance(file, str) else file.name
58
  print_err(f"Loading {file}")
59
- df = read_table(file)
60
  print_err(df.head())
61
- return gr.Dataframe(value=df, visible=True), get_dropdown_options(df, str)
 
 
 
62
 
63
 
64
  def _clean_split_input(strings: str) -> List[str]:
65
- return [s2.strip() for s in strings.split("\n") for s2 in s.split(",")]
 
 
 
 
66
 
67
 
68
  def _convert_input(
@@ -82,7 +110,7 @@ def _convert_input(
82
  def convert_one(
83
  strings: str,
84
  input_representation: str = 'smiles',
85
- output_representation: Union[Iterable[str], str] = 'smiles'
86
  ):
87
  output_representation = cast(output_representation, to=list)
88
  for rep in output_representation:
@@ -168,7 +196,9 @@ def predict_one(
168
  strings: str,
169
  input_representation: str = 'smiles',
170
  predict: Union[Iterable[str], str] = 'smiles',
171
- extra_metrics: Optional[Union[Iterable[str], str]] = None
 
 
172
  ):
173
  prediction_df = convert_one(
174
  strings=strings,
@@ -180,15 +210,26 @@ def predict_one(
180
  predict=predict,
181
  extra_metrics=extra_metrics,
182
  )
183
- return gr.DataFrame(
184
- prediction_df[
185
- ['id', 'pubchem_name', 'pubchem_id']
186
- + prediction_cols
187
- + ['smiles', 'inchikey', "mwt", "clogp"]
188
- ],
189
- visible=True
190
- )
191
-
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  def convert_file(
194
  df: pd.DataFrame,
@@ -230,10 +271,12 @@ def predict_file(
230
  input_representation: str = 'smiles',
231
  predict: str = 'smiles',
232
  predict2: Optional[str] = None,
233
- extra_metrics: Optional[Union[Iterable[str], str]] = None
 
 
234
  ):
235
  predict = cast(predict, to=list)
236
- if predict2 is not None:
237
  predict += cast(predict2, to=list)
238
  if extra_metrics is None:
239
  extra_metrics = []
@@ -266,25 +309,49 @@ def predict_file(
266
  col for col in prediction_df
267
  if col not in main_cols
268
  ]
269
- return prediction_df[
270
  ['id', 'inchikey']
271
  + [column]
272
  + prediction_cols + other_cols
273
  + ['smiles', "mwt", "clogp"]
274
  ]
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  def draw_one(
277
- strings: Union[Iterable[str], str],
278
- input_representation: str = 'smiles'
 
279
  ):
280
- message = f"Drawing {len(cast(strings, to=list))} molecules..."
 
 
 
 
281
  gr.Info(message, duration=10)
282
- _ids = _convert_input(
283
- strings,
284
- input_representation,
285
- ["inchikey", "id", "pubchem_name"],
286
- )
287
- mols = cast(_x2mol(_clean_split_input(strings), input_representation), to=list)
288
  if isinstance(mols, Mol):
289
  mols = [mols]
290
  return Draw.MolsToGridImage(
@@ -294,6 +361,7 @@ def draw_one(
294
  legends=["\n".join(items) for items in zip(*_ids.values())],
295
  )
296
 
 
297
  def log10_if_all_positive(df, col):
298
  if np.all(df[col] > 0.):
299
  df[col] = np.log10(df[col])
@@ -355,386 +423,434 @@ def download_table(
355
  df: pd.DataFrame
356
  ) -> str:
357
  df_hash = nm.hash(pd.util.hash_pandas_object(df).values)
358
- filename = f"predicted-{df_hash}.csv"
 
 
359
  df.to_csv(filename, index=False)
360
  return gr.DownloadButton(value=filename, visible=True)
361
 
362
 
363
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
- with open(HEADER_FILE, 'r') as f:
366
- header_md = f.read()
367
- gr.Markdown(header_md)
368
 
369
- with gr.Tab(label="Paste one per line"):
370
- input_format_single = gr.Dropdown(
 
 
 
 
 
 
371
  label="Input string format",
372
  choices=list(_FROM_FUNCTIONS),
373
  value="smiles",
374
  interactive=True,
375
- )
376
- input_line = gr.Textbox(
377
- label="Input",
378
- placeholder="Paste your molecule here, one per line",
379
- lines=2,
380
- interactive=True,
381
- submit_btn=True,
382
- )
383
- output_species_single = gr.CheckboxGroup(
384
  label="Species for prediction",
385
  choices=list(MODEL_REPOS),
386
  value=list(MODEL_REPOS)[:1],
387
  interactive=True,
388
- )
389
- extra_metric = gr.CheckboxGroup(
390
  label="Extra metrics (Doubscore & Information Sensitivity can increase calculation time to a couple of minutes!)",
391
  choices=list(EXTRA_METRICS),
392
  value=list(EXTRA_METRICS)[:2],
393
  interactive=True,
394
- )
395
- examples = gr.Examples(
396
- examples=[
397
- [
398
- '\n'.join([
399
- "C1CC1N2C=C(C(=O)C3=CC(=C(C=C32)N4CCNCC4)F)C(=O)O",
400
- "CN1C(=NC(=O)C(=O)N1)SCC2=C(N3[C@@H]([C@@H](C3=O)NC(=O)/C(=N\OC)/C4=CSC(=N4)N)SC2)C(=O)O",
401
- "CC(C)(C(=O)O)O/N=C(/C1=CSC(=N1)N)\C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)C[N+]4(CCCC4)CCNC(=O)C5=C(C(=C(C=C5)O)O)Cl)C(=O)[O-]",
402
- "CC(=O)NC[C@H]1CN(C(=O)O1)C2=CC(=C(C=C2)N3CCOCC3)F",
403
- "C1CC2=CC(=NC=C2OC1)CNC3CCN(CC3)C[C@@H]4CN5C(=O)C=CC6=C5N4C(=O)C=N6",
404
- ]),
405
- "Yersinia pestis",
406
- list(EXTRA_METRICS)[:2],
407
- ], # cipro, ceftriaxone, cefiderocol, linezolid, gepotidacin
408
- [
409
- '\n'.join([
410
- "C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)CO)O)N)O",
411
- "CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](C3=CC=CC=C3)N)C(=O)O)C",
412
- "CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](C3=CC=C(C=C3)O)N)C(=O)O)C",
413
- "C[C@@H]1[C@@H]2[C@H](C(=O)N2C(=C1S[C@H]3C[C@H](NC3)C(=O)N(C)C)C(=O)O)[C@@H](C)O",
414
- "C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O",
415
- "CC1=C2C=CC=C(C2=C(C3=C1C[C@H]4[C@@H](C(=O)C(=C([C@]4(C3=O)O)O)C(=O)N)N(C)C)O)O",
416
- ]),
417
- "Staphylococcus aureus",
418
- list(EXTRA_METRICS)[:2],
419
- ], # doxorubicin, ampicillin, amoxicillin, meropenem, tetracycline, anhydrotetracycline
420
- [
421
- '\n'.join([
422
- "C1=C(SC(=N1)SC2=NN=C(S2)N)[N+](=O)[O-]",
423
- "C1CN(CCC12C3=CC=CC=C3NC(=O)O2)CCC4=CC=C(C=C4)C(F)(F)F",
424
- "COC1=CC(=CC(=C1OC)OC)CC2=CN=C(N=C2N)N",
425
- "CC1=CC(=NO1)NS(=O)(=O)C2=CC=C(C=C2)N",
426
- "C1[C@@H]([C@H]([C@@H]([C@H]([C@@H]1NC(=O)[C@H](CCN)O)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)N)O)O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CN)O)O)O)N",
427
- "C1=CN=CC=C1C(=O)NN",
428
- ]),
429
- ["Escherichia coli", "Acinetobacter baumannii"],
430
- list(EXTRA_METRICS)[:2],
431
- ], # Halicin, Abaucin, Trimethoprim, Sulfamethoxazole, Amikacin, Isoniazid
432
- [
433
- '\n'.join([
434
- "CC[C@H](C)[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N2CCC[C@@H]2C(=O)N3CCC[C@H]3C(=O)N[C@H](C(=O)N[C@H](C(=O)N1)CC4=CNC5=CC=CC=C54)[C@@H](C)O)CO)C)CCN)CCN)CC6=CNC7=CC=CC=C76)CCN)CCN)CCCN)CCN",
435
- "C[C@H]1[C@H]([C@@](C[C@@H](O1)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2OC3=C4C=C5C=C3OC6=C(C=C(C=C6)[C@H]([C@H](C(=O)N[C@H](C(=O)N[C@H]5C(=O)N[C@@H]7C8=CC(=C(C=C8)O)C9=C(C=C(C=C9O)O)[C@H](NC(=O)[C@H]([C@@H](C1=CC(=C(O4)C=C1)Cl)O)NC7=O)C(=O)O)CC(=O)N)NC(=O)[C@@H](CC(C)C)NC)O)Cl)CO)O)O)(C)N)O",
436
- "CN1[C@H](C(=O)NCC2=C(C=CC=C2SC3=C(CN[C@H](C(=O)N[C@H](C1=O)CCCCN)CCCN)C=CC=N3)C4=CC=C(C=C4)C(=O)O)CC5=CNC6=CC=CC=C65",
437
- "C[C@@]1(CO[C@@H]([C@@H]([C@H]1NC)O)O[C@H]2[C@@H](C[C@@H]([C@H]([C@@H]2O)O[C@@H]3[C@@H](CC=C(O3)CNCCO)N)N)NC(=O)[C@H](CCN)O)O",
438
- "CC(C1CCC(C(O1)OC2C(CC(C(C2O)OC3C(C(C(CO3)(C)O)NC)O)N)N)N)NC",
439
- "C[C@H]1/C=C/C=C(\C(=O)NC2=C(C(=C3C(=C2O)C(=C(C4=C3C(=O)[C@](O4)(O/C=C/[C@@H]([C@H]([C@H]([C@@H]([C@@H]([C@@H]([C@H]1O)C)O)C)OC(=O)C)C)OC)C)C)O)O)/C=N/N5CCN(CC5)C)/C",
440
- ]),
441
- "Acinetobacter baumannii",
442
- list(EXTRA_METRICS)[:2],
443
- ], # murepavadin, vancomycin, zosurabalpin, plazomicin, Gentamicin, rifampicin
444
- [
445
- '\n'.join([
446
- "CC1=C(OC2=CC=CC=C12)CN(C)C(=O)/C=C/C3=CC4=C(NC(=O)CC4)N=C3",
447
- "CC1=C(OC2=CC=CC=C12)CN(C)C(=O)/C=C/C3=CC4=C(NC(=O)[C@@H](C4)N)N=C3",
448
- "CC1=C(OC2=CC=CC=C12)CN(C)C(=O)/C=C/C3=CC4=C(NC(=O)[C@H](CC4)[NH3+])N=C3.[Cl-]",
449
- "C1=C(C(=O)NC(=O)N1)F",
450
- "CCCCCCNC(=O)N1C=C(C(=O)NC1=O)F",
451
- "C[C@@H]1OC[C@@H]2[C@@H](O1)[C@@H]([C@H]([C@@H](O2)O[C@H]3[C@H]4COC(=O)[C@@H]4[C@@H](C5=CC6=C(C=C35)OCO6)C7=CC(=C(C(=C7)OC)O)OC)O)O",
452
- ]),
453
- "Escherichia coli",
454
- list(EXTRA_METRICS)[:2],
455
- ], # Debio1452, Debio-1452-NH3, Fabimycin, 5-FU, Carmofur, Etoposide
456
- [
457
- '\n'.join([
458
- "COC1=CC(=CC(=C1OC)OC)CC2=CN=C(N=C2N)N",
459
- "CC(C)C1=CC=C(C=C1)CN2C=CC3=C2C=CC4=C3C(=NC(=N4)NC5CC5)N",
460
- "C1=CC(=CC=C1CCC2=CNC3=C2C(=O)NC(=N3)N)C(=O)N[C@@H](CCC(=O)O)C(=O)O",
461
- "CC1=C(C2=C(C=C1)N=C(NC2=O)N)SC3=CC=NC=C3",
462
- "CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)O)C(=O)O",
463
- "CC1=NC2=C(C=C(C=C2)CN(C)C3=CC=C(S3)C(=O)N[C@@H](CCC(=O)O)C(=O)O)C(=O)N1",
464
- ]),
465
- "Klebsiella pneumoniae",
466
- list(EXTRA_METRICS)[:2],
467
- ], # Trimethoprim, SCH79797, Pemetrexed, Nolatrexed, Methotrexate, Raltitrexed
468
- [
469
- '\n'.join([
470
- "C[C@H]([C@@H](C(=O)NO)NC(=O)C1=CC=C(C=C1)C#CC2=CC=C(C=C2)CN3CCOCC3)O",
471
- "CC(C)C1=CC=C(C=C1)CN2C=CC3=C2C=CC4=C3C(=NC(=N4)NC5CC5)N",
472
- "C1=CC=C(C=C1)CNC2=NC(=NC3=CC=CC=C32)NCC4=CC=CC=C4",
473
- "CC(C)(C)C1=CC=C(C=C1)C(=O)NC(=S)NC2=CC=C(C=C2)NC(=O)CCCCN(C)C",
474
- "CCC1=C(C(=NC(=N1)N)N)C2=CC=C(C=C2)Cl",
475
- "C1=CC(=CC=C1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCC2=CN=C3C(=N2)C(=NC(=N3)N)N",
476
- ]),
477
- "Klebsiella pneumoniae",
478
- list(EXTRA_METRICS)[:2],
479
- ], # CHIR-090, SCH79797, DBeQ, Tenovin-6, Pyrimethamine, Aminopterin
480
-
481
- ],
482
- example_labels=[
483
- "_Y. pestis_ (plague) vs Ciprofloxacin, Ceftriaxone, Cefiderocol, Linezolid, Gepotidacin",
484
- "_S. aureus_ vs Doxorubicin, Ampicillin, Amoxicillin, Meropenem, Tetracycline, Anhydrotetracycline",
485
- "_E. coli_ and _A. baumannii_ vs Halicin, Abaucin, Trimethoprim, Sulfamethoxazole, Amikacin, Isoniazid",
486
- "_A. baumannii_ vs Murepavadin, Vancomycin, Zosurabalpin, Plazomicin, Gentamicin, Rifampicin",
487
- "_E. coli_ vs Debio-1452, Debio-1452-NH3, Fabimycin, 5-FU, Carmofur, Etoposide",
488
- "_K. pneumoniae_ vs Trimethoprim, Pemetrexed, Nolatrexed, Methotrexate, Raltitrexed",
489
- "_K. pneumoniae_ vs CHIR-090, SCH79797, DBeQ, Tenovin-6, Pyrimethamine, Aminopterin"
490
- ],
491
- inputs=[input_line, output_species_single, extra_metric],
492
- cache_mode="eager",
493
- )
494
- download_single = gr.DownloadButton(
495
- label="Download predictions",
496
- visible=False,
497
- )
498
- # with gr.Row():
499
- output_line = gr.DataFrame(
500
- label="Predictions",
501
- interactive=False,
502
- visible=False,
503
- )
504
- drawing = gr.Image(label="Chemical structures")
505
-
506
- gr.on(
507
- [
508
- input_line.submit,
509
- ],
510
- fn=predict_one,
511
- inputs=[
512
- input_line,
513
- input_format_single,
514
- output_species_single,
515
- extra_metric,
516
- ],
517
- outputs={
518
- output_line,
519
- }
520
- ).then(
521
- draw_one,
522
- inputs=[
523
- input_line,
524
- input_format_single,
525
- ],
526
- outputs=drawing,
527
- ).then(
528
- download_table,
529
- inputs=output_line,
530
- outputs=download_single
531
- )
532
 
533
- with gr.Tab(f"Predict on structures from a file (max. {MAX_ROWS} rows, ≤ 2 species)"):
534
- input_file = gr.File(
535
  label="Upload a table of chemical compounds here",
536
  file_types=[".xlsx", ".csv", ".tsv", ".txt"],
537
- )
538
- with gr.Row():
539
- input_column = gr.Dropdown(
540
- label="Input column name",
541
- choices=[],
542
- allow_custom_value=True,
543
- visible=False,
544
- )
545
- input_format = gr.Dropdown(
546
- label="Input string format",
547
- choices=list(_FROM_FUNCTIONS),
548
- value="smiles",
549
- interactive=True,
550
- visible=True,
551
- )
552
- output_species = [
553
  gr.Dropdown(
554
  label="Species 1 for prediction",
555
  choices=list(MODEL_REPOS),
556
  value=list(MODEL_REPOS)[0],
557
  interactive=True,
 
558
  ),
559
  gr.Dropdown(
560
  label="Species 2 for prediction",
561
  choices=list(MODEL_REPOS),
562
  value=None,
563
  interactive=True,
 
564
  ),
565
- ]
566
- extra_metric_file = gr.CheckboxGroup(
567
  label="Extra metrics (Information Sensitivity can increase calculation time)",
568
  choices=list(EXTRA_METRICS),
569
  value=list(EXTRA_METRICS)[:2],
570
  interactive=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  )
572
-
573
- go_button2 = gr.Button(
574
- value="Predict!",
575
- )
576
 
577
- download = gr.DownloadButton(
578
- label="Download predictions",
579
- visible=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  )
581
- input_data = gr.Dataframe(
582
- label="Input data",
583
- max_height=500,
 
584
  visible=False,
585
- interactive=False,
586
- )
587
- with gr.Row():
588
- observed_col = gr.Dropdown(
589
- label="Observed column (y-axis) for left plot",
590
- choices=[],
591
- value=None,
592
- interactive=True,
593
- visible=False,
594
- )
595
- color_col = gr.Dropdown(
596
- label="Color for left plot",
597
- choices=[],
598
- value=None,
599
- interactive=True,
600
- visible=False,
601
- )
602
- with gr.Row():
603
- any_x_col = gr.Dropdown(
604
- label="x-axis for right plot",
605
- choices=[],
606
- value=None,
607
- interactive=True,
608
- visible=False,
609
- )
610
- any_y_col = gr.Dropdown(
611
- label="y-axis for right plot",
612
- choices=[],
613
- value=None,
614
- interactive=True,
615
- visible=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
  )
617
- any_color_col = gr.Dropdown(
618
- label="Color for right plot",
619
- choices=[],
620
- value=None,
621
- interactive=True,
622
- visible=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  )
624
- plot_button = gr.Button(
625
- value="Plot!",
626
- visible=False,
627
- )
628
- file_examples = gr.Examples(
629
- examples=[
630
- [
631
- "example-data/stokes2020-eco.csv",
632
- "SMILES",
633
- "Escherichia coli",
634
- "Mean_Growth",
635
- "Escherichia coli: Doubtscore",
636
- list(EXTRA_METRICS)[:3],
637
  ],
638
- [
639
- "example-data/liu23-abau.csv",
640
- "SMILES",
641
- "Acinetobacter baumannii",
642
- "Mean",
643
- "Acinetobacter baumannii: Doubtscore",
644
- list(EXTRA_METRICS)[:3],
645
  ],
646
- [
647
- "example-data/wong24-sau-tox-5000.csv",
648
- "SMILES",
649
- "Staphylococcus aureus",
650
- "Mean",
651
- "Staphylococcus aureus: Doubtscore",
652
- list(EXTRA_METRICS)[:3],
653
  ],
654
- ],
655
- example_labels=[
656
- "E. coli training data from Stokes J. et al., Cell, 2020",
657
- "A. baumannii training data from Liu, 2023",
658
- "S. aureus and toxicity training data from Wong, 2024",
659
- ],
660
- inputs=[input_file, input_column, output_species[0], observed_col, color_col, extra_metric_file],
661
- cache_mode="eager",
662
- )
663
- with gr.Row():
664
- pred_vs_observed = gr.ScatterPlot(
665
- label="Prediction vs observed",
666
- x_title="Predicted MIC (µM)",
667
- y_title="Observed",
668
- visible=False,
669
- height=600,
670
  )
671
- plot_any_vs_any = gr.ScatterPlot(
672
- label="Any vs any",
673
- visible=False,
674
- height=600,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  )
676
-
677
- load_data_action = {
678
- "fn": load_input_data,
679
- "inputs": [input_file],
680
- "outputs": [input_data, input_column]
681
- }
682
-
683
- file_examples.load_input_event.then(
684
- **load_data_action,
685
- )
686
- input_file.upload(
687
- **load_data_action,
688
- )
689
- go2_click_event = go_button2.click(
690
- predict_file,
691
- inputs=[
692
- input_data,
693
- input_column,
694
- input_format,
695
- *output_species,
696
- extra_metric_file,
697
- ],
698
- outputs={
699
- input_data,
700
- }
701
- ).then(
702
- download_table,
703
- inputs=input_data,
704
- outputs=download
705
- ).then(
706
- lambda: gr.Button(visible=True),
707
- outputs=[plot_button]
708
- )
709
-
710
- for dropdown in [observed_col, color_col, any_color_col, any_x_col, any_y_col]:
711
- go2_click_event.then(
712
- partial(get_dropdown_options, _type="number"),
713
- inputs=[input_data],
714
- outputs=[dropdown],
715
  )
716
 
717
- plot_button.click(
718
- plot_pred_vs_observed,
719
- inputs=[
720
- input_data,
721
- output_species[0],
722
- observed_col,
723
- color_col,
724
- ],
725
- outputs=[pred_vs_observed],
726
- ).then(
727
- plot_x_vs_y,
728
- inputs=[
729
- input_data,
730
- any_x_col,
731
- any_y_col,
732
- any_color_col,
733
- ],
734
- outputs=[plot_any_vs_any],
735
- )
736
-
737
- if __name__ == "__main__":
738
- demo.queue()
739
- demo.launch(share=True)
740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Gradio demo for schemist."""
2
 
3
  from typing import Iterable, List, Optional, Union
4
+ import csv
5
  from functools import partial
6
  from io import TextIOWrapper
7
+ import itertools
8
  import json
9
  import os
10
+ import sys
11
+
12
+ csv.field_size_limit(sys.maxsize)
13
 
14
  from carabiner import cast, print_err
15
  from carabiner.pd import read_table
 
26
  )
27
  from schemist.tables import converter
28
  import torch
29
+ from duvida.stateless.config import config
30
 
31
+ THEME = 'd8ahazard/material_design_rd'
32
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
 
34
  CACHE = "./cache"
35
+ MAX_ROWS = 500
36
+ BATCH_SIZE = 32
37
  HEADER_FILE = os.path.join("sources", "header.md")
38
  with open("repos.json", "r") as f:
39
  MODEL_REPOS = json.load(f)
40
 
41
  MODELBOXES = {
42
+ key: AutoModelBox.from_pretrained(val, cache_dir=os.path.join(CACHE, "duvida"))
43
  for key, val in MODEL_REPOS.items()
44
  }
45
  [mb.to(DEVICE) for mb in MODELBOXES.values()]
 
51
  "Information sensitivity (approx.)": lambda modelbox, candidates: modelbox.information_sensitivity(candidates=candidates, batch_size=BATCH_SIZE, optimality_approximation=True, approximator="squared_jacobian", cache=CACHE).map(lambda x: {"information sensitivity": torch.log10(x["information sensitivity"])}),
52
  }
53
 
54
+ with open(os.path.join("example-data", "examples.json"), "r") as f:
55
+ EXAMPLES = json.load(f)
56
+
57
  def get_dropdown_options(df, _type = str):
58
  if _type == str:
59
  cols = list(df.select_dtypes(exclude=[np.number]))
60
  else:
61
  cols = list(df.select_dtypes([np.number]))
62
+ non_none = [col for col in cols if col is not None]
63
+ if len(cols) > 0:
64
+ default_value = non_none[0]
65
+ else:
66
+ default_value = ""
67
+ print_err(f"Dropdown default value is {default_value}")
68
+ return gr.Dropdown(
69
+ choices=cols,
70
+ interactive=True,
71
+ value=default_value,
72
+ visible=True,
73
+ allow_custom_value=True,
74
+ )
75
 
76
 
77
+ def load_input_data(file: Union[TextIOWrapper, str], return_pd: bool = False) -> pd.DataFrame:
78
  file = file if isinstance(file, str) else file.name
79
  print_err(f"Loading {file}")
80
+ df = read_table(file, nrows=MAX_ROWS)
81
  print_err(df.head())
82
+ if return_pd:
83
+ return (df, gr.Dataframe(value=df, visible=True)), get_dropdown_options(df, str)
84
+ else:
85
+ return gr.Dataframe(value=df, visible=True), get_dropdown_options(df, str)
86
 
87
 
88
  def _clean_split_input(strings: str) -> List[str]:
89
+ return [
90
+ s2.split(":")[-1].strip()
91
+ for s in strings.split("\n")
92
+ for s2 in s.split(",")
93
+ ]
94
 
95
 
96
  def _convert_input(
 
110
  def convert_one(
111
  strings: str,
112
  input_representation: str = 'smiles',
113
+ output_representation: Union[Iterable[str], str] = 'smiles',
114
  ):
115
  output_representation = cast(output_representation, to=list)
116
  for rep in output_representation:
 
196
  strings: str,
197
  input_representation: str = 'smiles',
198
  predict: Union[Iterable[str], str] = 'smiles',
199
+ extra_metrics: Optional[Union[Iterable[str], str]] = None,
200
+ return_pd: bool = False
201
+ # progress = gr.Progress(track_tqdm=True)
202
  ):
203
  prediction_df = convert_one(
204
  strings=strings,
 
210
  predict=predict,
211
  extra_metrics=extra_metrics,
212
  )
213
+ df = prediction_df[
214
+ ['id', 'pubchem_name', 'pubchem_id']
215
+ + prediction_cols
216
+ + ['smiles', 'inchikey', "mwt", "clogp"]
217
+ ]
218
+ if return_pd:
219
+ return (
220
+ df,
221
+ gr.DataFrame(
222
+ df,
223
+ pinned_columns=3,
224
+ visible=True,
225
+ )
226
+ )
227
+ else:
228
+ return gr.DataFrame(
229
+ df,
230
+ pinned_columns=3,
231
+ visible=True,
232
+ )
233
 
234
  def convert_file(
235
  df: pd.DataFrame,
 
271
  input_representation: str = 'smiles',
272
  predict: str = 'smiles',
273
  predict2: Optional[str] = None,
274
+ extra_metrics: Optional[Union[Iterable[str], str]] = None,
275
+ return_pd: bool = False
276
+ # progress = gr.Progress(track_tqdm=True)
277
  ):
278
  predict = cast(predict, to=list)
279
+ if predict2 is not None and predict2 in MODELBOXES:
280
  predict += cast(predict2, to=list)
281
  if extra_metrics is None:
282
  extra_metrics = []
 
309
  col for col in prediction_df
310
  if col not in main_cols
311
  ]
312
+ prediction_df = prediction_df[
313
  ['id', 'inchikey']
314
  + [column]
315
  + prediction_cols + other_cols
316
  + ['smiles', "mwt", "clogp"]
317
  ]
318
 
319
+ if return_pd:
320
+ return (
321
+ prediction_df,
322
+ gr.Dataframe(
323
+ label="Predictions",
324
+ value=prediction_df,
325
+ pinned_columns=3,
326
+ visible=True,
327
+ wrap=True,
328
+ column_widths=[75] * prediction_df.shape[1],
329
+ ),
330
+ )
331
+ else:
332
+ return gr.Dataframe(
333
+ label="Predictions",
334
+ value=prediction_df,
335
+ pinned_columns=3,
336
+ visible=True,
337
+ wrap=True,
338
+ column_widths=[125] * prediction_df.shape[1],
339
+ )
340
+
341
+
342
  def draw_one(
343
+ df,
344
+ smiles_col: str = "smiles",
345
+ legends: Optional[Union[str, Iterable[str]]] = None
346
  ):
347
+ if legends is None:
348
+ legends = ["inchikey", "id", "pubchem_name"]
349
+ else:
350
+ legends = []
351
+ message = f"Drawing {df.shape[0]} molecules..."
352
  gr.Info(message, duration=10)
353
+ _ids = {col: df[col].tolist() for col in legends}
354
+ mols = cast(_x2mol(df[smiles_col], "smiles"), to=list)
 
 
 
 
355
  if isinstance(mols, Mol):
356
  mols = [mols]
357
  return Draw.MolsToGridImage(
 
361
  legends=["\n".join(items) for items in zip(*_ids.values())],
362
  )
363
 
364
+
365
  def log10_if_all_positive(df, col):
366
  if np.all(df[col] > 0.):
367
  df[col] = np.log10(df[col])
 
423
  df: pd.DataFrame
424
  ) -> str:
425
  df_hash = nm.hash(pd.util.hash_pandas_object(df).values)
426
+ filename = os.path.join(CACHE, "downloads", f"predicted-{df_hash}.csv")
427
+ if not os.path.exists(os.path.dirname(filename)):
428
+ os.makedirs(os.path.dirname(filename))
429
  df.to_csv(filename, index=False)
430
  return gr.DownloadButton(value=filename, visible=True)
431
 
432
 
433
+ def _predict_then_draw_then_download(
434
+ strings: str,
435
+ input_representation: str = 'smiles',
436
+ predict: Union[Iterable[str], str] = 'smiles',
437
+ extra_metrics: Optional[Union[Iterable[str], str]] = None,
438
+ smiles_col: str = "smiles",
439
+ legends: Optional[Union[str, Iterable[str]]] = None
440
+ ):
441
+ df, gr_df = predict_one(
442
+ strings=strings,
443
+ input_representation=input_representation,
444
+ predict=predict,
445
+ extra_metrics=extra_metrics,
446
+ return_pd=True,
447
+ )
448
+ img = draw_one(
449
+ df,
450
+ smiles_col="smiles",
451
+ )
452
+ return gr_df, img, download_table(df)
453
+
454
+
455
+ def _load_then_predict_then_download_then_reveal_plot(
456
+ file: str,
457
+ column: str = 'smiles',
458
+ input_representation: str = 'smiles',
459
+ predict: str = 'smiles',
460
+ predict2: Optional[str] = "",
461
+ extra_metrics: Optional[Union[Iterable[str], str]] = None
462
+ ):
463
+ (df, df_gr), col_opts = load_input_data(
464
+ file,
465
+ return_pd=True,
466
+ )
467
+ df, df_gr = predict_file(
468
+ df,
469
+ column=column,
470
+ input_representation=input_representation,
471
+ predict=predict,
472
+ predict2=None if predict2 == "" else predict2,
473
+ extra_metrics=extra_metrics,
474
+ return_pd=True,
475
+ )
476
+ print_err(df.head())
477
+ # plot_dropdown = get_dropdown_options(df, _type="number")
478
+ return (
479
+ df_gr,
480
+ download_table(df),
481
+ )
482
+
483
 
484
+ def _initial_setup():
 
 
485
 
486
+ """Set up blocks.
487
+
488
+ """
489
+ print_err(f"Duvida config is {config}")
490
+ print_err(f"Default torch device is {DEVICE}")
491
+
492
+ line_inputs = {
493
+ "format": gr.Dropdown(
494
  label="Input string format",
495
  choices=list(_FROM_FUNCTIONS),
496
  value="smiles",
497
  interactive=True,
498
+ ),
499
+ "species": gr.CheckboxGroup(
 
 
 
 
 
 
 
500
  label="Species for prediction",
501
  choices=list(MODEL_REPOS),
502
  value=list(MODEL_REPOS)[:1],
503
  interactive=True,
504
+ ),
505
+ "extras": gr.CheckboxGroup(
506
  label="Extra metrics (Doubscore & Information Sensitivity can increase calculation time to a couple of minutes!)",
507
  choices=list(EXTRA_METRICS),
508
  value=list(EXTRA_METRICS)[:2],
509
  interactive=True,
510
+ ),
511
+ "strings": gr.Textbox(
512
+ label="Input",
513
+ placeholder="Paste your molecule here, one per line.",
514
+ lines=2,
515
+ interactive=True,
516
+ submit_btn=True,
517
+ ),
518
+ }
519
+ output_line = gr.DataFrame(
520
+ label="Predictions (scroll left and right)",
521
+ interactive=False,
522
+ max_chars=75,
523
+ pinned_columns=3,
524
+ visible=True,
525
+ )
526
+ download_single = gr.DownloadButton(
527
+ label="Download predictions",
528
+ visible=False,
529
+ )
530
+ drawing = gr.Image(label="Chemical structures")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
 
532
+ file_inputs = {
533
+ "file": gr.File(
534
  label="Upload a table of chemical compounds here",
535
  file_types=[".xlsx", ".csv", ".tsv", ".txt"],
536
+ ),
537
+ "column": gr.Dropdown(
538
+ label="Input column name",
539
+ choices=[],
540
+ allow_custom_value=True,
541
+ visible=True,
542
+ interactive=True,
543
+ ),
544
+ "format": gr.Dropdown(
545
+ label="Input string format",
546
+ choices=list(_FROM_FUNCTIONS),
547
+ value="smiles",
548
+ interactive=True,
549
+ visible=True,
550
+ ),
551
+ "species": [
552
  gr.Dropdown(
553
  label="Species 1 for prediction",
554
  choices=list(MODEL_REPOS),
555
  value=list(MODEL_REPOS)[0],
556
  interactive=True,
557
+ allow_custom_value=True,
558
  ),
559
  gr.Dropdown(
560
  label="Species 2 for prediction",
561
  choices=list(MODEL_REPOS),
562
  value=None,
563
  interactive=True,
564
+ allow_custom_value=True,
565
  ),
566
+ ],
567
+ "extras": gr.CheckboxGroup(
568
  label="Extra metrics (Information Sensitivity can increase calculation time)",
569
  choices=list(EXTRA_METRICS),
570
  value=list(EXTRA_METRICS)[:2],
571
  interactive=True,
572
+ ),
573
+ }
574
+
575
+ input_dataframe = gr.Dataframe(
576
+ label="Input data",
577
+ max_height=500,
578
+ visible=True,
579
+ interactive=False,
580
+ show_fullscreen_button=True,
581
+ show_search="filter",
582
+ max_chars=45,
583
+ )
584
+ download = gr.DownloadButton(
585
+ label="Download predictions",
586
+ visible=False,
587
+ )
588
+ plot_button = gr.Button(
589
+ value="Plot!",
590
+ visible=False,
591
+ )
592
+
593
+ left_plot_inputs = {
594
+ "observed": gr.Dropdown(
595
+ label="Observed column (y-axis) for left plot",
596
+ choices=[],
597
+ value=None,
598
+ interactive=True,
599
+ visible=True,
600
+ allow_custom_value=True,
601
+ ),
602
+ "color": gr.Dropdown(
603
+ label="Color for left plot",
604
+ choices=[],
605
+ value=None,
606
+ interactive=True,
607
+ visible=True,
608
+ allow_custom_value=True,
609
  )
610
+ }
 
 
 
611
 
612
+ right_plot_inputs = {
613
+ "x": gr.Dropdown(
614
+ label="x-axis for right plot",
615
+ choices=[],
616
+ value=None,
617
+ interactive=True,
618
+ visible=True,
619
+ allow_custom_value=True,
620
+ ),
621
+ "y": gr.Dropdown(
622
+ label="y-axis for right plot",
623
+ choices=[],
624
+ value=None,
625
+ interactive=True,
626
+ visible=True,
627
+ allow_custom_value=True,
628
+ ),
629
+ "color": gr.Dropdown(
630
+ label="Color for right plot",
631
+ choices=[],
632
+ value=None,
633
+ interactive=True,
634
+ visible=True,
635
+ allow_custom_value=True,
636
  )
637
+ }
638
+ plots = {
639
+ "left": gr.ScatterPlot(
640
+ height=500,
641
  visible=False,
642
+ ),
643
+ "right": gr.ScatterPlot(
644
+ height=500,
645
+ visible=False,
646
+ ),
647
+ }
648
+
649
+ return (
650
+ line_inputs,
651
+ output_line,
652
+ download_single,
653
+ drawing,
654
+ file_inputs,
655
+ input_dataframe,
656
+ download,
657
+ plot_button,
658
+ left_plot_inputs,
659
+ right_plot_inputs,
660
+ plots,
661
+ )
662
+
663
+ if __name__ == "__main__":
664
+ (
665
+ line_inputs,
666
+ output_line,
667
+ download_single,
668
+ drawing,
669
+ file_inputs,
670
+ input_dataframe,
671
+ download,
672
+ plot_button,
673
+ left_plot_inputs,
674
+ right_plot_inputs,
675
+ plots,
676
+ ) = _initial_setup()
677
+ with gr.Blocks(theme=THEME) as demo:
678
+ with open(HEADER_FILE, 'r') as f:
679
+ header_md = f.read()
680
+ gr.Markdown(header_md)
681
+
682
+ with gr.Tab(label="Paste one per line"):
683
+ examples = gr.Examples(
684
+ examples=[
685
+ [
686
+ "\n".join(eg["strings"]),
687
+ "smiles",
688
+ eg["species"],
689
+ list(EXTRA_METRICS)[:2],
690
+ ]
691
+ for eg in EXAMPLES["line input examples"]
692
+ ],
693
+ example_labels=[
694
+ eg["label"] for eg in EXAMPLES["line input examples"]
695
+ ],
696
+ inputs=[
697
+ line_inputs["strings"],
698
+ line_inputs["format"],
699
+ line_inputs["species"],
700
+ line_inputs["extras"],
701
+ ],
702
+ fn=_predict_then_draw_then_download,
703
+ outputs=[
704
+ output_line,
705
+ drawing,
706
+ download_single,
707
+ ],
708
+ cache_examples=True,
709
+ cache_mode="lazy",
710
  )
711
+
712
+ for val in line_inputs.values():
713
+ val.render()
714
+ # with gr.Row():
715
+ output_line.render()
716
+ download_single.render()
717
+ drawing.render()
718
+ line_inputs["strings"].submit(
719
+ fn=_predict_then_draw_then_download,
720
+ inputs=[
721
+ line_inputs["strings"],
722
+ line_inputs["format"],
723
+ line_inputs["species"],
724
+ line_inputs["extras"],
725
+ ],
726
+ outputs=[
727
+ output_line,
728
+ drawing,
729
+ download_single,
730
+ ],
731
  )
732
+ with gr.Tab(f"Predict on structures from a file (max. {MAX_ROWS} rows, ≤ 2 species)"):
733
+ file_examples = gr.Examples(
734
+ examples=[
735
+ [
736
+ eg["file"],
737
+ eg["column"],
738
+ "smiles",
739
+ eg["species"],
740
+ "",
741
+ list(EXTRA_METRICS)[:2],
742
+ ] for eg in EXAMPLES["file examples"]
 
 
743
  ],
744
+ example_labels=[
745
+ eg["label"] for eg in EXAMPLES["file examples"]
 
 
 
 
 
746
  ],
747
+ fn=_load_then_predict_then_download_then_reveal_plot,
748
+ inputs=[
749
+ file_inputs["file"],
750
+ file_inputs["column"],
751
+ file_inputs["format"],
752
+ *file_inputs["species"],
753
+ file_inputs["extras"],
754
  ],
755
+ outputs=[
756
+ input_dataframe,
757
+ download,
758
+ ],
759
+ cache_examples=True, ## appears to cause CSV load error
760
+ cache_mode="lazy",
 
 
 
 
 
 
 
 
 
 
761
  )
762
+ file_inputs["file"].render()
763
+ with gr.Row():
764
+ for key in ("column", "format"):
765
+ file_inputs[key].render()
766
+ with gr.Row():
767
+ for item in file_inputs["species"]:
768
+ item.render()
769
+ file_inputs["extras"].render()
770
+
771
+ go_button2 = gr.Button(value="Predict!")
772
+
773
+ input_dataframe.render()
774
+ download.render()
775
+ with gr.Row():
776
+ for val in left_plot_inputs.values():
777
+ val.render()
778
+ with gr.Row():
779
+ for val in right_plot_inputs.values():
780
+ val.render()
781
+ plot_button.render()
782
+
783
+ with gr.Row():
784
+ for val in plots.values():
785
+ val.render()
786
+
787
+ file_inputs["file"].upload(
788
+ fn=load_input_data,
789
+ inputs=file_inputs["file"],
790
+ outputs=[
791
+ input_dataframe,
792
+ file_inputs["column"],
793
+ ],
794
  )
795
+ go2_click_event = go_button2.click(
796
+ predict_file,
797
+ inputs=[
798
+ input_dataframe,
799
+ file_inputs["column"],
800
+ file_inputs["format"],
801
+ *file_inputs["species"],
802
+ file_inputs["extras"],
803
+ ],
804
+ outputs=[
805
+ input_dataframe,
806
+ ],
807
+ )
808
+
809
+ df_change = input_dataframe.change(
810
+ download_table,
811
+ inputs=input_dataframe,
812
+ outputs=download
813
+ ).then(
814
+ lambda: gr.Button(visible=True),
815
+ outputs=[plot_button],
816
+ js=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  )
818
 
819
+ # file_examples.load_input_event.then(
820
+ # lambda: gr.Button(visible=True),
821
+ # outputs=[plot_button],
822
+ # js=True,
823
+ # )
824
+
825
+ for dropdown in itertools.chain(
826
+ left_plot_inputs.values(),
827
+ right_plot_inputs.values(),
828
+ ):
829
+ # for e in (file_examples.load_input_event, go2_click_event):
830
+ df_change.then(
831
+ partial(get_dropdown_options, _type="number"),
832
+ inputs=[input_dataframe],
833
+ outputs=[dropdown],
834
+ )
 
 
 
 
 
 
 
835
 
836
+ plot_button.click(
837
+ plot_pred_vs_observed,
838
+ inputs=[
839
+ input_dataframe,
840
+ file_inputs["species"][0],
841
+ left_plot_inputs["observed"],
842
+ left_plot_inputs["color"],
843
+ ],
844
+ outputs=[plots["left"]],
845
+ ).then(
846
+ plot_x_vs_y,
847
+ inputs=[
848
+ input_dataframe,
849
+ right_plot_inputs["x"],
850
+ right_plot_inputs["y"],
851
+ right_plot_inputs["color"],
852
+ ],
853
+ outputs=[plots["right"]],
854
+ )
855
+ demo.queue()
856
+ demo.launch(share=True)
example-data/examples.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "line input examples": [
3
+ {
4
+ "label": "Y. pestis (plague) vs Ciprofloxacin, Ceftriaxone, Cefiderocol, Linezolid, Gepotidacin",
5
+ "strings": [
6
+ "Ciprofloxacin: C1CC1N2C=C(C(=O)C3=CC(=C(C=C32)N4CCNCC4)F)C(=O)O",
7
+ "Ceftriaxone: CN1C(=NC(=O)C(=O)N1)SCC2=C(N3[C@@H]([C@@H](C3=O)NC(=O)/C(=N\\OC)/C4=CSC(=N4)N)SC2)C(=O)O",
8
+ "Cefiderocol: CC(C)(C(=O)O)O/N=C(/C1=CSC(=N1)N)\\C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)C[N+]4(CCCC4)CCNC(=O)C5=C(C(=C(C=C5)O)O)Cl)C(=O)[O-]",
9
+ "Linezolid: CC(=O)NC[C@H]1CN(C(=O)O1)C2=CC(=C(C=C2)N3CCOCC3)F",
10
+ "Gepotidacin: C1CC2=CC(=NC=C2OC1)CNC3CCN(CC3)C[C@@H]4CN5C(=O)C=CC6=C5N4C(=O)C=N6"
11
+ ],
12
+ "species": [
13
+ "Yersinia pestis"
14
+ ]
15
+ },
16
+ {
17
+ "label": "S. aureus vs Doxorubicin, Ampicillin, Amoxicillin, Meropenem, Tetracycline, Anhydrotetracycline",
18
+ "strings": [
19
+ "Doxorubicin: C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)CO)O)N)O",
20
+ "Ampicillin: CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](C3=CC=CC=C3)N)C(=O)O)C",
21
+ "Amoxicillin: CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](C3=CC=C(C=C3)O)N)C(=O)O)C",
22
+ "Meropenem: C[C@@H]1[C@@H]2[C@H](C(=O)N2C(=C1S[C@H]3C[C@H](NC3)C(=O)N(C)C)C(=O)O)[C@@H](C)O",
23
+ "Tetracycline: C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O",
24
+ "Anhydrotetracycline: CC1=C2C=CC=C(C2=C(C3=C1C[C@H]4[C@@H](C(=O)C(=C([C@]4(C3=O)O)O)C(=O)N)N(C)C)O)O"
25
+ ],
26
+ "species": [
27
+ "Staphylococcus aureus"
28
+ ]
29
+ },
30
+ {
31
+ "label": "E. coli and A. baumannii vs Halicin, Abaucin, Trimethoprim, Sulfamethoxazole, Amikacin, Isoniazid",
32
+ "strings": [
33
+ "Halicin: C1=C(SC(=N1)SC2=NN=C(S2)N)[N+](=O)[O-]",
34
+ "Abaucin: C1CN(CCC12C3=CC=CC=C3NC(=O)O2)CCC4=CC=C(C=C4)C(F)(F)F",
35
+ "Trimethoprim: COC1=CC(=CC(=C1OC)OC)CC2=CN=C(N=C2N)N",
36
+ "Amikacin: CC1=CC(=NO1)NS(=O)(=O)C2=CC=C(C=C2)N",
37
+ "Sulfamethoxazole: C1[C@@H]([C@H]([C@@H]([C@H]([C@@H]1NC(=O)[C@H](CCN)O)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)N)O)O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CN)O)O)O)N",
38
+ "Isoniazid: C1=CN=CC=C1C(=O)NN"
39
+ ],
40
+ "species": [
41
+ "Escherichia coli",
42
+ "Acinetobacter baumannii"
43
+ ]
44
+ },
45
+ {
46
+ "label": "A. baumannii vs Murepavadin, Vancomycin, Zosurabalpin, Plazomicin, Gentamicin, Rifampicin",
47
+ "strings": [
48
+ "Murepavadin: CC[C@H](C)[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N2CCC[C@@H]2C(=O)N3CCC[C@H]3C(=O)N[C@H](C(=O)N[C@H](C(=O)N1)CC4=CNC5=CC=CC=C54)[C@@H](C)O)CO)C)CCN)CCN)CC6=CNC7=CC=CC=C76)CCN)CCN)CCCN)CCN",
49
+ "Vancomycin: C[C@H]1[C@H]([C@@](C[C@@H](O1)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2OC3=C4C=C5C=C3OC6=C(C=C(C=C6)[C@H]([C@H](C(=O)N[C@H](C(=O)N[C@H]5C(=O)N[C@@H]7C8=CC(=C(C=C8)O)C9=C(C=C(C=C9O)O)[C@H](NC(=O)[C@H]([C@@H](C1=CC(=C(O4)C=C1)Cl)O)NC7=O)C(=O)O)CC(=O)N)NC(=O)[C@@H](CC(C)C)NC)O)Cl)CO)O)O)(C)N)O",
50
+ "Zosurabalpin: CN1[C@H](C(=O)NCC2=C(C=CC=C2SC3=C(CN[C@H](C(=O)N[C@H](C1=O)CCCCN)CCCN)C=CC=N3)C4=CC=C(C=C4)C(=O)O)CC5=CNC6=CC=CC=C65",
51
+ "Plazomicin: C[C@@]1(CO[C@@H]([C@@H]([C@H]1NC)O)O[C@H]2[C@@H](C[C@@H]([C@H]([C@@H]2O)O[C@@H]3[C@@H](CC=C(O3)CNCCO)N)N)NC(=O)[C@H](CCN)O)O",
52
+ "Gentamicin: CC(C1CCC(C(O1)OC2C(CC(C(C2O)OC3C(C(C(CO3)(C)O)NC)O)N)N)N)NC",
53
+ "Rifampicin: C[C@H]1/C=C/C=C(\\C(=O)NC2=C(C(=C3C(=C2O)C(=C(C4=C3C(=O)[C@](O4)(O/C=C/[C@@H]([C@H]([C@H]([C@@H]([C@@H]([C@@H]([C@H]1O)C)O)C)OC(=O)C)C)OC)C)C)O)O)/C=N/N5CCN(CC5)C)/C"
54
+ ],
55
+ "species": [
56
+ "Acinetobacter baumannii"
57
+ ]
58
+ },
59
+ {
60
+ "label": "E. coli vs Debio-1452, Debio-1452-NH3, Fabimycin, 5-FU, Carmofur, Etoposide",
61
+ "strings": [
62
+ "Debio-1452: CC1=C(OC2=CC=CC=C12)CN(C)C(=O)/C=C/C3=CC4=C(NC(=O)CC4)N=C3",
63
+ "Debio-1452-NH3: CC1=C(OC2=CC=CC=C12)CN(C)C(=O)/C=C/C3=CC4=C(NC(=O)[C@@H](C4)N)N=C3",
64
+ "Fabimycin: CC1=C(OC2=CC=CC=C12)CN(C)C(=O)/C=C/C3=CC4=C(NC(=O)[C@H](CC4)[NH3+])N=C3.[Cl-]",
65
+ "5-FU: C1=C(C(=O)NC(=O)N1)F",
66
+ "Carmofur: CCCCCCNC(=O)N1C=C(C(=O)NC1=O)F",
67
+ "Etoposide: C[C@@H]1OC[C@@H]2[C@@H](O1)[C@@H]([C@H]([C@@H](O2)O[C@H]3[C@H]4COC(=O)[C@@H]4[C@@H](C5=CC6=C(C=C35)OCO6)C7=CC(=C(C(=C7)OC)O)OC)O)O"
68
+ ],
69
+ "species": [
70
+ "Escherichia coli"
71
+ ]
72
+ },
73
+ {
74
+ "label": "K. pneumoniae vs Trimethoprim, SCH-79797, Pemetrexed, Nolatrexed, Methotrexate, Raltitrexed",
75
+ "strings": [
76
+ "Trimethoprim: COC1=CC(=CC(=C1OC)OC)CC2=CN=C(N=C2N)N",
77
+ "SCH-79797: CC(C)C1=CC=C(C=C1)CN2C=CC3=C2C=CC4=C3C(=NC(=N4)NC5CC5)N",
78
+ "Pemetrexed: C1=CC(=CC=C1CCC2=CNC3=C2C(=O)NC(=N3)N)C(=O)N[C@@H](CCC(=O)O)C(=O)O",
79
+ "Nolatrexed: CC1=C(C2=C(C=C1)N=C(NC2=O)N)SC3=CC=NC=C3",
80
+ "Methotrexate: CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)O)C(=O)O",
81
+ "Raltitrexed: CC1=NC2=C(C=C(C=C2)CN(C)C3=CC=C(S3)C(=O)N[C@@H](CCC(=O)O)C(=O)O)C(=O)N1"
82
+ ],
83
+ "species": [
84
+ "Klebsiella pneumoniae"
85
+ ]
86
+ },
87
+ {
88
+ "label": "K. pneumoniae vs CHIR-090, SCH79797, DBeQ, Tenovin-6, Pyrimethamine, Aminopterin",
89
+ "strings": [
90
+ "CHIR-090: C[C@H]([C@@H](C(=O)NO)NC(=O)C1=CC=C(C=C1)C#CC2=CC=C(C=C2)CN3CCOCC3)O",
91
+ "SCH79797: CC(C)C1=CC=C(C=C1)CN2C=CC3=C2C=CC4=C3C(=NC(=N4)NC5CC5)N",
92
+ "DBeQ: C1=CC=C(C=C1)CNC2=NC(=NC3=CC=CC=C32)NCC4=CC=CC=C4",
93
+ "Tenovin-6: CC(C)(C)C1=CC=C(C=C1)C(=O)NC(=S)NC2=CC=C(C=C2)NC(=O)CCCCN(C)C",
94
+ "Pyrimethamine: CCC1=C(C(=NC(=N1)N)N)C2=CC=C(C=C2)Cl",
95
+ "Aminopterin: C1=CC(=CC=C1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCC2=CN=C3C(=N2)C(=NC(=N3)N)N"
96
+ ],
97
+ "species": [
98
+ "Klebsiella pneumoniae"
99
+ ]
100
+ }
101
+ ],
102
+ "file examples": [
103
+ {
104
+ "label": "E. coli training data from Stokes J. et al., Cell (2020)",
105
+ "file": "example-data/stokes2020-eco.csv",
106
+ "column": "SMILES",
107
+ "species": "Escherichia coli"
108
+ },
109
+ {
110
+ "label": "A. baumannii training data from Liu (2023)",
111
+ "file": "example-data/liu23-abau.csv",
112
+ "column": "SMILES",
113
+ "species": "Acinetobacter baumannii"
114
+ },
115
+ {
116
+ "label": "S. aureus training data from Wong (2024)",
117
+ "file": "example-data/wong24-sau-tox-5000.csv",
118
+ "column": "SMILES",
119
+ "species": "Staphylococcus aureus"
120
+ }
121
+ ]
122
+ }