File size: 39,927 Bytes
2956799
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
# -*- coding: utf-8 -*-
"""
Author: Philipp Seidl
        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
        Johannes Kepler University Linz
Contact: [email protected]

Training
"""

from .utils import str2bool, lgamma, multinom_gk, top_k_accuracy
from .data import load_templates, load_dataset_from_csv, load_USPTO
from .model import ModelConfig, MHN, StaticQK, SeglerBaseline, Retrosim
from .molutils import convert_smiles_to_fp, FP_featurizer, smarts2appl, getTemplateFingerprint, disable_rdkit_logging
from collections import defaultdict
import argparse
import os
import numpy as np
import pandas as pd
import datetime
import sys
from time import time
import matplotlib.pyplot as plt
import torch
import multiprocessing
import warnings
from joblib import Memory

cachedir = 'data/cache/'
memory = Memory(cachedir, verbose=0, bytes_limit=80e9)

def parse_args():
    parser = argparse.ArgumentParser(description="Train MHNreact.",
                                     epilog="--", prog="Train")
    parser.add_argument('-f', type=str)
    parser.add_argument('--model_type', type=str, default='mhn', 
                        help="Model-type: choose from 'segler', 'fortunato', 'mhn' or 'staticQK', default:'mhn'")
    parser.add_argument("--exp_name", type=str, default='', help="experiment name, (added as postfix to the file-names)")
    parser.add_argument("-d", "--dataset_type", type=str, default='sm', 
                        help="Input Dataset 'sm' for Scheider-USPTO-50k 'lg' for USPTO large or 'golden' or use keyword '--csv_path to specify an input file', default: 'sm'")
    parser.add_argument("--csv_path", default=None, type=str, help="path to preprocessed trainings file + split columns, default: None")
    parser.add_argument("--split_col", default='split', type=str, help="split column of csv, default: 'split'")
    parser.add_argument("--input_col", default='prod_smiles', type=str, help="input column of csv, default: 'pro_smiles'")
    parser.add_argument("--reactants_col", default='reactants_can', type=str, help="reactant colum of csv, default: 'reactants_can'")
    
    parser.add_argument("--fp_type", type=str, default='morganc',
                        help="Fingerprint type for the input only!: default: 'morgan', other options: 'rdk', 'ECFP', 'ECFC', 'MxFP', 'Morgan2CBF' or a combination of fingerprints with '+'' for max-pooling and '&' for concatination e.g. maccs+morganc+topologicaltorsion+erg+atompair+pattern+rdkc+layered+mhfp, default: 'morganc'")
    parser.add_argument("--template_fp_type", type=str, default='rdk', 
                        help="Fingerprint type for the template fingerprint, default: 'rdk'")
    parser.add_argument("--device", type=str, default='best', 
                        help="Device to run the model on, preferably 'cuda:0', default: 'best' (takes the gpu with most RAM)")
    parser.add_argument("--fp_size", type=int, default=4096, 
                        help="fingerprint-size used for templates as well as for inputs, default: 4096")
    parser.add_argument("--fp_radius", type=int, default=2, help="fingerprint-radius (if applicable to the fingerprint-type), default: 2")
    parser.add_argument("--epochs", type=int, default=10, help='number of epochs, default: 10')

    parser.add_argument("--pretrain_epochs", type=int, default=0,
                        help="applicability-matrix pretraining epochs if applicable (e.g. fortunato model_type), default: 0")
    parser.add_argument("--save_model", type=str2bool, default=False, help="save the model, default: False")

    parser.add_argument("--dropout", type=float, default=0.2, help="dropout rate for encoders, default: 0.2")
    parser.add_argument("--lr", type=float, default=5e-4, help="learning-rate, dfeault: 5e-4")
    parser.add_argument("--hopf_beta", type=float, default=0.05, help="hopfield beta parameter, default: 0.125")
    parser.add_argument("--hopf_asso_dim", type=int, default=512, help="association dimension, default: 512")
    parser.add_argument("--hopf_num_heads", type=int, default=1, help="hopfield number of heads, default: 1")
    parser.add_argument("--hopf_association_activation", type=str, default='None',
                        help="hopfield association activation function recommended:'Tanh' or 'None', other: 'ReLU', 'SeLU', 'GeLU', or 'None' for more, see torch.nn, default: 'None'")

    parser.add_argument("--norm_input", default=True, type=str2bool, 
                        help="input-normalization, default: True")
    parser.add_argument("--norm_asso", default=True, type=str2bool, 
                        help="association-normalization, default: True")

    # additional experimental hyperparams
    parser.add_argument("--hopf_n_layers", default=1, type=int, help="Number of hopfield-layers, default: 1")
    parser.add_argument("--mol_encoder_layers", default=1, type=int, help="Number of molecule-encoder layers, default: 1")
    parser.add_argument("--temp_encoder_layers", default=1, type=int, help="Number of template-encoder layers, default: 1")
    parser.add_argument("--encoder_af", default='ReLU', type=str,
                        help="Encoder-NN intermediate activation function (before association_activation function), default: 'ReLU'")
    parser.add_argument("--hopf_pooling_operation_head", default='mean', type=str, help="Pooling operation over heads default=max, (max, min, mean, ...), default: 'mean'")

    parser.add_argument("--splitting_scheme", default=None, type=str, help="Splitting_scheme for non-csv-input, default: None, other options: 'class-freq', 'random'")

    parser.add_argument("--concat_rand_template_thresh", default=-1, type=int, help="Concatinates a random vector to the tempalte-fingerprint at all templates with num_training samples > this threshold; -1 (default) means deactivated")
    parser.add_argument("--repl_quotient", default=10, type=float, help="Only if --concat_rand_template_thresh >= 0 - Quotient of how much should be replaced by random in template-embedding, (default: 10)")
    parser.add_argument("--verbose", default=False, type=str2bool, help="If verbose, will print out more stuff, default: False")
    parser.add_argument("--batch_size", default=128, type=int, help="Training batch-size, default: 128")
    parser.add_argument("--eval_every_n_epochs", default=1, type=int, help="Evaluate every _ epochs (Evaluation is costly for USPTO-Lg), default: 1")
    parser.add_argument("--save_preds", default=False, type=str2bool, help="Save predictions for test split at the end of training, default: False")
    parser.add_argument("--wandb", default=False, type=str2bool, help="Save to wandb; login required, default: False")
    parser.add_argument("--seed", default=None, type=int, help="Seed your run to make it reproducible, defualt: None")

    parser.add_argument("--template_fp_type2", default=None, type=str, help="experimental template_fp_type for layer 2, default: None")
    parser.add_argument("--layer2weight",default=0.2, type=float, help="hopf-layer2 weight of p, default: 0.2")

    parser.add_argument("--reactant_pooling", default='max', type=str, help="reactant pooling operation over template-fingerprint, default: 'max', options: 'min','mean','lgamma'")


    parser.add_argument("--ssretroeval", default=False, type=str2bool, help="single-step retro-synthesis eval, default: False")
    parser.add_argument("--addval2train", default=False, type=str2bool, help="adds the validation set to the training set, default: False")
    parser.add_argument("--njobs",default=-1, type=int, help="Number of jobs, default: -1 -> uses all available")
    
    parser.add_argument("--eval_only_loss", default=False, type=str2bool, help="if only loss should be evaluated (if top-k acc may be time consuming), default: False")
    parser.add_argument("--only_templates_in_batch", default=False, type=str2bool, help="while training only forwards templates that are in the batch, default: False")
    
    parser.add_argument("--plot_res", default=False, type=str2bool, help="Plotting results for USPTO-sm/lg, default: False")
    args = parser.parse_args()
    
    if args.njobs ==-1:
        args.njobs = int(multiprocessing.cpu_count())
        
    if args.device=='best':
        from .utils import get_best_gpu
        try:
            args.device = get_best_gpu()
        except:
            print('couldnt get the best gpu, using cpu instead')
            args.device = 'cpu'
    
    # some save checks on model type
    if (args.model_type == 'segler') & (args.pretrain_epochs>=1):
        print('changing model type to fortunato because of pretraining_epochs>0')
        args.model_type = 'fortunato'
    if ((args.model_type == 'staticQK') or (args.model_type == 'retrosim')) & (args.epochs>1):
        print('changing epochs to 1 (StaticQK is not lernable ;)')
        args.epochs=1
        if args.template_fp_type != args.fp_type:
            print('fp_type must be the same as template_fp_type --> setting template_fp_type to fp_type')
            args.template_fp_type = args.fp_type
    if args.save_model & (args.fp_type=='MxFP'):
        warnings.warn('Currently MxFP is not recommended for saving the model paprameter (fragment dict for others would need to be saved or compued again, currently not implemented)')
    
    return args

@memory.cache(ignore=['njobs'])
def featurize_smiles(X, fp_type='morgan', fp_size=4096, fp_radius=2, njobs=1, verbose=False):
    X_fp = {}
    
    if fp_type in ['MxFP','MACCS','Morgan2CBF','Morgan4CBF', 'Morgan6CBF', 'ErG','AtomPair','TopologicalTorsion','RDK']:
        print('computing', fp_type)
        if fp_type == 'MxFP':
            fp_types = ['MACCS','Morgan2CBF','Morgan4CBF', 'Morgan6CBF', 'ErG','AtomPair','TopologicalTorsion','RDK']
        else:
            fp_types = [fp_type]

        remaining = int(fp_size)
        for fp_type in fp_types:
            print(fp_type,end=' ')
            feat = FP_featurizer(fp_types=fp_type,
                                 max_features= (fp_size//len(fp_types)) if (fp_type != fp_types[-1]) else remaining )
            X_fp[f'train_{fp_type}'] = feat.fit(X['train'])
            X_fp[f'valid_{fp_type}'] = feat.transform(X['valid'])
            X_fp[f'test_{fp_type}'] = feat.transform(X['test'])

            remaining -= X_fp[f'train_{fp_type}'].shape[1]
            #X_fp['train'].shape, X_fp['test'].shape
        X_fp['train'] = np.hstack([ X_fp[f'train_{fp_type}'] for fp_type in fp_types])
        X_fp['valid'] = np.hstack([ X_fp[f'valid_{fp_type}'] for fp_type in fp_types])
        X_fp['test'] = np.hstack([ X_fp[f'test_{fp_type}'] for fp_type in fp_types])
    
    else: #fp_type in ['rdk','morgan','ecfp4','pattern','morganc','rdkc']:
        if verbose: print('computing', fp_type, 'folded')
        for split in X.keys():
            X_fp[split] = convert_smiles_to_fp(X[split], fp_size=fp_size, which=fp_type, radius=fp_radius, njobs=njobs, verbose=verbose)

    return X_fp


def compute_template_fp(fp_len=2048, reactant_pooling='max', do_log=True):
    """Pre-Compute the template-fingerprint"""
    # combine them to one fingerprint
    comb_template_fp = np.zeros((max(template_list.keys())+1,fp_len if reactant_pooling!='concat' else fp_len*6))
    for i in template_list:
        tpl = template_list[i]
        try:
            pr, rea = str(tpl).split('>>')
            idxx = temp_part_to_fp[pr]
            prod_fp = templates_fp['fp'][idxx]
        except:
            print('err', pr, end='\r')
            prod_fp = np.zeros(fp_len)

        rea_fp = templates_fp['fp'][[temp_part_to_fp[r] for r in str(rea).split('.')]] # max-pooling

        if reactant_pooling=='only_product':
            rea_fp = np.zeros(fp_len)
        if reactant_pooling=='max':
            rea_fp = np.log(1 + rea_fp.max(0))
        elif reactant_pooling=='mean':
            rea_fp = np.log(1 + rea_fp.mean(0))
        elif reactant_pooling=='sum':
            rea_fp = np.log(1 + rea_fp.mean(0))
        elif reactant_pooling=='lgamma':
            rea_fp = multinom_gk(rea_fp, axis=0)
        elif reactant_pooling=='concat':
            rs = str(rea).split('.')
            rs.sort()
            for ii, r in enumerate(rs):
                idx = temp_part_to_fp[r]
                rea_fp = templates_fp['fp'][idx]
                comb_template_fp[i, (fp_len*(ii+1)):(fp_len*(ii+2))] = np.log(1 + rea_fp)
        
        comb_template_fp[i,:prod_fp.shape[0]] = np.log(1 + prod_fp) #- rea_fp*0.5
        if reactant_pooling!='concat':
            #comb_template_fp[i] = multinom_gk(np.stack([np.log(1+prod_fp), rea_fp]))
            #comb_template_fp[i,fp_len:] = rea_fp
            comb_template_fp[i,:rea_fp.shape[0]] = comb_template_fp[i, :rea_fp.shape[0]] - rea_fp*0.5
            
    return comb_template_fp


def set_up_model(args, template_list=None):
    hpn_config = ModelConfig(num_templates = int(max(template_list.keys()))+1,
                             #len(template_list.values()),  #env.num_templates, #
                             dropout=args.dropout,
                             fingerprint_type=args.fp_type,
                             template_fp_type = args.template_fp_type,
                             fp_size = args.fp_size,
                             fp_radius= args.fp_radius,
                             device=args.device,
                             lr=args.lr,
                             hopf_beta=args.hopf_beta,  #1/(128**0.5),#1/(2048**0.5),
                             hopf_input_size=args.fp_size,
                             hopf_output_size=None,
                             hopf_num_heads=args.hopf_num_heads,
                             hopf_asso_dim=args.hopf_asso_dim,

                             hopf_association_activation = args.hopf_association_activation,  #or ReLU, Tanh works better, SELU, GELU
                             norm_input = args.norm_input,
                             norm_asso = args.norm_asso,
                             
                             hopf_n_layers= args.hopf_n_layers,
                             mol_encoder_layers=args.mol_encoder_layers,
                             temp_encoder_layers=args.temp_encoder_layers,
                             encoder_af=args.encoder_af,
                             
                             hopf_pooling_operation_head = args.hopf_pooling_operation_head,
                             batch_size=args.batch_size,
                             )
    print(hpn_config.__dict__)

    if args.model_type=='segler': # baseline
        clf = SeglerBaseline(hpn_config)
    elif args.model_type=='mhn':
        clf = MHN(hpn_config, layer2weight=args.layer2weight)
    elif args.model_type=='fortunato': # pretraining with applicability-matrix
        clf = SeglerBaseline(hpn_config)
    elif args.model_type=='staticQK': # staticQK
        clf = StaticQK(hpn_config)
    elif args.model_type=='retrosim': # staticQK
        clf = Retrosim(hpn_config)
    else:
        raise NotImplementedError
        
    return clf, hpn_config

def set_up_template_encoder(args, clf, label_to_n_train_samples=None, template_list=None):

    if isinstance(clf, SeglerBaseline):
        clf.templates = []
    elif args.model_type=='staticQK':
        clf.template_list = list(template_list.values())
        clf.update_template_embedding(which=args.template_fp_type, fp_size=args.fp_size, radius=args.fp_radius, njobs=args.njobs)
    elif args.model_type=='retrosim':
        #clf.template_list = list(X['train'].values())
        clf.fit_with_train(X_fp['train'], y['train'])
    else:
        import hashlib
        PATH = './data/cache/'
        if not os.path.exists(PATH):
            os.mkdir(PATH)
        fn_templ_emb = f'{PATH}templ_emb_{args.fp_size}_{args.template_fp_type}{args.fp_radius}_{len(template_list)}_{int(hashlib.sha512((str(template_list)).encode()).hexdigest(), 16)}.npy'
        if (os.path.exists(fn_templ_emb)): # load the template embedding
            print(f'loading tfp from file {fn_templ_emb}')
            templ_emb = np.load(fn_templ_emb)
            # !!! beware of different fingerprint types
            clf.template_list = list(template_list.values())

            if args.only_templates_in_batch:
                clf.templates_np = templ_emb
                clf.templates = None
            else:
                clf.templates = torch.from_numpy(templ_emb).float().to(clf.config.device)
        else:
            if args.template_fp_type=='MxFP':
                clf.template_list = list(template_list.values())
                clf.templates = torch.from_numpy(comb_template_fp).float().to(clf.config.device)
                clf.set_templates_recursively()
            elif args.template_fp_type=='Tfidf':
                clf.template_list = list(template_list.values())
                clf.templates = torch.from_numpy(tfidf_template_fp).float().to(clf.config.device)
                clf.set_templates_recursively()
            elif args.template_fp_type=='random':
                clf.template_list = list(template_list.values())
                clf.templates = torch.from_numpy(np.random.rand(len(template_list),args.fp_size)).float().to(clf.config.device)
                clf.set_templates_recursively()
            else:
                clf.set_templates(list(template_list.values()), which=args.template_fp_type, fp_size=args.fp_size, 
                                  radius=args.fp_radius, learnable=False, njobs=args.njobs, only_templates_in_batch=args.only_templates_in_batch)
                #if len(template_list)<100000:
                np.save(fn_templ_emb, clf.templates_np if args.only_templates_in_batch else clf.templates.detach().cpu().numpy().astype(np.float16))

        # concatinate the current fingerprint with a random fingerprint if the threshold is above
        if (args.concat_rand_template_thresh != -1) & (args.repl_quotient>0):
            REPLACE_FACTOR = int(args.repl_quotient) # default was 8

            # fold the original fingerprint
            pre_comp_templates = clf.templates_np if args.only_templates_in_batch else clf.templates.detach().cpu().numpy()
             
            # mask of labels with mor than 49 training samples
            l_mask = np.array([label_to_n_train_samples[k]>=args.concat_rand_template_thresh for k in template_list])
            print(f'Num of templates with added rand-vect of size {pre_comp_templates.shape[1]//REPLACE_FACTOR} due to >=thresh ({args.concat_rand_template_thresh}):',l_mask.sum())

            # remove the bits with the lowest variance
            v = pre_comp_templates.var(0)
            idx_lowest_var_half = v.argsort()[:(pre_comp_templates.shape[1]//REPLACE_FACTOR)]
            
            # the new zero-init-vectors
            pre = np.zeros([pre_comp_templates.shape[0], pre_comp_templates.shape[1]//REPLACE_FACTOR]).astype(np.float)
            print(pre.shape, l_mask.shape, l_mask.sum()) #(616, 1700) (11790,) 519
            print(pre_comp_templates.shape, len(template_list)) #(616, 17000) 616
            # only the ones with >thresh will receive a random vect
            pre[l_mask] = np.random.rand(l_mask.sum(), pre.shape[1])

            pre_comp_templates[:,idx_lowest_var_half] = pre

            #clf.templates = torch.from_numpy(pre_comp_templates).float().to(clf.config.device)
            if pre_comp_templates.shape[0]<100000:
                print('adding template_matrix to params')
                param = torch.nn.Parameter(torch.from_numpy(pre_comp_templates).float(), requires_grad=False)
                clf.register_parameter(name='templates+noise', param=param)
                clf.templates = param.to(clf.config.device)
                clf.set_templates_recursively()
            else: #otherwise might cause memory issues
                print('more than 100k templates')
                if args.only_templates_in_batch:
                    clf.templates = None
                    clf.templates_np = pre_comp_templates
                else:
                    clf.templates = torch.from_numpy(pre_comp_templates).float()
                    clf.set_templates_recursively()

    
    # set's this for the first layer!!
    if args.template_fp_type2=='MxFP':
        print('first_layer template_fingerprint is set to MxFP')
        clf.templates = torch.from_numpy(comb_template_fp).float().to(clf.config.device)
    elif args.template_fp_type2=='Tfidf':
        print('first_layer template_fingerprint is set to Tfidf')
        clf.templates = torch.from_numpy(tfidf_template_fp).float().to(clf.config.device)
    elif args.template_fp_type2=='random':
        print('first_layer template_fingerprint is set to random')
        clf.templates = torch.from_numpy(np.random.rand(len(template_list),args.fp_size)).float().to(clf.config.device)
    elif args.template_fp_type2=='stfp':
        print('first_layer template_fingerprint is set to stfp ! only works with 4096 fp_size')
        tfp = getTemplateFingerprint(list(template_list.values()))
        clf.templates = torch.from_numpy(tfp).float().to(clf.config.device)
        
    return clf


if __name__ == '__main__':
    
    args = parse_args()
    
    run_id = str(time()).split('.')[0]
    fn_postfix = str(args.exp_name) + '_' + run_id
        
    if args.wandb:
        import wandb
        wandb.init(project='mhn-react', entity='phseidl', name=args.dataset_type+'_'+args.model_type+'_'+fn_postfix, config=args.__dict__)
    else:
        wandb=None
        
    if not args.verbose:
        disable_rdkit_logging()
        
    if args.seed is not None:
        from .utils import seed_everything
        seed_everything(args.seed)
        print('seeded with',args.seed)

    # load csv or data
    if args.csv_path is None:
        X, y = load_USPTO(which=args.dataset_type)
        template_list = load_templates(which=args.dataset_type)
    else:
        X, y, template_list, test_reactants_can = load_dataset_from_csv(**vars(args))

    if args.addval2train:
        print('adding val to train')
        X['train'] = [*X['train'],*X['valid']]
        y['train'] = np.concatenate([y['train'],y['valid']])

    splits = ['train', 'valid', 'test']

    #TODO split up in seperate class
    if args.splitting_scheme == 'class-freq':
        X_all = np.concatenate([X[split] for split in splits], axis=0)
        y_all = np.concatenate([y[split] for split in splits])

        # sort class by frequency / assumes class-index is ordered (wich is mildely violated)
        res = y_all.argsort()

        # use same split proportions
        cum_split_lens = np.cumsum([len(y[split]) for split in splits]) #cumulative split length

        X['train'] = X_all[res[0:cum_split_lens[0]]]
        y['train'] = y_all[res[0:cum_split_lens[0]]]

        X['valid'] = X_all[res[cum_split_lens[0]:cum_split_lens[1]]]
        y['valid'] = y_all[res[cum_split_lens[0]:cum_split_lens[1]]]

        X['test'] = X_all[res[cum_split_lens[1]:]]
        y['test'] = y_all[res[cum_split_lens[1]:]]
        for split in splits:
            print(split, y[split].shape[0], 'samples (', y[split].max(),'max label)')

    if args.splitting_scheme == 'remove_once_in_train_and_not_in_test':
        print('remove_once_in_train')
        from collections import Counter
        cc = Counter()
        cc.update(y['train'])
        classes_set_only_once_in_train = set(np.array(list(cc.keys()))[ (np.array(list(cc.values())))==1])
        not_in_test = set(y['train']).union(y['valid']) - (set(y['test']))
        classes_set_only_once_in_train = (classes_set_only_once_in_train.intersection(not_in_test))
        remove_those_mask = np.array([yii in classes_set_only_once_in_train for yii in y['train']])
        X['train'] = np.array(X['train'])[~remove_those_mask]
        y['train'] = np.array(y['train'])[~remove_those_mask]
        print(remove_those_mask.mean(),'%', remove_those_mask.sum(), 'samples removed')

    if args.splitting_scheme == 'random':
        print('random-splitting-scheme:8-1-1')
        if args.ssretroeval:
            print('ssretroeval not available')
            raise NotImplementedError
        import numpy as np
        from sklearn.model_selection import train_test_split
        
        def _unpack(lod):
            r = []
            for k,v in lod.items():
                [r.append(i) for i in v]
            return r

        X_all = _unpack(X)
        y_all = np.array( _unpack(y) )

        X['train'], X['test'], y['train'], y['test'] = train_test_split(X_all, y_all, test_size=0.2, random_state=70135)
        X['test'], X['valid'], y['test'], y['valid'] = train_test_split(X['test'], y['test'], test_size=0.5, random_state=70135)

        zero_shot = set(y['test']).difference( set(y['train']).union(set(y['valid'])) )
        zero_shot_mask = np.array([yi in zero_shot for yi in y['test']])
        print(sum(zero_shot_mask))
        #y['test'][zero_shot_mask] = list(zero_shot)[0] #not right but quick

    
    if args.model_type=='staticQK' or args.model_type=='retrosim':
        print('staticQK model: caution: use pattern, or rdk -fingerprint-embedding')

    fp_size = args.fp_size
    radius = args.fp_radius #quite important ;)
    fp_embedding = args.fp_type

    X_fp = featurize_smiles(X, fp_type=args.fp_type, fp_size=args.fp_size, fp_radius=args.fp_radius, njobs=args.njobs)

    if args.template_fp_type=='MxFP' or (args.template_fp_type2=='MxFP'):
        temp_part_to_fp = {}
        for i in template_list:
            tpl = template_list[i]
            for part in str(tpl).split('>>'):
                for p in str(part).split('.'):
                    temp_part_to_fp[p]=None
        for i, k in enumerate(temp_part_to_fp):
            temp_part_to_fp[k] = i

        fp_types = ['Morgan2CBF','Morgan4CBF', 'Morgan6CBF','AtomPair','TopologicalTorsion', 'Pattern', 'RDK']
        #MACCS ErG don't work --> errors with explicit / inplicit valence
        templates_fp = {}
        remaining = args.fp_size
        for fp_type in fp_types:
            #print(fp_type, end='\t')
            # if it's that last use up the remaining fps
            te_feat = FP_featurizer(fp_types=fp_type,
                                    max_features=(args.fp_size//len(fp_types)) if (fp_type != fp_types[-1]) else remaining,
                                    log_scale=False
                                    )
            templates_fp[fp_type] = te_feat.fit(list(temp_part_to_fp.keys())[:], is_smarts=True)
            #print(np.unique(templates_fp[fp_type]), end='\r')
            remaining -= templates_fp[fp_type].shape[1]
        templates_fp['fp'] = np.hstack([ templates_fp[f'{fp_type}'] for fp_type in fp_types])

    
    if args.template_fp_type=='MxFP' or (args.template_fp_type2=='MxFP'):
        comb_template_fp = compute_template_fp(fp_len= args.fp_size, reactant_pooling=args.reactant_pooling)


    
    if args.template_fp_type=='Tfidf' or (args.template_fp_type2 == 'Tfidf'):
        print('using tfidf template-fingerprint')
        from sklearn.feature_extraction.text import TfidfVectorizer
        corpus = (list(template_list.values()))
        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,12), max_features=args.fp_size)
        tfidf_template_fp = vectorizer.fit_transform(corpus).toarray()
        tfidf_template_fp.shape

    
    acutal_fp_size = X_fp['train'].shape[1]
    if acutal_fp_size != args.fp_size:
        args.fp_size = int(X_fp['train'].shape[1])
        print('Warning: fp-size has changed to', acutal_fp_size)

    
    label_to_n_train_samples = {}
    n_train_samples_to_label = defaultdict(list)
    n_templates = max(template_list.keys())+1 #max(max(y['train']), max(y['test']), max(y['valid']))
    for i in range(n_templates):
        n_train_samples = (y['train']==i).sum()
        label_to_n_train_samples[i] = n_train_samples
        n_train_samples_to_label[n_train_samples].append(i)

    
    up_to = 11
    n_samples = []
    masks = []
    ntes = range(up_to)
    mask_dict = {}

    for nte in ntes: # Number of training examples
        split = f'nte_{nte}'
        #print(split)
        mask = np.zeros(y['test'].shape)

        if isinstance(nte, int):
            for label_with_nte in n_train_samples_to_label[nte]:
                mask += (y['test'] == label_with_nte)

        mask = mask>=1
        masks.append(mask)
        mask_dict[str(nte)] = mask
        n_samples.append(mask.sum())

    # for greater than 10 # >10
    n_samples.append((np.array(masks).max(0)==0).sum())
    mask_dict['>10'] = (np.array(masks).max(0)==0)

    sum(n_samples), mask.shape

    ntes = range(50) #to 49
    for nte in ntes: # Number of training examples
        split = f'nte_{nte}'
        #print(split)
        mask = np.zeros(y['test'].shape)
        for label_with_nte in n_train_samples_to_label[nte]:
            mask += (y['test'] == label_with_nte)
        mask = mask>=1
        masks.append(mask)
    # for greater than 10 # >49
    n_samples.append((np.array(masks).max(0)==0).sum())
    mask_dict['>49'] = np.array(masks).max(0)==0

    print(n_samples)

    clf, hpn_config = set_up_model(args, template_list=template_list)
    clf = set_up_template_encoder(args, clf, label_to_n_train_samples=label_to_n_train_samples, template_list=template_list)
  
    if args.verbose:
        print(clf.config.__dict__)
        print(clf)

    wda = torch.optim.AdamW(clf.parameters(), lr=args.lr, weight_decay=1e-2)

    if args.wandb:
        wandb.watch(clf)

    
    # pretraining with applicablity matrix, if applicable
    if args.model_type == 'fortunato' or args.pretrain_epochs>1:
        print('pretraining on applicability-matrix -- loading the matrix')
        _, y_appl = load_USPTO(args.dataset_type, is_appl_matrix=True)
        if args.splitting_scheme == 'remove_once_in_train_and_not_in_test':
            y_appl['train'] = y_appl['train'][~remove_those_mask]

        # check random if the applicability is true for y
        splt = 'train'
        for i in range(500):
            i = np.random.randint(len(y[splt]))
            #assert ( y_appl[splt][i].indices == y[splt][i] ).sum()==1

        print('pre-training (BCE-loss)')
        for epoch in range(args.pretrain_epochs):
            clf.train_from_np(X_fp['train'], X_fp['train'], y_appl['train'], use_dataloader=True, is_smiles=False,
                          epochs=1, wandb=wandb, verbose=args.verbose, bs=args.batch_size, 
                          permute_batches=True, shuffle=True, optimizer=wda, 
                          only_templates_in_batch=args.only_templates_in_batch)
            y_pred = clf.evaluate(X_fp['valid'], X_fp['valid'], y_appl['valid'], 
                                  split='pretrain_valid', is_smiles=False, only_loss=True, 
                                  bs=args.batch_size,wandb=wandb)
            appl_acc = ((y_appl['valid'].toarray()) == (y_pred>0.5)).mean()
            print(f'{epoch:2.0f} -- train_loss: {clf.hist["loss"][-1]:1.3f}, loss_valid: {clf.hist["loss_pretrain_valid"][-1]:1.3f}, train_acc: {appl_acc:1.5f}')
    
    fn_hist = None
    y_preds = None

    for epoch in range(round(args.epochs / args.eval_every_n_epochs)):
        if not isinstance(clf, StaticQK):
            now = time()
            clf.train_from_np(X_fp['train'], X_fp['train'], y['train'], use_dataloader=True, is_smiles=False,
                          epochs=args.eval_every_n_epochs, wandb=wandb, verbose=args.verbose, bs=args.batch_size, 
                              permute_batches=True, shuffle=True, optimizer=wda, only_templates_in_batch=args.only_templates_in_batch)
            if args.verbose: print(f'training took {(time()-now)/60:3.1f} min for {args.eval_every_n_epochs} epochs')
        for split in ['valid', 'test']:
            print(split, 'evaluating', end='\r')
            now = time()
            #only_loss = ((epoch%5)==4) if args.dataset_type=='lg' else True
            y_preds = clf.evaluate(X_fp[split], X_fp[split], y[split], is_smiles=False, split=split, bs=args.batch_size, only_loss=args.eval_only_loss, wandb=wandb);

            if args.verbose: print(f'eval {split} took',(time()-now)/60,'min')
        if not isinstance(clf, StaticQK):
            try:
                print(f'{epoch:2.0f} -- train_loss: {clf.hist["loss"][-1]:1.3f}, loss_valid: {clf.hist["loss_valid"][-1]:1.3f}, val_t1acc: {clf.hist["t1_acc_valid"][-1]:1.3f}, val_t100acc: {clf.hist["t100_acc_valid"][-1]:1.3f}')
            except:
                pass

        now = time()
        ks = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
        for nte in mask_dict: # Number of training examples
            split = f'nte_{nte}'
            #print(split)
            mask = mask_dict[nte]

            topkacc = top_k_accuracy(np.array(y['test'])[mask], y_preds[mask, :], k=ks, ret_arocc=False)

            new_hist = {}
            for k, tkacc in zip(ks, topkacc):
                new_hist[f't{k}_acc_{split}'] = tkacc
            #new_hist[(f'arocc_{split}')] = (arocc)
            new_hist[f'steps_{split}'] = (clf.steps)

            for k in new_hist:
                clf.hist[k].append(new_hist[k])

        if args.verbose: print(f'eval nte-test took',(time()-now)/60,'min')

        fn_hist = clf.save_hist(prefix=f'USTPO_{args.dataset_type}_{args.model_type}_', postfix=fn_postfix)

    if args.save_preds:
        PATH = './data/preds/'
        if not os.path.exists(PATH):
            os.mkdir(PATH)
        pred_fn = f'{PATH}USPTO_{args.dataset_type}_test_{args.model_type}_{fn_postfix}.npy'
        print('saving predictions to',pred_fn)
        np.save(pred_fn,y_preds)
        args.save_preds = pred_fn

    
    if args.save_model:
        model_save_path = clf.save_model(prefix=f'USPTO_{args.dataset_type}_{args.model_type}_valloss{clf.hist.get("loss_valid",[-1])[-1]:1.3f}_',name_as_conf=False, postfix=fn_postfix)

        # Serialize data into file:
        import json
        json.dump( args.__dict__, open( f"data/model/{fn_postfix}_args.json", 'w' ) )
        json.dump( hpn_config.__dict__, 
                  open( f"data/model/{fn_postfix}_config.json", 'w' ) )

        print('model saved to', model_save_path)

    print(min(clf.hist.get('loss_valid',[-1])))
    
    if args.plot_res:
        from plotutils import plot_topk, plot_nte

        plt.figure()
        clf.plot_loss()
        plt.draw()

        plt.figure()
        plot_topk(clf.hist, sets=['valid'])
        if args.dataset_type=='sm':
            baseline_val_res = {1:0.4061, 10:0.6827, 50: 0.7883, 100:0.8400}
            plt.plot(list(baseline_val_res.keys()), list(baseline_val_res.values()), 'k.--')
        plt.draw()
        plt.figure()

        best_cpt = np.array(clf.hist['loss_valid'])[::-1].argmin()+1
        print(best_cpt)
        try:
            best_cpt = np.array(clf.hist['t10_acc_valid'])[::-1].argmax()+1
            print(best_cpt)
        except:
            print('err with t10_acc_valid')
        plot_nte(clf.hist, dataset=args.dataset_type.capitalize(), last_cpt=best_cpt, include_bar=True, model_legend=args.exp_name,
                 n_samples=n_samples, z=1.96)
        if os.path.exists('data/figs/'):
            try:
                os.mkdir(f'data/figs/{args.exp_name}/')
            except:
                pass
            plt.savefig(f'data/figs/{args.exp_name}/training_examples_vs_top100_acc_{args.dataset_type}_{hash(str(args))}.pdf')
        plt.draw()
        fn_hist = clf.save_hist(prefix=f'USTPO_{args.dataset_type}_{args.model_type}_', postfix=fn_postfix)

    
    if args.ssretroeval:
        print('testing on the real test set ;)')
        from .data import load_templates
        from .retroeval import run_templates, topkaccuracy
        from .utils import sort_by_template_and_flatten
        

        a = list(template_list.keys())
        #assert list(range(len(a))) == a
        templates = list(template_list.values())
        #templates = [*templates, *expert_templates]
        template_product_smarts = [str(s).split('>')[0] for s in templates]

        #execute all template
        print('execute all templates')
        test_product_smarts = [xi[0] for xi in X['test']] #added later
        smarts2appl = memory.cache(smarts2appl, ignore=['njobs','nsplits', 'use_tqdm'])
        appl = smarts2appl(test_product_smarts, template_product_smarts, njobs=args.njobs)
        n_pairs = len(test_product_smarts) * len(template_product_smarts)
        n_appl = len(appl[0])
        print(n_pairs, n_appl, n_appl/n_pairs)
        
        #forward
        split = 'test'
        print('len(X_fp[test]):',len(X_fp[split]))
        y[split] = np.zeros(len(X[split])).astype(np.int)
        clf.eval()
        if y_preds is None:
            y_preds = clf.evaluate(X_fp[split], X_fp[split], y[split], is_smiles=False,
                               split='ttest', bs=args.batch_size, only_loss=True, wandb=None);

        template_scores = y_preds #this should allready be test
        
        #### 
        if y_preds.shape[1]>100000:
            kth = 200
            print(f'only evaluating top {kth} applicable predicted templates')
            # only take top kth and multiply by applicability matrix
            appl_mtrx = np.zeros_like(y_preds, dtype=bool)
            appl_mtrx[appl[0], appl[1]] = 1

            appl_and_topkth = ([], [])
            for row in range(len(y_preds)):
                argpreds = (np.argpartition(-(y_preds[row]*appl_mtrx[row]), kth, axis=0)[:kth])
                # if there are less than kth applicable
                mask = appl_mtrx[row][argpreds]
                argpreds = argpreds[mask]
                #if len(argpreds)!=kth:
                #    print('changed to ', len(argpreds))

                appl_and_topkth[0].extend([row for _ in range(len(argpreds))])
                appl_and_topkth[1].extend(list(argpreds))   
            
            appl = appl_and_topkth
        ####
        
        print('running the templates')
        run_templates = run_templates #memory.cache( ) ... allready cached to tmp
        prod_idx_reactants, prod_temp_reactants =  run_templates(test_product_smarts, templates, appl, njobs=args.njobs)
        #sorted_results = sort_by_template(template_scores, prod_idx_reactants)
        #flat_results = flatten_per_product(sorted_results, remove_duplicates=True)
        #now aglomerates over same outcome
        flat_results = sort_by_template_and_flatten(y_preds, prod_idx_reactants, agglo_fun=sum) 
        accs = topkaccuracy(test_reactants_can, flat_results, [*list(range(1,101)), 100000])

        mtrcs2 = {f't{k}acc_ttest':accs[k-1] for k in [1,2,3,5,10,20,50,100,101]}
        if wandb:
            wandb.log(mtrcs2)
        print('Single-step retrosynthesis-evaluation, results on ttest:')
        #print([k[:-6]+'|' for k in mtrcs2.keys()])
        [print(k[:-6],end='\t') for k in mtrcs2.keys()]
        print()
        for k,v in mtrcs2.items():
            print(f'{v*100:2.2f}',end='\t')

    
    # save the history of this experiment
    EXP_DIR = 'data/experiments/'

    df = pd.DataFrame([args.__dict__])
    df['min_loss_valid'] = min(clf.hist.get('loss_valid', [-1]))
    df['min_loss_train'] = 0 if ((args.model_type=='staticQK') or (args.model_type=='retrosim')) else min(clf.hist.get('loss',[-1]))
    try:
        df['max_t1_acc_valid'] = max(clf.hist.get('t1_acc_valid', [0]))
        df['max_t100_acc_valid'] = max(clf.hist.get('t100_acc_valid', [0]))
    except:
        pass
    df['hist'] = [clf.hist]
    df['n_samples'] = [n_samples]

    df['fn_hist'] = fn_hist if fn_hist else None
    df['fn_model'] = '' if not args.save_model else model_save_path
    df['date'] = str(datetime.datetime.fromtimestamp(time()))
    df['cmd'] = ' '.join(sys.argv[:])


    if not os.path.exists(EXP_DIR):
        os.mkdir(EXP_DIR)

    df.to_csv(f'{EXP_DIR}{run_id}.tsv', sep='\t')
    df