jinysun commited on
Commit
ece094e
·
1 Parent(s): 329951e

Delete RF.py

Browse files
Files changed (1) hide show
  1. RF.py +0 -190
RF.py DELETED
@@ -1,190 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Mon Sep 4 10:38:59 2023
4
-
5
- @author: BM109X32G-10GPU-02
6
- """
7
-
8
-
9
- from sklearn.metrics import confusion_matrix
10
- import matplotlib.pyplot as plt
11
- import numpy as np
12
-
13
- from sklearn.datasets import make_blobs
14
- import json
15
- import numpy as np
16
- import math
17
- from tqdm import tqdm
18
- from scipy import sparse
19
- from sklearn.metrics import median_absolute_error,r2_score, mean_absolute_error,mean_squared_error
20
- import pickle
21
-
22
-
23
- import pandas as pd
24
- import matplotlib.pyplot as plt
25
- from rdkit import Chem
26
-
27
- from sklearn.ensemble import RandomForestRegressor
28
- from sklearn.model_selection import train_test_split
29
- from sklearn.preprocessing import MinMaxScaler
30
- from sklearn.neural_network import MLPClassifier
31
- from sklearn.svm import SVC
32
- from tensorflow.keras.models import Model, load_model
33
- from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate
34
- from tensorflow.keras import metrics, optimizers
35
- from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
36
-
37
- def split_smiles(smiles, kekuleSmiles=True):
38
- try:
39
- mol = Chem.MolFromSmiles(smiles)
40
- smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles)
41
- except:
42
- pass
43
- splitted_smiles = []
44
- for j, k in enumerate(smiles):
45
- if len(smiles) == 1:
46
- return [smiles]
47
- if j == 0:
48
- if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
49
- splitted_smiles.append(k + smiles[j + 1])
50
- else:
51
- splitted_smiles.append(k)
52
- elif j != 0 and j < len(smiles) - 1:
53
- if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
54
- splitted_smiles.append(k + smiles[j + 1])
55
- elif k.islower() and smiles[j - 1].isupper() and k != "c":
56
- pass
57
- else:
58
- splitted_smiles.append(k)
59
-
60
- elif j == len(smiles) - 1:
61
- if k.islower() and smiles[j - 1].isupper() and k != "c":
62
- pass
63
- else:
64
- splitted_smiles.append(k)
65
- return splitted_smiles
66
-
67
- def get_maxlen(all_smiles, kekuleSmiles=True):
68
- maxlen = 0
69
- for smi in tqdm(all_smiles):
70
- spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
71
- if spt is None:
72
- continue
73
- maxlen = max(maxlen, len(spt))
74
- return maxlen
75
- def get_dict(all_smiles, save_path, kekuleSmiles=True):
76
- words = [' ']
77
- for smi in tqdm(all_smiles):
78
- spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
79
- if spt is None:
80
- continue
81
- for w in spt:
82
- if w in words:
83
- continue
84
- else:
85
- words.append(w)
86
- with open(save_path, 'w') as js:
87
- json.dump(words, js)
88
- return words
89
-
90
- def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000):
91
- coord_j = []
92
- coord_k = []
93
- spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
94
- if spt is None:
95
- return None
96
- for j,w in enumerate(spt):
97
- if j >= max_len:
98
- break
99
- try:
100
- k = words.index(w)
101
- except:
102
- continue
103
- coord_j.append(j)
104
- coord_k.append(k)
105
- data = np.repeat(1, len(coord_j))
106
- output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words)))
107
- return output
108
- def split_dataset(dataset, ratio):
109
- """Shuffle and split a dataset."""
110
- # np.random.seed(111) # fix the seed for shuffle.
111
- #np.random.shuffle(dataset)
112
- n = int(ratio * len(dataset))
113
- return dataset[:n], dataset[n:]
114
- def plot_confusion_matrix(cm, savename, title='Confusion Matrix'):
115
-
116
- plt.figure(figsize=(12, 8), dpi=100)
117
- np.set_printoptions(precision=2)
118
-
119
- ind_array = [np.arange(3)]
120
- x, y = np.meshgrid(ind_array, ind_array)
121
- for x_val, y_val in zip(x.flatten(), y.flatten()):
122
- c = cm[y_val][x_val]
123
- if c > 0.001:
124
- plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=15, va='center', ha='center')
125
-
126
- plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary)
127
- plt.title(title)
128
- plt.colorbar()
129
- xlocations = np.array(range(len(classes)))
130
- plt.xticks(xlocations, classes, rotation=90)
131
- plt.yticks(xlocations, classes)
132
- plt.ylabel('Actual label')
133
- plt.xlabel('Predict label')
134
-
135
- # offset the tick
136
- tick_marks = np.array(range(len(classes))) + 0.5
137
- plt.gca().set_xticks(tick_marks, minor=True)
138
- plt.gca().set_yticks(tick_marks, minor=True)
139
- plt.gca().xaxis.set_ticks_position('none')
140
- plt.gca().yaxis.set_ticks_position('none')
141
- plt.grid(True, which='minor', linestyle='-')
142
- plt.gcf().subplots_adjust(bottom=0.15)
143
-
144
- # show confusion matrix
145
- plt.savefig(savename, format='png')
146
- plt.show()
147
- def main(sm):
148
- with open("dict.json", "r", encoding="utf-8") as f:
149
- words = json.load(f)
150
-
151
- inchis = list([sm])
152
- rts = list([0])
153
-
154
- smiles, targets = [], []
155
- for i, inc in enumerate(tqdm(inchis)):
156
- mol = Chem.MolFromSmiles(inc)
157
- if mol is None:
158
- continue
159
- else:
160
- smi = Chem.MolToSmiles(mol)
161
- smiles.append(smi)
162
- targets.append(rts[i])
163
-
164
-
165
-
166
- features = []
167
- for i, smi in enumerate(tqdm(smiles)):
168
- xi = one_hot_coding(smi, words, max_len=600)
169
- if xi is not None:
170
- features.append(xi.todense())
171
- features = np.asarray(features)
172
- targets = np.asarray(targets)
173
- X_test=features
174
- Y_test=targets
175
- n_features=10
176
-
177
- model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
178
-
179
- from tensorflow.keras import backend as K
180
-
181
- load_model = pickle.load(open(r"predict.dat","rb"))
182
- Y_predict = load_model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2))))
183
- #Y_predict = model.predict(X_test)
184
- x = list(Y_test)
185
- y = list(Y_predict)
186
-
187
- return Y_predict
188
-
189
- if __name__ == "__main__":
190
- x = main("CCCCCCC1=CC=C(C2(C3=CC=C(CCCCCC)C=C3)C3=CC4=C(C=C3C3=C2C=C(/C=C2\SC(=S)N(CC)C2=O)S3)C(C2=CC=C(CCCCCC)C=C2)(C2=CC=C(CCCCCC)C=C2)C2=C4SC(/C=C3\SC(=S)N(CC)C3=O)=C2)C=C1")