Spaces:
Running
Running
Delete RF.py
Browse files
RF.py
DELETED
@@ -1,190 +0,0 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Mon Sep 4 10:38:59 2023
|
4 |
-
|
5 |
-
@author: BM109X32G-10GPU-02
|
6 |
-
"""
|
7 |
-
|
8 |
-
|
9 |
-
from sklearn.metrics import confusion_matrix
|
10 |
-
import matplotlib.pyplot as plt
|
11 |
-
import numpy as np
|
12 |
-
|
13 |
-
from sklearn.datasets import make_blobs
|
14 |
-
import json
|
15 |
-
import numpy as np
|
16 |
-
import math
|
17 |
-
from tqdm import tqdm
|
18 |
-
from scipy import sparse
|
19 |
-
from sklearn.metrics import median_absolute_error,r2_score, mean_absolute_error,mean_squared_error
|
20 |
-
import pickle
|
21 |
-
|
22 |
-
|
23 |
-
import pandas as pd
|
24 |
-
import matplotlib.pyplot as plt
|
25 |
-
from rdkit import Chem
|
26 |
-
|
27 |
-
from sklearn.ensemble import RandomForestRegressor
|
28 |
-
from sklearn.model_selection import train_test_split
|
29 |
-
from sklearn.preprocessing import MinMaxScaler
|
30 |
-
from sklearn.neural_network import MLPClassifier
|
31 |
-
from sklearn.svm import SVC
|
32 |
-
from tensorflow.keras.models import Model, load_model
|
33 |
-
from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate
|
34 |
-
from tensorflow.keras import metrics, optimizers
|
35 |
-
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
|
36 |
-
|
37 |
-
def split_smiles(smiles, kekuleSmiles=True):
|
38 |
-
try:
|
39 |
-
mol = Chem.MolFromSmiles(smiles)
|
40 |
-
smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles)
|
41 |
-
except:
|
42 |
-
pass
|
43 |
-
splitted_smiles = []
|
44 |
-
for j, k in enumerate(smiles):
|
45 |
-
if len(smiles) == 1:
|
46 |
-
return [smiles]
|
47 |
-
if j == 0:
|
48 |
-
if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
|
49 |
-
splitted_smiles.append(k + smiles[j + 1])
|
50 |
-
else:
|
51 |
-
splitted_smiles.append(k)
|
52 |
-
elif j != 0 and j < len(smiles) - 1:
|
53 |
-
if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
|
54 |
-
splitted_smiles.append(k + smiles[j + 1])
|
55 |
-
elif k.islower() and smiles[j - 1].isupper() and k != "c":
|
56 |
-
pass
|
57 |
-
else:
|
58 |
-
splitted_smiles.append(k)
|
59 |
-
|
60 |
-
elif j == len(smiles) - 1:
|
61 |
-
if k.islower() and smiles[j - 1].isupper() and k != "c":
|
62 |
-
pass
|
63 |
-
else:
|
64 |
-
splitted_smiles.append(k)
|
65 |
-
return splitted_smiles
|
66 |
-
|
67 |
-
def get_maxlen(all_smiles, kekuleSmiles=True):
|
68 |
-
maxlen = 0
|
69 |
-
for smi in tqdm(all_smiles):
|
70 |
-
spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
|
71 |
-
if spt is None:
|
72 |
-
continue
|
73 |
-
maxlen = max(maxlen, len(spt))
|
74 |
-
return maxlen
|
75 |
-
def get_dict(all_smiles, save_path, kekuleSmiles=True):
|
76 |
-
words = [' ']
|
77 |
-
for smi in tqdm(all_smiles):
|
78 |
-
spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
|
79 |
-
if spt is None:
|
80 |
-
continue
|
81 |
-
for w in spt:
|
82 |
-
if w in words:
|
83 |
-
continue
|
84 |
-
else:
|
85 |
-
words.append(w)
|
86 |
-
with open(save_path, 'w') as js:
|
87 |
-
json.dump(words, js)
|
88 |
-
return words
|
89 |
-
|
90 |
-
def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000):
|
91 |
-
coord_j = []
|
92 |
-
coord_k = []
|
93 |
-
spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
|
94 |
-
if spt is None:
|
95 |
-
return None
|
96 |
-
for j,w in enumerate(spt):
|
97 |
-
if j >= max_len:
|
98 |
-
break
|
99 |
-
try:
|
100 |
-
k = words.index(w)
|
101 |
-
except:
|
102 |
-
continue
|
103 |
-
coord_j.append(j)
|
104 |
-
coord_k.append(k)
|
105 |
-
data = np.repeat(1, len(coord_j))
|
106 |
-
output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words)))
|
107 |
-
return output
|
108 |
-
def split_dataset(dataset, ratio):
|
109 |
-
"""Shuffle and split a dataset."""
|
110 |
-
# np.random.seed(111) # fix the seed for shuffle.
|
111 |
-
#np.random.shuffle(dataset)
|
112 |
-
n = int(ratio * len(dataset))
|
113 |
-
return dataset[:n], dataset[n:]
|
114 |
-
def plot_confusion_matrix(cm, savename, title='Confusion Matrix'):
|
115 |
-
|
116 |
-
plt.figure(figsize=(12, 8), dpi=100)
|
117 |
-
np.set_printoptions(precision=2)
|
118 |
-
|
119 |
-
ind_array = [np.arange(3)]
|
120 |
-
x, y = np.meshgrid(ind_array, ind_array)
|
121 |
-
for x_val, y_val in zip(x.flatten(), y.flatten()):
|
122 |
-
c = cm[y_val][x_val]
|
123 |
-
if c > 0.001:
|
124 |
-
plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=15, va='center', ha='center')
|
125 |
-
|
126 |
-
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary)
|
127 |
-
plt.title(title)
|
128 |
-
plt.colorbar()
|
129 |
-
xlocations = np.array(range(len(classes)))
|
130 |
-
plt.xticks(xlocations, classes, rotation=90)
|
131 |
-
plt.yticks(xlocations, classes)
|
132 |
-
plt.ylabel('Actual label')
|
133 |
-
plt.xlabel('Predict label')
|
134 |
-
|
135 |
-
# offset the tick
|
136 |
-
tick_marks = np.array(range(len(classes))) + 0.5
|
137 |
-
plt.gca().set_xticks(tick_marks, minor=True)
|
138 |
-
plt.gca().set_yticks(tick_marks, minor=True)
|
139 |
-
plt.gca().xaxis.set_ticks_position('none')
|
140 |
-
plt.gca().yaxis.set_ticks_position('none')
|
141 |
-
plt.grid(True, which='minor', linestyle='-')
|
142 |
-
plt.gcf().subplots_adjust(bottom=0.15)
|
143 |
-
|
144 |
-
# show confusion matrix
|
145 |
-
plt.savefig(savename, format='png')
|
146 |
-
plt.show()
|
147 |
-
def main(sm):
|
148 |
-
with open("dict.json", "r", encoding="utf-8") as f:
|
149 |
-
words = json.load(f)
|
150 |
-
|
151 |
-
inchis = list([sm])
|
152 |
-
rts = list([0])
|
153 |
-
|
154 |
-
smiles, targets = [], []
|
155 |
-
for i, inc in enumerate(tqdm(inchis)):
|
156 |
-
mol = Chem.MolFromSmiles(inc)
|
157 |
-
if mol is None:
|
158 |
-
continue
|
159 |
-
else:
|
160 |
-
smi = Chem.MolToSmiles(mol)
|
161 |
-
smiles.append(smi)
|
162 |
-
targets.append(rts[i])
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
features = []
|
167 |
-
for i, smi in enumerate(tqdm(smiles)):
|
168 |
-
xi = one_hot_coding(smi, words, max_len=600)
|
169 |
-
if xi is not None:
|
170 |
-
features.append(xi.todense())
|
171 |
-
features = np.asarray(features)
|
172 |
-
targets = np.asarray(targets)
|
173 |
-
X_test=features
|
174 |
-
Y_test=targets
|
175 |
-
n_features=10
|
176 |
-
|
177 |
-
model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
|
178 |
-
|
179 |
-
from tensorflow.keras import backend as K
|
180 |
-
|
181 |
-
load_model = pickle.load(open(r"predict.dat","rb"))
|
182 |
-
Y_predict = load_model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2))))
|
183 |
-
#Y_predict = model.predict(X_test)
|
184 |
-
x = list(Y_test)
|
185 |
-
y = list(Y_predict)
|
186 |
-
|
187 |
-
return Y_predict
|
188 |
-
|
189 |
-
if __name__ == "__main__":
|
190 |
-
x = main("CCCCCCC1=CC=C(C2(C3=CC=C(CCCCCC)C=C3)C3=CC4=C(C=C3C3=C2C=C(/C=C2\SC(=S)N(CC)C2=O)S3)C(C2=CC=C(CCCCCC)C=C2)(C2=CC=C(CCCCCC)C=C2)C2=C4SC(/C=C3\SC(=S)N(CC)C3=O)=C2)C=C1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|