Justin-12138 commited on
Commit
15afd18
·
1 Parent(s): 1aa5ac2

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +20 -474
  2. src.py +407 -0
app.py CHANGED
@@ -1,475 +1,21 @@
1
  import gradio as gr
2
- import matplotlib.pyplot as plt
3
- import numpy as np
4
- import pandas as pd
5
- import csv
6
- import seaborn as sns
7
- from scipy.stats import f_oneway
8
- from sklearn.ensemble import RandomForestClassifier
9
- from sklearn.model_selection import cross_val_score
10
- from sklearn.naive_bayes import GaussianNB
11
- from sklearn.neighbors import KNeighborsClassifier
12
- from sklearn.svm import SVC
13
- from sklearn.tree import DecisionTreeClassifier
14
- from sklearn.linear_model import LassoLarsCV
15
- from sklearn.preprocessing import LabelEncoder
16
- from sklearn.model_selection import train_test_split
17
- from sklearn.preprocessing import StandardScaler
18
- from sklearn.metrics import confusion_matrix
19
-
20
-
21
- class MyModel:
22
- def __init__(self, model):
23
- self.clf = model
24
- self.scaler = None
25
- self.label_encoder = None
26
-
27
- def train(self, X, Y):
28
- # 对标签进行编码
29
- self.label_encoder = LabelEncoder()
30
- Y = self.label_encoder.fit_transform(Y)
31
-
32
- # 对特征进行标准化
33
- self.scaler = StandardScaler()
34
- X = self.scaler.fit_transform(X)
35
-
36
- # 划分训练集和测试集
37
- X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
38
-
39
- # 训练模型
40
- self.clf.fit(X_train, Y_train)
41
-
42
- def predict_samples(self, samples):
43
- # 对样本进行相同的预处理步骤
44
- samples = self.scaler.transform(samples)
45
-
46
- # 使用模型进行预测
47
- predictions = self.clf.predict(samples)
48
-
49
- # 将预测的标签解码回原始值
50
- predictions = self.label_encoder.inverse_transform(predictions)
51
-
52
- return predictions
53
-
54
-
55
- def add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list):
56
- max_score_index = np.argmax(np.array(temp_scores))
57
- current_score.append(temp_scores[max_score_index])
58
- selected_indices.add(max_score_index)
59
- selected_indices_list.append(max_score_index)
60
-
61
-
62
- def fs(data, method, num_fea_int, clf, testsample):
63
- num_fea_int = int(num_fea_int)
64
- if method == 'MRMR_FCD':
65
- data = pd.read_csv(data.name)
66
- X = data.iloc[:, :-1].values
67
- y = data['Label'].values
68
- num_features = len(X[0])
69
- f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
70
- # 添加起始特征的分数到current_score
71
- current_score = [max(f_test_scores)]
72
- # 索引从最高分数的特征开始
73
- start_feature_index = f_test_scores.index(max(f_test_scores))
74
- selected_indices = set()
75
- selected_indices_list = []
76
- selected_indices.add(start_feature_index)
77
- selected_indices_list.append(start_feature_index)
78
- pearson_score_matrix = np.zeros((num_features, num_features))
79
- for _ in range(num_fea_int - 1):
80
- temp_scores = []
81
- for i in range(num_features):
82
- if i in selected_indices:
83
- temp_scores.append(-float('inf'))
84
- else:
85
- f_test_score = f_test_scores[i]
86
- diff = 0
87
- for j in selected_indices:
88
- # pearson score
89
- if j > i:
90
- if pearson_score_matrix[i][j] == 0:
91
- pearson_score_matrix[i][j] = np.corrcoef(X[:, i], X[:, j])[0, 1]
92
- diff += pearson_score_matrix[i][j]
93
- else:
94
- if pearson_score_matrix[j][i] == 0:
95
- pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
96
- diff += pearson_score_matrix[j][i]
97
- temp_scores.append(f_test_score - diff / len(selected_indices))
98
- add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
99
- combined = list(zip(selected_indices_list, current_score))
100
- # 使用sorted()函数对合并后的列表进行排序,key参数指定按照分数排序,reverse=True表示降序排序
101
- sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
102
- # 将索引和特征分数写入csv文件
103
- with open('index-score.csv', 'w', newline='') as file:
104
- writer = csv.writer(file)
105
- writer.writerow(["Index", "Score"]) # 写入列名
106
- writer.writerows(sorted_combined)
107
-
108
- inde = []
109
- scores = []
110
- for indy in sorted_combined:
111
- inde.append(str(indy[0] + 1))
112
- scores.append(indy[1])
113
- # 创建第一个图索引-分数图
114
- plt.figure(1, figsize=(24, 10))
115
- plt.title("mRMR-FCD()")
116
- plt.plot(inde, scores)
117
- plt.xlabel("Feature Index")
118
- plt.ylabel("Feature Score")
119
- plt.savefig('Index_Score.png')
120
-
121
- ff = [] # 将字符串索引转化成整型
122
- for fire in inde:
123
- ff.append(int(fire) - 1)
124
- # 选择分类器
125
- if clf == 'RF':
126
- clf = RandomForestClassifier(n_jobs=-1)
127
- elif clf == 'KNN':
128
- clf = KNeighborsClassifier()
129
- elif clf == 'DT':
130
- clf = DecisionTreeClassifier()
131
- elif clf == 'SVM':
132
- clf = SVC(C=1.0, kernel='rbf')
133
- elif clf == 'Naive Bayes':
134
- clf = GaussianNB()
135
-
136
- acc = []
137
- # 对于index列表中的每个特征索引
138
- for i in range(len(ff)):
139
- # 使用前i个特征进行交叉验证
140
- selected_features = X[:, [int(j) - 1 for j in ff[:i + 1]]]
141
- scores = cross_val_score(clf, selected_features, y, cv=5)
142
- # 计算平均准确率并添加到acc列表中
143
- acc.append(scores.mean())
144
- max_acc = max(acc)
145
- max_index = acc.index(max_acc) + 1
146
-
147
- # 创建第二个图IFS准确率率图
148
- plt.figure(2, figsize=(24, 10))
149
- plt.title("IFS_" + str(method) + "_Accuracy")
150
- plt.plot(max_index, max_acc, 'ro')
151
- plt.plot(acc)
152
- plt.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, 20),
153
- ha='center')
154
- # 设置x轴和y轴的标签
155
- plt.xlabel("Top n features")
156
- plt.ylabel('Accuracy')
157
- plt.savefig('acc.png')
158
-
159
- # 从test.csv加载测试样本和标签
160
- testsample = pd.read_csv(testsample.name)
161
- test_samples = testsample.iloc[:, :-1].values
162
- test_labels = testsample.iloc[:, -1].values
163
-
164
- # 加载模型
165
- models = SVC(C=1.0, kernel='rbf')
166
- my_model = MyModel(models)
167
- my_model.train(X, y)
168
-
169
- # 预测测试样本的标签
170
- predictions = my_model.predict_samples(test_samples)
171
- # 计算混淆矩阵
172
- cm = confusion_matrix(test_labels, predictions)
173
-
174
- # 使用seaborn绘制混淆矩阵热力图
175
- plt.figure(figsize=(24, 10))
176
- sns.heatmap(cm, annot=True, fmt='d')
177
- plt.xlabel('predict labels')
178
- plt.ylabel('True labels')
179
- plt.grid(True)
180
- plt.savefig('confusion_matrix.png')
181
-
182
- return 'Index_Score.png', 'acc.png', "confusion_matrix.png", "index-score.csv"
183
-
184
- elif method == 'MRMR_FCQ':
185
- data = pd.read_csv(data.name)
186
- X = data.iloc[:, :-1].values
187
- y = data['Label'].values
188
- num_fea_inttures = len(X[0])
189
- f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
190
-
191
- # 添加起始特征的分数到current_score
192
- current_score = [max(f_test_scores)]
193
-
194
- # 索引从0开始
195
- # start_feature_index = random.randint(0, num_features - 1)
196
- # 索引从最高分数的特征开始
197
- start_feature_index = f_test_scores.index(max(f_test_scores))
198
-
199
- selected_indices = set()
200
- selected_indices_list = []
201
- selected_indices.add(start_feature_index)
202
- selected_indices_list.append(start_feature_index)
203
- pearson_score_matrix = np.zeros((num_fea_inttures, num_fea_inttures))
204
- for _ in range(num_fea_int - 1):
205
- temp_scores = []
206
- for i in range(num_fea_inttures):
207
- if i in selected_indices:
208
- temp_scores.append(-float('inf'))
209
- else:
210
- f_test_score = f_test_scores[i]
211
- q = 0
212
- for j in selected_indices:
213
- # pearson score
214
- if j > i:
215
- if pearson_score_matrix[i][j] == 0:
216
- pearson_score_matrix[i][j] = np.corrcoef(X[:, i], X[:, j])[0, 1]
217
- q += pearson_score_matrix[i][j]
218
- else:
219
- if pearson_score_matrix[j][i] == 0:
220
- pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
221
- q += pearson_score_matrix[j][i]
222
- temp_scores.append(f_test_score / (q / len(selected_indices)))
223
- add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
224
- combined = list(zip(selected_indices_list, current_score))
225
-
226
- # 使用sorted()函数对合并后的列表进行排序,key参数指定按照分数排序,reverse=True表示降序排序
227
- sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
228
- # 将索引和特征分数写入csv文件
229
- with open('index-score.csv', 'w', newline='') as file:
230
- writer = csv.writer(file)
231
- writer.writerow(["Index", "Score"]) # 写入列名
232
- writer.writerows(sorted_combined)
233
-
234
- inde = []
235
- scores = []
236
- for indy in sorted_combined:
237
- inde.append(str(indy[0] + 1))
238
- scores.append(indy[1])
239
- # 创建第一个图索引-分数图
240
- plt.figure(1, figsize=(24, 10))
241
- plt.title("mRMR-FCD()")
242
- plt.plot(inde, scores)
243
- plt.xlabel("Feature Index")
244
- plt.ylabel("Feature Score")
245
- plt.savefig('Index_Score.png')
246
-
247
- ff = [] # 将字符串索引转化成整型
248
- for fire in inde:
249
- ff.append(int(fire) - 1)
250
- # 选择分类器
251
- if clf == 'RF':
252
- clf = RandomForestClassifier(n_jobs=-1)
253
- elif clf == 'KNN':
254
- clf = KNeighborsClassifier()
255
- elif clf == 'DT':
256
- clf = DecisionTreeClassifier()
257
- elif clf == 'SVM':
258
- clf = SVC(C=1.0, kernel='rbf')
259
- elif clf == 'Naive Bayes':
260
- clf = GaussianNB()
261
-
262
- acc = []
263
- # 对于index列表中的每个特征索引
264
- for i in range(len(ff)):
265
- # 使用前i个特征进行交叉验证
266
- selected_features = X[:, [int(j) - 1 for j in ff[:i + 1]]]
267
- scores = cross_val_score(clf, selected_features, y, cv=5)
268
- # 计算平均准确率并添加到acc列表中
269
- acc.append(scores.mean())
270
- max_acc = max(acc)
271
- max_index = acc.index(max_acc) + 1
272
-
273
- # 创建第二个图IFS准确率率图
274
- plt.figure(2, figsize=(24, 10))
275
- plt.title("IFS_" + str(method) + "_Accuracy")
276
- plt.plot(max_index, max_acc, 'ro')
277
- plt.plot(acc)
278
- plt.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, 20),
279
- ha='center')
280
- # 设置x轴和y轴的标签
281
- plt.xlabel("Top n features")
282
- plt.ylabel('Accuracy')
283
- plt.savefig('acc.png')
284
-
285
- # 从test.csv加载测试样本和标签
286
- testsample = pd.read_csv(testsample.name)
287
- test_samples = testsample.iloc[:, :-1].values
288
- test_labels = testsample.iloc[:, -1].values
289
-
290
- # 加载模型
291
- models = SVC(C=1.0, kernel='rbf')
292
- my_model = MyModel(models)
293
- my_model.train(X, y)
294
-
295
- # 预测测试样本的标签
296
- predictions = my_model.predict_samples(test_samples)
297
- # 计算混淆矩阵
298
- cm = confusion_matrix(test_labels, predictions)
299
-
300
- # 使用seaborn绘制混淆矩阵热力图
301
- plt.figure(figsize=(24, 10))
302
- sns.heatmap(cm, annot=True, fmt='d')
303
- plt.xlabel('predict labels')
304
- plt.ylabel('True labels')
305
- plt.grid(True)
306
- plt.savefig('confusion_matrix.png')
307
-
308
- return 'Index_Score.png', 'acc.png', "confusion_matrix.png", "index-score.csv"
309
-
310
-
311
- # 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
312
- # 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
313
- # 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
314
- elif method == 'Lasso':
315
- data = pd.read_csv(data.name)
316
- X = data.iloc[:, :-1]
317
- y = data.iloc[:, -1:].values.flatten()
318
-
319
- cl = LassoLarsCV(cv=20, max_iter=80000).fit(X, y)
320
-
321
- importance = np.abs(cl.coef_)
322
- feature_names = list(X)
323
- a = len(feature_names)
324
-
325
- idx_features = (-importance).argsort()[:a]
326
- name_features = np.array(feature_names)[idx_features]
327
- result = pd.DataFrame({'index': idx_features, 'Score': importance[idx_features]})
328
- result_rank = result.sort_values(by='Score', ascending=False, ignore_index=True)
329
- result_rank.to_csv("index-score.csv")
330
- inde = result_rank['index'].tolist()
331
- score = result_rank['Score'].tolist()
332
-
333
- index = []
334
- for i in inde:
335
- index.append(str(i))
336
- plt.figure(1, figsize=(24, 12))
337
- plt.title(str(method))
338
- plt.plot(index[:num_fea_int], score[:num_fea_int])
339
-
340
- # 设置x轴和y轴的标签
341
- plt.xlabel('Feature Index')
342
- plt.ylabel('Feature Score')
343
- plt.savefig('Index_Score.png')
344
- if clf == 'RF':
345
- clf = RandomForestClassifier(n_jobs=-1)
346
- elif clf == 'KNN':
347
- clf = KNeighborsClassifier()
348
- elif clf == 'DT':
349
- clf = DecisionTreeClassifier()
350
- elif clf == 'SVM':
351
- clf = SVC()
352
- elif clf == 'Naive Bayes':
353
- clf = GaussianNB()
354
-
355
- inde = inde[:num_fea_int]
356
- index = index[:num_fea_int]
357
- acc = []
358
- # 对于index列表中的每个特征索引
359
-
360
- X = data.iloc[:, :-1].values
361
- for i in range(len(index)):
362
- # 使用前i个特征进行交叉验证
363
- selected_features = X[:, [int(j) - 1 for j in inde[:i + 1]]]
364
- scores = cross_val_score(clf, selected_features, y, cv=5)
365
- # 计算平均准确率并添加到acc列表中
366
- acc.append(scores.mean())
367
- max_acc = max(acc)
368
- max_index = acc.index(max_acc) + 1
369
-
370
- # ax2 = fig.add_subplot(212)
371
- # ax2.set_title("IFS_" + str(method) + "_Accuracy")
372
- plt.figure(2, figsize=(24, 10))
373
- plt.plot(max_index, max_acc, 'ro')
374
- plt.plot(acc)
375
- plt.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
376
- ha='center')
377
- # 设置x轴和y轴的标签
378
- # ax2.set_xlabel()
379
- # ax2.set_ylabel('Accuracy')
380
- plt.xlabel('Top n features')
381
- plt.ylabel('Accuracy')
382
- plt.grid(True)
383
- plt.savefig('acc.png')
384
-
385
- testsample = pd.read_csv(testsample.name)
386
- test_samples = testsample.iloc[:, :-1].values
387
- test_labels = testsample.iloc[:, -1].values
388
- models = SVC(C=1.0, kernel='rbf')
389
- my_model = MyModel(models)
390
- my_model.train(X, y)
391
-
392
- # 预测测试样本的标签并计算准确率
393
- predictions = my_model.predict_samples(test_samples)
394
- # 计算混淆矩阵
395
- cm = confusion_matrix(test_labels, predictions)
396
-
397
- # 使用seaborn绘制混淆矩阵热力图
398
- plt.figure(figsize=(24, 10))
399
- sns.heatmap(cm, annot=True, fmt='d')
400
- plt.xlabel('predict labels')
401
- plt.ylabel('True labels')
402
- plt.grid(True)
403
- plt.savefig('confusion_matrix.png')
404
-
405
- return 'Index_Score.png', 'acc.png', "confusion_matrix.png",'index-score.csv'
406
-
407
- elif method == 'Ensemble':
408
- pass
409
- elif method == 'CI':
410
- pass
411
-
412
-
413
- title = "FSALs: Robust Feature selection framework"
414
- description = r"""<center><img src='https://raw.githubusercontent.com/Justin-12138/bio_if/d1fdf085f8e679dcceecc2c05014b1d4a237e033/assets/favicon.svg' alt='FSALs logo'></center>
415
- <b>Official Gradio demo</b> for <a href='https://huggingface.co/spaces/Justin-12138/FSALA' target='_blank'><b>Application of Causal Inference in Alzheimer's Disease(CCFC2023)</b></a>.<br>
416
- 🔥 Fsals is a Robust feature selection framework based on causal inference. <br>
417
- 🤗 Try using fsals in different data sets.!<br>
418
- """
419
- article = r"""
420
- If FSALs is helpful, please help to ⭐ the <a href='https://github.com/Justin-12138/bio_if' target='_blank'>Github Repo</a>. Thanks!
421
- [![GitHub Stars](https://img.shields.io/github/stars/Justin-12138/bio_if?style=social)](https://github.com/Justin-12138/bio_if)
422
-
423
- ---
424
-
425
- 📝 **Citation**
426
-
427
- If our work is useful for your research, please consider citing:
428
- ```bibtex
429
- @article{zlhl2023,
430
- author = {Xiaolong Zhou, Zhao Liu, Yuchen Huang, Kun Lin},
431
- title = {A Novel Ensemble Feature Selection Method for Biomarkers of Alzheimer's disease},
432
- booktitle = {GUET Publisher},
433
- year = {2023}
434
- }
435
- ```
436
-
437
- 📋 **License**
438
-
439
- This project is licensed under <a rel="license" href="https://github.com/Justin-12138/bio_if/blob/main/LICENSE">GPL License 2.0</a>.
440
- Redistribution and use for non-commercial purposes should follow this license.
441
-
442
- 📧 **Contact**
443
-
444
- If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
445
-
446
- <div>
447
- 🤗 Find Me:
448
- <a href="https://github.com/Justin-12138"><img style="margin-top:0.5em; margin-bottom:2em" src="https://img.shields.io/github/followers/Justin-12138?style=social" alt="Github Follow"></a>
449
- </div>
450
- """
451
-
452
- iface = gr.Interface(
453
- fn=fs,
454
- title=title,
455
- description=description,
456
-
457
- inputs=["file",
458
- gr.inputs.Radio(['MRMR_FCD', 'MRMR_FCQ', 'CFS', 'Lasso', 'Ensemble', 'CI']),
459
- gr.inputs.Number(),
460
- gr.inputs.Radio(['RF', 'SVM', 'KNN', 'DT', 'Naive Bayes']),
461
- "file"
462
- ],
463
- outputs=["image", "image", "image", "file"],
464
- article=article,
465
- examples=[
466
- ["example_data.csv", 'MRMR_FCQ', 20, 'RF', "test.csv"],
467
- ["example_data.csv", 'MRMR_FCD', 10, 'SVM', "test.csv"],
468
- ["example_data.csv", 'MRMR_FCD', 30, 'KNN', "test.csv"],
469
- ["example_data.csv", 'Lasso', 50, 'DT', "test.csv"],
470
- ["example_data.csv", 'Lasso', 40, 'Naive Bayes', "test.csv"],
471
- ],
472
- allow_flagging="never"
473
- )
474
-
475
- iface.launch()
 
1
  import gradio as gr
2
+ from src import des, fs
3
+
4
+ if __name__ == '__main__':
5
+ iface = gr.Interface(
6
+ fn=fs,
7
+ title=des("title"),
8
+ description=des("description"),
9
+ article=des("article"),
10
+ inputs=des("inputs"),
11
+ outputs=des("outputs"),
12
+ examples=[
13
+ ["example_data.csv", 'MRMR_FCQ', 20, 'RF', "test.csv"],
14
+ ["example_data.csv", 'MRMR_FCD', 10, 'SVM', "test.csv"],
15
+ ["example_data.csv", 'MRMR_FCD', 30, 'KNN', "test.csv"],
16
+ ["example_data.csv", 'Lasso', 30, 'DT', "test.csv"],
17
+ ["example_data.csv", 'Lasso', 20, 'Naive Bayes', "test.csv"],
18
+ ],
19
+ allow_flagging="never"
20
+ )
21
+ iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import gradio as gr
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ from scipy.stats import f_oneway
8
+ from sklearn.ensemble import RandomForestClassifier
9
+ from sklearn.linear_model import LassoLarsCV
10
+ from sklearn.model_selection import cross_val_score
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.naive_bayes import GaussianNB
13
+ from sklearn.neighbors import KNeighborsClassifier
14
+ from sklearn.preprocessing import LabelEncoder
15
+ from sklearn.preprocessing import StandardScaler
16
+ from sklearn.svm import SVC
17
+ from sklearn.tree import DecisionTreeClassifier
18
+ from sklearn.metrics import confusion_matrix
19
+
20
+
21
+ class MyModel:
22
+ def __init__(self, model):
23
+ self.clf = model
24
+ self.scaler = None
25
+ self.label_encoder = None
26
+
27
+ def train(self, X, Y):
28
+ # 对标签进行编码
29
+ self.label_encoder = LabelEncoder()
30
+ Y = self.label_encoder.fit_transform(Y)
31
+
32
+ # 对特征进行标准化
33
+ self.scaler = StandardScaler()
34
+ X = self.scaler.fit_transform(X)
35
+
36
+ # 划分训练集和测试集
37
+ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
38
+
39
+ # 训练模型
40
+ self.clf.fit(X_train, Y_train)
41
+
42
+ def predict_samples(self, samples):
43
+ # 对样本进行相同的预处理步骤
44
+ samples = self.scaler.transform(samples)
45
+
46
+ # 使用模型进行预测
47
+ predictions = self.clf.predict(samples)
48
+
49
+ # 将预测的标签解码回原始值
50
+ predictions = self.label_encoder.inverse_transform(predictions)
51
+
52
+ return predictions
53
+
54
+
55
+ # choose classifier
56
+ def setclf(clf_name):
57
+ if clf_name == 'RF':
58
+ return RandomForestClassifier(n_jobs=-1)
59
+ elif clf_name == 'KNN':
60
+ return KNeighborsClassifier(n_jobs=-1)
61
+ elif clf_name == 'DT':
62
+ return DecisionTreeClassifier()
63
+ elif clf_name == 'SVM':
64
+ return SVC(C=1.0, kernel='rbf')
65
+ elif clf_name == 'Naive Bayes':
66
+ return GaussianNB()
67
+
68
+
69
+ # cal score
70
+ def add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list):
71
+ max_score_index = np.argmax(np.array(temp_scores))
72
+ current_score.append(temp_scores[max_score_index])
73
+ selected_indices.add(max_score_index)
74
+ selected_indices_list.append(max_score_index)
75
+
76
+
77
+ # load data
78
+ def load_data(data, out_name):
79
+ # global X, y
80
+ data = pd.read_csv(data.name)
81
+ if not out_name:
82
+ X = data.iloc[:, :-1].values
83
+ y = data.iloc[:, -1].values
84
+ elif out_name:
85
+ X = data.iloc[:, :-1]
86
+ y = data.iloc[:, -1].values.flatten()
87
+ return X, y
88
+
89
+
90
+ def MRMR_FCD(data, testsample, num_fea_int):
91
+ X, y = load_data(data, False)
92
+ # 从test.csv加载测试样本和标签
93
+ test_samples, test_labels = load_data(testsample, False)
94
+ # 获取特征数量
95
+ # max_fea_num = X.shape[1]
96
+ num_features = len(X[0])
97
+ f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
98
+ # 添加起始特征的分数到current_score
99
+ current_score = [max(f_test_scores)]
100
+ # 索引从最高分数的特征开始
101
+ start_feature_index = f_test_scores.index(max(f_test_scores))
102
+ selected_indices = set()
103
+ selected_indices_list = []
104
+ selected_indices.add(start_feature_index)
105
+ selected_indices_list.append(start_feature_index)
106
+ pearson_score_matrix = np.zeros((num_features, num_features))
107
+ for _ in range(num_fea_int - 1):
108
+ temp_scores = []
109
+ for i in range(num_features):
110
+ if i in selected_indices:
111
+ temp_scores.append(-float('inf'))
112
+ else:
113
+ f_test_score = f_test_scores[i]
114
+ diff = 0
115
+ for j in selected_indices:
116
+ # pearson score
117
+ if j > i:
118
+ if pearson_score_matrix[i][j] == 0:
119
+ pearson_score_matrix[i][j] = np.corrcoef(X[:, i], X[:, j])[0, 1]
120
+ diff += pearson_score_matrix[i][j]
121
+ else:
122
+ if pearson_score_matrix[j][i] == 0:
123
+ pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
124
+ diff += pearson_score_matrix[j][i]
125
+ temp_scores.append(f_test_score - diff / len(selected_indices))
126
+ add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
127
+ combined = list(zip(selected_indices_list, current_score))
128
+ return combined, X, y, test_samples, test_labels
129
+
130
+
131
+ def MRMR_FCQ(data, testsample, num_fea_int):
132
+ X, y = load_data(data, False)
133
+ # 从test.csv加载测试样本和标签
134
+ test_samples, test_labels = load_data(testsample, False)
135
+ # 获取特征数量
136
+ # max_fea_num = X.shape[1]
137
+
138
+ num_fea_inttures = len(X[0])
139
+ f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
140
+
141
+ # 添加起始特征的分数到current_score
142
+ current_score = [max(f_test_scores)]
143
+
144
+ # 索引从0开始
145
+ # start_feature_index = random.randint(0, num_features - 1)
146
+ # 索引从最高分数的特征开始
147
+ start_feature_index = f_test_scores.index(max(f_test_scores))
148
+
149
+ selected_indices = set()
150
+ selected_indices_list = []
151
+ selected_indices.add(start_feature_index)
152
+ selected_indices_list.append(start_feature_index)
153
+ pearson_score_matrix = np.zeros((num_fea_inttures, num_fea_inttures))
154
+ for _ in range(num_fea_int - 1):
155
+ temp_scores = []
156
+ for i in range(num_fea_inttures):
157
+ if i in selected_indices:
158
+ temp_scores.append(-float('inf'))
159
+ else:
160
+ f_test_score = f_test_scores[i]
161
+ q = 0
162
+ for j in selected_indices:
163
+ # pearson score
164
+ if j > i:
165
+ if pearson_score_matrix[i][j] == 0:
166
+ pearson_score_matrix[i][j] = np.corrcoef(X[:, i], X[:, j])[0, 1]
167
+ q += pearson_score_matrix[i][j]
168
+ else:
169
+ if pearson_score_matrix[j][i] == 0:
170
+ pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
171
+ q += pearson_score_matrix[j][i]
172
+ temp_scores.append(f_test_score / (q / len(selected_indices)))
173
+ add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
174
+ combined = list(zip(selected_indices_list, current_score))
175
+ return combined, X, y, test_samples, test_labels
176
+
177
+
178
+ def index_score_csv(sorted_combined, filename):
179
+ with open(filename, 'w', newline='') as file:
180
+ writer = csv.writer(file)
181
+ writer.writerow(["Index", "Score"]) # 写入列名
182
+ writer.writerows(sorted_combined)
183
+
184
+
185
+ def isplot(num, width, height, title_gr, x, y, xlabbel, ylabel, filename):
186
+ plt.figure(num=num, figsize=(width, height))
187
+ plt.title(title_gr, fontsize=30)
188
+ plt.plot(x, y)
189
+ plt.xlabel(xlabel=xlabbel, fontsize=30)
190
+ plt.ylabel(ylabel=ylabel, fontsize=30)
191
+ plt.savefig(filename)
192
+
193
+
194
+ def ifsplot(num, width, height, title_gr, max_index, max_acc, acc, xlabbel, ylabel, filename):
195
+ plt.figure(num=num, figsize=(width, height))
196
+ plt.title("IFS_" + title_gr + "_Accuracy", fontsize=40)
197
+ plt.plot(max_index, max_acc, 'ro')
198
+ plt.plot(acc)
199
+ plt.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, 20),
200
+ ha='center', fontsize=40)
201
+ # 设置x轴和y轴的标签
202
+ plt.xlabel(xlabel=xlabbel, fontsize=40)
203
+ plt.ylabel(ylabel=ylabel, fontsize=40)
204
+ plt.savefig(filename)
205
+
206
+
207
+ def cmplot(num, width, height, cm, xlabbel, ylabel, filename):
208
+ plt.figure(num=num, figsize=(width, height))
209
+ sns.heatmap(cm, annot=True, fmt='d')
210
+ plt.xlabel(xlabel=xlabbel, fontsize=40)
211
+ plt.plot(ylabel=ylabel, fontsize=40)
212
+ plt.grid(True)
213
+ plt.savefig(filename)
214
+
215
+ pass
216
+
217
+
218
+ def des(choicce):
219
+ title = "FSALs: Robust Feature selection framework"
220
+ description = r"""<center><img src='https://raw.githubusercontent.com/Justin-12138/bio_if/d1fdf085f8e679dcceecc2c05014b1d4a237e033/assets/favicon.svg' alt='FSALs logo'></center>
221
+ <b>Official Gradio demo</b> for <a href='https://huggingface.co/spaces/Justin-12138/FSALA' target='_blank'><b>Application of Causal Inference in Alzheimer's Disease(CCFC2023)</b></a>.<br>
222
+ 🔥 Fsals is a Robust feature selection framework based on causal inference. <br>
223
+ 🤗 Try using fsals in different data sets.!<br>
224
+ """
225
+ article = r"""
226
+ If FSALs is helpful, please help to ⭐ the <a href='https://github.com/Justin-12138/bio_if' target='_blank'>Github Repo</a>. Thanks!
227
+ [![GitHub Stars](https://img.shields.io/github/stars/Justin-12138/bio_if?style=social)](https://github.com/Justin-12138/bio_if)
228
+
229
+ ---
230
+
231
+ 📝 **Citation**
232
+
233
+ If our work is useful for your research, please consider citing:
234
+ ```bibtex
235
+ @article{zlhl2023,
236
+ author = {Xiaolong Zhou, Zhao Liu, Yuchen Huang, Kun Lin},
237
+ title = {A Novel Ensemble Feature Selection Method for Biomarkers of Alzheimer's disease},
238
+ booktitle = {GUET Publisher},
239
+ year = {2023}
240
+ }
241
+ ```
242
+ 📋 **License**
243
+
244
+ This project is licensed under <a rel="license" href="https://github.com/Justin-12138/bio_if/blob/main/LICENSE">GPL License 2.0</a>.
245
+ Redistribution and use for non-commercial purposes should follow this license.
246
+
247
+ 📧 **Contact**
248
+
249
+ If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
250
+
251
+ <div>
252
+ 🤗 Find Me:
253
+ <a href="https://github.com/Justin-12138"><img style="margin-top:0.5em; margin-bottom:2em" src="https://img.shields.io/github/followers/Justin-12138?style=social" alt="Github Follow"></a>
254
+ </div>
255
+ """
256
+ if choicce == "title":
257
+ return title
258
+ elif choicce == "description":
259
+ return description
260
+ elif choicce == "article":
261
+ return article
262
+ elif choicce == 'inputs':
263
+ inputs = [gr.inputs.File(label="Training data"),
264
+ gr.inputs.Radio(['MRMR_FCD', 'MRMR_FCQ', 'CFS', 'Lasso', 'Ensemble', 'CI'], label="method"),
265
+ gr.inputs.Number(label="Num_feature(int)"),
266
+ gr.inputs.Radio(['RF', 'SVM', 'KNN', 'DT', 'Naive Bayes'], label="classifier for CV"),
267
+ gr.inputs.File(label="Testing data")
268
+ ]
269
+ return inputs
270
+ elif choicce == 'outputs':
271
+ output = [gr.Image(label="Index_score"),
272
+ gr.Image(label="IFS_Acc"),
273
+ gr.Image(label="Confusion_matrix"),
274
+ gr.File(label='Index_score.csv')]
275
+ return output
276
+
277
+
278
+ def cv(X, y, index_0, clf, n_fold):
279
+ acc = []
280
+ for i in range(len(index_0)):
281
+ # 使用前i个特征进行交叉验证
282
+ selected_features = X[:, [int(j) - 1 for j in index_0[:i + 1]]]
283
+ scores = cross_val_score(clf, selected_features, y, cv=n_fold)
284
+ # 计算平均准确率并添加到acc列表中
285
+ acc.append(scores.mean())
286
+ max_acc = round(max(acc), 4)
287
+ max_index = acc.index(max(acc)) + 1
288
+ return acc, max_acc, max_index
289
+
290
+
291
+ def getindex_1(sorted_combined):
292
+ index_1 = []
293
+ index_0 = []
294
+ scores = []
295
+ for indy in sorted_combined:
296
+ index_1.append(str(indy[0] + 1))
297
+ scores.append(indy[1])
298
+ for item in index_1:
299
+ index_0.append(int(item) - 1)
300
+ return index_1, index_0, scores
301
+
302
+
303
+ def load_model(X, y, test_samples, test_labels):
304
+ models = SVC(C=1.0, kernel='rbf')
305
+ my_model = MyModel(models)
306
+ my_model.train(X, y)
307
+ # 预测测试样本的标签并计算准确率
308
+ predictions = my_model.predict_samples(test_samples)
309
+ # 计算混淆矩阵
310
+ cm = confusion_matrix(test_labels, predictions)
311
+ return cm
312
+
313
+
314
+ def lasso(data, testsample, num_fea_int):
315
+ X, y = load_data(data, True)
316
+ test_samples, test_labels = load_data(testsample, False)
317
+ cl = LassoLarsCV(cv=20, max_iter=80000).fit(X, y)
318
+ importance = np.abs(cl.coef_)
319
+ feature_names = list(X)
320
+ a = len(feature_names)
321
+ idx_features = (-importance).argsort()[:a]
322
+ # name_features = np.array(feature_names)[idx_features]
323
+ result = pd.DataFrame({'index': idx_features, 'Score': importance[idx_features]})
324
+ result_rank = result.sort_values(by='Score', ascending=False, ignore_index=True)
325
+ result_rank.to_csv("index-score.csv")
326
+ inde = result_rank['index'].tolist()
327
+ score = result_rank['Score'].tolist()
328
+ return X, y, inde, score, test_samples, test_labels, num_fea_int
329
+
330
+
331
+ def fs(data, method, num_fea_int, clf, testsample):
332
+ num_fea_int = int(num_fea_int)
333
+ if method == 'MRMR_FCD':
334
+ combined, X, y, test_samples, test_labels = MRMR_FCD(data=data, testsample=testsample, num_fea_int=num_fea_int)
335
+ # 使用sorted()函数对合并后的列表进行排序,key参数指定按照分数排序,reverse=True表示降序排序
336
+ sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
337
+ index_score_csv(sorted_combined=sorted_combined, filename='ab.csv')
338
+ index_1, index_0, scores = getindex_1(sorted_combined=sorted_combined)
339
+ # 画score.png
340
+ isplot(1, 24, 10,
341
+ title_gr=str(method), x=index_1, y=scores,
342
+ xlabbel="index", ylabel="scores", filename="index-score.png")
343
+ # 选择分类器
344
+ clf = setclf(clf)
345
+ acc, max_acc, max_index = cv(X=X, y=y, index_0=index_0, clf=clf, n_fold=10)
346
+ # 画acc.png
347
+ ifsplot(2, 24, 10,
348
+ title_gr=str(method), max_index=max_index, max_acc=max_acc,
349
+ acc=acc, xlabbel="top n features", ylabel="acc", filename="acc.png")
350
+ cm = load_model(X=X, y=y, test_samples=test_samples, test_labels=test_labels)
351
+ cmplot(3, 24, 10, cm=cm,
352
+ xlabbel="predicted labels", ylabel="true labels", filename='confusion_matrix.png')
353
+ return 'index-score.png', 'acc.png', "confusion_matrix.png", "ab.csv"
354
+
355
+ elif method == 'MRMR_FCQ':
356
+ combined, X, y, test_samples, test_labels = MRMR_FCQ(data=data, testsample=testsample, num_fea_int=num_fea_int)
357
+ # 使用sorted()函数对合并后的列表进行排序,key参数指定按照分数排序,reverse=True表示降序排序
358
+ sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
359
+ index_score_csv(sorted_combined=sorted_combined, filename='ab.csv')
360
+ # inde index start 1
361
+ index_1, index_0, scores = getindex_1(sorted_combined=sorted_combined)
362
+ # index-score.png
363
+ isplot(1, 24, 10, title_gr=str(method), x=index_1, y=scores,
364
+ xlabbel="index", ylabel="scores", filename="index-score.png")
365
+ # 选择分类器
366
+ clf = setclf(clf)
367
+ acc, max_acc, max_index = cv(X=X, y=y, index_0=index_0, clf=clf, n_fold=5)
368
+ # acc.png
369
+ ifsplot(2, 24, 10, title_gr=str(method), max_index=max_index,
370
+ max_acc=max_acc, acc=acc, xlabbel="top n features", ylabel="acc",
371
+ filename="acc.png")
372
+ # cal cm
373
+ cm = load_model(X=X, y=y, test_samples=test_samples, test_labels=test_labels)
374
+ cmplot(3, 24, 10,
375
+ cm=cm, xlabbel="predicted labels", ylabel="true labels", filename='confusion_matrix.png')
376
+ return 'index-score.png', 'acc.png', "confusion_matrix.png", "ab.csv"
377
+
378
+ elif method == 'Lasso':
379
+ X, y, inde, score, test_samples, test_labels, num_fea_int = lasso(data, testsample, num_fea_int)
380
+ index = []
381
+ for i in inde:
382
+ index.append(str(i))
383
+ plt.figure(1, figsize=(24, 12))
384
+ plt.title(str(method))
385
+ plt.plot(index[:num_fea_int], score[:num_fea_int])
386
+
387
+ # 设置x轴和y轴的标签
388
+ plt.xlabel('Feature Index', fontsize=40)
389
+ plt.ylabel('Feature Score', fontsize=40)
390
+ plt.savefig('Index_Score.png')
391
+ clf = setclf(clf)
392
+
393
+ inde = inde[:num_fea_int]
394
+ X = X.values
395
+ acc, max_acc, max_index = cv(X=X, y=y, index_0=inde, clf=clf, n_fold=5)
396
+ ifsplot(2, 24, 10, title_gr=str(method), max_index=max_index,
397
+ max_acc=max_acc, acc=acc, xlabbel="top n features", ylabel="acc",
398
+ filename="acc.png")
399
+
400
+ cm = load_model(X=X, y=y, test_samples=test_samples, test_labels=test_labels)
401
+ cmplot(3, 24, 10,
402
+ cm=cm, xlabbel="predicted labels", ylabel="true labels", filename='confusion_matrix.png')
403
+
404
+ return 'Index_Score.png', 'acc.png', "confusion_matrix.png", 'index-score.csv'
405
+
406
+ elif method == 'CFS':
407
+ pass