Justin-12138 commited on
Commit
1f7e2a8
·
1 Parent(s): 3da65a3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -135
app.py CHANGED
@@ -16,6 +16,7 @@ from sklearn.naive_bayes import GaussianNB
16
  from sklearn.neighbors import KNeighborsClassifier
17
  from sklearn.svm import SVC
18
  from sklearn.tree import DecisionTreeClassifier
 
19
 
20
 
21
  # warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
@@ -28,28 +29,22 @@ def add_max_score_to_list(temp_scores, current_score, selected_indices, selected
28
 
29
 
30
  def fs(data, method, num_fea_int, clf):
31
- data = pd.read_csv(data.name)
32
- X = data.iloc[:, :-1].values
33
- y = data['Label'].values
34
  num_fea_int = int(num_fea_int)
35
  if method == 'MRMR_FCD':
 
 
 
36
  num_features = len(X[0])
37
-
38
  f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
39
  # 添加起始特征的分数到current_score
40
  current_score = [max(f_test_scores)]
41
-
42
- # 索引从0开始
43
- # start_feature_index = random.randint(0, num_features - 1)
44
  # 索引从最高分数的特征开始
45
  start_feature_index = f_test_scores.index(max(f_test_scores))
46
  selected_indices = set()
47
  selected_indices_list = []
48
  selected_indices.add(start_feature_index)
49
  selected_indices_list.append(start_feature_index)
50
-
51
  pearson_score_matrix = np.zeros((num_features, num_features))
52
-
53
  for _ in range(num_fea_int - 1):
54
  temp_scores = []
55
  for i in range(num_features):
@@ -68,7 +63,6 @@ def fs(data, method, num_fea_int, clf):
68
  if pearson_score_matrix[j][i] == 0:
69
  pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
70
  diff += pearson_score_matrix[j][i]
71
- # diff += np.corrcoef(X[:,i], X[:,j])[0, 1]
72
  temp_scores.append(f_test_score - diff / len(selected_indices))
73
  add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
74
  combined = list(zip(selected_indices_list, current_score))
@@ -129,6 +123,9 @@ def fs(data, method, num_fea_int, clf):
129
  return 'output.png'
130
 
131
  elif method == 'MRMR_FCQ':
 
 
 
132
  num_fea_inttures = len(X[0])
133
  f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
134
 
@@ -222,131 +219,43 @@ def fs(data, method, num_fea_int, clf):
222
  plt.grid(True)
223
  plt.savefig('output.png')
224
  return 'output.png'
225
-
226
  # 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
227
  # 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
228
  # 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
229
- elif method == 'CFS':
230
- def loadDataSet(fileName):
231
- df = pd.read_csv(fileName)
232
- return df
233
-
234
- def writesortedlist(filename, thelist):
235
- with open(filename, "w") as fw:
236
- for item in thelist:
237
- fw.write(item[0] + "\t" + str(item[1]) + "\n")
238
-
239
- def writethelist(filename, thelist):
240
- with open(filename, "w") as fw:
241
- for item in thelist:
242
- fw.write(item + "\n")
243
-
244
- def getdatadf(datafile):
245
- datadf = loadDataSet(datafile)
246
- labellist = datadf["Label"].tolist()
247
- del datadf["Label"]
248
- return datadf, labellist
249
-
250
- def CFSmethod(datafile):
251
-
252
- datadf, labellist = getdatadf(datafile)
253
- print(datadf)
254
- selectdf = datadf.copy()
255
- allflist = datadf.columns.tolist()
256
- namelist = list(datadf.index)
257
- print(namelist)
258
- namelist = [int(var) for var in namelist]
259
- selectdf["class"] = namelist
260
-
261
- bestfset,sortlist = calBFset(selectdf, allflist)
262
- writethelist("bestfeature.txt", bestfset)#保存最佳特征子集
263
-
264
- return dict(sortlist)
265
-
266
- def calmulmerit(selectdf, sublist):
267
- retvalue = 0
268
- label = "class"
269
- k = len(sublist)
270
- namelist = list(selectdf["class"])
271
- classset = set(namelist)
272
- caldf = selectdf[sublist]
273
- allvalue = 0.0
274
- for feature in sublist:
275
- caldf = selectdf[sublist]
276
- middlevalue = 0.0
277
- for ind in classset:
278
- caldf[label] = np.where(selectdf[label] == ind, 1, 0)
279
- coeff = pointbiserialr(caldf[feature], caldf[label])
280
- middlevalue = abs(coeff.correlation) + middlevalue
281
- allvalue = middlevalue / float(len(classset)) + allvalue
282
- allvalue = allvalue / float(k)
283
-
284
- corr = selectdf[sublist].corr()
285
- corr.values[np.tril_indices_from(corr.values)] = np.nan
286
- corr = abs(corr)
287
- rff = corr.unstack().mean()
288
- retvalue = (k * allvalue) / sqrt(k + k * (k - 1) * rff)
289
- print(retvalue)
290
- return retvalue
291
-
292
- def calBFset(selectdf, allflist):
293
- allfdict = getallfscoredict(selectdf, allflist)
294
- sortedflist = sorted(allfdict.items(), key=lambda item: item[1], reverse=True)
295
- writesortedlist("sorteddict.txt", sortedflist)#保存特征得分的降序
296
- feaS = []
297
- feaS.append(sortedflist[0][0])
298
- maxvalue = sortedflist[0][1]
299
- for i in range(1, len(sortedflist)):
300
- print(str(i) + "/" + str(len(sortedflist)))
301
- itemf = sortedflist[i][0]
302
- feaS.append(itemf)
303
- newvalue = calmulmerit(selectdf, feaS)
304
- if newvalue > maxvalue:
305
- maxvalue = newvalue
306
- else:
307
- feaS.pop()
308
- print(feaS)
309
- return feaS,sortedflist
310
-
311
- def getallfscoredict(selectdf, allflist):
312
- retdict = {}
313
- k = 1
314
- for f in allflist:
315
- print(k)
316
- k = k + 1
317
- score = calonemerit(selectdf, f)
318
- if math.isnan(score):
319
- continue
320
- retdict[f] = score
321
- return retdict
322
-
323
- def calonemerit(selectdf, subname):
324
- retvalue = 0
325
- label = "class"
326
- namelist = list(selectdf["class"])
327
- classset = set(namelist)
328
- caldf = selectdf[subname].to_frame()
329
- allvalue = 0.0
330
- for ind in classset:
331
- caldf[label] = np.where(selectdf[label] == ind, 1, 0)
332
- coeff = pointbiserialr(caldf[subname], caldf[label])
333
- allvalue = abs(coeff.correlation) + allvalue
334
- allvalue = allvalue / float(len(classset))
335
- return allvalue
336
-
337
- #获取特征分数
338
- sortdict=CFSmethod(data.name)
339
- # 画图
340
  fig = plt.figure(figsize=(24, 12))
341
  ax1 = fig.add_subplot(211)
342
  ax1.set_title(str(method))
343
- indexlist=list(range(1,len(sortdict.keys()+1)))
344
- ax1.plot(indexlist, sortdict.values()) # 特征分数图
345
  # 设置x轴和y轴的标签
346
  ax1.set_xlabel('Feature Index')
347
  ax1.set_ylabel('Feature Score')
348
 
349
- #分类器
350
  if clf == 'RF':
351
  clf = RandomForestClassifier(n_jobs=-1)
352
  elif clf == 'KNN':
@@ -357,20 +266,24 @@ def fs(data, method, num_fea_int, clf):
357
  clf = SVC()
358
  elif clf == 'Naive Bayes':
359
  clf = GaussianNB()
360
- #画交叉验证图
 
361
  acc = []
362
  # 对于index列表中的每个特征索引
363
- for i in range(len(indexlist)):
 
 
 
364
  # 使用前i个特征进行交叉验证
365
- selected_features = X[:,0:i]
366
  scores = cross_val_score(clf, selected_features, y, cv=5)
367
  # 计算平均准确率并添加到acc列表中
368
  acc.append(scores.mean())
369
  max_acc = max(acc)
370
- max_index = acc.index(max_acc)#应该不用加1
371
 
372
  ax2 = fig.add_subplot(212)
373
- ax2.set_title("IFS_mRMR_FCD_Accuracy")
374
  ax2.plot(max_index, max_acc, 'ro')
375
  ax2.plot(acc)
376
  ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
@@ -382,9 +295,6 @@ def fs(data, method, num_fea_int, clf):
382
  plt.savefig('output.png')
383
  return 'output.png'
384
 
385
- pass
386
- elif method == 'Lasso':
387
- pass
388
  elif method == 'Ensemble':
389
  pass
390
  elif method == 'CI':
@@ -430,8 +340,6 @@ If you have any questions, please feel free to reach me out at <b>justinliu707@g
430
  </div>
431
  """
432
 
433
-
434
-
435
  iface = gr.Interface(
436
  fn=fs,
437
  title=title,
 
16
  from sklearn.neighbors import KNeighborsClassifier
17
  from sklearn.svm import SVC
18
  from sklearn.tree import DecisionTreeClassifier
19
+ from sklearn.linear_model import LassoLarsCV
20
 
21
 
22
  # warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
 
29
 
30
 
31
  def fs(data, method, num_fea_int, clf):
 
 
 
32
  num_fea_int = int(num_fea_int)
33
  if method == 'MRMR_FCD':
34
+ data = pd.read_csv(data.name)
35
+ X = data.iloc[:, :-1].values
36
+ y = data['Label'].values
37
  num_features = len(X[0])
 
38
  f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
39
  # 添加起始特征的分数到current_score
40
  current_score = [max(f_test_scores)]
 
 
 
41
  # 索引从最高分数的特征开始
42
  start_feature_index = f_test_scores.index(max(f_test_scores))
43
  selected_indices = set()
44
  selected_indices_list = []
45
  selected_indices.add(start_feature_index)
46
  selected_indices_list.append(start_feature_index)
 
47
  pearson_score_matrix = np.zeros((num_features, num_features))
 
48
  for _ in range(num_fea_int - 1):
49
  temp_scores = []
50
  for i in range(num_features):
 
63
  if pearson_score_matrix[j][i] == 0:
64
  pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
65
  diff += pearson_score_matrix[j][i]
 
66
  temp_scores.append(f_test_score - diff / len(selected_indices))
67
  add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
68
  combined = list(zip(selected_indices_list, current_score))
 
123
  return 'output.png'
124
 
125
  elif method == 'MRMR_FCQ':
126
+ data = pd.read_csv(data.name)
127
+ X = data.iloc[:, :-1].values
128
+ y = data['Label'].values
129
  num_fea_inttures = len(X[0])
130
  f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
131
 
 
219
  plt.grid(True)
220
  plt.savefig('output.png')
221
  return 'output.png'
 
222
  # 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
223
  # 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
224
  # 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
225
+ elif method == 'Lasso':
226
+ data = pd.read_csv(data.name)
227
+ X = data.iloc[:, :-1]
228
+ y = data.iloc[:, -1:].values.flatten()
229
+
230
+ cl = LassoLarsCV(cv=20, max_iter=80000).fit(X, y)
231
+
232
+ importance = np.abs(cl.coef_)
233
+ feature_names = list(X)
234
+ print(feature_names)
235
+ a = len(feature_names)
236
+
237
+ idx_features = (-importance).argsort()[:a]
238
+ print(idx_features)
239
+ name_features = np.array(feature_names)[idx_features]
240
+ for i in range(a):
241
+ print((name_features)[i], importance[idx_features][i])
242
+ result = pd.DataFrame({'index': idx_features, 'Score': importance[idx_features]})
243
+ result_rank = result.sort_values(by='Score', ascending=False, ignore_index=True)
244
+ inde = result_rank['index'].tolist()
245
+ score = result_rank['Score'].tolist()
246
+
247
+ index = []
248
+ for i in inde:
249
+ index.append(str(i))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  fig = plt.figure(figsize=(24, 12))
251
  ax1 = fig.add_subplot(211)
252
  ax1.set_title(str(method))
253
+ ax1.plot(index[:num_fea_int], score[:num_fea_int])
254
+
255
  # 设置x轴和y轴的标签
256
  ax1.set_xlabel('Feature Index')
257
  ax1.set_ylabel('Feature Score')
258
 
 
259
  if clf == 'RF':
260
  clf = RandomForestClassifier(n_jobs=-1)
261
  elif clf == 'KNN':
 
266
  clf = SVC()
267
  elif clf == 'Naive Bayes':
268
  clf = GaussianNB()
269
+ inde = inde[:num_fea_int]
270
+ index = index[:num_fea_int]
271
  acc = []
272
  # 对于index列表中的每个特征索引
273
+
274
+ X = data.iloc[:, :-1].values
275
+ print(X)
276
+ for i in range(len(index)):
277
  # 使用前i个特征进行交叉验证
278
+ selected_features = X[:, [int(j) - 1 for j in inde[:i + 1]]]
279
  scores = cross_val_score(clf, selected_features, y, cv=5)
280
  # 计算平均准确率并添加到acc列表中
281
  acc.append(scores.mean())
282
  max_acc = max(acc)
283
+ max_index = acc.index(max_acc) + 1
284
 
285
  ax2 = fig.add_subplot(212)
286
+ ax2.set_title("IFS_" + str(method) + "_Accuracy")
287
  ax2.plot(max_index, max_acc, 'ro')
288
  ax2.plot(acc)
289
  ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
 
295
  plt.savefig('output.png')
296
  return 'output.png'
297
 
 
 
 
298
  elif method == 'Ensemble':
299
  pass
300
  elif method == 'CI':
 
340
  </div>
341
  """
342
 
 
 
343
  iface = gr.Interface(
344
  fn=fs,
345
  title=title,