Justin-12138 commited on
Commit
2cd96ed
·
1 Parent(s): 9529ea8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -2
app.py CHANGED
@@ -222,6 +222,166 @@ def fs(data, method, num_fea_int, clf):
222
  # 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
223
  # 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
224
  # 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  elif method == 'Lasso':
226
  data = pd.read_csv(data.name)
227
  X = data.iloc[:, :-1]
@@ -357,8 +517,8 @@ iface = gr.Interface(
357
  ["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
358
  ["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
359
  ["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
360
- ["example_data.csv", 'Lasso', 50, 'DT'],
361
- ["example_data.csv", 'Lasso', 40, 'Naive Bayes'],
362
  ],
363
  allow_flagging="never"
364
  )
 
222
  # 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
223
  # 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
224
  # 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
225
+ elif method == 'CFS':
226
+ X = data.iloc[:, :-1].values
227
+ y = data['Label'].values
228
+
229
+ def loadDataSet(fileName):
230
+ df = pd.read_csv(fileName)
231
+ return df
232
+
233
+ def writesortedlist(filename, thelist):
234
+ with open(filename, "w") as fw:
235
+ for item in thelist:
236
+ fw.write(item[0] + "\t" + str(item[1]) + "\n")
237
+
238
+ def writethelist(filename, thelist):
239
+ with open(filename, "w") as fw:
240
+ for item in thelist:
241
+ fw.write(item + "\n")
242
+
243
+ def getdatadf(datafile):
244
+ datadf = loadDataSet(datafile)
245
+ labellist = datadf["Label"].tolist()
246
+ del datadf["Label"]
247
+ return datadf, labellist
248
+
249
+ def CFSmethod(datafile):
250
+
251
+ datadf, labellist = getdatadf(datafile)
252
+ print(datadf)
253
+ selectdf = datadf.copy()
254
+ allflist = datadf.columns.tolist()
255
+ namelist = list(datadf.index)
256
+ print(namelist)
257
+ namelist = [int(var) for var in namelist]
258
+ selectdf["class"] = namelist
259
+
260
+ bestfset, sortlist = calBFset(selectdf, allflist)
261
+ # writethelist("bestfeature.txt", bestfset) # 保存最佳特征子集
262
+
263
+ return dict(sortlist)
264
+
265
+ def calmulmerit(selectdf, sublist):
266
+ retvalue = 0
267
+ label = "class"
268
+ k = len(sublist)
269
+ namelist = list(selectdf["class"])
270
+ classset = set(namelist)
271
+ caldf = selectdf[sublist]
272
+ allvalue = 0.0
273
+ for feature in sublist:
274
+ caldf = selectdf[sublist]
275
+ middlevalue = 0.0
276
+ for ind in classset:
277
+ caldf[label] = np.where(selectdf[label] == ind, 1, 0)
278
+ coeff = pointbiserialr(caldf[feature], caldf[label])
279
+ middlevalue = abs(coeff.correlation) + middlevalue
280
+ allvalue = middlevalue / float(len(classset)) + allvalue
281
+ allvalue = allvalue / float(k)
282
+
283
+ corr = selectdf[sublist].corr()
284
+ corr.values[np.tril_indices_from(corr.values)] = np.nan
285
+ corr = abs(corr)
286
+ rff = corr.unstack().mean()
287
+ retvalue = (k * allvalue) / sqrt(k + k * (k - 1) * rff)
288
+ print(retvalue)
289
+ return retvalue
290
+
291
+ def calBFset(selectdf, allflist):
292
+ allfdict = getallfscoredict(selectdf, allflist)
293
+ sortedflist = sorted(allfdict.items(), key=lambda item: item[1], reverse=True)
294
+ # writesortedlist("sorteddict.txt", sortedflist) # 保存特征得分的降序
295
+ feaS = []
296
+ feaS.append(sortedflist[0][0])
297
+ maxvalue = sortedflist[0][1]
298
+ for i in range(1, len(sortedflist)):
299
+ print(str(i) + "/" + str(len(sortedflist)))
300
+ itemf = sortedflist[i][0]
301
+ feaS.append(itemf)
302
+ newvalue = calmulmerit(selectdf, feaS)
303
+ if newvalue > maxvalue:
304
+ maxvalue = newvalue
305
+ else:
306
+ feaS.pop()
307
+ print(feaS)
308
+ return feaS, sortedflist
309
+
310
+ def getallfscoredict(selectdf, allflist):
311
+ retdict = {}
312
+ k = 1
313
+ for f in allflist:
314
+ print(k)
315
+ k = k + 1
316
+ score = calonemerit(selectdf, f)
317
+ if math.isnan(score):
318
+ continue
319
+ retdict[f] = score
320
+ return retdict
321
+
322
+ def calonemerit(selectdf, subname):
323
+ retvalue = 0
324
+ label = "class"
325
+ namelist = list(selectdf["class"])
326
+ classset = set(namelist)
327
+ caldf = selectdf[subname].to_frame()
328
+ allvalue = 0.0
329
+ for ind in classset:
330
+ caldf[label] = np.where(selectdf[label] == ind, 1, 0)
331
+ coeff = pointbiserialr(caldf[subname], caldf[label])
332
+ allvalue = abs(coeff.correlation) + allvalue
333
+ allvalue = allvalue / float(len(classset))
334
+ return allvalue
335
+
336
+ # 获取特征分数
337
+ sortdict = CFSmethod(data.name)
338
+ # 画图
339
+ fig = plt.figure(figsize=(24, 12))
340
+ ax1 = fig.add_subplot(211)
341
+ ax1.set_title(str(method))
342
+ indexlist = list(range(1, len(sortdict.keys() + 1)))
343
+ ax1.plot(indexlist, sortdict.values()) # 特征分数图
344
+ # 设置x轴和y轴的标签
345
+ ax1.set_xlabel('Feature Index')
346
+ ax1.set_ylabel('Feature Score')
347
+
348
+ # 分类器
349
+ if clf == 'RF':
350
+ clf = RandomForestClassifier(n_jobs=-1)
351
+ elif clf == 'KNN':
352
+ clf = KNeighborsClassifier()
353
+ elif clf == 'DT':
354
+ clf = DecisionTreeClassifier()
355
+ elif clf == 'SVM':
356
+ clf = SVC()
357
+ elif clf == 'Naive Bayes':
358
+ clf = GaussianNB()
359
+ # 画交叉验证图
360
+ acc = []
361
+ # 对于index列表中的每个特征索引
362
+ for i in range(len(indexlist)):
363
+ # 使用前i个特征进行交叉验证
364
+ selected_features = X[:, 0:i]
365
+ scores = cross_val_score(clf, selected_features, y, cv=5)
366
+ # 计算平均准确率并添加到acc列表中
367
+ acc.append(scores.mean())
368
+ max_acc = max(acc)
369
+ max_index = acc.index(max_acc) # 应该不用加1吧
370
+
371
+ ax2 = fig.add_subplot(212)
372
+ ax2.set_title("IFS_mRMR_FCD_Accuracy")
373
+ ax2.plot(max_index, max_acc, 'ro')
374
+ ax2.plot(acc)
375
+ ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
376
+ ha='center')
377
+ # 设置x轴和y轴的标签
378
+ ax2.set_xlabel('Top n features')
379
+ ax2.set_ylabel('Accuracy')
380
+ plt.grid(True)
381
+ plt.savefig('output.png')
382
+ return 'output.png'
383
+
384
+ pass
385
  elif method == 'Lasso':
386
  data = pd.read_csv(data.name)
387
  X = data.iloc[:, :-1]
 
517
  ["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
518
  ["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
519
  ["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
520
+ ["example_data.csv", 'CFS', 50, 'DT'],
521
+ ["example_data.csv", 'CFS', 40, 'Naive Bayes'],
522
  ],
523
  allow_flagging="never"
524
  )