Justin-12138 commited on
Commit
77bd380
·
1 Parent(s): 2cd96ed

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -162
app.py CHANGED
@@ -222,166 +222,6 @@ def fs(data, method, num_fea_int, clf):
222
  # 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
223
  # 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
224
  # 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
225
- elif method == 'CFS':
226
- X = data.iloc[:, :-1].values
227
- y = data['Label'].values
228
-
229
- def loadDataSet(fileName):
230
- df = pd.read_csv(fileName)
231
- return df
232
-
233
- def writesortedlist(filename, thelist):
234
- with open(filename, "w") as fw:
235
- for item in thelist:
236
- fw.write(item[0] + "\t" + str(item[1]) + "\n")
237
-
238
- def writethelist(filename, thelist):
239
- with open(filename, "w") as fw:
240
- for item in thelist:
241
- fw.write(item + "\n")
242
-
243
- def getdatadf(datafile):
244
- datadf = loadDataSet(datafile)
245
- labellist = datadf["Label"].tolist()
246
- del datadf["Label"]
247
- return datadf, labellist
248
-
249
- def CFSmethod(datafile):
250
-
251
- datadf, labellist = getdatadf(datafile)
252
- print(datadf)
253
- selectdf = datadf.copy()
254
- allflist = datadf.columns.tolist()
255
- namelist = list(datadf.index)
256
- print(namelist)
257
- namelist = [int(var) for var in namelist]
258
- selectdf["class"] = namelist
259
-
260
- bestfset, sortlist = calBFset(selectdf, allflist)
261
- # writethelist("bestfeature.txt", bestfset) # 保存最佳特征子集
262
-
263
- return dict(sortlist)
264
-
265
- def calmulmerit(selectdf, sublist):
266
- retvalue = 0
267
- label = "class"
268
- k = len(sublist)
269
- namelist = list(selectdf["class"])
270
- classset = set(namelist)
271
- caldf = selectdf[sublist]
272
- allvalue = 0.0
273
- for feature in sublist:
274
- caldf = selectdf[sublist]
275
- middlevalue = 0.0
276
- for ind in classset:
277
- caldf[label] = np.where(selectdf[label] == ind, 1, 0)
278
- coeff = pointbiserialr(caldf[feature], caldf[label])
279
- middlevalue = abs(coeff.correlation) + middlevalue
280
- allvalue = middlevalue / float(len(classset)) + allvalue
281
- allvalue = allvalue / float(k)
282
-
283
- corr = selectdf[sublist].corr()
284
- corr.values[np.tril_indices_from(corr.values)] = np.nan
285
- corr = abs(corr)
286
- rff = corr.unstack().mean()
287
- retvalue = (k * allvalue) / sqrt(k + k * (k - 1) * rff)
288
- print(retvalue)
289
- return retvalue
290
-
291
- def calBFset(selectdf, allflist):
292
- allfdict = getallfscoredict(selectdf, allflist)
293
- sortedflist = sorted(allfdict.items(), key=lambda item: item[1], reverse=True)
294
- # writesortedlist("sorteddict.txt", sortedflist) # 保存特征得分的降序
295
- feaS = []
296
- feaS.append(sortedflist[0][0])
297
- maxvalue = sortedflist[0][1]
298
- for i in range(1, len(sortedflist)):
299
- print(str(i) + "/" + str(len(sortedflist)))
300
- itemf = sortedflist[i][0]
301
- feaS.append(itemf)
302
- newvalue = calmulmerit(selectdf, feaS)
303
- if newvalue > maxvalue:
304
- maxvalue = newvalue
305
- else:
306
- feaS.pop()
307
- print(feaS)
308
- return feaS, sortedflist
309
-
310
- def getallfscoredict(selectdf, allflist):
311
- retdict = {}
312
- k = 1
313
- for f in allflist:
314
- print(k)
315
- k = k + 1
316
- score = calonemerit(selectdf, f)
317
- if math.isnan(score):
318
- continue
319
- retdict[f] = score
320
- return retdict
321
-
322
- def calonemerit(selectdf, subname):
323
- retvalue = 0
324
- label = "class"
325
- namelist = list(selectdf["class"])
326
- classset = set(namelist)
327
- caldf = selectdf[subname].to_frame()
328
- allvalue = 0.0
329
- for ind in classset:
330
- caldf[label] = np.where(selectdf[label] == ind, 1, 0)
331
- coeff = pointbiserialr(caldf[subname], caldf[label])
332
- allvalue = abs(coeff.correlation) + allvalue
333
- allvalue = allvalue / float(len(classset))
334
- return allvalue
335
-
336
- # 获取特征分数
337
- sortdict = CFSmethod(data.name)
338
- # 画图
339
- fig = plt.figure(figsize=(24, 12))
340
- ax1 = fig.add_subplot(211)
341
- ax1.set_title(str(method))
342
- indexlist = list(range(1, len(sortdict.keys() + 1)))
343
- ax1.plot(indexlist, sortdict.values()) # 特征分数图
344
- # 设置x轴和y轴的标签
345
- ax1.set_xlabel('Feature Index')
346
- ax1.set_ylabel('Feature Score')
347
-
348
- # 分类器
349
- if clf == 'RF':
350
- clf = RandomForestClassifier(n_jobs=-1)
351
- elif clf == 'KNN':
352
- clf = KNeighborsClassifier()
353
- elif clf == 'DT':
354
- clf = DecisionTreeClassifier()
355
- elif clf == 'SVM':
356
- clf = SVC()
357
- elif clf == 'Naive Bayes':
358
- clf = GaussianNB()
359
- # 画交叉验证图
360
- acc = []
361
- # 对于index列表中的每个特征索引
362
- for i in range(len(indexlist)):
363
- # 使用前i个特征进行交叉验证
364
- selected_features = X[:, 0:i]
365
- scores = cross_val_score(clf, selected_features, y, cv=5)
366
- # 计算平均准确率并添加到acc列表中
367
- acc.append(scores.mean())
368
- max_acc = max(acc)
369
- max_index = acc.index(max_acc) # 应该不用加1吧
370
-
371
- ax2 = fig.add_subplot(212)
372
- ax2.set_title("IFS_mRMR_FCD_Accuracy")
373
- ax2.plot(max_index, max_acc, 'ro')
374
- ax2.plot(acc)
375
- ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
376
- ha='center')
377
- # 设置x轴和y轴的标签
378
- ax2.set_xlabel('Top n features')
379
- ax2.set_ylabel('Accuracy')
380
- plt.grid(True)
381
- plt.savefig('output.png')
382
- return 'output.png'
383
-
384
- pass
385
  elif method == 'Lasso':
386
  data = pd.read_csv(data.name)
387
  X = data.iloc[:, :-1]
@@ -517,8 +357,8 @@ iface = gr.Interface(
517
  ["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
518
  ["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
519
  ["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
520
- ["example_data.csv", 'CFS', 50, 'DT'],
521
- ["example_data.csv", 'CFS', 40, 'Naive Bayes'],
522
  ],
523
  allow_flagging="never"
524
  )
 
222
  # 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
223
  # 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
224
  # 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  elif method == 'Lasso':
226
  data = pd.read_csv(data.name)
227
  X = data.iloc[:, :-1]
 
357
  ["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
358
  ["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
359
  ["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
360
+ ["example_data.csv", 'Lasso', 50, 'DT'],
361
+ ["example_data.csv", 'Lasso', 40, 'Naive Bayes'],
362
  ],
363
  allow_flagging="never"
364
  )