Spaces:
Runtime error
Runtime error
Commit
·
2cd96ed
1
Parent(s):
9529ea8
Upload app.py
Browse files
app.py
CHANGED
@@ -222,6 +222,166 @@ def fs(data, method, num_fea_int, clf):
|
|
222 |
# 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
|
223 |
# 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
|
224 |
# 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
elif method == 'Lasso':
|
226 |
data = pd.read_csv(data.name)
|
227 |
X = data.iloc[:, :-1]
|
@@ -357,8 +517,8 @@ iface = gr.Interface(
|
|
357 |
["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
|
358 |
["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
|
359 |
["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
|
360 |
-
["example_data.csv", '
|
361 |
-
["example_data.csv", '
|
362 |
],
|
363 |
allow_flagging="never"
|
364 |
)
|
|
|
222 |
# 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
|
223 |
# 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
|
224 |
# 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
|
225 |
+
elif method == 'CFS':
|
226 |
+
X = data.iloc[:, :-1].values
|
227 |
+
y = data['Label'].values
|
228 |
+
|
229 |
+
def loadDataSet(fileName):
|
230 |
+
df = pd.read_csv(fileName)
|
231 |
+
return df
|
232 |
+
|
233 |
+
def writesortedlist(filename, thelist):
|
234 |
+
with open(filename, "w") as fw:
|
235 |
+
for item in thelist:
|
236 |
+
fw.write(item[0] + "\t" + str(item[1]) + "\n")
|
237 |
+
|
238 |
+
def writethelist(filename, thelist):
|
239 |
+
with open(filename, "w") as fw:
|
240 |
+
for item in thelist:
|
241 |
+
fw.write(item + "\n")
|
242 |
+
|
243 |
+
def getdatadf(datafile):
|
244 |
+
datadf = loadDataSet(datafile)
|
245 |
+
labellist = datadf["Label"].tolist()
|
246 |
+
del datadf["Label"]
|
247 |
+
return datadf, labellist
|
248 |
+
|
249 |
+
def CFSmethod(datafile):
|
250 |
+
|
251 |
+
datadf, labellist = getdatadf(datafile)
|
252 |
+
print(datadf)
|
253 |
+
selectdf = datadf.copy()
|
254 |
+
allflist = datadf.columns.tolist()
|
255 |
+
namelist = list(datadf.index)
|
256 |
+
print(namelist)
|
257 |
+
namelist = [int(var) for var in namelist]
|
258 |
+
selectdf["class"] = namelist
|
259 |
+
|
260 |
+
bestfset, sortlist = calBFset(selectdf, allflist)
|
261 |
+
# writethelist("bestfeature.txt", bestfset) # 保存最佳特征子集
|
262 |
+
|
263 |
+
return dict(sortlist)
|
264 |
+
|
265 |
+
def calmulmerit(selectdf, sublist):
|
266 |
+
retvalue = 0
|
267 |
+
label = "class"
|
268 |
+
k = len(sublist)
|
269 |
+
namelist = list(selectdf["class"])
|
270 |
+
classset = set(namelist)
|
271 |
+
caldf = selectdf[sublist]
|
272 |
+
allvalue = 0.0
|
273 |
+
for feature in sublist:
|
274 |
+
caldf = selectdf[sublist]
|
275 |
+
middlevalue = 0.0
|
276 |
+
for ind in classset:
|
277 |
+
caldf[label] = np.where(selectdf[label] == ind, 1, 0)
|
278 |
+
coeff = pointbiserialr(caldf[feature], caldf[label])
|
279 |
+
middlevalue = abs(coeff.correlation) + middlevalue
|
280 |
+
allvalue = middlevalue / float(len(classset)) + allvalue
|
281 |
+
allvalue = allvalue / float(k)
|
282 |
+
|
283 |
+
corr = selectdf[sublist].corr()
|
284 |
+
corr.values[np.tril_indices_from(corr.values)] = np.nan
|
285 |
+
corr = abs(corr)
|
286 |
+
rff = corr.unstack().mean()
|
287 |
+
retvalue = (k * allvalue) / sqrt(k + k * (k - 1) * rff)
|
288 |
+
print(retvalue)
|
289 |
+
return retvalue
|
290 |
+
|
291 |
+
def calBFset(selectdf, allflist):
|
292 |
+
allfdict = getallfscoredict(selectdf, allflist)
|
293 |
+
sortedflist = sorted(allfdict.items(), key=lambda item: item[1], reverse=True)
|
294 |
+
# writesortedlist("sorteddict.txt", sortedflist) # 保存特征得分的降序
|
295 |
+
feaS = []
|
296 |
+
feaS.append(sortedflist[0][0])
|
297 |
+
maxvalue = sortedflist[0][1]
|
298 |
+
for i in range(1, len(sortedflist)):
|
299 |
+
print(str(i) + "/" + str(len(sortedflist)))
|
300 |
+
itemf = sortedflist[i][0]
|
301 |
+
feaS.append(itemf)
|
302 |
+
newvalue = calmulmerit(selectdf, feaS)
|
303 |
+
if newvalue > maxvalue:
|
304 |
+
maxvalue = newvalue
|
305 |
+
else:
|
306 |
+
feaS.pop()
|
307 |
+
print(feaS)
|
308 |
+
return feaS, sortedflist
|
309 |
+
|
310 |
+
def getallfscoredict(selectdf, allflist):
|
311 |
+
retdict = {}
|
312 |
+
k = 1
|
313 |
+
for f in allflist:
|
314 |
+
print(k)
|
315 |
+
k = k + 1
|
316 |
+
score = calonemerit(selectdf, f)
|
317 |
+
if math.isnan(score):
|
318 |
+
continue
|
319 |
+
retdict[f] = score
|
320 |
+
return retdict
|
321 |
+
|
322 |
+
def calonemerit(selectdf, subname):
|
323 |
+
retvalue = 0
|
324 |
+
label = "class"
|
325 |
+
namelist = list(selectdf["class"])
|
326 |
+
classset = set(namelist)
|
327 |
+
caldf = selectdf[subname].to_frame()
|
328 |
+
allvalue = 0.0
|
329 |
+
for ind in classset:
|
330 |
+
caldf[label] = np.where(selectdf[label] == ind, 1, 0)
|
331 |
+
coeff = pointbiserialr(caldf[subname], caldf[label])
|
332 |
+
allvalue = abs(coeff.correlation) + allvalue
|
333 |
+
allvalue = allvalue / float(len(classset))
|
334 |
+
return allvalue
|
335 |
+
|
336 |
+
# 获取特征分数
|
337 |
+
sortdict = CFSmethod(data.name)
|
338 |
+
# 画图
|
339 |
+
fig = plt.figure(figsize=(24, 12))
|
340 |
+
ax1 = fig.add_subplot(211)
|
341 |
+
ax1.set_title(str(method))
|
342 |
+
indexlist = list(range(1, len(sortdict.keys() + 1)))
|
343 |
+
ax1.plot(indexlist, sortdict.values()) # 特征分数图
|
344 |
+
# 设置x轴和y轴的标签
|
345 |
+
ax1.set_xlabel('Feature Index')
|
346 |
+
ax1.set_ylabel('Feature Score')
|
347 |
+
|
348 |
+
# 分类器
|
349 |
+
if clf == 'RF':
|
350 |
+
clf = RandomForestClassifier(n_jobs=-1)
|
351 |
+
elif clf == 'KNN':
|
352 |
+
clf = KNeighborsClassifier()
|
353 |
+
elif clf == 'DT':
|
354 |
+
clf = DecisionTreeClassifier()
|
355 |
+
elif clf == 'SVM':
|
356 |
+
clf = SVC()
|
357 |
+
elif clf == 'Naive Bayes':
|
358 |
+
clf = GaussianNB()
|
359 |
+
# 画交叉验证图
|
360 |
+
acc = []
|
361 |
+
# 对于index列表中的每个特征索引
|
362 |
+
for i in range(len(indexlist)):
|
363 |
+
# 使用前i个特征进行交叉验证
|
364 |
+
selected_features = X[:, 0:i]
|
365 |
+
scores = cross_val_score(clf, selected_features, y, cv=5)
|
366 |
+
# 计算平均准确率并添加到acc列表中
|
367 |
+
acc.append(scores.mean())
|
368 |
+
max_acc = max(acc)
|
369 |
+
max_index = acc.index(max_acc) # 应该不用加1吧
|
370 |
+
|
371 |
+
ax2 = fig.add_subplot(212)
|
372 |
+
ax2.set_title("IFS_mRMR_FCD_Accuracy")
|
373 |
+
ax2.plot(max_index, max_acc, 'ro')
|
374 |
+
ax2.plot(acc)
|
375 |
+
ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
|
376 |
+
ha='center')
|
377 |
+
# 设置x轴和y轴的标签
|
378 |
+
ax2.set_xlabel('Top n features')
|
379 |
+
ax2.set_ylabel('Accuracy')
|
380 |
+
plt.grid(True)
|
381 |
+
plt.savefig('output.png')
|
382 |
+
return 'output.png'
|
383 |
+
|
384 |
+
pass
|
385 |
elif method == 'Lasso':
|
386 |
data = pd.read_csv(data.name)
|
387 |
X = data.iloc[:, :-1]
|
|
|
517 |
["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
|
518 |
["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
|
519 |
["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
|
520 |
+
["example_data.csv", 'CFS', 50, 'DT'],
|
521 |
+
["example_data.csv", 'CFS', 40, 'Naive Bayes'],
|
522 |
],
|
523 |
allow_flagging="never"
|
524 |
)
|