Spaces:
Runtime error
Runtime error
Commit
·
77bd380
1
Parent(s):
2cd96ed
Upload app.py
Browse files
app.py
CHANGED
@@ -222,166 +222,6 @@ def fs(data, method, num_fea_int, clf):
|
|
222 |
# 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
|
223 |
# 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
|
224 |
# 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
|
225 |
-
elif method == 'CFS':
|
226 |
-
X = data.iloc[:, :-1].values
|
227 |
-
y = data['Label'].values
|
228 |
-
|
229 |
-
def loadDataSet(fileName):
|
230 |
-
df = pd.read_csv(fileName)
|
231 |
-
return df
|
232 |
-
|
233 |
-
def writesortedlist(filename, thelist):
|
234 |
-
with open(filename, "w") as fw:
|
235 |
-
for item in thelist:
|
236 |
-
fw.write(item[0] + "\t" + str(item[1]) + "\n")
|
237 |
-
|
238 |
-
def writethelist(filename, thelist):
|
239 |
-
with open(filename, "w") as fw:
|
240 |
-
for item in thelist:
|
241 |
-
fw.write(item + "\n")
|
242 |
-
|
243 |
-
def getdatadf(datafile):
|
244 |
-
datadf = loadDataSet(datafile)
|
245 |
-
labellist = datadf["Label"].tolist()
|
246 |
-
del datadf["Label"]
|
247 |
-
return datadf, labellist
|
248 |
-
|
249 |
-
def CFSmethod(datafile):
|
250 |
-
|
251 |
-
datadf, labellist = getdatadf(datafile)
|
252 |
-
print(datadf)
|
253 |
-
selectdf = datadf.copy()
|
254 |
-
allflist = datadf.columns.tolist()
|
255 |
-
namelist = list(datadf.index)
|
256 |
-
print(namelist)
|
257 |
-
namelist = [int(var) for var in namelist]
|
258 |
-
selectdf["class"] = namelist
|
259 |
-
|
260 |
-
bestfset, sortlist = calBFset(selectdf, allflist)
|
261 |
-
# writethelist("bestfeature.txt", bestfset) # 保存最佳特征子集
|
262 |
-
|
263 |
-
return dict(sortlist)
|
264 |
-
|
265 |
-
def calmulmerit(selectdf, sublist):
|
266 |
-
retvalue = 0
|
267 |
-
label = "class"
|
268 |
-
k = len(sublist)
|
269 |
-
namelist = list(selectdf["class"])
|
270 |
-
classset = set(namelist)
|
271 |
-
caldf = selectdf[sublist]
|
272 |
-
allvalue = 0.0
|
273 |
-
for feature in sublist:
|
274 |
-
caldf = selectdf[sublist]
|
275 |
-
middlevalue = 0.0
|
276 |
-
for ind in classset:
|
277 |
-
caldf[label] = np.where(selectdf[label] == ind, 1, 0)
|
278 |
-
coeff = pointbiserialr(caldf[feature], caldf[label])
|
279 |
-
middlevalue = abs(coeff.correlation) + middlevalue
|
280 |
-
allvalue = middlevalue / float(len(classset)) + allvalue
|
281 |
-
allvalue = allvalue / float(k)
|
282 |
-
|
283 |
-
corr = selectdf[sublist].corr()
|
284 |
-
corr.values[np.tril_indices_from(corr.values)] = np.nan
|
285 |
-
corr = abs(corr)
|
286 |
-
rff = corr.unstack().mean()
|
287 |
-
retvalue = (k * allvalue) / sqrt(k + k * (k - 1) * rff)
|
288 |
-
print(retvalue)
|
289 |
-
return retvalue
|
290 |
-
|
291 |
-
def calBFset(selectdf, allflist):
|
292 |
-
allfdict = getallfscoredict(selectdf, allflist)
|
293 |
-
sortedflist = sorted(allfdict.items(), key=lambda item: item[1], reverse=True)
|
294 |
-
# writesortedlist("sorteddict.txt", sortedflist) # 保存特征得分的降序
|
295 |
-
feaS = []
|
296 |
-
feaS.append(sortedflist[0][0])
|
297 |
-
maxvalue = sortedflist[0][1]
|
298 |
-
for i in range(1, len(sortedflist)):
|
299 |
-
print(str(i) + "/" + str(len(sortedflist)))
|
300 |
-
itemf = sortedflist[i][0]
|
301 |
-
feaS.append(itemf)
|
302 |
-
newvalue = calmulmerit(selectdf, feaS)
|
303 |
-
if newvalue > maxvalue:
|
304 |
-
maxvalue = newvalue
|
305 |
-
else:
|
306 |
-
feaS.pop()
|
307 |
-
print(feaS)
|
308 |
-
return feaS, sortedflist
|
309 |
-
|
310 |
-
def getallfscoredict(selectdf, allflist):
|
311 |
-
retdict = {}
|
312 |
-
k = 1
|
313 |
-
for f in allflist:
|
314 |
-
print(k)
|
315 |
-
k = k + 1
|
316 |
-
score = calonemerit(selectdf, f)
|
317 |
-
if math.isnan(score):
|
318 |
-
continue
|
319 |
-
retdict[f] = score
|
320 |
-
return retdict
|
321 |
-
|
322 |
-
def calonemerit(selectdf, subname):
|
323 |
-
retvalue = 0
|
324 |
-
label = "class"
|
325 |
-
namelist = list(selectdf["class"])
|
326 |
-
classset = set(namelist)
|
327 |
-
caldf = selectdf[subname].to_frame()
|
328 |
-
allvalue = 0.0
|
329 |
-
for ind in classset:
|
330 |
-
caldf[label] = np.where(selectdf[label] == ind, 1, 0)
|
331 |
-
coeff = pointbiserialr(caldf[subname], caldf[label])
|
332 |
-
allvalue = abs(coeff.correlation) + allvalue
|
333 |
-
allvalue = allvalue / float(len(classset))
|
334 |
-
return allvalue
|
335 |
-
|
336 |
-
# 获取特征分数
|
337 |
-
sortdict = CFSmethod(data.name)
|
338 |
-
# 画图
|
339 |
-
fig = plt.figure(figsize=(24, 12))
|
340 |
-
ax1 = fig.add_subplot(211)
|
341 |
-
ax1.set_title(str(method))
|
342 |
-
indexlist = list(range(1, len(sortdict.keys() + 1)))
|
343 |
-
ax1.plot(indexlist, sortdict.values()) # 特征分数图
|
344 |
-
# 设置x轴和y轴的标签
|
345 |
-
ax1.set_xlabel('Feature Index')
|
346 |
-
ax1.set_ylabel('Feature Score')
|
347 |
-
|
348 |
-
# 分类器
|
349 |
-
if clf == 'RF':
|
350 |
-
clf = RandomForestClassifier(n_jobs=-1)
|
351 |
-
elif clf == 'KNN':
|
352 |
-
clf = KNeighborsClassifier()
|
353 |
-
elif clf == 'DT':
|
354 |
-
clf = DecisionTreeClassifier()
|
355 |
-
elif clf == 'SVM':
|
356 |
-
clf = SVC()
|
357 |
-
elif clf == 'Naive Bayes':
|
358 |
-
clf = GaussianNB()
|
359 |
-
# 画交叉验证图
|
360 |
-
acc = []
|
361 |
-
# 对于index列表中的每个特征索引
|
362 |
-
for i in range(len(indexlist)):
|
363 |
-
# 使用前i个特征进行交叉验证
|
364 |
-
selected_features = X[:, 0:i]
|
365 |
-
scores = cross_val_score(clf, selected_features, y, cv=5)
|
366 |
-
# 计算平均准确率并添加到acc列表中
|
367 |
-
acc.append(scores.mean())
|
368 |
-
max_acc = max(acc)
|
369 |
-
max_index = acc.index(max_acc) # 应该不用加1吧
|
370 |
-
|
371 |
-
ax2 = fig.add_subplot(212)
|
372 |
-
ax2.set_title("IFS_mRMR_FCD_Accuracy")
|
373 |
-
ax2.plot(max_index, max_acc, 'ro')
|
374 |
-
ax2.plot(acc)
|
375 |
-
ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
|
376 |
-
ha='center')
|
377 |
-
# 设置x轴和y轴的标签
|
378 |
-
ax2.set_xlabel('Top n features')
|
379 |
-
ax2.set_ylabel('Accuracy')
|
380 |
-
plt.grid(True)
|
381 |
-
plt.savefig('output.png')
|
382 |
-
return 'output.png'
|
383 |
-
|
384 |
-
pass
|
385 |
elif method == 'Lasso':
|
386 |
data = pd.read_csv(data.name)
|
387 |
X = data.iloc[:, :-1]
|
@@ -517,8 +357,8 @@ iface = gr.Interface(
|
|
517 |
["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
|
518 |
["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
|
519 |
["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
|
520 |
-
["example_data.csv", '
|
521 |
-
["example_data.csv", '
|
522 |
],
|
523 |
allow_flagging="never"
|
524 |
)
|
|
|
222 |
# 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
|
223 |
# 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
|
224 |
# 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
elif method == 'Lasso':
|
226 |
data = pd.read_csv(data.name)
|
227 |
X = data.iloc[:, :-1]
|
|
|
357 |
["example_data.csv", 'MRMR_FCQ', 20, 'RF'],
|
358 |
["example_data.csv", 'MRMR_FCD', 10, 'SVM'],
|
359 |
["example_data.csv", 'MRMR_FCD', 30, 'KNN'],
|
360 |
+
["example_data.csv", 'Lasso', 50, 'DT'],
|
361 |
+
["example_data.csv", 'Lasso', 40, 'Naive Bayes'],
|
362 |
],
|
363 |
allow_flagging="never"
|
364 |
)
|