Spaces:
Runtime error
Runtime error
Commit
·
1f7e2a8
1
Parent(s):
3da65a3
Upload app.py
Browse files
app.py
CHANGED
@@ -16,6 +16,7 @@ from sklearn.naive_bayes import GaussianNB
|
|
16 |
from sklearn.neighbors import KNeighborsClassifier
|
17 |
from sklearn.svm import SVC
|
18 |
from sklearn.tree import DecisionTreeClassifier
|
|
|
19 |
|
20 |
|
21 |
# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
|
@@ -28,28 +29,22 @@ def add_max_score_to_list(temp_scores, current_score, selected_indices, selected
|
|
28 |
|
29 |
|
30 |
def fs(data, method, num_fea_int, clf):
|
31 |
-
data = pd.read_csv(data.name)
|
32 |
-
X = data.iloc[:, :-1].values
|
33 |
-
y = data['Label'].values
|
34 |
num_fea_int = int(num_fea_int)
|
35 |
if method == 'MRMR_FCD':
|
|
|
|
|
|
|
36 |
num_features = len(X[0])
|
37 |
-
|
38 |
f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
|
39 |
# 添加起始特征的分数到current_score
|
40 |
current_score = [max(f_test_scores)]
|
41 |
-
|
42 |
-
# 索引从0开始
|
43 |
-
# start_feature_index = random.randint(0, num_features - 1)
|
44 |
# 索引从最高分数的特征开始
|
45 |
start_feature_index = f_test_scores.index(max(f_test_scores))
|
46 |
selected_indices = set()
|
47 |
selected_indices_list = []
|
48 |
selected_indices.add(start_feature_index)
|
49 |
selected_indices_list.append(start_feature_index)
|
50 |
-
|
51 |
pearson_score_matrix = np.zeros((num_features, num_features))
|
52 |
-
|
53 |
for _ in range(num_fea_int - 1):
|
54 |
temp_scores = []
|
55 |
for i in range(num_features):
|
@@ -68,7 +63,6 @@ def fs(data, method, num_fea_int, clf):
|
|
68 |
if pearson_score_matrix[j][i] == 0:
|
69 |
pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
|
70 |
diff += pearson_score_matrix[j][i]
|
71 |
-
# diff += np.corrcoef(X[:,i], X[:,j])[0, 1]
|
72 |
temp_scores.append(f_test_score - diff / len(selected_indices))
|
73 |
add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
|
74 |
combined = list(zip(selected_indices_list, current_score))
|
@@ -129,6 +123,9 @@ def fs(data, method, num_fea_int, clf):
|
|
129 |
return 'output.png'
|
130 |
|
131 |
elif method == 'MRMR_FCQ':
|
|
|
|
|
|
|
132 |
num_fea_inttures = len(X[0])
|
133 |
f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
|
134 |
|
@@ -222,131 +219,43 @@ def fs(data, method, num_fea_int, clf):
|
|
222 |
plt.grid(True)
|
223 |
plt.savefig('output.png')
|
224 |
return 'output.png'
|
225 |
-
|
226 |
# 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
|
227 |
# 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
|
228 |
# 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
|
229 |
-
elif method == '
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
selectdf = datadf.copy()
|
255 |
-
allflist = datadf.columns.tolist()
|
256 |
-
namelist = list(datadf.index)
|
257 |
-
print(namelist)
|
258 |
-
namelist = [int(var) for var in namelist]
|
259 |
-
selectdf["class"] = namelist
|
260 |
-
|
261 |
-
bestfset,sortlist = calBFset(selectdf, allflist)
|
262 |
-
writethelist("bestfeature.txt", bestfset)#保存最佳特征子集
|
263 |
-
|
264 |
-
return dict(sortlist)
|
265 |
-
|
266 |
-
def calmulmerit(selectdf, sublist):
|
267 |
-
retvalue = 0
|
268 |
-
label = "class"
|
269 |
-
k = len(sublist)
|
270 |
-
namelist = list(selectdf["class"])
|
271 |
-
classset = set(namelist)
|
272 |
-
caldf = selectdf[sublist]
|
273 |
-
allvalue = 0.0
|
274 |
-
for feature in sublist:
|
275 |
-
caldf = selectdf[sublist]
|
276 |
-
middlevalue = 0.0
|
277 |
-
for ind in classset:
|
278 |
-
caldf[label] = np.where(selectdf[label] == ind, 1, 0)
|
279 |
-
coeff = pointbiserialr(caldf[feature], caldf[label])
|
280 |
-
middlevalue = abs(coeff.correlation) + middlevalue
|
281 |
-
allvalue = middlevalue / float(len(classset)) + allvalue
|
282 |
-
allvalue = allvalue / float(k)
|
283 |
-
|
284 |
-
corr = selectdf[sublist].corr()
|
285 |
-
corr.values[np.tril_indices_from(corr.values)] = np.nan
|
286 |
-
corr = abs(corr)
|
287 |
-
rff = corr.unstack().mean()
|
288 |
-
retvalue = (k * allvalue) / sqrt(k + k * (k - 1) * rff)
|
289 |
-
print(retvalue)
|
290 |
-
return retvalue
|
291 |
-
|
292 |
-
def calBFset(selectdf, allflist):
|
293 |
-
allfdict = getallfscoredict(selectdf, allflist)
|
294 |
-
sortedflist = sorted(allfdict.items(), key=lambda item: item[1], reverse=True)
|
295 |
-
writesortedlist("sorteddict.txt", sortedflist)#保存特征得分的降序
|
296 |
-
feaS = []
|
297 |
-
feaS.append(sortedflist[0][0])
|
298 |
-
maxvalue = sortedflist[0][1]
|
299 |
-
for i in range(1, len(sortedflist)):
|
300 |
-
print(str(i) + "/" + str(len(sortedflist)))
|
301 |
-
itemf = sortedflist[i][0]
|
302 |
-
feaS.append(itemf)
|
303 |
-
newvalue = calmulmerit(selectdf, feaS)
|
304 |
-
if newvalue > maxvalue:
|
305 |
-
maxvalue = newvalue
|
306 |
-
else:
|
307 |
-
feaS.pop()
|
308 |
-
print(feaS)
|
309 |
-
return feaS,sortedflist
|
310 |
-
|
311 |
-
def getallfscoredict(selectdf, allflist):
|
312 |
-
retdict = {}
|
313 |
-
k = 1
|
314 |
-
for f in allflist:
|
315 |
-
print(k)
|
316 |
-
k = k + 1
|
317 |
-
score = calonemerit(selectdf, f)
|
318 |
-
if math.isnan(score):
|
319 |
-
continue
|
320 |
-
retdict[f] = score
|
321 |
-
return retdict
|
322 |
-
|
323 |
-
def calonemerit(selectdf, subname):
|
324 |
-
retvalue = 0
|
325 |
-
label = "class"
|
326 |
-
namelist = list(selectdf["class"])
|
327 |
-
classset = set(namelist)
|
328 |
-
caldf = selectdf[subname].to_frame()
|
329 |
-
allvalue = 0.0
|
330 |
-
for ind in classset:
|
331 |
-
caldf[label] = np.where(selectdf[label] == ind, 1, 0)
|
332 |
-
coeff = pointbiserialr(caldf[subname], caldf[label])
|
333 |
-
allvalue = abs(coeff.correlation) + allvalue
|
334 |
-
allvalue = allvalue / float(len(classset))
|
335 |
-
return allvalue
|
336 |
-
|
337 |
-
#获取特征分数
|
338 |
-
sortdict=CFSmethod(data.name)
|
339 |
-
# 画图
|
340 |
fig = plt.figure(figsize=(24, 12))
|
341 |
ax1 = fig.add_subplot(211)
|
342 |
ax1.set_title(str(method))
|
343 |
-
|
344 |
-
|
345 |
# 设置x轴和y轴的标签
|
346 |
ax1.set_xlabel('Feature Index')
|
347 |
ax1.set_ylabel('Feature Score')
|
348 |
|
349 |
-
#分类器
|
350 |
if clf == 'RF':
|
351 |
clf = RandomForestClassifier(n_jobs=-1)
|
352 |
elif clf == 'KNN':
|
@@ -357,20 +266,24 @@ def fs(data, method, num_fea_int, clf):
|
|
357 |
clf = SVC()
|
358 |
elif clf == 'Naive Bayes':
|
359 |
clf = GaussianNB()
|
360 |
-
|
|
|
361 |
acc = []
|
362 |
# 对于index列表中的每个特征索引
|
363 |
-
|
|
|
|
|
|
|
364 |
# 使用前i个特征进行交叉验证
|
365 |
-
selected_features = X[:,
|
366 |
scores = cross_val_score(clf, selected_features, y, cv=5)
|
367 |
# 计算平均准确率并添加到acc列表中
|
368 |
acc.append(scores.mean())
|
369 |
max_acc = max(acc)
|
370 |
-
max_index = acc.index(max_acc)
|
371 |
|
372 |
ax2 = fig.add_subplot(212)
|
373 |
-
ax2.set_title("
|
374 |
ax2.plot(max_index, max_acc, 'ro')
|
375 |
ax2.plot(acc)
|
376 |
ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
|
@@ -382,9 +295,6 @@ def fs(data, method, num_fea_int, clf):
|
|
382 |
plt.savefig('output.png')
|
383 |
return 'output.png'
|
384 |
|
385 |
-
pass
|
386 |
-
elif method == 'Lasso':
|
387 |
-
pass
|
388 |
elif method == 'Ensemble':
|
389 |
pass
|
390 |
elif method == 'CI':
|
@@ -430,8 +340,6 @@ If you have any questions, please feel free to reach me out at <b>justinliu707@g
|
|
430 |
</div>
|
431 |
"""
|
432 |
|
433 |
-
|
434 |
-
|
435 |
iface = gr.Interface(
|
436 |
fn=fs,
|
437 |
title=title,
|
|
|
16 |
from sklearn.neighbors import KNeighborsClassifier
|
17 |
from sklearn.svm import SVC
|
18 |
from sklearn.tree import DecisionTreeClassifier
|
19 |
+
from sklearn.linear_model import LassoLarsCV
|
20 |
|
21 |
|
22 |
# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
|
|
|
29 |
|
30 |
|
31 |
def fs(data, method, num_fea_int, clf):
|
|
|
|
|
|
|
32 |
num_fea_int = int(num_fea_int)
|
33 |
if method == 'MRMR_FCD':
|
34 |
+
data = pd.read_csv(data.name)
|
35 |
+
X = data.iloc[:, :-1].values
|
36 |
+
y = data['Label'].values
|
37 |
num_features = len(X[0])
|
|
|
38 |
f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
|
39 |
# 添加起始特征的分数到current_score
|
40 |
current_score = [max(f_test_scores)]
|
|
|
|
|
|
|
41 |
# 索引从最高分数的特征开始
|
42 |
start_feature_index = f_test_scores.index(max(f_test_scores))
|
43 |
selected_indices = set()
|
44 |
selected_indices_list = []
|
45 |
selected_indices.add(start_feature_index)
|
46 |
selected_indices_list.append(start_feature_index)
|
|
|
47 |
pearson_score_matrix = np.zeros((num_features, num_features))
|
|
|
48 |
for _ in range(num_fea_int - 1):
|
49 |
temp_scores = []
|
50 |
for i in range(num_features):
|
|
|
63 |
if pearson_score_matrix[j][i] == 0:
|
64 |
pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
|
65 |
diff += pearson_score_matrix[j][i]
|
|
|
66 |
temp_scores.append(f_test_score - diff / len(selected_indices))
|
67 |
add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
|
68 |
combined = list(zip(selected_indices_list, current_score))
|
|
|
123 |
return 'output.png'
|
124 |
|
125 |
elif method == 'MRMR_FCQ':
|
126 |
+
data = pd.read_csv(data.name)
|
127 |
+
X = data.iloc[:, :-1].values
|
128 |
+
y = data['Label'].values
|
129 |
num_fea_inttures = len(X[0])
|
130 |
f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
|
131 |
|
|
|
219 |
plt.grid(True)
|
220 |
plt.savefig('output.png')
|
221 |
return 'output.png'
|
|
|
222 |
# 添加你们的代码在这里,我们先全部写成函数,然后再封装成类,主要是先把样子做出来
|
223 |
# 然后目前最终结果是返回一个图片,包含了含有特征的索引及其对应的分数的图,还有一张是增量式特征选择的准确率图
|
224 |
# 我上面的代码很多地方还可以优化,比如画图,选择分类器这些,但是你们都先不用管,把下面的几个elif写完先,然后我们再讨论优化代码的事情。
|
225 |
+
elif method == 'Lasso':
|
226 |
+
data = pd.read_csv(data.name)
|
227 |
+
X = data.iloc[:, :-1]
|
228 |
+
y = data.iloc[:, -1:].values.flatten()
|
229 |
+
|
230 |
+
cl = LassoLarsCV(cv=20, max_iter=80000).fit(X, y)
|
231 |
+
|
232 |
+
importance = np.abs(cl.coef_)
|
233 |
+
feature_names = list(X)
|
234 |
+
print(feature_names)
|
235 |
+
a = len(feature_names)
|
236 |
+
|
237 |
+
idx_features = (-importance).argsort()[:a]
|
238 |
+
print(idx_features)
|
239 |
+
name_features = np.array(feature_names)[idx_features]
|
240 |
+
for i in range(a):
|
241 |
+
print((name_features)[i], importance[idx_features][i])
|
242 |
+
result = pd.DataFrame({'index': idx_features, 'Score': importance[idx_features]})
|
243 |
+
result_rank = result.sort_values(by='Score', ascending=False, ignore_index=True)
|
244 |
+
inde = result_rank['index'].tolist()
|
245 |
+
score = result_rank['Score'].tolist()
|
246 |
+
|
247 |
+
index = []
|
248 |
+
for i in inde:
|
249 |
+
index.append(str(i))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
fig = plt.figure(figsize=(24, 12))
|
251 |
ax1 = fig.add_subplot(211)
|
252 |
ax1.set_title(str(method))
|
253 |
+
ax1.plot(index[:num_fea_int], score[:num_fea_int])
|
254 |
+
|
255 |
# 设置x轴和y轴的标签
|
256 |
ax1.set_xlabel('Feature Index')
|
257 |
ax1.set_ylabel('Feature Score')
|
258 |
|
|
|
259 |
if clf == 'RF':
|
260 |
clf = RandomForestClassifier(n_jobs=-1)
|
261 |
elif clf == 'KNN':
|
|
|
266 |
clf = SVC()
|
267 |
elif clf == 'Naive Bayes':
|
268 |
clf = GaussianNB()
|
269 |
+
inde = inde[:num_fea_int]
|
270 |
+
index = index[:num_fea_int]
|
271 |
acc = []
|
272 |
# 对于index列表中的每个特征索引
|
273 |
+
|
274 |
+
X = data.iloc[:, :-1].values
|
275 |
+
print(X)
|
276 |
+
for i in range(len(index)):
|
277 |
# 使用前i个特征进行交叉验证
|
278 |
+
selected_features = X[:, [int(j) - 1 for j in inde[:i + 1]]]
|
279 |
scores = cross_val_score(clf, selected_features, y, cv=5)
|
280 |
# 计算平均准确率并添加到acc列表中
|
281 |
acc.append(scores.mean())
|
282 |
max_acc = max(acc)
|
283 |
+
max_index = acc.index(max_acc) + 1
|
284 |
|
285 |
ax2 = fig.add_subplot(212)
|
286 |
+
ax2.set_title("IFS_" + str(method) + "_Accuracy")
|
287 |
ax2.plot(max_index, max_acc, 'ro')
|
288 |
ax2.plot(acc)
|
289 |
ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
|
|
|
295 |
plt.savefig('output.png')
|
296 |
return 'output.png'
|
297 |
|
|
|
|
|
|
|
298 |
elif method == 'Ensemble':
|
299 |
pass
|
300 |
elif method == 'CI':
|
|
|
340 |
</div>
|
341 |
"""
|
342 |
|
|
|
|
|
343 |
iface = gr.Interface(
|
344 |
fn=fs,
|
345 |
title=title,
|