Spaces:

Justin-12138
/

FSALA

Runtime error

App Files Files Community

Justin-12138 commited on Oct 8, 2023

Commit

1f7e2a8

1 Parent(s): 3da65a3

Upload app.py

Browse files

Files changed (1) hide show

app.py +43 -135

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from sklearn.naive_bayes import GaussianNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 # warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
@@ -28,28 +29,22 @@ def add_max_score_to_list(temp_scores, current_score, selected_indices, selected
 def fs(data, method, num_fea_int, clf):
-    data = pd.read_csv(data.name)
-    X = data.iloc[:, :-1].values
-    y = data['Label'].values
     num_fea_int = int(num_fea_int)
     if method == 'MRMR_FCD':
         num_features = len(X[0])
         f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
         # 添加起始特征的分数到current_score
         current_score = [max(f_test_scores)]
-        # 索引从0开始
-        # start_feature_index = random.randint(0, num_features - 1)
         # 索引从最高分数的特征开始
         start_feature_index = f_test_scores.index(max(f_test_scores))
         selected_indices = set()
         selected_indices_list = []
         selected_indices.add(start_feature_index)
         selected_indices_list.append(start_feature_index)
         pearson_score_matrix = np.zeros((num_features, num_features))
         for _ in range(num_fea_int - 1):
             temp_scores = []
             for i in range(num_features):
@@ -68,7 +63,6 @@ def fs(data, method, num_fea_int, clf):
                             if pearson_score_matrix[j][i] == 0:
                                 pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
                             diff += pearson_score_matrix[j][i]
-                        # diff += np.corrcoef(X[:,i], X[:,j])[0, 1]
                     temp_scores.append(f_test_score - diff / len(selected_indices))
             add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
         combined = list(zip(selected_indices_list, current_score))
@@ -129,6 +123,9 @@ def fs(data, method, num_fea_int, clf):
         return 'output.png'
     elif method == 'MRMR_FCQ':
         num_fea_inttures = len(X[0])
         f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
@@ -222,131 +219,43 @@ def fs(data, method, num_fea_int, clf):
         plt.grid(True)
         plt.savefig('output.png')
         return 'output.png'
     # 添加你们的代码在这里,我们先全部写成函数，然后再封装成类，主要是先把样子做出来
     # 然后目前最终结果是返回一个图片，包含了含有特征的索引及其对应的分数的图，还有一张是增量式特征选择的准确率图
     # 我上面的代码很多地方还可以优化，比如画图，选择分类器这些，但是你们都先不用管，把下面的几个elif写完先，然后我们再讨论优化代码的事情。
-    elif method == 'CFS':
-        def loadDataSet(fileName):
-            df = pd.read_csv(fileName)
-            return df
-        def writesortedlist(filename, thelist):
-            with open(filename, "w") as fw:
-                for item in thelist:
-                    fw.write(item[0] + "\t" + str(item[1]) + "\n")
-        def writethelist(filename, thelist):
-            with open(filename, "w") as fw:
-                for item in thelist:
-                    fw.write(item + "\n")
-        def getdatadf(datafile):
-            datadf = loadDataSet(datafile)
-            labellist = datadf["Label"].tolist()
-            del datadf["Label"]
-            return datadf, labellist
-        def CFSmethod(datafile):
-            datadf, labellist = getdatadf(datafile)
-            print(datadf)
-            selectdf = datadf.copy()
-            allflist = datadf.columns.tolist()
-            namelist = list(datadf.index)
-            print(namelist)
-            namelist = [int(var) for var in namelist]
-            selectdf["class"] = namelist
-            bestfset,sortlist = calBFset(selectdf, allflist)
-            writethelist("bestfeature.txt", bestfset)#保存最佳特征子集
-            return dict(sortlist)
-        def calmulmerit(selectdf, sublist):
-            retvalue = 0
-            label = "class"
-            k = len(sublist)
-            namelist = list(selectdf["class"])
-            classset = set(namelist)
-            caldf = selectdf[sublist]
-            allvalue = 0.0
-            for feature in sublist:
-                caldf = selectdf[sublist]
-                middlevalue = 0.0
-                for ind in classset:
-                    caldf[label] = np.where(selectdf[label] == ind, 1, 0)
-                    coeff = pointbiserialr(caldf[feature], caldf[label])
-                    middlevalue = abs(coeff.correlation) + middlevalue
-                allvalue = middlevalue / float(len(classset)) + allvalue
-            allvalue = allvalue / float(k)
-            corr = selectdf[sublist].corr()
-            corr.values[np.tril_indices_from(corr.values)] = np.nan
-            corr = abs(corr)
-            rff = corr.unstack().mean()
-            retvalue = (k * allvalue) / sqrt(k + k * (k - 1) * rff)
-            print(retvalue)
-            return retvalue
-        def calBFset(selectdf, allflist):
-            allfdict = getallfscoredict(selectdf, allflist)
-            sortedflist = sorted(allfdict.items(), key=lambda item: item[1], reverse=True)
-            writesortedlist("sorteddict.txt", sortedflist)#保存特征得分的降序
-            feaS = []
-            feaS.append(sortedflist[0][0])
-            maxvalue = sortedflist[0][1]
-            for i in range(1, len(sortedflist)):
-                print(str(i) + "/" + str(len(sortedflist)))
-                itemf = sortedflist[i][0]
-                feaS.append(itemf)
-                newvalue = calmulmerit(selectdf, feaS)
-                if newvalue > maxvalue:
-                    maxvalue = newvalue
-                else:
-                    feaS.pop()
-            print(feaS)
-            return feaS,sortedflist
-        def getallfscoredict(selectdf, allflist):
-            retdict = {}
-            k = 1
-            for f in allflist:
-                print(k)
-                k = k + 1
-                score = calonemerit(selectdf, f)
-                if math.isnan(score):
-                    continue
-                retdict[f] = score
-            return retdict
-        def calonemerit(selectdf, subname):
-            retvalue = 0
-            label = "class"
-            namelist = list(selectdf["class"])
-            classset = set(namelist)
-            caldf = selectdf[subname].to_frame()
-            allvalue = 0.0
-            for ind in classset:
-                caldf[label] = np.where(selectdf[label] == ind, 1, 0)
-                coeff = pointbiserialr(caldf[subname], caldf[label])
-                allvalue = abs(coeff.correlation) + allvalue
-            allvalue = allvalue / float(len(classset))
-            return allvalue
-        #获取特征分数
-        sortdict=CFSmethod(data.name)
-        # 画图
         fig = plt.figure(figsize=(24, 12))
         ax1 = fig.add_subplot(211)
         ax1.set_title(str(method))
-        indexlist=list(range(1,len(sortdict.keys()+1)))
-        ax1.plot(indexlist, sortdict.values())  # 特征分数图
         # 设置x轴和y轴的标签
         ax1.set_xlabel('Feature Index')
         ax1.set_ylabel('Feature Score')
-        #分类器
         if clf == 'RF':
             clf = RandomForestClassifier(n_jobs=-1)
         elif clf == 'KNN':
@@ -357,20 +266,24 @@ def fs(data, method, num_fea_int, clf):
             clf = SVC()
         elif clf == 'Naive Bayes':
             clf = GaussianNB()
-        #画交叉验证图
         acc = []
         # 对于index列表中的每个特征索引
-        for i in range(len(indexlist)):
             # 使用前i个特征进行交叉验证
-            selected_features = X[:,0:i]
             scores = cross_val_score(clf, selected_features, y, cv=5)
             # 计算平均准确率并添加到acc列表中
             acc.append(scores.mean())
         max_acc = max(acc)
-        max_index = acc.index(max_acc)#应该不用加1吧
         ax2 = fig.add_subplot(212)
-        ax2.set_title("IFS_mRMR_FCD_Accuracy")
         ax2.plot(max_index, max_acc, 'ro')
         ax2.plot(acc)
         ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
@@ -382,9 +295,6 @@ def fs(data, method, num_fea_int, clf):
         plt.savefig('output.png')
         return 'output.png'
-        pass
-    elif method == 'Lasso':
-        pass
     elif method == 'Ensemble':
         pass
     elif method == 'CI':
@@ -430,8 +340,6 @@ If you have any questions, please feel free to reach me out at <b>justinliu707@g
 </div>
 """
 iface = gr.Interface(
     fn=fs,
     title=title,

 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LassoLarsCV
 # warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
 def fs(data, method, num_fea_int, clf):
     num_fea_int = int(num_fea_int)
     if method == 'MRMR_FCD':
+        data = pd.read_csv(data.name)
+        X = data.iloc[:, :-1].values
+        y = data['Label'].values
         num_features = len(X[0])
         f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_features)]
         # 添加起始特征的分数到current_score
         current_score = [max(f_test_scores)]
         # 索引从最高分数的特征开始
         start_feature_index = f_test_scores.index(max(f_test_scores))
         selected_indices = set()
         selected_indices_list = []
         selected_indices.add(start_feature_index)
         selected_indices_list.append(start_feature_index)
         pearson_score_matrix = np.zeros((num_features, num_features))
         for _ in range(num_fea_int - 1):
             temp_scores = []
             for i in range(num_features):
                             if pearson_score_matrix[j][i] == 0:
                                 pearson_score_matrix[j][i] = np.corrcoef(X[:, i], X[:, j])[0, 1]
                             diff += pearson_score_matrix[j][i]
                     temp_scores.append(f_test_score - diff / len(selected_indices))
             add_max_score_to_list(temp_scores, current_score, selected_indices, selected_indices_list)
         combined = list(zip(selected_indices_list, current_score))
         return 'output.png'
     elif method == 'MRMR_FCQ':
+        data = pd.read_csv(data.name)
+        X = data.iloc[:, :-1].values
+        y = data['Label'].values
         num_fea_inttures = len(X[0])
         f_test_scores = [f_oneway(X[:, i], y)[0] for i in range(num_fea_inttures)]
         plt.grid(True)
         plt.savefig('output.png')
         return 'output.png'
     # 添加你们的代码在这里,我们先全部写成函数，然后再封装成类，主要是先把样子做出来
     # 然后目前最终结果是返回一个图片，包含了含有特征的索引及其对应的分数的图，还有一张是增量式特征选择的准确率图
     # 我上面的代码很多地方还可以优化，比如画图，选择分类器这些，但是你们都先不用管，把下面的几个elif写完先，然后我们再讨论优化代码的事情。
+    elif method == 'Lasso':
+        data = pd.read_csv(data.name)
+        X = data.iloc[:, :-1]
+        y = data.iloc[:, -1:].values.flatten()
+        cl = LassoLarsCV(cv=20, max_iter=80000).fit(X, y)
+        importance = np.abs(cl.coef_)
+        feature_names = list(X)
+        print(feature_names)
+        a = len(feature_names)
+        idx_features = (-importance).argsort()[:a]
+        print(idx_features)
+        name_features = np.array(feature_names)[idx_features]
+        for i in range(a):
+            print((name_features)[i], importance[idx_features][i])
+        result = pd.DataFrame({'index': idx_features, 'Score': importance[idx_features]})
+        result_rank = result.sort_values(by='Score', ascending=False, ignore_index=True)
+        inde = result_rank['index'].tolist()
+        score = result_rank['Score'].tolist()
+        index = []
+        for i in inde:
+            index.append(str(i))
         fig = plt.figure(figsize=(24, 12))
         ax1 = fig.add_subplot(211)
         ax1.set_title(str(method))
+        ax1.plot(index[:num_fea_int], score[:num_fea_int])
         # 设置x轴和y轴的标签
         ax1.set_xlabel('Feature Index')
         ax1.set_ylabel('Feature Score')
         if clf == 'RF':
             clf = RandomForestClassifier(n_jobs=-1)
         elif clf == 'KNN':
             clf = SVC()
         elif clf == 'Naive Bayes':
             clf = GaussianNB()
+        inde = inde[:num_fea_int]
+        index = index[:num_fea_int]
         acc = []
         # 对于index列表中的每个特征索引
+        X = data.iloc[:, :-1].values
+        print(X)
+        for i in range(len(index)):
             # 使用前i个特征进行交叉验证
+            selected_features = X[:, [int(j) - 1 for j in inde[:i + 1]]]
             scores = cross_val_score(clf, selected_features, y, cv=5)
             # 计算平均准确率并添加到acc列表中
             acc.append(scores.mean())
         max_acc = max(acc)
+        max_index = acc.index(max_acc) + 1
         ax2 = fig.add_subplot(212)
+        ax2.set_title("IFS_" + str(method) + "_Accuracy")
         ax2.plot(max_index, max_acc, 'ro')
         ax2.plot(acc)
         ax2.annotate(f'({max_index}, {max_acc})', (max_index, max_acc), textcoords="offset points", xytext=(-5, -5),
         plt.savefig('output.png')
         return 'output.png'
     elif method == 'Ensemble':
         pass
     elif method == 'CI':
 </div>
 """
 iface = gr.Interface(
     fn=fs,
     title=title,