from typing import List from pydantic import BaseModel from fastapi import FastAPI, Response, BackgroundTasks from fastapi.middleware.cors import CORSMiddleware # 跨域 from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split import sklearn.preprocessing as preproc from sklearn.preprocessing import StandardScaler import lightgbm as lgb from xgboost import XGBRegressor from sklearn.neural_network import MLPRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt import io import json import numpy as np import pandas as pd import matplotlib matplotlib.use('AGG') app = FastAPI() # set cross-domain whitelist origins = [ "http://127.0.0.1:5500", "http://localhost:8081", "http://mlca.coycs.com", "https://mlca.coycs.com", "http://celadon-lebkuchen-cc4bb0.netlify.app", "https://celadon-lebkuchen-cc4bb0.netlify.app" ] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["POST", "GET"], allow_headers=["*"], ) # 工具函数 def json2df(json): # 字符串转数值 def str2num(x): if isinstance(x, str): return eval(x) else: return x df = pd.DataFrame(json) # 空白符转None,且是"None"让eval能解析成功 df.replace(to_replace=r"^\s*$", value="None", regex=True, inplace=True) # 科学计数法转数值 df = df.applymap(str2num) return df # def process_abnormal(df, detect, method): # 异常值处理 # if detect == 1: # IQR检测方式 # for coloum in df.columns: # q1 = df[coloum].quantile(0.75) # q3 = df[coloum].quantile(0.25) # iqr = q1-q3 # if method == 1: # 删除异常值 # df.drop( # df.loc[lambda x:x[coloum] > q1 + 1.5 * iqr].index, inplace=True) # df.drop( # df.loc[lambda x:x[coloum] < q3 - 1.5 * iqr].index, inplace=True) # elif method == 2: # 均值替换 # df.loc[lambda x:x[coloum] > q1 + 1.5 * # iqr, coloum]=df[coloum].mean() # df.loc[lambda x:x[coloum] < q3 - 1.5 * # iqr, coloum]=df[coloum].mean() # elif method == 3: # 中位数替换 # df.loc[lambda x:x[coloum] > q1 + 1.5 * # iqr, coloum]=df[coloum].median() # df.loc[lambda x:x[coloum] < q3 - 1.5 * # iqr, coloum]=df[coloum].median() # elif method == 4: # 众数替换 # df.loc[lambda x:x[coloum] > q1 + 1.5 * # iqr, coloum]=df[coloum].mode().iloc[0] # df.loc[lambda x:x[coloum] < q3 - 1.5 * # iqr, coloum]=df[coloum].mode().iloc[0] # elif method == 5: # 边界替换 # df.loc[lambda x:x[coloum] > q1 + # 1.5 * iqr, coloum]=q1 + 1.5 * iqr # df.loc[lambda x:x[coloum] < q3 - # 1.5 * iqr, coloum]=q3 - 1.5 * iqr # elif detect == 2: # Z-score检测方式 # for coloum in df.columns: # mean = df[coloum].mean() # std = df[coloum].std() # df.drop( # df.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True) # df.drop( # df.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True) # if method == 1: # 删除异常值 # df.drop( # df.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True) # df.drop( # df.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True) # elif method == 2: # 均值替换 # df.loc[lambda x:x[coloum] > mean + # 3 * std, coloum]=df[coloum].mean() # df.loc[lambda x:x[coloum] < mean - # 3 * std, coloum]=df[coloum].mean() # elif method == 3: # 中位数替换 # df.loc[lambda x:x[coloum] > mean + 3 * # std, coloum]=df[coloum].median() # df.loc[lambda x:x[coloum] < mean - 3 * # std, coloum]=df[coloum].median() # elif method == 4: # 众数替换 # df.loc[lambda x:x[coloum] > mean + 3 * # std, coloum]=df[coloum].mode().iloc[0] # df.loc[lambda x:x[coloum] < mean - 3 * # std, coloum]=df[coloum].mode().iloc[0] # elif method == 5: # 边界替换 # df.loc[lambda x:x[coloum] > mean + # 3 * std, coloum]=mean + 3 * std # df.loc[lambda x:x[coloum] < mean - # 3 * std, coloum]=mean - 3 * std # return df def process_miss(df, method): # 缺失值处理 # 舍弃全为空的行 df = df.dropna(how='all') # 舍弃全为空的列 df = df.dropna(axis=1, how='all') if method == 1: # 均值 df = df.fillna(df.mean()) elif method == 2: # 中位数 df = df.fillna(df.median()) elif method == 3: # 众数 df = df.fillna(df.mode().iloc[0]) elif method == 4: # 线性 df = df.fillna(df.interpolate( method='linear', limit_direction='forward', axis=0)) elif method == 5: # 前值 df = df.fillna(method="ffill") elif method == 6: # 后值 df = df.fillna(method="bfill") return df def process_abnormal(df_inside, df_user, detect, method): # 异常值处理 df = pd.concat([df_inside, df_user], axis=0, ignore_index=True) # 合并的dataframe df_features = df.iloc[:, :12] # 取所有的特征列为dataframe # print(df) if detect == 1: # IQR检测方式 for coloum in df_features.columns: q1 = df_features[coloum].quantile(0.75) q3 = df_features[coloum].quantile(0.25) iqr = q1-q3 if method == 1: # 删除异常值 df_features.drop( df_features.loc[lambda x:x[coloum] > q1 + 1.5 * iqr].index, inplace=True) df_features.drop( df_features.loc[lambda x:x[coloum] < q3 - 1.5 * iqr].index, inplace=True) elif method == 2: # 均值替换 df_features.loc[lambda x:x[coloum] > q1 + 1.5 * iqr, coloum]=df_features[coloum].mean() df_features.loc[lambda x:x[coloum] < q3 - 1.5 * iqr, coloum]=df_features[coloum].mean() elif method == 3: # 中位数替换 df_features.loc[lambda x:x[coloum] > q1 + 1.5 * iqr, coloum]=df_features[coloum].median() df_features.loc[lambda x:x[coloum] < q3 - 1.5 * iqr, coloum]=df_features[coloum].median() elif method == 4: # 众数替换 df_features.loc[lambda x:x[coloum] > q1 + 1.5 * iqr, coloum]=df_features[coloum].mode().iloc[0] df_features.loc[lambda x:x[coloum] < q3 - 1.5 * iqr, coloum]=df_features[coloum].mode().iloc[0] elif method == 5: # 边界替换 df_features.loc[lambda x:x[coloum] > q1 + 1.5 * iqr, coloum]=q1 + 1.5 * iqr df_features.loc[lambda x:x[coloum] < q3 - 1.5 * iqr, coloum]=q3 - 1.5 * iqr elif detect == 2: # Z-score检测方式 for coloum in df_features.columns: mean = df_features[coloum].mean() std = df_features[coloum].std() df_features.drop( df_features.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True) df_features.drop( df_features.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True) if method == 1: # 删除异常值 df_features.drop( df_features.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True) df_features.drop( df_features.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True) elif method == 2: # 均值替换 df_features.loc[lambda x:x[coloum] > mean + 3 * std, coloum]=df_features[coloum].mean() df_features.loc[lambda x:x[coloum] < mean - 3 * std, coloum]=df_features[coloum].mean() elif method == 3: # 中位数替换 df_features.loc[lambda x:x[coloum] > mean + 3 * std, coloum]=df_features[coloum].median() df_features.loc[lambda x:x[coloum] < mean - 3 * std, coloum]=df_features[coloum].median() elif method == 4: # 众数替换 df_features.loc[lambda x:x[coloum] > mean + 3 * std, coloum]=df_features[coloum].mode().iloc[0] df_features.loc[lambda x:x[coloum] < mean - 3 * std, coloum]=df_features[coloum].mode().iloc[0] elif method == 5: # 边界替换 df_features.loc[lambda x:x[coloum] > mean + 3 * std, coloum]=mean + 3 * std df_features.loc[lambda x:x[coloum] < mean - 3 * std, coloum]=mean - 3 * std df.iloc[:, :12] = df_features df_inside = df.iloc[:df_inside.shape[0], :] df_user = df.iloc[df_inside.shape[0]:, :12] return {"df_inside": df_inside, "df_user": df_user} def process_standard(df_inside, df_user, method): # 标准化处理 df = pd.concat([df_inside, df_user], axis=0, ignore_index=True) # 合并的dataframe df_features = df.iloc[:, :12] # 取所有的特征列为dataframe columns = df_features.columns # 列名 if method == 1: # Min-max df_features = preproc.minmax_scale(df_features) elif method == 2: # Z-Score df_features = preproc.StandardScaler().fit_transform(df_features) elif method == 3: # MaxAbs df_features = preproc.maxabs_scale(df_features, axis=0) elif method == 4: # RobustScaler df_features = preproc.RobustScaler().fit_transform(df_features) elif method == 5: # 正则化 df_features = preproc.normalize(df_features, axis=0) df_features = pd.DataFrame( data=df_features[0:, 0:], columns=columns) # 补充列名 df.iloc[:, :12] = df_features df_inside = df.iloc[:df_inside.shape[0], :] df_user = df.iloc[df_inside.shape[0]:, :12] return {"df_inside": df_inside, "df_user": df_user} def train_model(x, y, test_size, algorithm, paras): # 模型训练 # 划分数据集 x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=0) # 机器学习 model = None results = {} if algorithm == 1: # 最小二乘法线性回归 model = LinearRegression(fit_intercept=paras["fit_intercept"]) if algorithm == 2: # 随机森林回归 model = RandomForestRegressor(n_estimators=paras["n_estimators"], criterion=paras["criterion"], max_depth=paras["max_depth"], random_state=0) if algorithm == 3: # BP神经网络回归 model = MLPRegressor(hidden_layer_sizes=(paras["hidden_layer_sizes_1"], paras["hidden_layer_sizes_2"]), activation=paras["activation"], solver='lbfgs', random_state=paras["random_state"]) if algorithm == 4: # XGBoost回归 model = XGBRegressor( max_depth=paras["max_depth"], learning_rate=paras["learning_rate"], n_estimators=paras["n_estimators"]) if algorithm == 5: # LightGBM回归 # model = lgb.LGBMRegressor(objective='regression',boosting_type="dart",num_leaves=30, max_depth=-1,n_estimators=20,learning_rate=1) model = lgb.LGBMRegressor(objective='regression', max_depth=paras["max_depth"], learning_rate=paras["learning_rate"], random_state=paras["random_state"], n_estimators=paras["n_estimators"]) # 返回数据 if model != None: model.fit(x_train, y_train) if algorithm == 1: # 最小二乘法线性回归 # 保留小数点后三位 # results["coef"] = model.coef_.tolist() # 模型斜率 results["coef"] = [float('{:.4f}'.format(i)) for i in model.coef_.tolist()] # 模型斜率 results["intercept"] = round(model.intercept_, 3) # 模型截距 y_pred = model.predict(x_test) # 预测值 # y_test = y_test.values # 误差,用round保留三位小数且四舍五入 mae = round(mean_absolute_error(y_test, y_pred), 3) rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)), 3) r2 = round(r2_score(y_test, y_pred), 3) # y_test = [x[0] for x in np.array(y_test).tolist()] # y_pred = [x[0] for x in y_pred.tolist()] y_test = np.array(y_test).tolist() y_pred = y_pred.tolist() res = {"y_test": y_test, "y_pred": y_pred, "error": { "MAE": mae, "RMSE": rmse, "R2": r2}, "results": results} print(res) return res # return {"y_test": y_test, "y_pred": y_pred, "error": {"MAE": mae, "RMSE": rmse, "R2": r2}, "results": results} else: return "模型训练出错" def predict_connectivity(x, x1, y, test_size, algorithm, paras): # 划分数据集 x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=0) # 机器学习 model = None results = {} if algorithm == 1: # 最小二乘法线性回归 model = LinearRegression(fit_intercept=paras["fit_intercept"]) if algorithm == 2: # 随机森林回归 model = RandomForestRegressor(n_estimators=paras["n_estimators"], criterion=paras["criterion"], max_depth=paras["max_depth"], random_state=0) if algorithm == 3: # BP神经网络回归 model = MLPRegressor(hidden_layer_sizes=(paras["hidden_layer_sizes_1"], paras["hidden_layer_sizes_2"]), activation=paras["activation"], solver='lbfgs', random_state=paras["random_state"]) if algorithm == 4: # XGBoost回归 model = XGBRegressor( max_depth=paras["max_depth"], learning_rate=paras["learning_rate"], n_estimators=paras["n_estimators"]) if algorithm == 5: # LightGBM回归 model = lgb.LGBMRegressor(objective='regression', max_depth=paras["max_depth"], learning_rate=paras["learning_rate"], random_state=paras["random_state"], n_estimators=paras["n_estimators"]) # 返回数据 if model != None: model.fit(x_train, y_train) y_pred = model.predict(x1).tolist() # 预测值 return y_pred else: return "预测连通性出错" # 登录验证 class Login(BaseModel): # 接口数据类型 username: str password: str @app.post("/login") # 接口 async def login(login: Login): username = login.username password = login.password if username == "admin" and password == "123456": return True return False # 处理用户数据 class Process_user(BaseModel): # 接口数据类型 mode: int data: List miss: List abnormal: List standard: List @app.post("/process/user") # 接口 async def process_user(user: Process_user): mode = user.mode # 选择的井间连通模式 df_inside = pd.read_csv( "./mode_{}.csv".format(mode)).dropna(axis=0) # 连通模式对应的内置数据 df_user = json2df(user.data) abnormal = user.abnormal[0] miss = user.miss[0] standard = user.standard[0] # 异常值处理 if abnormal["state"]: abnormaled = process_abnormal( df_inside, df_user, abnormal["detect"], abnormal["method"]) df_inside = abnormaled["df_inside"] df_user = abnormaled["df_user"] # 缺失值处理 if miss["state"]: df_user = process_miss(df_user, miss["method"]) # 标准化处理 if standard["state"]: standarded = process_standard(df_inside, df_user, standard["method"]) df_inside = standarded["df_inside"] df_user = standarded["df_user"] # 用astype将数值转科学计数法 return {"inside": df_inside.astype('str').to_json(orient='records'), "user": df_user.astype('str').to_json(orient='records')} # # 用astype将数值转科学计数法 # return df.astype('str').to_json(orient='records') # 处理内置数据 class Process_inside(BaseModel): # 接口数据类型 data: List abnormal: List standard: List @app.post("/process/inside") # 接口 async def process_inside(inside: Process_inside): df = json2df(inside.data) abnormal = inside.abnormal[0] standard = inside.standard[0] # 异常值处理 if abnormal["state"]: df = process_abnormal(df, abnormal["detect"], abnormal["method"]) # 标准化处理:只对特征进行标准化,不包括标签(后三列) if standard["state"]: df = pd.concat([process_standard(df.iloc[:, :12], standard["method"]), df.iloc[:, 12:]], axis=1) # 用astype将数值转科学计数法 return df.astype('str').to_json(orient='records') # 训练模型 class Train(BaseModel): # 接口数据类型 data: List test_size: float algorithm: int paras: List @app.post("/train") # 接口 async def train(train: Train): # 解析数据 df = json2df(train.data) test_size = train.test_size algorithm = train.algorithm paras = train.paras[0] x = df.iloc[:, :12] y1 = df.loc[:, "BSR"] y2 = df.loc[:, "SBR"] y3 = df.loc[:, "D"] bsr = train_model(x, y1, test_size, algorithm, paras) sbr = train_model(x, y2, test_size, algorithm, paras) x_train, x_test, y_train, y_test = train_test_split( x, y3, test_size=test_size, random_state=0) d = {"y_test": np.array(y_test).tolist(), "y_pred": np.sum( [bsr["y_pred"], sbr["y_pred"]], axis=0).tolist()} return {"bsr": bsr, "sbr": sbr, "d": d} # 预测连通性 class Predict(BaseModel): # 接口数据类型 data_train: List data_predict: List test_size: float algorithm: int paras: List @app.post("/predict") # 接口 async def predict(predict: Predict): # 解析数据 df_train = json2df(predict.data_train) df_predict = json2df(predict.data_predict) test_size = predict.test_size algorithm = predict.algorithm paras = predict.paras[0] x = df_train.iloc[:, :12] y1 = df_train.loc[:, "BSR"] y2 = df_train.loc[:, "SBR"] # 预测连通性 bsr = predict_connectivity(x, df_predict, y1, test_size, algorithm, paras) sbr = predict_connectivity(x, df_predict, y2, test_size, algorithm, paras) d = np.sum([bsr, sbr], axis=0).tolist() # 合并为一个list后转dataframe再转json实现前端表格数据格式 data = [] data.append(bsr) data.append(sbr) data.append(d) df_result = pd.concat([pd.DataFrame(predict.data_predict), pd.DataFrame(data=np.array( data).T.tolist(), columns=["BSR", "SBR", "D"])], axis=1) return df_result.to_json(orient='records') # return pd.DataFrame(data=np.array(data).T.tolist(), columns=["BSR", "SBR", "D"]).to_json(orient='records') # # 图片测试 # def create_img(): # plt.rcParams['figure.figsize'] = [7.50, 3.50] # plt.rcParams['figure.autolayout'] = True # plt.plot([1, 2]) # img_buf = io.BytesIO() # plt.savefig(img_buf, format='png') # plt.close() # return img_buf # @app.get('/png') # async def get_img(background_tasks: BackgroundTasks): # img_buf = create_img() # # get the entire buffer content # # because of the async, this will await the loading of all content # bufContents: bytes = img_buf.getvalue() # background_tasks.add_task(img_buf.close) # headers = {'Content-Disposition': 'inline; filename="out.png"'} # return Response(bufContents, headers=headers, media_type='image/png')