基本思想:将序列分解成季节、趋势和残差,然后对残差进行EMD分解,再将这些成分作为特征输入XGBoost模型,并对各个成分进行预测,最后相加得到预测值。
emd代码:经验模态分解(EMD)实现
GLM:[文献复现]中长期神经网络概率预测模型
import warnings # 忽略警告
warnings.filterwarnings('ignore')
import datetime as dt
from EMD import emd
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
class GLM:
def __init__(self, df):
self.df = df
def getXY(self):
self.df['dayofyear'] = self.df.index.dayofyear
self.x = self.df.dayofyear.values.reshape(len(self.df), 1)
self.y = self.df.y.values.reshape(len(self.df), 1)
def getSX(self):
self.getXY()
sat = self.df['sat'].values.reshape(len(self.df), 1)
sun = self.df['sun'].values.reshape(len(self.df), 1)
w = 2 * np.pi / 365
x = self.x
x = np.hstack([x, np.sin(w * x), np.sin(2 * w * x), np.cos(w * x), np.cos(2 * w * x), sat, sun])
self.sx = x
def Trend(self, season):
'''trend = \beta_0+\beta_1*t'''
model = LinearRegression()
model.fit(self.x, self.y - season)
return model
def seasonality(self):
'''season = \sum_{k=1}^{2}[\beta_{1+k}sin(kwt)+\beta_{2+k}sin(kwt)]+
\beta_6D_{sat}(t)+\beta_7D_{sun}(t)+\beta_8D_{Hol}(t)'''
model = LinearRegression()
model.fit(self.sx, self.y)
return model
xgboost_params = {
'colsample_bynode': 0.8,
'learning_rate': 0.01,
'max_depth': 4,
'objective': 'reg:squarederror',
'subsample': 0.8,
'reg_alpha': 0.005,
'n_estimators': 200
}
class XGB:
def fit(self, X_train, y_train, n=None):
pipeline = [('SimpleImputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
# ('scaler', StandardScaler()),
# ('pca', PCA(n_components=n))
]
multi_regressorr = MultiOutputRegressor(XGBRegressor(params=xgboost_params), 20)
pipeline.append(('regressor', multi_regressorr))
estimator = Pipeline(pipeline)
self.xgb = estimator.fit(X_train, y_train)
def predict(self, X_test):
y_pred = self.xgb.predict(X_test)
return y_pred
def error(self, y_test, y_pred):
error = mean_squared_error(y_test.values.flatten(), y_pred.flatten())
return error
class SaveAndLoadModel:
def __init__(self, path):
self.path = path
if not os.path.exists(self.path):
os.makedirs(self.path)
def save_model(self, model):
filename = 'model.pkl'
save_name = os.path.join(self.path, filename)
print('saving: -> exporting to %s' % save_name)
with open(save_name, 'wb') as handle:
pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
def load_model(self):
filename = 'model.pkl'
save_name = os.path.join(self.path, filename)
print('loading: -> exporting to %s' % save_name)
with open(save_name, 'rb') as handle:
model = pickle.load(handle)
return model
def main(df):
columns = ['y', 'season', 'trend', 'emd0', 'emd1', 'emd2', 'emd3', 'emd4']
df_train = df[(df.index > dt.datetime(2015, 1, 1)) & (df.index <= dt.datetime(2017, 1, 1))]
df_test = df[(df.index > dt.datetime(2017, 1, 1)) & (df.index <= dt.datetime(2018, 8, 1))]
df_y_train = df_train[columns].shift(-24)[:-24]
df_y_test = df_test[columns].shift(-24)[:-24]
X_train, X_test, = df_train.values[:-24], df_test.values[:-24]
y_train, y_test = df_y_train.values, df_y_test.values
xgb = XGB()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
error1 = xgb.error(df_y_test.y, y_pred[:, 0])
y_pred2 = y_pred[:, 1:].sum(axis=1)
error2 = xgb.error(df_y_test.y, y_pred2)
plt.figure()
plt.plot(y_pred[:, 0])
plt.plot(y_pred2)
plt.plot(df_y_test.y.values)
plt.legend(['y', 'emd', 'true'])
print(error1, error2)
# 807037.8726142674 2487790.09234746
# 直接预测比emd的误差小
if __name__ == '__main__':
df = pd.read_csv('./AEP_hourly.csv', index_col=[0])
df.columns = ['y']
df.index = pd.to_datetime(df.index)
df['weekday'] = df.index.weekday
df['hour'] = df.index.hour
df['sat'] = (df['weekday'] == 5) * 1
df['sun'] = (df['weekday'] == 6) * 1
glm = GLM(df)
glm.getSX()
glm.getXY()
season_model = glm.seasonality()
season = season_model.predict(glm.sx)
trend_model = glm.Trend(season)
trend = trend_model.predict(glm.x)
residual = glm.y - season - trend
df['season'] = season
df['trend'] = trend
df['residual'] = residual
x = (glm.y - season - trend).flatten()
t = np.arange(0, len(x) * .001, .001)
imfs = emd(x, nIMF=5)
for i in range(len(imfs)):
df[f'emd{i}'] = imfs[i]
for col in ['season', 'trend', 'emd0', 'emd1', 'emd2', 'emd3', 'emd4']: # 'season',
for i in range(24):
df[f'{col}_lag{i}'] = df[col].shift(i)
df.dropna(inplace=True)
main(df)
结果如下所示: