【原】用statsmodels库做计量分析

大邓的Python 2021-02-23

展开全文

OLS估计

import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
%matplotlib inline
np.random.seed(9876789)
#生成实验数据
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)
#ols模型需要的X矩阵需要加入常数项
X = sm.add_constant(X)
y = np.dot(X, beta)+e
#OLS拟合实验数据
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 1.000
Model: OLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 4.020e+06
Date: Thu, 19 Dec 2019 Prob (F-statistic): 2.83e-239
Time: 17:33:16 Log-Likelihood: -146.51
No. Observations: 100 AIC: 299.0
Df Residuals: 97 BIC: 306.8
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 1.3423 0.313 4.292 0.000 0.722 1.963
x1 -0.0402 0.145 -0.278 0.781 -0.327 0.247
x2 10.0103 0.014 715.745 0.000 9.982 10.038
==============================================================================
Omnibus: 2.042 Durbin-Watson: 2.274
Prob(Omnibus): 0.360 Jarque-Bera (JB): 1.875
Skew: 0.234 Prob(JB): 0.392
Kurtosis: 2.519 Cond. No. 144.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#OLS模型中含有的属性和方法
print(dir(results))

['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cache', '_data_attr', '_get_robustcov_results', '_is_nested', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'diagn', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']

#这里只看看系数和R2
print('Parameters: ', results.params)
print('R2: ', results.rsquared)

Parameters: [ 1.1875799 0.06751355 10.00087227]
R2: 0.9999904584846492

OLS非线性曲线，但参数是线性的

nsample = 50
sig = 0.5
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, np.sin(x), (x-5)**2, np.ones(nsample)))
beta = [0.5, 0.5, -0.02, 5.]
y_true = np.dot(X, beta)
y = y_true + sig * np.random.normal(size=nsample)
res = sm.OLS(y, X).fit()
print(res.summary())

OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.946
Model: OLS Adj. R-squared: 0.942
Method: Least Squares F-statistic: 267.0
Date: Thu, 19 Dec 2019 Prob (F-statistic): 4.28e-29
Time: 17:39:35 Log-Likelihood: -29.585
No. Observations: 50 AIC: 67.17
Df Residuals: 46 BIC: 74.82
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 0.5031 0.024 20.996 0.000 0.455 0.551
x2 0.5805 0.094 6.162 0.000 0.391 0.770
x3 -0.0208 0.002 -9.908 0.000 -0.025 -0.017
const 5.0230 0.155 32.328 0.000 4.710 5.336
==============================================================================
Omnibus: 3.931 Durbin-Watson: 2.004
Prob(Omnibus): 0.140 Jarque-Bera (JB): 2.119
Skew: -0.235 Prob(JB): 0.347
Kurtosis: 2.108 Cond. No. 221.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

print('Parameters: ', res.params)
print('Standard errors: ', res.bse)
print('Predicted values: ', res.predict())

Parameters: [ 0.50314123 0.58047691 -0.02084597 5.02303139]
Standard errors: [0.02396328 0.09420248 0.00210399 0.15537883]
Predicted values: [ 4.50188223 5.01926399 5.49184506 5.88798995 6.18748025 6.38483648
6.49021831 6.52775537 6.53158285 6.54023313 6.59030507 6.7104509
6.91666871 7.20967408 7.57478245 7.98432176 8.40217892 8.78973307
9.11220088 9.34435135 9.47465119 9.50715972 9.46086192 9.36654926
9.26176075 9.18461538 9.16754929 9.23198657 9.3848194 9.61727563
9.90636005 10.21863244 10.51570167 10.7605333 10.92353419 10.98741461
10.95002886 10.82472782 10.63816544 10.42591956 10.22664657 10.0757303
9.99946853 10.01075234 10.10694816 10.27033045 10.47099394 10.67176702
10.83431887 10.92545718]

为了对比真实值与OLS预测值，使用 wls_prediction_std

from statsmodels.sandbox.regression.predstd import wls_prediction_std

#标准差预测值，置信区间下界限，置信区间上界限
prstd, iv_l, iv_u = wls_prediction_std(res)
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="data")
ax.plot(x, y_true, 'b-', label="True")
ax.plot(x, res.predict(), 'r--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');

虚拟变量处理

dummy = sm.categorical(groups, drop=True)

import numpy as np
import statsmodels.api as sm
groups = np.array(['a', 'b', 'a', 'a', 'c'])
sm.categorical(groups)

array([['a', '1.0', '0.0', '0.0'],
['b', '0.0', '1.0', '0.0'],
['a', '1.0', '0.0', '0.0'],
['a', '1.0', '0.0', '0.0'],
['c', '0.0', '0.0', '1.0']], dtype='<U32')

sm.categorical(groups, drop=True)

array([[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.],
[1., 0., 0.],
[0., 0., 1.]])

x = np.array([1, 2, 3, 4, 5])
dummy = sm.categorical(groups, drop=True)
np.column_stack((x, dummy))

array([[1., 1., 0., 0.],
[2., 0., 1., 0.],
[3., 1., 0., 0.],
[4., 1., 0., 0.],
[5., 0., 0., 1.]])

共线性问题

数据集Longley是众所周知的拥有强共线性现象的数据集，也就是自变量之间拥有较高的相关性。

因变量TOTEMP

自变量

GNPDEFL
GNP
UNEMP
ARMED
POP
YEAR

共线性问题会影响ols参数估计的稳定性。

import pandas as pd
df = pd.read_csv('data/longley.csv')
df.head()

import statsmodels.api as sm
import pandas as pd
df = pd.read_csv('data/longley.csv')
X = df[df.columns[1:]]
X = sm.add_constant(X)
y = df[df.columns[0]]
ols = sm.OLS(y, X)
ols_results = ols.fit()
print(ols_results.summary())

OLS Regression Results
==============================================================================
Dep. Variable: TOTEMP R-squared: 0.995
Model: OLS Adj. R-squared: 0.992
Method: Least Squares F-statistic: 330.3
Date: Thu, 19 Dec 2019 Prob (F-statistic): 4.98e-10
Time: 19:52:43 Log-Likelihood: -109.62
No. Observations: 16 AIC: 233.2
Df Residuals: 9 BIC: 238.6
Df Model: 6
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -3.482e+06 8.9e+05 -3.911 0.004 -5.5e+06 -1.47e+06
GNPDEFL 15.0619 84.915 0.177 0.863 -177.029 207.153
GNP -0.0358 0.033 -1.070 0.313 -0.112 0.040
UNEMP -2.0202 0.488 -4.136 0.003 -3.125 -0.915
ARMED -1.0332 0.214 -4.822 0.001 -1.518 -0.549
POP -0.0511 0.226 -0.226 0.826 -0.563 0.460
YEAR 1829.1515 455.478 4.016 0.003 798.788 2859.515
==============================================================================
Omnibus: 0.749 Durbin-Watson: 2.559
Prob(Omnibus): 0.688 Jarque-Bera (JB): 0.684
Skew: 0.420 Prob(JB): 0.710
Kurtosis: 2.434 Cond. No. 4.86e+09
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.86e+09. This might indicate that there are
strong multicollinearity or other numerical problems.

summary末尾Warnings提醒我们模型的condition number很大，可能存在很强的多重共线性问题或者其他问题

Condition number

condition number可以用来评估多重共线性问题的大小。

当该值大于20，基本可以确定是存在多重共线性问题(参考Greene 4.9)

import numpy as np
np.linalg.cond(ols.exog)

4859257015.454868

删除观测值

Greene也指出即使移除一个观测值，也可能会对ols估计产生巨大的影响

#保留了前14个观测值
ols_results2 = sm.OLS(y[:14], X[:14]).fit()
print("Percentage change %4.2f%%\n"*7 % tuple([i for i in (ols_results2.params - ols_results.params)/ols_results.params*100]))

Percentage change 4.55%
Percentage change -105.20%
Percentage change -3.43%
Percentage change 2.92%
Percentage change 3.32%
Percentage change 97.06%
Percentage change 4.64%

我们也可以查看DFBETAS，即当移除某个观测值后，每个参数的会因此发生的改变(标准化)

We can also look at formal statistics for this such as the DFBETAS – a standardized measure of how much each coefficient changes when that observation is left out.

influence = ols_results.get_influence()
influence.summary_frame()

大致上，我们可以认为DBETAS的绝对值大于

2./(len(X)**.5)

0.5

# 保留含有dfb的字段名
influence.summary_frame().filter(regex="dfb")

现在statsmodels正在持续开发中，未来python的计量分析方面的应用会越来越好用，期待ing