import numpy as np
import pandas as pd
names=("Balance,Duration,History,Purpose,Credit amount,Savings,Employment,instPercent,sexMarried,Guarantors,Residence duration,Assets,Age,concCredit,Apartment,Credits,Occupation,Dependents,hasPhone,Foreign,lable").split(',')
data=pd.read_csv("Desktop/sunshengyun/data/german/german.data",sep='\s+',names=names)
data.head()
|
Balance |
Duration |
History |
Purpose |
Credit amount |
Savings |
Employment |
instPercent |
sexMarried |
Guarantors |
… |
Assets |
Age |
concCredit |
Apartment |
Credits |
Occupation |
Dependents |
hasPhone |
Foreign |
lable |
0 |
A11 |
6 |
A34 |
A43 |
1169 |
A65 |
A75 |
4 |
A93 |
A101 |
… |
A121 |
67 |
A143 |
A152 |
2 |
A173 |
1 |
A192 |
A201 |
1 |
1 |
A12 |
48 |
A32 |
A43 |
5951 |
A61 |
A73 |
2 |
A92 |
A101 |
… |
A121 |
22 |
A143 |
A152 |
1 |
A173 |
1 |
A191 |
A201 |
2 |
2 |
A14 |
12 |
A34 |
A46 |
2096 |
A61 |
A74 |
2 |
A93 |
A101 |
… |
A121 |
49 |
A143 |
A152 |
1 |
A172 |
2 |
A191 |
A201 |
1 |
3 |
A11 |
42 |
A32 |
A42 |
7882 |
A61 |
A74 |
2 |
A93 |
A103 |
… |
A122 |
45 |
A143 |
A153 |
1 |
A173 |
2 |
A191 |
A201 |
1 |
4 |
A11 |
24 |
A33 |
A40 |
4870 |
A61 |
A73 |
3 |
A93 |
A101 |
… |
A124 |
53 |
A143 |
A153 |
2 |
A173 |
2 |
A191 |
A201 |
2 |
5 rows × 21 columns
data.Balance.unique()
array([‘A11’, ‘A12’, ‘A14’, ‘A13’], dtype=object)
data.count()
Balance 1000
Duration 1000
History 1000
Purpose 1000
Credit amount 1000
Savings 1000
Employment 1000
instPercent 1000
sexMarried 1000
Guarantors 1000
Residence duration 1000
Assets 1000
Age 1000
concCredit 1000
Apartment 1000
Credits 1000
Occupation 1000
Dependents 1000
hasPhone 1000
Foreign 1000
lable 1000
dtype: int64
#部分变量描述性统计分析
data.describe()
|
Duration |
Credit amount |
instPercent |
Residence duration |
Age |
Credits |
Dependents |
lable |
count |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
mean |
20.903000 |
3271.258000 |
2.973000 |
2.845000 |
35.546000 |
1.407000 |
1.155000 |
1.300000 |
std |
12.058814 |
2822.736876 |
1.118715 |
1.103718 |
11.375469 |
0.577654 |
0.362086 |
0.458487 |
min |
4.000000 |
250.000000 |
1.000000 |
1.000000 |
19.000000 |
1.000000 |
1.000000 |
1.000000 |
25% |
12.000000 |
1365.500000 |
2.000000 |
2.000000 |
27.000000 |
1.000000 |
1.000000 |
1.000000 |
50% |
18.000000 |
2319.500000 |
3.000000 |
3.000000 |
33.000000 |
1.000000 |
1.000000 |
1.000000 |
75% |
24.000000 |
3972.250000 |
4.000000 |
4.000000 |
42.000000 |
2.000000 |
1.000000 |
2.000000 |
max |
72.000000 |
18424.000000 |
4.000000 |
4.000000 |
75.000000 |
4.000000 |
2.000000 |
2.000000 |
data.Duration.unique()
array([ 6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8,
54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40], dtype=int64)
data.History.unique()
array([‘A34’, ‘A32’, ‘A33’, ‘A30’, ‘A31’], dtype=object)
data.groupby('Balance').size().order(ascending=False)
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…)
if __name__ == ‘__main__’:
Balance
A14 394
A11 274
A12 269
A13 63
dtype: int64
data.groupby('Purpose').size().order(ascending=False)
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…)
if __name__ == ‘__main__’:
Purpose
A43 280
A40 234
A42 181
A41 103
A49 97
A46 50
A45 22
A44 12
A410 12
A48 9
dtype: int64
data.groupby('Apartment').size().order(ascending=False)
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…)
if __name__ == ‘__main__’:
Apartment
A152 713
A151 179
A153 108
dtype: int64
import matplotlib.pyplot as plt
%matplotlib inline
data.plot(x='lable', y='Age', kind='scatter',
alpha=0.02, s=50);
![png](output_13_0.png)
data.hist('Age', bins=15);
![png](output_14_0.png)
target=data.lable
features_data=data.drop('lable',axis=1)
numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] # 提取数值类型为整数或浮点数的变量
numeric_features
[‘Duration’,
‘Credit amount’,
‘instPercent’,
‘Residence duration’,
‘Age’,
‘Credits’,
‘Dependents’]
numeric_data = features_data[numeric_features]
numeric_data.head()
|
Duration |
Credit amount |
instPercent |
Residence duration |
Age |
Credits |
Dependents |
0 |
6 |
1169 |
4 |
4 |
67 |
2 |
1 |
1 |
48 |
5951 |
2 |
2 |
22 |
1 |
1 |
2 |
12 |
2096 |
2 |
3 |
49 |
1 |
2 |
3 |
42 |
7882 |
2 |
4 |
45 |
1 |
2 |
4 |
24 |
4870 |
3 |
4 |
53 |
2 |
2 |
categorical_data = features_data.drop(numeric_features, axis=1)
categorical_data.head()
|
Balance |
History |
Purpose |
Savings |
Employment |
sexMarried |
Guarantors |
Assets |
concCredit |
Apartment |
Occupation |
hasPhone |
Foreign |
0 |
A11 |
A34 |
A43 |
A65 |
A75 |
A93 |
A101 |
A121 |
A143 |
A152 |
A173 |
A192 |
A201 |
1 |
A12 |
A32 |
A43 |
A61 |
A73 |
A92 |
A101 |
A121 |
A143 |
A152 |
A173 |
A191 |
A201 |
2 |
A14 |
A34 |
A46 |
A61 |
A74 |
A93 |
A101 |
A121 |
A143 |
A152 |
A172 |
A191 |
A201 |
3 |
A11 |
A32 |
A42 |
A61 |
A74 |
A93 |
A103 |
A122 |
A143 |
A153 |
A173 |
A191 |
A201 |
4 |
A11 |
A33 |
A40 |
A61 |
A73 |
A93 |
A101 |
A124 |
A143 |
A153 |
A173 |
A191 |
A201 |
categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) # pd.factorize即可将分类变量转换为数值表示
# apply运算将转换函数应用到每一个变量维度
categorical_data_encoded.head(5)
|
Balance |
History |
Purpose |
Savings |
Employment |
sexMarried |
Guarantors |
Assets |
concCredit |
Apartment |
Occupation |
hasPhone |
Foreign |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
2 |
2 |
0 |
1 |
1 |
2 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
3 |
0 |
1 |
2 |
1 |
2 |
0 |
1 |
1 |
0 |
1 |
0 |
1 |
0 |
4 |
0 |
2 |
3 |
1 |
1 |
0 |
0 |
2 |
0 |
1 |
0 |
1 |
0 |
features = pd.concat([numeric_data, categorical_data_encoded], axis=1)#进行数据的合并
features.head()
# 此处也可以选用one-hot编码来表示分类变量,相应的程序如下:
# features = pd.get_dummies(features_data)
# features.head()
|
Duration |
Credit amount |
instPercent |
Residence duration |
Age |
Credits |
Dependents |
Balance |
History |
Purpose |
Savings |
Employment |
sexMarried |
Guarantors |
Assets |
concCredit |
Apartment |
Occupation |
hasPhone |
Foreign |
0 |
6 |
1169 |
4 |
4 |
67 |
2 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
48 |
5951 |
2 |
2 |
22 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
2 |
12 |
2096 |
2 |
3 |
49 |
1 |
2 |
2 |
0 |
1 |
1 |
2 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
3 |
42 |
7882 |
2 |
4 |
45 |
1 |
2 |
0 |
1 |
2 |
1 |
2 |
0 |
1 |
1 |
0 |
1 |
0 |
1 |
0 |
4 |
24 |
4870 |
3 |
4 |
53 |
2 |
2 |
0 |
2 |
3 |
1 |
1 |
0 |
0 |
2 |
0 |
1 |
0 |
1 |
0 |
X = features.values.astype(np.float32) # 转换数据类型
y = (target.values == 1).astype(np.int32) # 1:good,2:bad
from sklearn.cross_validation import train_test_split # sklearn库中train_test_split函数可实现该划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0) # 参数test_size设置训练集占比
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(max_depth=8) # 参数max_depth设置树最大深度
# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(
np.mean(scores), np.std(scores)))
ROC AUC Decision Tree: 0.6866 +/-0.0105
#利用learning curve,以样本数为横坐标,训练和交叉验证集上的评分为纵坐标,对不同深度的决策树进行对比(判断是否存在过拟合或欠拟合)
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=3,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5),
scoring=None):
plt.title("Learning curves for %s" % type(estimator).__name__)
plt.ylim(*ylim); plt.grid()
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, validation_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
scoring=scoring)
train_scores_mean = np.mean(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
clf = DecisionTreeClassifier(max_depth=None)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
# 可以注意到训练数据和交叉验证数据的得分有很大的差距,意味着可能过度拟合训练数据了
Best validation score: 0.6310
clf = DecisionTreeClassifier(max_depth=10)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
Best validation score: 0.6565
clf = DecisionTreeClassifier(max_depth=8)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
Best validation score: 0.6762
clf = DecisionTreeClassifier(max_depth=5)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
Best validation score: 0.7219
clf = DecisionTreeClassifier(max_depth=4)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
Best validation score: 0.7226
#利用validation_curve计算不同深度训练集和测试集交叉验证得分
from sklearn.learning_curve import validation_curve
def plot_validation_curve(estimator, X, y, param_name, param_range,
ylim=(0, 1.1), cv=3, n_jobs=1, scoring=None):
estimator_name = type(estimator).__name__
plt.title("Validation curves for %s on %s"
% (param_name, estimator_name))
plt.ylim(*ylim); plt.grid()
plt.xlim(min(param_range), max(param_range))
plt.xlabel(param_name)
plt.ylabel("Score")
train_scores, test_scores = validation_curve(
estimator, X, y, param_name, param_range,
cv=cv, n_jobs=n_jobs, scoring=scoring)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.semilogx(param_range, train_scores_mean, 'o-', color="r",
label="Training score")
plt.semilogx(param_range, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
print("Best test score: {:.4f}".format(test_scores_mean[-1]))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
clf = DecisionTreeClassifier(max_depth=8)
param_name = 'max_depth'
param_range = [1, 2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15]
plot_validation_curve(clf, X_train, y_train,
param_name, param_range, scoring='roc_auc')
Best test score: 0.6409
# 先利用随机森里来提升分类效果
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=27, max_features=15, # 参数n_estimators设置森林中树的个数
max_depth=10)
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc',
n_jobs=1)
print("ROC Random Forest: {:.4f} +/-{:.4f}".format(
np.mean(scores), np.std(scores)))
ROC Random Forest: 0.7817 +/-0.0208
clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=10, max_features=15, max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=27, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
from sklearn.metrics import roc_auc_score
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba))
ROC AUC: 0.7394
from sklearn import grid_search
# 使用gridsearch进行并行调参
parameters = {'n_estimators':[5,11,15,21,25,31], 'max_features':[5, 10,15,20],'max_depth':[3,6,9,12],'criterion':['gini','entropy']}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, cv=3)
clf.fit(X_train, y_train)
GridSearchCV(cv=3, error_score='raise',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False),
fit_params={}, iid=True, n_jobs=1,
param_grid={'n_estimators': [5, 11, 15, 21, 25, 31], 'max_features': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 6, 9, 12]},
pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba))
ROC AUC: 0.7551
clf.best_params_
{'criterion': 'entropy',
'max_depth': 6,
'max_features': 15,
'n_estimators': 21}
clf.best_score_
0.78374999999999995
|