分享

使用python+sklearn的决策树方法预测是否有信用风险

 宋娃娃姓宋 2018-04-24
import numpy as np
  • 1
import pandas as pd
  • 1
names=("Balance,Duration,History,Purpose,Credit amount,Savings,Employment,instPercent,sexMarried,Guarantors,Residence duration,Assets,Age,concCredit,Apartment,Credits,Occupation,Dependents,hasPhone,Foreign,lable").split(',')
  • 1
data=pd.read_csv("Desktop/sunshengyun/data/german/german.data",sep='\s+',names=names)
  • 1
data.head()
  • 1
Balance Duration History Purpose Credit amount Savings Employment instPercent sexMarried Guarantors Assets Age concCredit Apartment Credits Occupation Dependents hasPhone Foreign lable
0 A11 6 A34 A43 1169 A65 A75 4 A93 A101 A121 67 A143 A152 2 A173 1 A192 A201 1
1 A12 48 A32 A43 5951 A61 A73 2 A92 A101 A121 22 A143 A152 1 A173 1 A191 A201 2
2 A14 12 A34 A46 2096 A61 A74 2 A93 A101 A121 49 A143 A152 1 A172 2 A191 A201 1
3 A11 42 A32 A42 7882 A61 A74 2 A93 A103 A122 45 A143 A153 1 A173 2 A191 A201 1
4 A11 24 A33 A40 4870 A61 A73 3 A93 A101 A124 53 A143 A153 2 A173 2 A191 A201 2

5 rows × 21 columns

data.Balance.unique()
  • 1
array([‘A11’, ‘A12’, ‘A14’, ‘A13’], dtype=object)
data.count()
  • 1
Balance 1000 Duration 1000 History 1000 Purpose 1000 Credit amount 1000 Savings 1000 Employment 1000 instPercent 1000 sexMarried 1000 Guarantors 1000 Residence duration 1000 Assets 1000 Age 1000 concCredit 1000 Apartment 1000 Credits 1000 Occupation 1000 Dependents 1000 hasPhone 1000 Foreign 1000 lable 1000 dtype: int64
#部分变量描述性统计分析
data.describe()
  • 1
  • 2
Duration Credit amount instPercent Residence duration Age Credits Dependents lable
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 20.903000 3271.258000 2.973000 2.845000 35.546000 1.407000 1.155000 1.300000
std 12.058814 2822.736876 1.118715 1.103718 11.375469 0.577654 0.362086 0.458487
min 4.000000 250.000000 1.000000 1.000000 19.000000 1.000000 1.000000 1.000000
25% 12.000000 1365.500000 2.000000 2.000000 27.000000 1.000000 1.000000 1.000000
50% 18.000000 2319.500000 3.000000 3.000000 33.000000 1.000000 1.000000 1.000000
75% 24.000000 3972.250000 4.000000 4.000000 42.000000 2.000000 1.000000 2.000000
max 72.000000 18424.000000 4.000000 4.000000 75.000000 4.000000 2.000000 2.000000
data.Duration.unique()
  • 1
array([ 6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8, 54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40], dtype=int64)
data.History.unique()
  • 1
array([‘A34’, ‘A32’, ‘A33’, ‘A30’, ‘A31’], dtype=object)
data.groupby('Balance').size().order(ascending=False)
  • 1
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Balance A14 394 A11 274 A12 269 A13 63 dtype: int64
data.groupby('Purpose').size().order(ascending=False)
  • 1
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Purpose A43 280 A40 234 A42 181 A41 103 A49 97 A46 50 A45 22 A44 12 A410 12 A48 9 dtype: int64
data.groupby('Apartment').size().order(ascending=False)
  • 1
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Apartment A152 713 A151 179 A153 108 dtype: int64
import matplotlib.pyplot as plt
%matplotlib inline
data.plot(x='lable', y='Age', kind='scatter',
          alpha=0.02, s=50);
  • 1
  • 2
  • 3
  • 4
![png](output_13_0.png)
data.hist('Age', bins=15);
  • 1
![png](output_14_0.png)
target=data.lable
  • 1
features_data=data.drop('lable',axis=1)
  • 1
numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] # 提取数值类型为整数或浮点数的变量
  • 1
numeric_features
  • 1
[‘Duration’, ‘Credit amount’, ‘instPercent’, ‘Residence duration’, ‘Age’, ‘Credits’, ‘Dependents’]
numeric_data = features_data[numeric_features]
  • 1
numeric_data.head()
  • 1
Duration Credit amount instPercent Residence duration Age Credits Dependents
0 6 1169 4 4 67 2 1
1 48 5951 2 2 22 1 1
2 12 2096 2 3 49 1 2
3 42 7882 2 4 45 1 2
4 24 4870 3 4 53 2 2
categorical_data = features_data.drop(numeric_features, axis=1)
  • 1
categorical_data.head()
  • 1
Balance History Purpose Savings Employment sexMarried Guarantors Assets concCredit Apartment Occupation hasPhone Foreign
0 A11 A34 A43 A65 A75 A93 A101 A121 A143 A152 A173 A192 A201
1 A12 A32 A43 A61 A73 A92 A101 A121 A143 A152 A173 A191 A201
2 A14 A34 A46 A61 A74 A93 A101 A121 A143 A152 A172 A191 A201
3 A11 A32 A42 A61 A74 A93 A103 A122 A143 A153 A173 A191 A201
4 A11 A33 A40 A61 A73 A93 A101 A124 A143 A153 A173 A191 A201
categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) # pd.factorize即可将分类变量转换为数值表示
                                                                                # apply运算将转换函数应用到每一个变量维度
categorical_data_encoded.head(5)
  • 1
  • 2
  • 3
Balance History Purpose Savings Employment sexMarried Guarantors Assets concCredit Apartment Occupation hasPhone Foreign
0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 0 1 1 1 0 0 0 0 0 1 0
2 2 0 1 1 2 0 0 0 0 0 1 1 0
3 0 1 2 1 2 0 1 1 0 1 0 1 0
4 0 2 3 1 1 0 0 2 0 1 0 1 0
features = pd.concat([numeric_data, categorical_data_encoded], axis=1)#进行数据的合并
features.head()
# 此处也可以选用one-hot编码来表示分类变量,相应的程序如下:
# features = pd.get_dummies(features_data)
# features.head()
  • 1
  • 2
  • 3
  • 4
  • 5
Duration Credit amount instPercent Residence duration Age Credits Dependents Balance History Purpose Savings Employment sexMarried Guarantors Assets concCredit Apartment Occupation hasPhone Foreign
0 6 1169 4 4 67 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0
1 48 5951 2 2 22 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0
2 12 2096 2 3 49 1 2 2 0 1 1 2 0 0 0 0 0 1 1 0
3 42 7882 2 4 45 1 2 0 1 2 1 2 0 1 1 0 1 0 1 0
4 24 4870 3 4 53 2 2 0 2 3 1 1 0 0 2 0 1 0 1 0
X = features.values.astype(np.float32) # 转换数据类型
y = (target.values == 1).astype(np.int32) # 1:good,2:bad
  • 1
  • 2
from sklearn.cross_validation import train_test_split # sklearn库中train_test_split函数可实现该划分

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0) # 参数test_size设置训练集占比
  • 1
  • 2
  • 3
  • 4
  • 5
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score

clf = DecisionTreeClassifier(max_depth=8) # 参数max_depth设置树最大深度

# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc') 
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(
    np.mean(scores), np.std(scores)))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
ROC AUC Decision Tree: 0.6866 +/-0.0105
  • 1
  • 2
#利用learning curve,以样本数为横坐标,训练和交叉验证集上的评分为纵坐标,对不同深度的决策树进行对比(判断是否存在过拟合或欠拟合)
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=3,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5),
                        scoring=None):
    plt.title("Learning curves for %s" % type(estimator).__name__)
    plt.ylim(*ylim); plt.grid()
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
        scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
clf = DecisionTreeClassifier(max_depth=None)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
# 可以注意到训练数据和交叉验证数据的得分有很大的差距,意味着可能过度拟合训练数据了
  • 1
  • 2
  • 3
Best validation score: 0.6310
  • 1
  • 2

这里写图片描述

clf = DecisionTreeClassifier(max_depth=10)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
  • 1
  • 2
Best validation score: 0.6565
  • 1
  • 2

这里写图片描述

clf = DecisionTreeClassifier(max_depth=8)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
  • 1
  • 2
Best validation score: 0.6762
  • 1
  • 2

这里写图片描述

clf = DecisionTreeClassifier(max_depth=5)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
  • 1
  • 2
Best validation score: 0.7219
  • 1
  • 2

这里写图片描述

clf = DecisionTreeClassifier(max_depth=4)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
  • 1
  • 2
Best validation score: 0.7226
  • 1
  • 2

这里写图片描述

#利用validation_curve计算不同深度训练集和测试集交叉验证得分
from sklearn.learning_curve import validation_curve


def plot_validation_curve(estimator, X, y, param_name, param_range,
                          ylim=(0, 1.1), cv=3, n_jobs=1, scoring=None):
    estimator_name = type(estimator).__name__
    plt.title("Validation curves for %s on %s"
              % (param_name, estimator_name))
    plt.ylim(*ylim); plt.grid()
    plt.xlim(min(param_range), max(param_range))
    plt.xlabel(param_name)
    plt.ylabel("Score")

    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name, param_range,
        cv=cv, n_jobs=n_jobs, scoring=scoring)

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.semilogx(param_range, train_scores_mean, 'o-', color="r",
                 label="Training score")
    plt.semilogx(param_range, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    plt.legend(loc="best")
    print("Best test score: {:.4f}".format(test_scores_mean[-1]))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
clf = DecisionTreeClassifier(max_depth=8)
param_name = 'max_depth'
param_range = [1, 2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15]

plot_validation_curve(clf, X_train, y_train,
                      param_name, param_range, scoring='roc_auc')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
Best test score: 0.6409
  • 1
  • 2

这里写图片描述

# 先利用随机森里来提升分类效果
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=27, max_features=15, # 参数n_estimators设置森林中树的个数
                             max_depth=10)

scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc',
                         n_jobs=1)
print("ROC Random Forest: {:.4f} +/-{:.4f}".format(
    np.mean(scores), np.std(scores)))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
ROC Random Forest: 0.7817 +/-0.0208
  • 1
  • 2
clf.fit(X_train, y_train)
  • 1
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=15, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=27, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

from sklearn.metrics import roc_auc_score
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba))
  • 1
  • 2
  • 3
  • 4
ROC AUC: 0.7394
  • 1
  • 2
from sklearn import grid_search
# 使用gridsearch进行并行调参
parameters = {'n_estimators':[5,11,15,21,25,31], 'max_features':[5, 10,15,20],'max_depth':[3,6,9,12],'criterion':['gini','entropy']}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, cv=3)
clf.fit(X_train, y_train)
  • 1
  • 2
  • 3
  • 4
  • 5
GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 11, 15, 21, 25, 31], 'max_features': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 6, 9, 12]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba))
  • 1
  • 2
ROC AUC: 0.7551
  • 1
  • 2
clf.best_params_
  • 1
{'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 15,
 'n_estimators': 21}
  • 1
  • 2
  • 3
  • 4
  • 5
clf.best_score_
  • 1
0.78374999999999995
  • 1
  • 2

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约