In [2]: # 1.读取数据import pandas as pddf = pd.read_excel('股票客户流失.xlsx')# 2.划分特征变量和目标变量X = df.drop(columns='是否流失') y = df['是否流失']# 3.划分训练集和测试集from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)# 4.模型搭建from sklearn.linear_model import LogisticRegressionmodel = LogisticRegression()model.fit(X_train, y_train)# 5.模型使用1 - 预测数据结果y_pred = model.predict(X_test)print(y_pred[0:100]) # 打印预测内容的前100个看看# 查看全部的预测准确度from sklearn.metrics import accuracy_scorescore = accuracy_score(y_pred, y_test)print(score) # 打印整体的预测准确度# 6.模型使用2 - 预测概率y_pred_proba = model.predict_proba(X_test) print(y_pred_proba[0:5]) # 打印前5个客户的分类概率 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1] 0.7977288857345636 [[0.82041491 0.17958509] [0.84029613 0.15970387] [0.79819342 0.20180658] [0.62989192 0.37010808] [0.61636611 0.38363389]] In [3]: from sklearn.metrics import confusion_matrixm = confusion_matrix(y_test, y_pred) # 传入预测值和真实值print(m) [[968 93] [192 156]] In [4]: a = pd.DataFrame(m, index=['0(实际不流失)', '1(实际流失)'], columns=['0(预测不流失)', '1(预测流失)'])a Out[4]:
In [5]: from sklearn.metrics import classification_reportprint(classification_report(y_test, y_pred)) # 传入预测值和真实值 precision recall f1-score support 0 0.83 0.91 0.87 1061 1 0.63 0.45 0.52 348 accuracy 0.80 1409 macro avg 0.73 0.68 0.70 1409 weighted avg 0.78 0.80 0.79 1409 In [6]: y_pred_proba[:,1] Out[6]: array([0.17958509, 0.15970387, 0.20180658, ..., 0.04220544, 0.09782449, 0.63586739]) In [7]: # 1.计算ROC曲线需要的假警报率false positive result(fpr)、命中率true positive result(tpr)及阈值threshold(thres)from sklearn.metrics import roc_curvefpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1]) In [8]: # # 感兴趣的读者可以查看下roc_curve()函数返回的内容# print(roc_curve(y_test, y_pred_proba[:,1]))# type(roc_curve(y_test, y_pred_proba[:,1]))# len(roc_curve(y_test, y_pred_proba[:,1])) In [9]: # 2.查看假警报率(fpr)、命中率(tpr)及阈值(thres)a = pd.DataFrame() # 创建一个空DataFrame a['阈值'] = list(thres)a['假警报率'] = list(fpr)a['命中率'] = list(tpr)a.head() Out[9]:
In [13]: # 3.绘制ROC曲线import matplotlib.pyplot as pltplt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文plt.plot(fpr, tpr) # 通过plot()函数绘制折线图plt.title('ROC曲线') # 添加标题,注意如果要写中文,需要在之前添加一行代码:plt.rcParams['font.sans-serif'] = ['SimHei']plt.xlabel('FPR') # 添加X轴标签plt.ylabel('TPR') # 添加Y轴标plt.show() In [11]: # 4.求出模型的AUC值from sklearn.metrics import roc_auc_scorescore = roc_auc_score(y_test, y_pred_proba[:,1])score Out[11]: 0.8103854528908967 In [11]: max(y_pred_proba[:,1]) Out[11]: 0.9303686064600186 In [12]: a = pd.DataFrame(y_pred_proba, columns=['分类为0概率', '分类为1概率'])a = a.sort_values('分类为1概率', ascending=False)a.head(15) Out[12]:
In [14]: from sklearn.metrics import roc_curvefpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1]) In [15]: a = pd.DataFrame() # 创建一个空DataFrame a['阈值'] = list(thres)a['假警报率'] = list(fpr)a['命中率'] = list(tpr)a.head() Out[15]:
In [16]: plt.plot(thres[1:], tpr[1:])plt.plot(thres[1:], fpr[1:])plt.plot(thres[1:], tpr[1:] - fpr[1:])plt.xlabel('threshold')plt.legend(['tpr', 'fpr', 'tpr-fpr'])plt.gca().invert_xaxis() plt.show() In [17]: max(tpr - fpr) Out[17]: 0.4744656418256471 In [18]: # KS值对应的阈值a['TPR-FPR'] = a['命中率'] - a['假警报率']a.head() Out[18]:
In [19]: # 另外一种获取KS值的方式max(a['TPR-FPR']) Out[19]: 0.4744656418256471 In [20]: # 获取KS值对应的阈值等信息a[a['TPR-FPR'] == max(a['TPR-FPR'])] Out[20]:
|
|