分享

使用scikit-learn进行建模预测和评估操作_泰坦尼克号获救预测

 LibraryPKU 2018-06-21


  1. # coding: utf-8

  2. # In[142]:

  3. import pandas as pd

  4. import numpy as np

  5. import matplotlib.pyplot as plt

  6. # In[143]:

  7. # 导入数据

  8. titanic = pd.read_csv('train.csv')

  9. titanic.head(5)

  10. # print(titanic.describe())

  11. # In[144]:

  12. titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

  13. print(titanic.describe())

  14. # In[145]:

  15. print(titanic['Sex'].unique())

  16. # Replace all the occurences of male with the number 0.

  17. # 将字符值转换成 数值

  18. # 进行一个属性值转换

  19. titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0

  20. titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1

  21. # In[146]:

  22. # 登船地址

  23. print(titanic['Embarked'].unique())

  24. titanic['Embarked'] = titanic['Embarked'].fillna('S')

  25. titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0

  26. titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1

  27. titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2

  28. # In[147]:

  29. # Import the linear regression class (线性回归)

  30. from sklearn.linear_model import LinearRegression

  31. # Sklearn also has a helper that makes it easy to do cross validation(交叉验证)

  32. from sklearn.cross_validation import KFold

  33. # The Columns we'll use to predict the target

  34. predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

  35. # Initialize our algorithm class

  36. alg = LinearRegression()

  37. # Generate(生成) cross validation folds(交叉验证) for the titanic dataset.

  38. # We set random_state to ensure we get the same splits(相同的分割) every time we run this.

  39. kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

  40. # 预测结果

  41. predictions = []

  42. # 训练集, 测试集, 交叉验证

  43. for train, test in kf:

  44.    # The predictors we're using the train the algorithm.

  45.    # Note how we only take the rows in the train folds (只在训练集中进行)

  46.    train_predictors = (titanic[predictors].iloc[train, :])

  47.    # The target we're using to train the algorithm

  48.    train_target = titanic['Survived'].iloc[train]

  49.    # Training the algorithm using the prodictors and target

  50.    # 训练数据的 X, Y ==> 让他能进行判断的操作

  51.    alg.fit(train_predictors, train_target)

  52.    # we can now make predictions on the test fold

  53.    test_predictions = alg.predict(titanic[predictors].iloc[test, :])

  54.    predictions.append(test_predictions)

  55. # In[148]:

  56. import numpy as np

  57. # The Predictions are in three separate numpy arrays. Concatenate them into one.

  58. # We concatenate them on axis 0, as they only have one axis.我们将它们连接在轴0上,因为它们只有一个轴

  59. predictions = np.concatenate(predictions, axis = 0)

  60. # Map predictions to outcomes (only possible outcome are 1 and 0)

  61. predictions[predictions > 0.5] = 1

  62. predictions[predictions <> .5] = 0

  63. # 进行评估模型

  64. accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)

  65. print(accuracy)

  66. # In[149]:

  67. from sklearn import cross_validation

  68. from sklearn.linear_model import LogisticRegression

  69. # Initialize our algorithm

  70. alg = LogisticRegression(random_state=1)

  71. # Compute the accuracy score for all the cross validation folds. (计算所有交叉验证折叠的精度分数。)

  72. # (much simpler than what we did before !)

  73. scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)

  74. # Take the mean of the scores (because we have one for each fold)

  75. print(scores.mean())

  76. # ### 随机森林

  77. # In[150]:

  78. titanic_test = pd.read_csv('test.csv')

  79. titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())

  80. titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())

  81. titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0

  82. titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1

  83. titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')

  84. titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0

  85. titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1

  86. titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2

  87. # In[151]:

  88. from sklearn import cross_validation

  89. from sklearn.ensemble import RandomForestClassifier

  90. #选中一些特征

  91. predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

  92. # Initialize our algorithm with the default paramters

  93. # random_state = 1 表示此处代码多运行几次得到的随机值都是一样的,如果不设置,两次执行的随机值是不一样的

  94. # n_estimators  指定有多少颗决策树,树的分裂的条件是:

  95. # min_samples_split 代表样本不停的分裂,某一个节点上的样本如果只有2个了 ,就不再继续分裂了

  96. # min_samples_leaf 是控制叶子节点的最小个数

  97. alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)

  98. # Compute the accuracy score for all the cross validation folds (nuch simpler than what we did before)

  99. # 进行交叉验证

  100. kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)

  101. scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)

  102. # Take the mean of the scores (because we have one for each fold)

  103. print(scores.mean())

  104. # In[152]:

  105. # 建立100多个决策树

  106. alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)

  107. # Compute the accuracy score

  108. kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)

  109. scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)

  110. # Take the mean of the scores (because we have one for each fold)

  111. print(scores.mean())

  112. # ## 关于特征提取问题 (非常关键)

  113. # - 尽可能多的提取特征

  114. # - 看不同特征的效果

  115. # - 特征提取是数据挖掘里很- 要的一部分

  116. # - 以上使用的特征都是数据里已经有的了,在真实的数据挖掘里我们常常没有合适的特征,需要我们自己取提取

  117. #

  118. # In[153]:

  119. # Generating a familysize column

  120. # 合并数据 :自己生成一个特征,家庭成员的大小:兄弟姐妹+老人孩子

  121. titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']

  122. # The .apply method generates a new series 名字的长度(据说国外的富裕的家庭都喜欢取很长的名字)

  123. titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))

  124. # In[154]:

  125. import re

  126. # A function to get the title from a name

  127. def get_title(name):

  128.    # Use a regular expression to search for a title.

  129.    # Titles always consist of capital and lowercase letters.

  130.    title_search = re.search(' ([A-Za-z]+)\.', name)

  131.    # If the title exists extract and return it.

  132.    if title_search:

  133.        return title_search.group(1)

  134.    return ''

  135. # Get all the titles and print how often each one occurs.

  136. titles = titanic['Name'].apply(get_title)

  137. print(pd.value_counts(titles))          # 输出看看, 相同数量的,设置相同映射

  138. # 国外不同阶层的人都有不同的称呼

  139. # Map each title to an integer. Some titles are very rare. and are compressed into the same codes as other

  140. title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 6, 'Major': 7, 'Col': 7, 'Mlle': 8,

  141.                 'Mme': 8, 'Don': 9, 'Lady': 10, 'Countess': 10, 'Jonkheer': 10, 'Sir': 9, 'Capt': 7, 'Ms': 2 }

  142. for k, v in title_mapping.items():

  143.     #将不同的称呼替换成机器可以计算的数字

  144.    titles[titles == k] = v

  145. # Verify that we converted  everything

  146. print(pd.value_counts(titles))

  147. # Add in the title column

  148. titanic['Title'] = titles

  149. # In[155]:

  150. # 进行特征选择

  151. # 特征重要性分析

  152. # 分析 不同特征对 最终结果的影响

  153. # 例如 衡量age列的重要程度时,什么也不干,得到一个错误率error1

  154. # 加入一些噪音数据,替换原来的值(注意,此时其他列的数据不变),又得到一个一个错误率error2

  155. # 两个错误率的差值 可以体现这一个特征的重要性

  156. import numpy as np

  157. from sklearn.feature_selection import SelectKBest, f_classif

  158. import matplotlib.pylab as plt

  159. # 选中一些特征

  160. predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',

  161.             'FamilySize', 'Title', 'NameLength']

  162. # Perform feature selection 选择特性

  163. selector = SelectKBest(f_classif, k = 5)

  164. selector.fit(titanic[predictors], titanic['Survived'])

  165. # Get the raw p-values(P 值) for each feature, and transform from p-values into scores

  166. scores = -np.log10(selector.pvalues_)

  167. # Plot the scores. See how 'Plcass', 'Sex', 'Title', and 'Fare' are the best ?

  168. plt.bar(range(len(predictors)), scores)

  169. plt.xticks(range(len(predictors)), predictors, rotation='vertical')

  170. plt.show()

  171. # 通过以上的特征重要性分析, 选择出4个最重要的特性,重新进行随机森林的算法

  172. # Pick only the four best features.

  173. predictors = ['Pclass', 'Sex', 'Fare', 'Title']

  174. alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)

  175. # 进行交叉验证

  176. kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)

  177. scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'],cv=kf)

  178. #目前的结果是没有得到提高,本处的目的是为了练习在随机森林中的特征选择,它对于实际的数据挖掘具有重要意义

  179. print (scores.mean())

  180. # ### 集成多种算法(减少过拟合)

  181. # In[156]:

  182. # 在竞赛中常用的耍赖的办法:集成多种算法,取最后每种算法的平均值,来减少过拟合

  183. from sklearn.ensemble import GradientBoostingClassifier

  184. import numpy as np

  185. # GradientBoostingClassifier也是一种随机森林的算法,可以集成多个弱分类器,然后变成强分类器

  186. # The algorithm we want to ensemble

  187. # We're using the more linear predictors for the logistic regression

  188. # and everything with the gradient boosting

  189. algorithms = [

  190.    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),['Pclass','Sex','Age','Fare','Embarked','FamilySize','Title']],

  191.    [LogisticRegression(random_state=1), ['Pclass','Sex','Age','Fare','Embarked','FamilySize','Title']]

  192. ]

  193. # Initialize the cross validation folds

  194. kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

  195. predictions = []

  196. for train, test in kf:

  197.    train_target = titanic['Survived'].iloc[train]

  198.    full_test_predictions = []

  199.    # Make predictions for each algorithm on each folds

  200.    for alg, predictors in algorithms:

  201.        # Fit the algorithm on the training data.

  202.        alg.fit(titanic[predictors].iloc[train, :], train_target)

  203.        # Select and predict on the test fold.

  204.        # The astype(float) is necessary to convert the dataframe

  205.        test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]

  206.        full_test_predictions.append(test_predictions)

  207.    # Use a simple ensembling scheme - just average the predictions to get the final classification

  208.    # 两个算法, 分别算出来的 预测值, 取平均

  209.    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2

  210.    # Any value over 5 is assumed to be a 1 prediction, and below 5 is a 0 prediction

  211.    test_predictions[test_predictions <> 0.5] = 0

  212.    test_predictions[test_predictions > .5] = 1

  213.    predictions.append(test_predictions)

  214. # Put all the predictions together into one array

  215. predictions = np.concatenate(predictions, axis=0)

  216. accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)

  217. print(accuracy)

  218. # In[157]:

  219. titles = titanic['Name'].apply(get_title)

  220. print(pd.value_counts(titles))          # 输出看看, 相同数量的,设置相同映射

  221. # 国外不同阶层的人都有不同的称呼

  222. # Map each title to an integer. Some titles are very rare. and are compressed into the same codes as other

  223. title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 6, 'Major': 7, 'Col': 7, 'Mlle': 8,

  224.                 'Mme': 8, 'Don': 9, 'Lady': 10, 'Countess': 10, 'Jonkheer': 10, 'Sir': 9, 'Capt': 7, 'Ms': 2 }

  225. for k, v in title_mapping.items():

  226.     #将不同的称呼替换成机器可以计算的数字

  227.    titles[titles == k] = v

  228. # Add in the title column

  229. titanic_test['Title'] = titles

  230. print(pd.value_counts(titanic_test['Title']))

  231. # Now, we add the family size column.

  232. titanic_test['FamilySize'] = titanic_test['SibSp'] + titanic_test['Parch']

  233. # In[158]:

  234. predictors = ['Pclass','Sex','Age','Fare','Embarked','FamilySize','Title']

  235. algorithms = [

  236.    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],

  237.    [LogisticRegression(random_state=1), ['Pclass','Sex','Age','Fare','Embarked','FamilySize','Title']]

  238. ]

  239. full_predictions = []

  240. for alg, predictors in algorithms:

  241.    # Fit the Algorithm using the full training data

  242.    alg.fit(titanic[predictors], titanic['Survived'])

  243.    predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:, 1]

  244.    full_predictions.append(predictions)

  245. # 梯度提升分类器产生更好的预测

  246. # The gradient boosting classifier generates better predictions, so we weight it high

  247. predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4

  248. predictions


    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多