博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
泰坦尼克号
阅读量:5085 次
发布时间:2019-06-13

本文共 5773 字,大约阅读时间需要 19 分钟。

  数据分析领域都有一个经典的入门题目,泰坦尼克号生还者预测。数据集可以去kaggle下载。

  

import pandas as pdimport numpy as npimport matplotlib.pyplot as plttitanic = pd.read_csv('D:/train.csv')# print(titanic.head())# print(titanic.describe())#通过describe发现age数据不完整,需要填充titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())# print(titanic.describe())# print(titanic['Sex'].unique())#将字符值转换成数值titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1#登船地址# print(titanic['Embarked'].unique())titanic['Embarked'] = titanic['Embarked'].fillna('S')titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2#用线性回归来预测from sklearn.linear_model import LinearRegressionfrom sklearn.cross_validation import KFoldpredictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']alg = LinearRegression()kf = KFold(titanic.shape[0], n_folds=3, random_state=1)#预测结果predictions = []#训练集,测试集,交叉验证for train, test in kf:    train_predictors = (titanic[predictors].iloc[train, :])    train_target = titanic['Survived'].iloc[train]    #训练数据的X,Y => 让他进行判断    alg.fit(train_predictors, train_target)    test_predictions  = alg.predict(titanic[predictors].iloc[test, :])    predictions.append(test_predictions)predictions = np.concatenate(predictions, axis=0)predictions[predictions > 0.5] = 1predictions[predictions <= 0.5] = 0#进行模型评估accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)# print(accuracy)#用逻辑回归来预测from sklearn import cross_validationfrom sklearn.linear_model import LogisticRegressionalg = LogisticRegression(random_state=1)scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)# print(scores.mean())#用随机森林来做from sklearn.ensemble import RandomForestClassifierpredictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']alg = RandomForestClassifier(random_state=1, n_estimators=1000, min_samples_split=8, min_samples_leaf=8)kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)print(scores.mean())#关于特征提取问题 (非常关键)#尽可能多的提取特征#看不同特征的效果#特征提取是数据挖掘里很- 要的一部分#以上使用的特征都是数据里已经有的了,在真实的数据挖掘里我们常常没有合适的特征,需要我们自己取提取#合并数据:自己生成一个特征,家庭成员的大小:兄弟姐妹+老人孩子titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']#名字的长度titanic['NameLength'] = titanic['Name'].apply(lambda x : len(x))import redef get_title(name):    #使用正则表达式匹配出Mr等    title_search = re.search(' ([A-Za-z]+)\.', name)    if title_search:        return title_search.group(1)    return ""titles = titanic['Name'].apply(get_title)# print(pd.value_counts(titles))#国外不同阶层的人都有不同的称呼title_mapping = {
"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2 }for k, v in title_mapping.items(): #将不同的称呼替换成机器可以计算的数字 titles[titles == k] = vprint(pd.value_counts(titles))titanic['Title'] = titles# 进行特征选择# 特征重要性分析# 分析 不同特征对 最终结果的影响# 例如 衡量age列的重要程度时,什么也不干,得到一个错误率error1,# 加入一些噪音数据,替换原来的值(注意,此时其他列的数据不变),又得到一个一个错误率error2# 两个错误率的差值 可以体现这一个特征的重要性from sklearn.feature_selection import SelectKBest, f_classifimport matplotlib.pylab as plt# 选中一些特征predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', "Embarked", 'FamilySize', 'Title', 'NameLength']#选择特征selector = SelectKBest(f_classif, k = 5)selector.fit(titanic[predictors], titanic['Survived'])scores= -np.log10(selector.pvalues_)plt.bar(range(len(predictors)), scores)plt.xticks(range(len(predictors)), predictors, rotation = 'vertical')# plt.show()#通过以上特征重要性分析,选择出4个最重要的特征,重新进行随机森林的算法predictors = ['Pclass', 'Sex', 'Fare', 'Title']alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=8, min_samples_leaf=8)kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)print(scores.mean())#可以使用多种算法一起来训练,最后取平均值防止过拟合from sklearn.ensemble import GradientBoostingClassifieralgorithms = [ [LogisticRegression(random_state=1), ['Pclass', 'Sex', 'Fare', 'Title']], [RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=8, min_samples_leaf=8), ['Pclass', 'Sex', 'Fare', 'Title']]]# Initialize the cross validation foldskf = KFold(titanic.shape[0], n_folds=3, random_state=1)predictions = []for train, test in kf: train_target = titanic['Survived'].iloc[train] full_test_predictions = [] # Make predictions for each algorithm on each folds for alg, predictors in algorithms: # Fit the algorithm on the training data. alg.fit(titanic[predictors].iloc[train, :], train_target) # Select and predict on the test fold. # The astype(float) is necessary to convert the dataframe test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1] full_test_predictions.append(test_predictions) # Use a simple ensembling scheme - just average the predictions to get the final classification # 两个算法, 分别算出来的 预测值, 取平均 test_predictions = (full_test_predictions[0] * 3 + full_test_predictions[1]) / 2 # Any value over 5 is assumed to be a 1 prediction, and below 5 is a 0 prediction test_predictions[test_predictions <= 0.5] = 0 test_predictions[test_predictions > .5] = 1 predictions.append(test_predictions)# Put all the predictions together into one arraypredictions = np.concatenate(predictions, axis=0)accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)print(accuracy)

 

转载于:https://www.cnblogs.com/xushu/p/8644056.html

你可能感兴趣的文章
jQuery 自定义函数
查看>>
jquery datagrid 后台获取datatable处理成正确的json字符串
查看>>
ActiveMQ与spring整合
查看>>
web服务器
查看>>
第一阶段冲刺06
查看>>
EOS生产区块:解析插件producer_plugin
查看>>
mysql重置密码
查看>>
jQuery轮 播的封装
查看>>
一天一道算法题--5.30---递归
查看>>
JS取得绝对路径
查看>>
排球积分程序(三)——模型类的设计
查看>>
python numpy sum函数用法
查看>>
php变量什么情况下加大括号{}
查看>>
linux程序设计---序
查看>>
【字符串入门专题1】hdu3613 【一个悲伤的exkmp】
查看>>
C# Linq获取两个List或数组的差集交集
查看>>
HDU 4635 Strongly connected
查看>>
ASP.NET/C#获取文章中图片的地址
查看>>
Spring MVC 入门(二)
查看>>
格式化输出数字和时间
查看>>