In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.metrics import accuracy_score
In [0]:
titanic =  sns.load_dataset('titanic')
In [6]:
titanic.head()
Out[6]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [7]:
titanic.isnull().sum()
Out[7]:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
In [0]:
titanic.drop(labels=['age', 'deck'], axis=1, inplace=True)
In [0]:
titanic = titanic.dropna()
In [10]:
titanic.isnull().sum()
Out[10]:
survived       0
pclass         0
sex            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64
In [0]:
data = titanic[['pclass', 'sex', 'sibsp', 'parch', 'embarked', 'who', 'alone']].copy()
In [28]:
data.head()
Out[28]:
pclass sex sibsp parch embarked who alone
0 3 male 1 0 S man False
1 1 female 1 0 C woman False
2 3 female 0 0 S woman True
3 1 female 1 0 S woman False
4 3 male 0 0 S man True
In [29]:
data.isnull().sum()
Out[29]:
pclass      0
sex         0
sibsp       0
parch       0
embarked    0
who         0
alone       0
dtype: int64
In [0]:
sex = {'male': 0, 'female': 1}
data['sex'] = data['sex'].map(sex)
In [31]:
data.head()
Out[31]:
pclass sex sibsp parch embarked who alone
0 3 0 1 0 S man False
1 1 1 1 0 C woman False
2 3 1 0 0 S woman True
3 1 1 1 0 S woman False
4 3 0 0 0 S man True
In [0]:
ports = {'S': 0, 'C': 1, 'Q': 2}
data['embarked'] = data['embarked'].map(ports)
In [0]:
who = {'man': 0, 'woman': 1, 'child': 2}
data['who'] = data['who'].map(who)
In [0]:
alone = {True: 1, False: 0}
data['alone'] = data['alone'].map(alone)
In [35]:
data.head()
Out[35]:
pclass sex sibsp parch embarked who alone
0 3 0 1 0 0 0 0
1 1 1 1 0 1 1 0
2 3 1 0 0 0 1 1
3 1 1 1 0 0 1 0
4 3 0 0 0 0 0 1

Do Chi2-Test

In [0]:
x = data.copy()
y = titanic['survived']
In [37]:
x.shape, y.shape
Out[37]:
((889, 7), (889,))
In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
In [0]:
f_score = chi2(x_train, y_train)
In [41]:
f_score
# 1dimension-カイ二乗値、2dimension-p値
Out[41]:
(array([ 22.65169202, 152.91534343,   0.52934285,  10.35663782,
         16.13255653, 161.42431175,  13.4382363 ]),
 array([1.94189138e-06, 3.99737147e-35, 4.66883271e-01, 1.29009955e-03,
        5.90599986e-05, 5.52664700e-37, 2.46547298e-04]))
In [0]:
p_values = pd.Series(f_score[1], index=x_train.columns)
p_values.sort_values(ascending=True, inplace=True)
In [43]:
p_values
# who と sex のp値がかなり小さいので、重要度が高い
Out[43]:
who         5.526647e-37
sex         3.997371e-35
pclass      1.941891e-06
embarked    5.906000e-05
alone       2.465473e-04
parch       1.290100e-03
sibsp       4.668833e-01
dtype: float64
In [45]:
p_values.plot.bar()
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f24cde17c88>
In [0]:
# p値がとても低い2変数でモデリング
x_train_2 = x_train[['who', 'sex']]
x_test_2 = x_test[['who', 'sex']]
In [0]:
def run_randomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
In [48]:
%%time
run_randomForest(x_train_2, x_test_2, y_train, y_test)
Accuracy:  0.7191011235955056
CPU times: user 229 ms, sys: 40.9 ms, total: 270 ms
Wall time: 352 ms
In [51]:
# p値が低い3変数でモデリング
%%time
x_train_3 = x_train[['who', 'sex', 'pclass']]
x_test_3 = x_test[['who', 'sex', 'pclass']]

run_randomForest(x_train_3, x_test_3, y_train, y_test)
Accuracy:  0.7415730337078652
CPU times: user 250 ms, sys: 35 ms, total: 285 ms
Wall time: 357 ms
In [0]:
# p値が低い4変数でモデリング
x_train_4 = x_train[['who', 'sex', 'pclass', 'embarked']]
x_test_4 = x_test[['who', 'sex', 'pclass', 'embarked']]
In [53]:
%%time
run_randomForest(x_train_4, x_test_4, y_train, y_test)
Accuracy:  0.7584269662921348
CPU times: user 246 ms, sys: 39.3 ms, total: 285 ms
Wall time: 354 ms
In [0]:
# p値が低い4変数でモデリング2
x_train_4 = x_train[['who', 'sex', 'pclass', 'alone']]
x_test_4 = x_test[['who', 'sex', 'pclass', 'alone']]
In [55]:
%%time
run_randomForest(x_train_4, x_test_4, y_train, y_test)
Accuracy:  0.7528089887640449
CPU times: user 235 ms, sys: 37.6 ms, total: 272 ms
Wall time: 345 ms
In [0]:
# p値が低い5変数でモデリング
x_train_5 = x_train[['who', 'sex', 'pclass', 'embarked', 'alone']]
x_test_5 = x_test[['who', 'sex', 'pclass', 'embarked', 'alone']]
In [59]:
%%time
run_randomForest(x_train_5, x_test_5, y_train, y_test)
Accuracy:  0.7528089887640449
CPU times: user 236 ms, sys: 44.5 ms, total: 280 ms
Wall time: 348 ms
In [60]:
%%time
run_randomForest(x_train, x_test, y_train, y_test)
Accuracy:  0.7359550561797753
CPU times: user 255 ms, sys: 31.9 ms, total: 287 ms
Wall time: 351 ms
In [0]: