https://youtu.be/gDgEjdEzEDY

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.metrics import accuracy_score

titanic =  sns.load_dataset('titanic')

titanic.head()

titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

titanic.drop(labels=['age', 'deck'], axis=1, inplace=True)

titanic = titanic.dropna()

titanic.isnull().sum()

survived       0
pclass         0
sex            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

data = titanic[['pclass', 'sex', 'sibsp', 'parch', 'embarked', 'who', 'alone']].copy()

data.head()

data.isnull().sum()

pclass      0
sex         0
sibsp       0
parch       0
embarked    0
who         0
alone       0
dtype: int64

sex = {'male': 0, 'female': 1}
data['sex'] = data['sex'].map(sex)

data.head()

ports = {'S': 0, 'C': 1, 'Q': 2}
data['embarked'] = data['embarked'].map(ports)

who = {'man': 0, 'woman': 1, 'child': 2}
data['who'] = data['who'].map(who)

alone = {True: 1, False: 0}
data['alone'] = data['alone'].map(alone)

data.head()

Do Chi2-Test¶

x = data.copy()
y = titanic['survived']

x.shape, y.shape

((889, 7), (889,))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

f_score = chi2(x_train, y_train)

f_score
# 1dimension-カイ二乗値、2dimension-p値

(array([ 22.65169202, 152.91534343,   0.52934285,  10.35663782,
         16.13255653, 161.42431175,  13.4382363 ]),
 array([1.94189138e-06, 3.99737147e-35, 4.66883271e-01, 1.29009955e-03,
        5.90599986e-05, 5.52664700e-37, 2.46547298e-04]))

p_values = pd.Series(f_score[1], index=x_train.columns)
p_values.sort_values(ascending=True, inplace=True)

p_values
# who と　sex のp値がかなり小さいので、重要度が高い

who         5.526647e-37
sex         3.997371e-35
pclass      1.941891e-06
embarked    5.906000e-05
alone       2.465473e-04
parch       1.290100e-03
sibsp       4.668833e-01
dtype: float64

p_values.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f24cde17c88>

# p値がとても低い２変数でモデリング
x_train_2 = x_train[['who', 'sex']]
x_test_2 = x_test[['who', 'sex']]

def run_randomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))

%%time
run_randomForest(x_train_2, x_test_2, y_train, y_test)

Accuracy:  0.7191011235955056
CPU times: user 229 ms, sys: 40.9 ms, total: 270 ms
Wall time: 352 ms

# p値が低い３変数でモデリング
%%time
x_train_3 = x_train[['who', 'sex', 'pclass']]
x_test_3 = x_test[['who', 'sex', 'pclass']]

run_randomForest(x_train_3, x_test_3, y_train, y_test)

Accuracy:  0.7415730337078652
CPU times: user 250 ms, sys: 35 ms, total: 285 ms
Wall time: 357 ms

# p値が低い４変数でモデリング
x_train_4 = x_train[['who', 'sex', 'pclass', 'embarked']]
x_test_4 = x_test[['who', 'sex', 'pclass', 'embarked']]

%%time
run_randomForest(x_train_4, x_test_4, y_train, y_test)

Accuracy:  0.7584269662921348
CPU times: user 246 ms, sys: 39.3 ms, total: 285 ms
Wall time: 354 ms

# p値が低い４変数でモデリング２
x_train_4 = x_train[['who', 'sex', 'pclass', 'alone']]
x_test_4 = x_test[['who', 'sex', 'pclass', 'alone']]

%%time
run_randomForest(x_train_4, x_test_4, y_train, y_test)

Accuracy:  0.7528089887640449
CPU times: user 235 ms, sys: 37.6 ms, total: 272 ms
Wall time: 345 ms

# p値が低い５変数でモデリング
x_train_5 = x_train[['who', 'sex', 'pclass', 'embarked', 'alone']]
x_test_5 = x_test[['who', 'sex', 'pclass', 'embarked', 'alone']]

%%time
run_randomForest(x_train_5, x_test_5, y_train, y_test)

Accuracy:  0.7528089887640449
CPU times: user 236 ms, sys: 44.5 ms, total: 280 ms
Wall time: 348 ms

%%time
run_randomForest(x_train, x_test, y_train, y_test)

Accuracy:  0.7359550561797753
CPU times: user 255 ms, sys: 31.9 ms, total: 287 ms
Wall time: 351 ms

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True