import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.metrics import accuracy_score
titanic = sns.load_dataset('titanic')
titanic.head()
titanic.isnull().sum()
titanic.drop(labels=['age', 'deck'], axis=1, inplace=True)
titanic = titanic.dropna()
titanic.isnull().sum()
data = titanic[['pclass', 'sex', 'sibsp', 'parch', 'embarked', 'who', 'alone']].copy()
data.head()
data.isnull().sum()
sex = {'male': 0, 'female': 1}
data['sex'] = data['sex'].map(sex)
data.head()
ports = {'S': 0, 'C': 1, 'Q': 2}
data['embarked'] = data['embarked'].map(ports)
who = {'man': 0, 'woman': 1, 'child': 2}
data['who'] = data['who'].map(who)
alone = {True: 1, False: 0}
data['alone'] = data['alone'].map(alone)
data.head()
x = data.copy()
y = titanic['survived']
x.shape, y.shape
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
f_score = chi2(x_train, y_train)
f_score
# 1dimension-カイ二乗値、2dimension-p値
p_values = pd.Series(f_score[1], index=x_train.columns)
p_values.sort_values(ascending=True, inplace=True)
p_values
# who と sex のp値がかなり小さいので、重要度が高い
p_values.plot.bar()
# p値がとても低い2変数でモデリング
x_train_2 = x_train[['who', 'sex']]
x_test_2 = x_test[['who', 'sex']]
def run_randomForest(x_train, x_test, y_train, y_test):
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
%%time
run_randomForest(x_train_2, x_test_2, y_train, y_test)
# p値が低い3変数でモデリング
%%time
x_train_3 = x_train[['who', 'sex', 'pclass']]
x_test_3 = x_test[['who', 'sex', 'pclass']]
run_randomForest(x_train_3, x_test_3, y_train, y_test)
# p値が低い4変数でモデリング
x_train_4 = x_train[['who', 'sex', 'pclass', 'embarked']]
x_test_4 = x_test[['who', 'sex', 'pclass', 'embarked']]
%%time
run_randomForest(x_train_4, x_test_4, y_train, y_test)
# p値が低い4変数でモデリング2
x_train_4 = x_train[['who', 'sex', 'pclass', 'alone']]
x_test_4 = x_test[['who', 'sex', 'pclass', 'alone']]
%%time
run_randomForest(x_train_4, x_test_4, y_train, y_test)
# p値が低い5変数でモデリング
x_train_5 = x_train[['who', 'sex', 'pclass', 'embarked', 'alone']]
x_test_5 = x_test[['who', 'sex', 'pclass', 'embarked', 'alone']]
%%time
run_randomForest(x_train_5, x_test_5, y_train, y_test)
%%time
run_randomForest(x_train, x_test, y_train, y_test)