import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
# get dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data.keys()
print(data.DESCR)
x = pd.DataFrame(data=data.data, columns=data.feature_names)
x.head()
y = data.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_train.shape, x_test.shape
# 全特徴量の重要度の平均より大きい特徴量を抽出
sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
sel.fit(x_train, y_train)
sel.get_support()
features = x_train.columns[sel.get_support()]
features
len(features)
# 選ばれた特徴量は10個
np.mean(sel.estimator_.feature_importances_)
sel.estimator_.feature_importances_
x_train_rfc = sel.transform(x_train)
x_test_rfc = sel.transform(x_test)
def run_randomForest(x_train, x_test, y_train, y_test):
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
%%time
run_randomForest(x_train_rfc, x_test_rfc, y_train, y_test)
%%time
run_randomForest(x_train, x_test, y_train, y_test)
from sklearn.feature_selection import RFE
# 重要度が高い特徴量を15個抽出
sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), n_features_to_select=15)
sel.fit(x_train, y_train)
sel.get_support()
features = x_train.columns[sel.get_support()]
features
len(features)
x_train_rfe = sel.transform(x_train)
x_test_rfe = sel.transform(x_test)
%%time
run_randomForest(x_train_rfe, x_test_rfe, y_train, y_test)
%%time
run_randomForest(x_train, x_test, y_train, y_test)
from sklearn.ensemble import GradientBoostingClassifier
# 重要度が高い特徴量を12個抽出
sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select=12)
sel.fit(x_train, y_train)
sel.get_support()
features = x_train.columns[sel.get_support()]
features
len(features)
x_train_rfe = sel.transform(x_train)
x_test_rfe = sel.transform(x_test)
%%time
run_randomForest(x_train_rfe, x_test_rfe, y_train, y_test)
%%time
run_randomForest(x_train, x_test, y_train, y_test)
# 上記で適当に抽出特徴量を12個に指定しているが、実際は何個が適切かforループで探る
for index in range(1, 31):
sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select=index)
sel.fit(x_train, y_train)
x_train_rfe = sel.transform(x_train)
x_test_rfe = sel.transform(x_test)
print('Selected Feature: ', index)
run_randomForest(x_train_rfe, x_test_rfe, y_train, y_test)
print()
結果:特徴量が6個のときのスコアが最も良い。
sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select=6)
sel.fit(x_train, y_train)
x_train_rfe = sel.transform(x_train)
x_test_rfe = sel.transform(x_test)
print('Selected Feature: ', 6)
run_randomForest(x_train_rfe, x_test_rfe, y_train, y_test)
print()
features = x_train.columns[sel.get_support()]
features
# RandomForestClassifierでも、適当に抽出特徴量を15個に指定しているが、実際は何個が適切かforループで探る
for index in range(1, 31):
sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), n_features_to_select=index)
sel.fit(x_train, y_train)
x_train_rfe = sel.transform(x_train)
x_test_rfe = sel.transform(x_test)
print('Selected Feature: ', index)
run_randomForest(x_train_rfe, x_test_rfe, y_train, y_test)
print()
結果:特徴量の数が17個のとき、スコアが最大だった。また、GradientBoostigを利用した特徴量抽出の方がスコアが良かった。