In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
In [3]:
data = pd.read_csv('/content/drive/My Drive/kaggle/Learning_mutual_information/data/santander-train.csv', nrows=20000)
data.head()
Out[3]:
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 imp_op_var40_ult1 imp_op_var41_comer_ult1 imp_op_var41_comer_ult3 imp_op_var41_efect_ult1 imp_op_var41_efect_ult3 imp_op_var41_ult1 imp_op_var39_efect_ult1 imp_op_var39_efect_ult3 imp_op_var39_ult1 imp_sal_var16_ult1 ind_var1_0 ind_var1 ind_var2_0 ind_var2 ind_var5_0 ind_var5 ind_var6_0 ind_var6 ind_var8_0 ind_var8 ind_var12_0 ind_var12 ind_var13_0 ind_var13_corto_0 ind_var13_corto ind_var13_largo_0 ind_var13_largo ind_var13_medio_0 ind_var13_medio ind_var13 ... saldo_medio_var5_ult1 saldo_medio_var5_ult3 saldo_medio_var8_hace2 saldo_medio_var8_hace3 saldo_medio_var8_ult1 saldo_medio_var8_ult3 saldo_medio_var12_hace2 saldo_medio_var12_hace3 saldo_medio_var12_ult1 saldo_medio_var12_ult3 saldo_medio_var13_corto_hace2 saldo_medio_var13_corto_hace3 saldo_medio_var13_corto_ult1 saldo_medio_var13_corto_ult3 saldo_medio_var13_largo_hace2 saldo_medio_var13_largo_hace3 saldo_medio_var13_largo_ult1 saldo_medio_var13_largo_ult3 saldo_medio_var13_medio_hace2 saldo_medio_var13_medio_hace3 saldo_medio_var13_medio_ult1 saldo_medio_var13_medio_ult3 saldo_medio_var17_hace2 saldo_medio_var17_hace3 saldo_medio_var17_ult1 saldo_medio_var17_ult3 saldo_medio_var29_hace2 saldo_medio_var29_hace3 saldo_medio_var29_ult1 saldo_medio_var29_ult3 saldo_medio_var33_hace2 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET
0 1 2 23 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 39205.170000 0
1 3 2 34 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 ... 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 300.0 122.22 300.0 240.75 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 49278.030000 0
2 4 2 23 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 3.00 2.07 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 67333.770000 0
3 8 2 37 0.0 195.0 195.0 0.0 0.0 0 0 0.0 195.0 195.0 0.0 0.0 195.0 0.0 0.0 195.0 0.0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 91.56 138.84 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 64007.970000 0
4 10 2 39 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 ... 40501.08 13501.47 0.0 0.0 0.0 0.0 0.0 0.0 85501.89 85501.89 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 117310.979016 0

5 rows × 371 columns

In [4]:
x = data.drop('TARGET', axis=1)
y = data['TARGET']

x.shape, y.shape
Out[4]:
((20000, 370), (20000,))
In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

Constant Feature Removal

In [15]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
Out[15]:
VarianceThreshold(threshold=0)
In [17]:
# feature numbers after removing Constant Features
constant_filter.get_support().sum()
Out[17]:
291
In [19]:
constant_list = [not temp for temp in constant_filter.get_support()]
constant_list[:20]
Out[19]:
[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]
In [20]:
# check constant features
x.columns[constant_list]
Out[20]:
Index(['ind_var2_0', 'ind_var2', 'ind_var13_medio_0', 'ind_var13_medio',
       'ind_var18_0', 'ind_var18', 'ind_var27_0', 'ind_var28_0', 'ind_var28',
       'ind_var27', 'ind_var34_0', 'ind_var34', 'ind_var41', 'ind_var46_0',
       'ind_var46', 'num_var13_medio_0', 'num_var13_medio', 'num_var18_0',
       'num_var18', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27',
       'num_var34_0', 'num_var34', 'num_var41', 'num_var46_0', 'num_var46',
       'saldo_var13_medio', 'saldo_var18', 'saldo_var28', 'saldo_var27',
       'saldo_var34', 'saldo_var41', 'saldo_var46',
       'delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3',
       'delta_imp_reemb_var33_1y3', 'delta_imp_trasp_var17_out_1y3',
       'delta_imp_trasp_var33_out_1y3', 'delta_num_reemb_var33_1y3',
       'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_out_1y3',
       'imp_amort_var18_hace3', 'imp_amort_var18_ult1',
       'imp_amort_var34_hace3', 'imp_amort_var34_ult1', 'imp_var7_emit_ult1',
       'imp_reemb_var13_hace3', 'imp_reemb_var17_hace3',
       'imp_reemb_var33_hace3', 'imp_reemb_var33_ult1',
       'imp_trasp_var17_in_hace3', 'imp_trasp_var17_out_hace3',
       'imp_trasp_var17_out_ult1', 'imp_trasp_var33_in_hace3',
       'imp_trasp_var33_out_hace3', 'imp_trasp_var33_out_ult1',
       'ind_var7_emit_ult1', 'num_var2_0_ult1', 'num_var2_ult1',
       'num_var7_emit_ult1', 'num_meses_var13_medio_ult3',
       'num_reemb_var13_hace3', 'num_reemb_var17_hace3',
       'num_reemb_var33_hace3', 'num_reemb_var33_ult1',
       'num_trasp_var17_in_hace3', 'num_trasp_var17_out_hace3',
       'num_trasp_var17_out_ult1', 'num_trasp_var33_in_hace3',
       'num_trasp_var33_out_hace3', 'num_trasp_var33_out_ult1',
       'saldo_var2_ult1', 'saldo_medio_var13_medio_hace2',
       'saldo_medio_var13_medio_hace3', 'saldo_medio_var13_medio_ult1',
       'saldo_medio_var13_medio_ult3', 'saldo_medio_var29_hace3'],
      dtype='object')
In [0]:
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)
In [22]:
x_train_filter.shape, x_test_filter.shape, x_train.shape
Out[22]:
((16000, 291), (4000, 291), (16000, 370))

Quasi constant feature removal

In [0]:
# remove feature under 1% variance
quasi_constant_filter = VarianceThreshold(threshold=0.01)
In [24]:
quasi_constant_filter.fit(x_train_filter)
Out[24]:
VarianceThreshold(threshold=0.01)
In [25]:
quasi_constant_filter.get_support().sum()
Out[25]:
245
In [0]:
x_train_quasi_filter = quasi_constant_filter.transform(x_train_filter) # transformの戻り値はnumpy.array
x_test_quasi_filter = quasi_constant_filter.transform(x_test_filter)
In [27]:
x_train_quasi_filter.shape, x_test_quasi_filter.shape
Out[27]:
((16000, 245), (4000, 245))

Remove Duplicate Features

  • Duplicate Features:2つ以上のカラムの内容がほとんど同じカラム。同じ内容のカラムが2つ以上あっても意味がないので、1つだけにする。
  • 方法:sklearnに専用のclassがないので、データを転置して、行と列を入れ替えて(各行が特徴量になる)重複行を消す。
In [0]:
x_train_T = x_train_quasi_filter.T
x_test_T = x_test_quasi_filter.T
In [29]:
type(x_train_T)
Out[29]:
numpy.ndarray
In [0]:
x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)
In [31]:
x_test_T.head()
Out[31]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 ... 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999
0 19459.0 38602.00 11052.0 38520.0 9146.0 6629.0 33404.0 1913.0 12736.0 23051.0 5334.0 36411.0 715.0 9598.0 31123.0 31933.0 1766.0 5440.0 8333.0 9725.0 25958.0 20181.0 21217.0 3699.0 13098.0 19722.0 1836.0 31730.0 15631.0 31287.0 11745.0 12939.0 34337.0 39294.0 32649.00 253.0 14534.0 18747.0 13142.0 14168.0 ... 24582.0 3221.0 30575.0 22405.0 17865.0 10484.0 202.0 37060.0 28460.0 1531.0 10683.0 7554.0 28024.0 33381.0 25981.0 16069.0 9587.0 33949.0 23882.0 34987.0 11244.0 20591.0 36710.0 29635.0 13416.0 1722.0 8674.0 10863.0 32229.0 31632.0 17600.0 9441.00 1441.0 13525.0 4411.0 37244.0 15181.0 3616.0 14770.0 4612.0
1 2.0 2.00 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.00 2.0 2.0 2.0 2.0 2.0 ... 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 28.00 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
2 52.0 39.00 23.0 43.0 26.0 59.0 23.0 28.0 25.0 37.0 23.0 25.0 24.0 24.0 24.0 80.0 23.0 25.0 23.0 31.0 34.0 47.0 48.0 25.0 43.0 43.0 23.0 27.0 36.0 31.0 23.0 39.0 28.0 35.0 41.00 32.0 23.0 27.0 27.0 23.0 ... 22.0 23.0 37.0 25.0 22.0 26.0 38.0 26.0 51.0 47.0 25.0 23.0 32.0 59.0 34.0 38.0 70.0 26.0 23.0 26.0 26.0 42.0 26.0 35.0 23.0 23.0 44.0 32.0 23.0 31.0 38.0 31.00 25.0 25.0 28.0 42.0 29.0 87.0 27.0 33.0
3 0.0 570.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 270.0 0.0 390.00 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 1195.32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 197.7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 27.0 0.0 496.47 534.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 115.5 0.0 0.0 0.0 0.0 1277.7 595.2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 300.0 0.0 0.0 0.0 252.87 0.0 0.0 0.0 2390.4 0.0 0.0 0.0 78.0

5 rows × 4000 columns

In [32]:
x_train_T.shape, x_test_T.shape
Out[32]:
((245, 16000), (245, 4000))
In [33]:
# 重複している特徴量の数
x_train_T.duplicated().sum()
Out[33]:
18
In [34]:
duplicated_features = x_train_T.duplicated()
duplicated_features
Out[34]:
0      False
1      False
2      False
3      False
4      False
       ...  
240    False
241    False
242    False
243    False
244    False
Length: 245, dtype: bool
In [0]:
features_to_keep = [not index for index in duplicated_features]
In [37]:
features_to_keep[:10]
Out[37]:
[True, True, True, True, True, True, True, True, True, True]
In [0]:
x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T
In [39]:
x_train_unique.shape, x_train.shape
Out[39]:
((16000, 227), (16000, 370))

Buld ML model and compare the performance of the selected feature

In [0]:
def run_randomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))
In [44]:
%%time
run_randomForest(x_train_unique, x_test_unique, y_train, y_test)
Accuracy on test set: 
0.95875
CPU times: user 5.23 s, sys: 28.5 ms, total: 5.26 s
Wall time: 2.78 s
In [43]:
%%time
run_randomForest(x_train, x_test, y_train, y_test)
Accuracy on test set: 
0.9585
CPU times: user 6.08 s, sys: 54.9 ms, total: 6.14 s
Wall time: 3.22 s
In [0]: