In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [3]:
data = pd.read_csv('/content/drive/My Drive/kaggle/Learning_mutual_information/data/santander-train.csv', nrows=20000)
data.head()
Out[3]:
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 imp_op_var40_ult1 imp_op_var41_comer_ult1 imp_op_var41_comer_ult3 imp_op_var41_efect_ult1 imp_op_var41_efect_ult3 imp_op_var41_ult1 imp_op_var39_efect_ult1 imp_op_var39_efect_ult3 imp_op_var39_ult1 imp_sal_var16_ult1 ind_var1_0 ind_var1 ind_var2_0 ind_var2 ind_var5_0 ind_var5 ind_var6_0 ind_var6 ind_var8_0 ind_var8 ind_var12_0 ind_var12 ind_var13_0 ind_var13_corto_0 ind_var13_corto ind_var13_largo_0 ind_var13_largo ind_var13_medio_0 ind_var13_medio ind_var13 ... saldo_medio_var5_ult1 saldo_medio_var5_ult3 saldo_medio_var8_hace2 saldo_medio_var8_hace3 saldo_medio_var8_ult1 saldo_medio_var8_ult3 saldo_medio_var12_hace2 saldo_medio_var12_hace3 saldo_medio_var12_ult1 saldo_medio_var12_ult3 saldo_medio_var13_corto_hace2 saldo_medio_var13_corto_hace3 saldo_medio_var13_corto_ult1 saldo_medio_var13_corto_ult3 saldo_medio_var13_largo_hace2 saldo_medio_var13_largo_hace3 saldo_medio_var13_largo_ult1 saldo_medio_var13_largo_ult3 saldo_medio_var13_medio_hace2 saldo_medio_var13_medio_hace3 saldo_medio_var13_medio_ult1 saldo_medio_var13_medio_ult3 saldo_medio_var17_hace2 saldo_medio_var17_hace3 saldo_medio_var17_ult1 saldo_medio_var17_ult3 saldo_medio_var29_hace2 saldo_medio_var29_hace3 saldo_medio_var29_ult1 saldo_medio_var29_ult3 saldo_medio_var33_hace2 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET
0 1 2 23 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 39205.170000 0
1 3 2 34 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 ... 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 300.0 122.22 300.0 240.75 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 49278.030000 0
2 4 2 23 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 3.00 2.07 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 67333.770000 0
3 8 2 37 0.0 195.0 195.0 0.0 0.0 0 0 0.0 195.0 195.0 0.0 0.0 195.0 0.0 0.0 195.0 0.0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 91.56 138.84 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 64007.970000 0
4 10 2 39 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 ... 40501.08 13501.47 0.0 0.0 0.0 0.0 0.0 0.0 85501.89 85501.89 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 117310.979016 0

5 rows × 371 columns

In [4]:
x = data.drop('TARGET', axis=1)
y = data['TARGET']
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
(20000, 370) (20000,)

Constant Feature Removal

In [5]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
constant_list = [not temp for temp in constant_filter.get_support()]
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)
x_train_filter.shape, x_test_filter.shape, x_train.shape
Out[5]:
((16000, 291), (4000, 291), (16000, 370))

Quasi constant feature removal

In [6]:
# remove feature under 1% variance
quasi_constant_filter = VarianceThreshold(threshold=0.01)
quasi_constant_filter.fit(x_train_filter)
x_train_quasi_filter = quasi_constant_filter.transform(x_train_filter) # transformの戻り値はnumpy.array
x_test_quasi_filter = quasi_constant_filter.transform(x_test_filter)
x_train_quasi_filter.shape, x_test_quasi_filter.shape
Out[6]:
((16000, 245), (4000, 245))

Remove Duplicate Features

In [7]:
x_train_T = x_train_quasi_filter.T
x_test_T = x_test_quasi_filter.T

x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)

duplicated_features = x_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]

x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T

x_train_unique.shape, x_train.shape
Out[7]:
((16000, 227), (16000, 370))

Pearson Correlation Coefficients

In [0]:
corrmat = x_train_unique.corr()
In [9]:
plt.figure(figsize=(12,8))
sns.heatmap(corrmat)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcb0db4f60>
In [0]:
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col
In [11]:
corr_features = get_correlation(x_train_unique, 0.85)
corr_features
Out[11]:
{5,
 7,
 9,
 11,
 12,
 14,
 15,
 16,
 17,
 18,
 23,
 24,
 28,
 29,
 30,
 32,
 33,
 35,
 36,
 38,
 42,
 46,
 47,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 60,
 61,
 62,
 65,
 67,
 68,
 69,
 70,
 72,
 76,
 80,
 81,
 82,
 83,
 84,
 86,
 87,
 88,
 91,
 93,
 95,
 98,
 100,
 101,
 103,
 104,
 111,
 115,
 117,
 120,
 121,
 125,
 136,
 138,
 143,
 146,
 149,
 153,
 154,
 157,
 158,
 161,
 162,
 163,
 164,
 169,
 170,
 173,
 180,
 182,
 183,
 184,
 185,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 197,
 198,
 199,
 204,
 205,
 207,
 208,
 215,
 216,
 217,
 219,
 220,
 221,
 223,
 224,
 227,
 228,
 229,
 230,
 231,
 232,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243}
In [12]:
list(corr_features)[:10]
Out[12]:
[5, 7, 9, 11, 12, 14, 15, 16, 17, 18]
In [13]:
len(corr_features)
Out[13]:
124
In [0]:
x_train_uncorr = x_train_unique.drop(labels=corr_features, axis=1)
x_test_uncorr = x_test_unique.drop(labels=corr_features, axis=1)
In [15]:
x_train_uncorr.shape, x_test_uncorr.shape
Out[15]:
((16000, 103), (4000, 103))
In [0]:
def run_randomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))
In [17]:
%%time
run_randomForest(x_train_uncorr, x_test_uncorr, y_train, y_test)
Accuracy on test set: 
0.95875
CPU times: user 3.78 s, sys: 38 ms, total: 3.81 s
Wall time: 2.07 s
In [18]:
%%time
run_randomForest(x_train, x_test, y_train, y_test)
Accuracy on test set: 
0.9585
CPU times: user 5.77 s, sys: 58.8 ms, total: 5.83 s
Wall time: 3.11 s
In [0]:
 

Feature Grouping and Feature Importance

In [25]:
corrmat
Out[25]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 38 40 41 ... 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
0 1.000000 -0.025277 -0.001942 0.003594 0.004054 -0.001697 -0.015882 -0.019807 0.000956 -0.000588 -0.012443 0.010319 0.005268 0.017605 0.016960 0.018040 0.017400 0.016745 0.015206 -0.000103 -0.001198 -0.006814 -0.002037 0.010356 0.012021 0.001732 0.001138 -0.004836 -0.006480 -0.005811 -0.003929 -0.002340 0.004714 0.005018 0.005088 0.006998 0.005580 0.007236 0.001648 -0.000359 ... 0.007040 -0.001282 0.013583 0.008546 -0.002296 0.004097 0.006333 0.005441 0.006393 0.001991 0.006937 0.004924 0.008100 -0.000582 0.007130 0.007675 -0.006477 -0.010219 -0.011386 -0.011200 0.006455 0.008361 0.003765 0.005352 0.008042 0.007870 0.007952 0.008021 -0.001596 0.001830 -0.001337 0.002051 -0.008500 0.006554 0.005907 0.008825 -0.009174 0.012031 0.012128 0.006612
1 -0.025277 1.000000 -0.007647 0.001819 0.008981 0.009232 0.001638 0.001746 0.000614 0.000695 0.001517 0.009097 0.009360 -0.002511 -0.001086 0.002426 -0.002401 -0.001019 0.002629 0.000519 0.004590 -0.008975 0.041015 0.008019 0.007439 0.011525 0.009467 0.009771 0.008796 0.008676 0.009662 0.006484 0.009172 0.008687 0.007043 0.006730 0.007178 0.006622 0.050629 0.011849 ... 0.003053 0.004967 0.009019 0.006758 0.004488 0.002604 0.003651 0.003672 0.001322 0.000878 0.002152 0.002210 0.003979 0.002581 0.004811 0.004879 0.005759 0.003183 0.006355 0.006248 0.002629 0.001482 0.002827 0.002770 0.000356 0.000338 0.000411 0.000408 0.000391 0.000453 0.000544 0.000586 0.000337 0.000550 0.000563 0.000922 0.000598 0.000875 0.000942 0.000415
2 -0.001942 -0.007647 1.000000 0.030919 0.106245 0.109140 0.048524 0.055708 0.004040 0.005796 0.042368 0.096719 0.098070 0.082025 0.095485 0.106415 0.081028 0.095009 0.110912 0.016886 0.107680 -0.105502 -0.102487 0.107570 0.101605 0.273152 0.231649 0.299165 0.241707 0.237830 0.296879 0.149274 0.262784 0.253780 0.083030 0.091234 0.095103 0.082214 0.032543 0.136317 ... 0.205098 0.228133 0.154430 0.227132 0.188957 0.107593 0.132855 0.162295 0.029742 0.014903 0.043278 0.045622 0.149586 0.093124 0.178546 0.179565 0.178263 0.094741 0.200415 0.195652 0.125618 0.059293 0.135362 0.132537 0.023435 0.022679 0.025362 0.025406 0.013612 0.023446 0.025522 0.020168 0.011550 0.019325 0.019527 0.041321 0.016172 0.043577 0.044281 -0.000810
3 0.003594 0.001819 0.030919 1.000000 0.029418 0.024905 0.014513 0.013857 -0.000613 -0.000691 0.012451 0.026377 0.021968 0.016331 0.016458 0.024014 0.015979 0.016239 0.025558 -0.000520 0.007478 -0.002101 0.017541 0.003429 0.004843 0.010099 0.015117 0.036569 0.040420 0.041165 0.037154 -0.001398 0.012668 0.013574 0.007590 0.005443 0.007195 0.005737 0.025432 0.046103 ... 0.022688 0.017827 0.038797 0.032181 0.002703 0.001554 0.011326 0.006745 0.003401 -0.000822 0.002314 0.003234 0.001554 -0.001262 0.002540 0.002948 -0.005438 -0.003083 0.025778 0.033042 -0.001532 0.000238 -0.001817 -0.001698 -0.000354 -0.000338 -0.000285 -0.000334 -0.000391 0.008469 0.014032 -0.000583 -0.000337 -0.000548 -0.000561 0.000541 -0.000577 0.000231 0.000235 0.000966
4 0.004054 0.008981 0.106245 0.029418 1.000000 0.888789 0.381632 0.341266 0.012927 0.019674 0.298916 0.938409 0.838953 0.266746 0.326051 0.638412 0.263482 0.324417 0.673593 0.049579 0.227803 -0.208030 0.041167 0.200514 0.220673 0.027387 0.033757 -0.010411 -0.012628 -0.012035 -0.010694 0.030235 0.014793 0.009176 0.232927 0.223937 0.229660 0.226089 0.108756 0.267246 ... 0.235397 0.168010 0.268317 0.270899 0.021159 -0.004700 0.055867 0.029543 0.035820 0.110967 0.073627 0.086904 -0.003401 -0.007050 -0.002079 -0.001151 0.002963 0.040691 -0.000914 -0.000322 0.003267 0.012429 0.000824 0.001272 -0.001629 -0.001669 -0.001677 -0.001730 -0.001930 0.000833 0.002328 0.016743 -0.001662 0.020509 0.021276 -0.001905 -0.000635 -0.002552 -0.002736 0.003656
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
240 0.008825 0.000922 0.041321 0.000541 -0.001905 0.000871 -0.000818 -0.000866 -0.000309 -0.000349 -0.000762 -0.001754 0.001253 0.007400 0.004485 0.004773 0.007237 0.004412 0.004487 -0.000262 0.009008 -0.000421 0.013263 0.001397 0.002086 -0.000543 0.001500 -0.003893 -0.004438 -0.004378 -0.004659 -0.003271 0.001831 0.002405 0.003502 0.002806 0.003308 0.002956 0.012984 0.005350 ... 0.031705 0.038442 0.027067 0.037467 0.012998 0.000226 0.025813 0.015868 0.104010 0.073770 0.082936 0.074334 -0.001886 -0.001294 -0.002222 -0.002257 -0.002904 -0.001604 -0.003205 -0.003151 -0.001195 -0.000573 -0.001417 -0.001387 0.000265 -0.000171 0.001004 0.000746 0.031970 0.004649 0.012705 0.021540 -0.000170 0.032162 0.030087 1.000000 0.329805 0.935317 0.919036 0.011106
241 -0.009174 0.000598 0.016172 -0.000577 -0.000635 0.007096 -0.000515 -0.000545 -0.000195 -0.000220 -0.000480 -0.000494 0.007871 0.002248 0.001176 0.001465 0.002188 0.001147 0.001333 -0.000165 -0.001449 -0.010257 0.004114 0.011918 0.013156 -0.003662 -0.003008 -0.003103 -0.002796 -0.002758 -0.003068 -0.002061 -0.002914 -0.002760 0.014110 0.014929 0.013776 0.015226 0.008179 0.008135 ... 0.010826 0.025881 0.008090 0.016236 0.037897 0.001330 0.056950 0.037100 0.287537 0.204017 0.230452 0.206808 -0.001258 -0.000815 -0.001523 -0.001545 -0.001829 -0.001011 -0.002019 -0.001985 -0.000831 -0.000463 -0.000895 -0.000876 -0.000113 -0.000107 -0.000130 -0.000129 -0.000124 -0.000144 -0.000173 -0.000185 -0.000107 -0.000174 -0.000178 0.329805 1.000000 0.127224 0.140902 0.011807
242 0.012031 0.000875 0.043577 0.000231 -0.002552 -0.001672 -0.000779 -0.000825 -0.000295 -0.000332 -0.000726 -0.002467 -0.001513 0.006688 0.004012 0.003984 0.006539 0.003946 0.003729 -0.000250 0.009156 0.002149 0.013717 -0.001487 -0.001036 -0.001555 0.000193 -0.003587 -0.004228 -0.004171 -0.004263 -0.003116 0.000472 0.000954 -0.000169 -0.000911 -0.000291 -0.000824 0.012369 0.000950 ... 0.028671 0.023918 0.026292 0.031313 0.013596 -0.000514 0.019951 0.016858 0.037924 0.026822 0.029915 0.026758 -0.001828 -0.001232 -0.002103 -0.002116 -0.002766 -0.001528 -0.003054 -0.003002 -0.001146 -0.000556 -0.001348 -0.001318 0.000089 -0.000163 0.000510 0.000360 0.068648 0.010219 0.027515 0.012393 -0.000162 0.018565 0.017358 0.935317 0.127224 1.000000 0.993536 0.008604
243 0.012128 0.000942 0.044281 0.000235 -0.002736 -0.001844 -0.000839 -0.000888 -0.000317 -0.000358 -0.000782 -0.002645 -0.001676 0.006283 0.003599 0.003632 0.006139 0.003534 0.003377 -0.000269 0.011164 0.002306 0.014768 -0.001591 -0.001105 -0.001463 0.000460 -0.003989 -0.004553 -0.004491 -0.004670 -0.003356 0.000768 0.001301 -0.000128 -0.000859 -0.000260 -0.000763 0.013320 0.001099 ... 0.029862 0.026072 0.028454 0.033433 0.017619 -0.000616 0.023478 0.020881 0.042586 0.030148 0.033617 0.030066 -0.001966 -0.001327 -0.002237 -0.002243 -0.002979 -0.001646 -0.003288 -0.003233 -0.001241 -0.000608 -0.001452 -0.001421 0.000121 -0.000175 0.000619 0.000443 0.057673 0.008541 0.023072 0.014523 -0.000174 0.021742 0.020331 0.919036 0.140902 0.993536 1.000000 0.009136
244 0.006612 0.000415 -0.000810 0.000966 0.003656 0.002257 0.004448 0.002427 -0.000739 -0.000811 0.003341 0.002290 0.001570 0.000707 -0.001992 0.001339 0.000614 -0.002038 0.001910 0.000213 -0.001227 -0.016447 -0.056029 0.012346 -0.003767 0.012034 0.006643 0.012240 0.007400 0.006121 0.011291 0.004047 0.008078 0.006178 0.000669 0.000397 0.000346 0.000546 -0.055329 -0.014822 ... -0.010182 -0.008797 -0.006917 -0.010172 0.016391 0.019986 0.007422 0.016724 -0.000105 -0.001055 0.000338 0.000244 0.017276 0.006644 0.018092 0.017579 0.014736 0.002052 0.014980 0.014628 0.014567 0.005688 0.015351 0.014485 0.013197 0.012842 0.013321 0.013418 -0.000203 -0.003446 -0.003399 -0.000773 -0.000402 -0.000525 -0.000589 0.011106 0.011807 0.008604 0.009136 1.000000

227 rows × 227 columns

In [26]:
corrdata = corrmat.abs().stack()
corrdata
Out[26]:
0    0      1.000000
     1      0.025277
     2      0.001942
     3      0.003594
     4      0.004054
              ...   
244  240    0.011106
     241    0.011807
     242    0.008604
     243    0.009136
     244    1.000000
Length: 51529, dtype: float64
In [27]:
corrdata = corrdata.sort_values(ascending=False)
corrdata
Out[27]:
29   58     1.000000e+00
58   29     1.000000e+00
134  158    1.000000e+00
158  134    1.000000e+00
182  182    1.000000e+00
                ...     
229  111    1.934954e-06
231  150    6.044672e-07
150  231    6.044672e-07
231  123    3.966696e-07
123  231    3.966696e-07
Length: 51529, dtype: float64
In [28]:
corrdata = corrdata[corrdata > 0.85]
corrdata = corrdata[corrdata < 1]
corrdata
Out[28]:
143  135    1.000000
135  143    1.000000
136  128    1.000000
128  136    1.000000
31   62     1.000000
              ...   
67   66     0.851384
61   28     0.851022
28   61     0.851022
72   35     0.850893
35   72     0.850893
Length: 534, dtype: float64
In [29]:
corrdata = pd.DataFrame(corrdata).reset_index()
corrdata.columns = ['features1', 'feature2', 'corr_value']
corrdata
Out[29]:
features1 feature2 corr_value
0 143 135 1.000000
1 135 143 1.000000
2 136 128 1.000000
3 128 136 1.000000
4 31 62 1.000000
... ... ... ...
529 67 66 0.851384
530 61 28 0.851022
531 28 61 0.851022
532 72 35 0.850893
533 35 72 0.850893

534 rows × 3 columns

In [0]:
# 各独立変数ごとに、相関の強い独立変数をまとめてグループ化
grouped_feature_list = []
correlated_groups_list = []
for feature in corrdata.features1.unique(): # ffeatures1列から重複を取り除いて値を取り出し
    if feature not in grouped_feature_list:
        correlated_block = corrdata[corrdata.features1 == feature]
        grouped_feature_list = grouped_feature_list + list(correlated_block.feature2.unique()) + [feature]
        correlated_groups_list.append(correlated_block)
In [37]:
len(correlated_groups_list)
# 全部で56グループある
Out[37]:
56
In [38]:
# 各グループごとの相関状況一覧
for group in correlated_groups_list:
    print(group)
   features1  feature2  corr_value
0        143       135         1.0
     features1  feature2  corr_value
2          136       128    1.000000
197        136       169    0.959468
   features1  feature2  corr_value
4         31        62         1.0
   features1  feature2  corr_value
6         20        47         1.0
     features1  feature2  corr_value
8           52        23    1.000000
297         52        24    0.927683
299         52        53    0.927683
448         52        21    0.877297
505         52       183    0.860163
     features1  feature2  corr_value
12          33        69    1.000000
224         33        32    0.947113
228         33        68    0.946571
322         33        26    0.917665
337         33        55    0.914178
422         33       184    0.884383
    features1  feature2  corr_value
14        157       133         1.0
    features1  feature2  corr_value
16        237       149    1.000000
26        237       148    0.999929
    features1  feature2  corr_value
18        154       132         1.0
     features1  feature2  corr_value
20         146       230    0.999997
36         146       229    0.999778
59         146       231    0.997052
68         146       232    0.996772
76         146       113    0.996424
89         146       120    0.993307
245        146       170    0.944314
     features1  feature2  corr_value
22         238       122    0.999945
49         238       239    0.998497
264        238       236    0.938668
    features1  feature2  corr_value
34         82        78    0.999859
     features1  feature2  corr_value
40         108       115    0.999478
97         108       219    0.992870
115        108       125    0.987333
142        108       220    0.982474
280        108       217    0.933815
     features1  feature2  corr_value
46         199       197    0.998753
362        199       196    0.905699
371        199       198    0.904341
     features1  feature2  corr_value
50         181       208    0.997718
345        181       205    0.911453
467        181       207    0.871801
     features1  feature2  corr_value
72          17        14    0.996739
396         17        16    0.890442
408         17        13    0.888669
     features1  feature2  corr_value
86         242       243    0.993536
122        242       126    0.986744
276        242       240    0.935317
     features1  feature2  corr_value
92          28        57    0.993186
124         28        58    0.986371
126         28        29    0.986371
185         28       185    0.964067
381         28        27    0.901032
399         28        30    0.889321
531         28        61    0.851022
     features1  feature2  corr_value
94          51        22    0.992882
385         51       182    0.899063
     features1  feature2  corr_value
100         44        46    0.990593
377         44        98    0.902736
410         44        95    0.888337
     features1  feature2  corr_value
102         77        81    0.989793
461         77        80    0.874240
517         77        84    0.858529
     features1  feature2  corr_value
104        109       223    0.989341
151        109       224    0.980951
356        109       221    0.907987
413        109       111    0.887721
     features1  feature2  corr_value
112          9         8    0.988256
417          9       193    0.886955
444          9       192    0.878045
     features1  feature2  corr_value
116        227       228    0.987304
188        227       225    0.962657
     features1  feature2  corr_value
118        116       117    0.987013
     features1  feature2  corr_value
128         91        49    0.985951
     features1  feature2  corr_value
130         54        25    0.985875
419         54       100    0.886309
     features1  feature2  corr_value
134         76        75    0.984751
353         76        74    0.908497
477         76       191    0.870551
522         76       190    0.857717
     features1  feature2  corr_value
136         38        35    0.984077
261         38        34    0.940390
306         38        36    0.922699
496         38        72    0.864661
     features1  feature2  corr_value
138         18        15    0.983164
465         18        16    0.872133
470         18        13    0.870936
     features1  feature2  corr_value
140        215       107    0.983156
146        215       216    0.981815
     features1  feature2  corr_value
161         56        61    0.976942
187         56        27    0.962726
211         56        30    0.953194
     features1  feature2  corr_value
164        162       163    0.975002
288        162       161    0.930635
369        162       164    0.904702
463        162        41    0.874083
     features1  feature2  corr_value
166        102       103    0.974341
     features1  feature2  corr_value
168         83        79    0.973140
263         83       188    0.938960
273         83        84    0.936080
315         83       194    0.919405
351         83        80    0.910385
518         83       189    0.858484
     features1  feature2  corr_value
174         70        72    0.972088
500         70        35    0.862850
     features1  feature2  corr_value
180         59        60    0.968504
     features1  feature2  corr_value
207        195       189    0.956666
313        195        80    0.920961
330        195       194    0.916442
378        195        84    0.902276
428        195       188    0.882312
509        195        79    0.859806
     features1  feature2  corr_value
216        235       234    0.950232
349        235       106    0.911179
     features1  feature2  corr_value
220         10       104    0.948845
     features1  feature2  corr_value
234        180       179    0.945288
     features1  feature2  corr_value
236        241       151    0.944812
     features1  feature2  corr_value
243         42        41    0.944451
415         42       161    0.887059
503         42       164    0.861507
     features1  feature2  corr_value
248         12         5    0.943622
434         12        11    0.881673
     features1  feature2  corr_value
266          4        11    0.938409
402          4         5    0.888789
     features1  feature2  corr_value
274         93        92    0.935867
     features1  feature2  corr_value
290         89       121    0.928898
     features1  feature2  corr_value
304         88        87       0.924
     features1  feature2  corr_value
318        174       204    0.918533
     features1  feature2  corr_value
333         50        21    0.916137
     features1  feature2  corr_value
354          6         7    0.908158
     features1  feature2  corr_value
372         64        65    0.904095
488         64        87    0.866430
     features1  feature2  corr_value
374        101        86    0.903641
394        101        40    0.892951
     features1  feature2  corr_value
390        131       153     0.89633
     features1  feature2  corr_value
525        173       151    0.854991
     features1  feature2  corr_value
528         66        67    0.851384

Feature Importance based on tree based classifiers

In [0]:
important_features = []
for group in correlated_groups_list:
    features = list(group.features1.unique()) + list(group.feature2.unique()) # グループ毎の変数全て取り出し
    rf = RandomForestClassifier(n_estimators=100, random_state=0) # RandomForestで変数の重要度計測
    rf.fit(x_train_unique[features], y_train)

    importance = pd.concat([pd.Series(features), pd.Series(rf.feature_importances_)], axis=1) # 重要度取り出し
    importance.columns = ['features', 'importance']
    importance.sort_values(by='importance', ascending=False, inplace=True)
    feat = importance.iloc[0] # グループ内で一番重要度の高い変数を取り出し
    important_features.append(feat)
In [40]:
important_features
Out[40]:
[features      135.00
 importance      0.51
 Name: 1, dtype: float64, features      128.000000
 importance      0.563757
 Name: 1, dtype: float64, features      62.00
 importance     0.51
 Name: 1, dtype: float64, features      47.00
 importance     0.51
 Name: 1, dtype: float64, features      183.000000
 importance      0.285817
 Name: 5, dtype: float64, features      184.00000
 importance      0.34728
 Name: 6, dtype: float64, features      157.000000
 importance      0.523077
 Name: 0, dtype: float64, features      148.000000
 importance      0.624498
 Name: 2, dtype: float64, features      132.000000
 importance      0.565217
 Name: 1, dtype: float64, features      120.000000
 importance      0.749683
 Name: 6, dtype: float64, features      122.000000
 importance      0.343434
 Name: 1, dtype: float64, features      82.000000
 importance     0.518827
 Name: 0, dtype: float64, features      125.000000
 importance      0.940524
 Name: 3, dtype: float64, features      197.000000
 importance      0.289727
 Name: 1, dtype: float64, features      207.000000
 importance      0.312834
 Name: 3, dtype: float64, features      17.000000
 importance     0.286833
 Name: 0, dtype: float64, features      243.000000
 importance      0.431557
 Name: 1, dtype: float64, features      185.000000
 importance      0.391367
 Name: 4, dtype: float64, features      182.000000
 importance      0.432045
 Name: 2, dtype: float64, features      95.000000
 importance     0.487162
 Name: 3, dtype: float64, features      84.000000
 importance     0.299008
 Name: 3, dtype: float64, features      221.00000
 importance      0.28555
 Name: 3, dtype: float64, features      8.000000
 importance    0.345509
 Name: 1, dtype: float64, features      228.000000
 importance      0.434186
 Name: 1, dtype: float64, features      117.000000
 importance      0.517013
 Name: 1, dtype: float64, features      49.000000
 importance     0.500161
 Name: 1, dtype: float64, features      100.000000
 importance      0.386775
 Name: 2, dtype: float64, features      191.000000
 importance      0.345104
 Name: 3, dtype: float64, features      34.000000
 importance     0.283901
 Name: 2, dtype: float64, features      15.000000
 importance     0.400677
 Name: 1, dtype: float64, features      107.000000
 importance      0.349126
 Name: 1, dtype: float64, features      61.000000
 importance     0.323735
 Name: 1, dtype: float64, features      41.000000
 importance     0.386338
 Name: 4, dtype: float64, features      102.000000
 importance      0.508955
 Name: 0, dtype: float64, features      189.000000
 importance      0.229269
 Name: 6, dtype: float64, features      72.000000
 importance     0.490102
 Name: 1, dtype: float64, features      60.00000
 importance     0.50052
 Name: 1, dtype: float64, features      79.000000
 importance     0.213903
 Name: 6, dtype: float64, features      234.000000
 importance      0.469178
 Name: 1, dtype: float64, features      104.000000
 importance      0.640915
 Name: 1, dtype: float64, features      179.000000
 importance      0.634779
 Name: 1, dtype: float64, features      151.00
 importance      0.51
 Name: 1, dtype: float64, features      161.000000
 importance      0.346426
 Name: 2, dtype: float64, features      5.000000
 importance    0.356386
 Name: 1, dtype: float64, features      5.000000
 importance    0.403831
 Name: 2, dtype: float64, features      93.000000
 importance     0.544349
 Name: 0, dtype: float64, features      121.00
 importance      0.51
 Name: 1, dtype: float64, features      87.000000
 importance     0.553622
 Name: 1, dtype: float64, features      174.000000
 importance      0.743723
 Name: 0, dtype: float64, features      50.000000
 importance     0.616659
 Name: 0, dtype: float64, features      7.000000
 importance    0.545702
 Name: 1, dtype: float64, features      87.0000
 importance     0.7462
 Name: 2, dtype: float64, features      86.000000
 importance     0.447693
 Name: 1, dtype: float64, features      153.000000
 importance      0.515152
 Name: 1, dtype: float64, features      151.00
 importance      0.51
 Name: 1, dtype: float64, features      66.000000
 importance     0.630293
 Name: 0, dtype: float64]
In [0]:
important_features = pd.DataFrame(important_features)
In [0]:
important_features.reset_index(inplace=True, drop=True)
In [43]:
important_features
Out[43]:
features importance
0 135.0 0.510000
1 128.0 0.563757
2 62.0 0.510000
3 47.0 0.510000
4 183.0 0.285817
5 184.0 0.347280
6 157.0 0.523077
7 148.0 0.624498
8 132.0 0.565217
9 120.0 0.749683
10 122.0 0.343434
11 82.0 0.518827
12 125.0 0.940524
13 197.0 0.289727
14 207.0 0.312834
15 17.0 0.286833
16 243.0 0.431557
17 185.0 0.391367
18 182.0 0.432045
19 95.0 0.487162
20 84.0 0.299008
21 221.0 0.285550
22 8.0 0.345509
23 228.0 0.434186
24 117.0 0.517013
25 49.0 0.500161
26 100.0 0.386775
27 191.0 0.345104
28 34.0 0.283901
29 15.0 0.400677
30 107.0 0.349126
31 61.0 0.323735
32 41.0 0.386338
33 102.0 0.508955
34 189.0 0.229269
35 72.0 0.490102
36 60.0 0.500520
37 79.0 0.213903
38 234.0 0.469178
39 104.0 0.640915
40 179.0 0.634779
41 151.0 0.510000
42 161.0 0.346426
43 5.0 0.356386
44 5.0 0.403831
45 93.0 0.544349
46 121.0 0.510000
47 87.0 0.553622
48 174.0 0.743723
49 50.0 0.616659
50 7.0 0.545702
51 87.0 0.746200
52 86.0 0.447693
53 153.0 0.515152
54 151.0 0.510000
55 66.0 0.630293
In [0]:
features_to_consider = set(important_features['features']) # 相関を考慮して残った重要な変数のリスト
In [0]:
features_to_discard = set(corr_features) - set(features_to_consider) # 相関により不要な変数のリスト
In [0]:
features_to_discard = list(features_to_discard)
In [47]:
x_train_grouped_uncorr = x_train_unique.drop(labels=features_to_discard, axis=1)
x_train_grouped_uncorr.shape
Out[47]:
(16000, 140)
In [48]:
x_test_grouped_uncorr = x_test_unique.drop(labels=features_to_discard, axis=1)
x_test_grouped_uncorr.shape
Out[48]:
(4000, 140)
In [49]:
%%time
run_randomForest(x_train_grouped_uncorr, x_test_grouped_uncorr, y_train, y_test)
Accuracy on test set: 
0.95775
CPU times: user 4.06 s, sys: 32.5 ms, total: 4.09 s
Wall time: 2.26 s
In [50]:
%%time
run_randomForest(x_train, x_test, y_train, y_test)
Accuracy on test set: 
0.9585
CPU times: user 5.78 s, sys: 44.4 ms, total: 5.82 s
Wall time: 3.1 s
In [51]:
%%time
run_randomForest(x_train_uncorr, x_test_uncorr, y_train, y_test)
Accuracy on test set: 
0.95875
CPU times: user 3.75 s, sys: 21.5 ms, total: 3.77 s
Wall time: 2.06 s
In [0]: