import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
data = pd.read_csv('/content/drive/My Drive/kaggle/Learning_mutual_information/data/santander-train.csv', nrows=20000)
data.head()
x = data.drop('TARGET', axis=1)
y = data['TARGET']
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
constant_list = [not temp for temp in constant_filter.get_support()]
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)
x_train_filter.shape, x_test_filter.shape, x_train.shape
# remove feature under 1% variance
quasi_constant_filter = VarianceThreshold(threshold=0.01)
quasi_constant_filter.fit(x_train_filter)
x_train_quasi_filter = quasi_constant_filter.transform(x_train_filter) # transformの戻り値はnumpy.array
x_test_quasi_filter = quasi_constant_filter.transform(x_test_filter)
x_train_quasi_filter.shape, x_test_quasi_filter.shape
x_train_T = x_train_quasi_filter.T
x_test_T = x_test_quasi_filter.T
x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)
duplicated_features = x_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]
x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T
x_train_unique.shape, x_train.shape
corrmat = x_train_unique.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corrmat)
def get_correlation(data, threshold):
corr_col = set()
corrmat = data.corr()
for i in range(len(corrmat.columns)):
for j in range(i):
if abs(corrmat.iloc[i, j]) > threshold:
colname = corrmat.columns[i]
corr_col.add(colname)
return corr_col
corr_features = get_correlation(x_train_unique, 0.85)
corr_features
list(corr_features)[:10]
len(corr_features)
x_train_uncorr = x_train_unique.drop(labels=corr_features, axis=1)
x_test_uncorr = x_test_unique.drop(labels=corr_features, axis=1)
x_train_uncorr.shape, x_test_uncorr.shape
def run_randomForest(x_train, x_test, y_train, y_test):
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy on test set: ')
print(accuracy_score(y_test, y_pred))
%%time
run_randomForest(x_train_uncorr, x_test_uncorr, y_train, y_test)
%%time
run_randomForest(x_train, x_test, y_train, y_test)
corrmat
corrdata = corrmat.abs().stack()
corrdata
corrdata = corrdata.sort_values(ascending=False)
corrdata
corrdata = corrdata[corrdata > 0.85]
corrdata = corrdata[corrdata < 1]
corrdata
corrdata = pd.DataFrame(corrdata).reset_index()
corrdata.columns = ['features1', 'feature2', 'corr_value']
corrdata
# 各独立変数ごとに、相関の強い独立変数をまとめてグループ化
grouped_feature_list = []
correlated_groups_list = []
for feature in corrdata.features1.unique(): # ffeatures1列から重複を取り除いて値を取り出し
if feature not in grouped_feature_list:
correlated_block = corrdata[corrdata.features1 == feature]
grouped_feature_list = grouped_feature_list + list(correlated_block.feature2.unique()) + [feature]
correlated_groups_list.append(correlated_block)
len(correlated_groups_list)
# 全部で56グループある
# 各グループごとの相関状況一覧
for group in correlated_groups_list:
print(group)
important_features = []
for group in correlated_groups_list:
features = list(group.features1.unique()) + list(group.feature2.unique()) # グループ毎の変数全て取り出し
rf = RandomForestClassifier(n_estimators=100, random_state=0) # RandomForestで変数の重要度計測
rf.fit(x_train_unique[features], y_train)
importance = pd.concat([pd.Series(features), pd.Series(rf.feature_importances_)], axis=1) # 重要度取り出し
importance.columns = ['features', 'importance']
importance.sort_values(by='importance', ascending=False, inplace=True)
feat = importance.iloc[0] # グループ内で一番重要度の高い変数を取り出し
important_features.append(feat)
important_features
important_features = pd.DataFrame(important_features)
important_features.reset_index(inplace=True, drop=True)
important_features
features_to_consider = set(important_features['features']) # 相関を考慮して残った重要な変数のリスト
features_to_discard = set(corr_features) - set(features_to_consider) # 相関により不要な変数のリスト
features_to_discard = list(features_to_discard)
x_train_grouped_uncorr = x_train_unique.drop(labels=features_to_discard, axis=1)
x_train_grouped_uncorr.shape
x_test_grouped_uncorr = x_test_unique.drop(labels=features_to_discard, axis=1)
x_test_grouped_uncorr.shape
%%time
run_randomForest(x_train_grouped_uncorr, x_test_grouped_uncorr, y_train, y_test)
%%time
run_randomForest(x_train, x_test, y_train, y_test)
%%time
run_randomForest(x_train_uncorr, x_test_uncorr, y_train, y_test)