import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
data = pd.read_csv('/content/drive/My Drive/kaggle/Learning_mutual_information/data/santander-train.csv', nrows=20000)
data.head()
x = data.drop('TARGET', axis=1)
y = data['TARGET']
x.shape, y.shape
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
# remove Constant and quasi constant features
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(x_train)
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)
x_train_filter.shape, x_test_filter.shape
# remove duplicate features
x_train_T = x_train_filter.T
x_test_T = x_test_filter.T
x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)
x_train_T.duplicated().sum()
duplicated_features = x_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]
x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T
x_train_unique.shape, x_train.shape
# 独立変数一つづつ使ってRandomForestでモデリング後、roc_auc_scoreを使って変数毎のスコアを求める
roc_auc = []
for feature in x_train_unique.columns:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(x_train_unique[feature].to_frame(), y_train)
y_pred = clf.predict(x_test_unique[feature].to_frame())
roc_auc.append(roc_auc_score(y_test, y_pred))
print(roc_auc)
roc_values = pd.Series(roc_auc)
roc_values.index = x_train_unique.columns
roc_values.sort_values(ascending=False, inplace=True)
roc_values
roc_values.plot.bar()
sel = roc_values[roc_values > 0.5]
sel
x_train_roc = x_train_unique[sel.index]
x_test_roc = x_test_unique[sel.index]
def run_randomForest(x_train, x_test, y_train, y_test):
clf = RandomForestClassifier(n_estimators=100,random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy on test set: ', accuracy_score(y_test, y_pred))
%%time
run_randomForest(x_train_roc, x_test_roc, y_train, y_test)
x_train_roc.shape
%%time
run_randomForest(x_train, x_test, y_train, y_test)
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
boston = load_boston()
print(boston.DESCR)
x = pd.DataFrame(boston.data, columns=boston.feature_names)
x.head()
y = boston.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
mse = []
for feature in x_train.columns:
clf = LinearRegression()
clf.fit(x_train[feature].to_frame(), y_train)
y_pred = clf.predict(x_test[feature].to_frame())
mse.append(mean_squared_error(y_test, y_pred))
mse
mse = pd.Series(mse, index=x_train.columns)
mse.sort_values(ascending=False, inplace=True)
mse
# 結果:RM, LSTATの誤差が他の変数の半分なので、この2つの変数が重要
mse.plot.bar()
# 最も重要な2つの変数(RM, LSTAT)でモデリングし、結果を見る。
x_train_2 = x_train[['RM', 'LSTAT']]
x_test_2 = x_test[['RM', 'LSTAT']]
%%time
model = LinearRegression()
model.fit(x_train_2, y_train)
y_pred = model.predict(x_test_2)
print('r2_score: ', r2_score(y_test, y_pred))
print('rmse: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('sd of house price: ', np.std(y))
%%time
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print('r2_score: ', r2_score(y_test, y_pred))
print('rmse: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('sd of house price: ', np.std(y))