Use of ROC_AUC in Classification Problem¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold

data = pd.read_csv('/content/drive/My Drive/kaggle/Learning_mutual_information/data/santander-train.csv', nrows=20000)
data.head()

x = data.drop('TARGET', axis=1)
y = data['TARGET']

x.shape, y.shape

((20000, 370), (20000,))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

Remove Constant, Quasi Constant and Duplicate Features¶

# remove Constant and quasi constant features
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(x_train)
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)

x_train_filter.shape, x_test_filter.shape

((16000, 245), (4000, 245))

# remove duplicate features
x_train_T = x_train_filter.T
x_test_T = x_test_filter.T

x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)

x_train_T.duplicated().sum()

18

duplicated_features = x_train_T.duplicated()

features_to_keep = [not index for index in duplicated_features]

x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T

x_train_unique.shape, x_train.shape

((16000, 227), (16000, 370))

Calculate ROC_AUC Score¶

# 独立変数一つづつ使ってRandomForestでモデリング後、roc_auc_scoreを使って変数毎のスコアを求める
roc_auc = []
for feature in x_train_unique.columns:
    clf = RandomForestClassifier(n_estimators=100, random_state=0)
    clf.fit(x_train_unique[feature].to_frame(), y_train)
    y_pred = clf.predict(x_test_unique[feature].to_frame())
    roc_auc.append(roc_auc_score(y_test, y_pred))

print(roc_auc)

[0.5020561820568537, 0.5, 0.5, 0.49986968986187125, 0.501373452866903, 0.49569976544175137, 0.5028068643863192, 0.49986968986187125, 0.5, 0.5, 0.4997393797237425, 0.5017643832812891, 0.49569976544175137, 0.49960906958561374, 0.49895751889497003, 0.49700286682303885, 0.49960906958561374, 0.5021553136956755, 0.4968725566849101, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.49986968986187125, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5029371745244479, 0.4959603857180089, 0.5, 0.5048318679438659, 0.4997393797237425, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.49921813917122754, 0.49921813917122754, 0.49824600955181303, 0.5, 0.5, 0.5, 0.4990878290330988, 0.4983763196899418, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5025462441100617, 0.4990878290330988, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.49986968986187125, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.4997393797237425, 0.5, 0.5, 0.49986968986187125, 0.4991581805187143, 0.4988272087568413, 0.49674224654678134, 0.4995491109331005, 0.5, 0.5, 0.5022856238338043, 0.5012431427287742, 0.5, 0.5, 0.5, 0.49986968986187125, 0.5, 0.4997393797237425, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5076595179963898]

roc_values = pd.Series(roc_auc)
roc_values.index = x_train_unique.columns
roc_values.sort_values(ascending=False, inplace=True)

roc_values

244    0.507660
107    0.504832
104    0.502937
6      0.502807
155    0.502546
         ...   
18     0.496873
211    0.496742
105    0.495960
12     0.495700
5      0.495700
Length: 227, dtype: float64

roc_values.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7fb41e0a4b00>

sel = roc_values[roc_values > 0.5]
sel

244    0.507660
107    0.504832
104    0.502937
6      0.502807
155    0.502546
215    0.502286
17     0.502155
0      0.502056
11     0.501764
4      0.501373
216    0.501243
dtype: float64

x_train_roc = x_train_unique[sel.index]
x_test_roc = x_test_unique[sel.index]

Build Model and compare the performance¶

def run_randomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100,random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy on test set: ', accuracy_score(y_test, y_pred))

%%time
run_randomForest(x_train_roc, x_test_roc, y_train, y_test)

Accuracy on test set:  0.95275
CPU times: user 4.11 s, sys: 28.8 ms, total: 4.14 s
Wall time: 2.26 s

x_train_roc.shape

(16000, 11)

%%time
run_randomForest(x_train, x_test, y_train, y_test)

Accuracy on test set:  0.9585
CPU times: user 5.86 s, sys: 61.6 ms, total: 5.92 s
Wall time: 3.21 s

Feature Selection using RMSE in Regression¶

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

boston = load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

x = pd.DataFrame(boston.data, columns=boston.feature_names)
x.head()

y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

mse = []
for feature in x_train.columns:
    clf = LinearRegression()
    clf.fit(x_train[feature].to_frame(), y_train)
    y_pred = clf.predict(x_test[feature].to_frame())
    mse.append(mean_squared_error(y_test, y_pred))

mse

[76.38674157646072,
 84.66034377707905,
 77.02905244667242,
 79.36120219345942,
 76.95375968209433,
 46.90735162739531,
 80.3915476111525,
 82.61874125667717,
 82.46499985731933,
 78.30831374720844,
 81.79497121208001,
 77.75285601192718,
 46.336305360025925]

mse = pd.Series(mse, index=x_train.columns)
mse.sort_values(ascending=False, inplace=True)
mse
# 結果：RM, LSTATの誤差が他の変数の半分なので、この２つの変数が重要

ZN         84.660344
DIS        82.618741
RAD        82.465000
PTRATIO    81.794971
AGE        80.391548
CHAS       79.361202
TAX        78.308314
B          77.752856
INDUS      77.029052
NOX        76.953760
CRIM       76.386742
RM         46.907352
LSTAT      46.336305
dtype: float64

mse.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7fb41f17dc18>

# 最も重要な２つの変数(RM, LSTAT)でモデリングし、結果を見る。
x_train_2 = x_train[['RM', 'LSTAT']]
x_test_2 = x_test[['RM', 'LSTAT']]

%%time
model = LinearRegression()
model.fit(x_train_2, y_train)
y_pred = model.predict(x_test_2)
print('r2_score: ', r2_score(y_test, y_pred))
print('rmse: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('sd of house price: ', np.std(y))

r2_score:  0.5409084827186418
rmse:  6.114172522817781
sd of house price:  9.188011545278203
CPU times: user 3.78 ms, sys: 0 ns, total: 3.78 ms
Wall time: 3.39 ms

%%time
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print('r2_score: ', r2_score(y_test, y_pred))
print('rmse: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('sd of house price: ', np.std(y))

r2_score:  0.5892223849182512
rmse:  5.783509315085133
sd of house price:  9.188011545278203
CPU times: user 4.7 ms, sys: 0 ns, total: 4.7 ms
Wall time: 6.42 ms

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

	ID	var3	var15	imp_op_var39_comer_ult1	imp_op_var39_comer_ult3	imp_op_var41_comer_ult1	imp_op_var41_comer_ult3	imp_op_var41_ult1	imp_op_var39_ult1	ind_var5_0	ind_var5	ind_var12_0	ind_var12	ind_var13_0	ind_var13_corto_0	ind_var13_corto	ind_var13	...	saldo_medio_var5_ult1	saldo_medio_var5_ult3	saldo_medio_var12_ult1	saldo_medio_var12_ult3	saldo_medio_var13_corto_hace2	saldo_medio_var13_corto_hace3	saldo_medio_var13_corto_ult1	saldo_medio_var13_corto_ult3	var38
0	1	2	23	0.0	0.0	0.0	0.0	0.0	0.0	1	0	0	0	0	0	0	0	...	0.00	0.00	0.00	0.00	0.0	0.00	0.0	0.00	39205.170000
1	3	2	34	0.0	0.0	0.0	0.0	0.0	0.0	1	0	0	0	1	1	1	1	...	0.00	0.00	0.00	0.00	300.0	122.22	300.0	240.75	49278.030000
2	4	2	23	0.0	0.0	0.0	0.0	0.0	0.0	1	1	0	0	0	0	0	0	...	3.00	2.07	0.00	0.00	0.0	0.00	0.0	0.00	67333.770000
3	8	2	37	195.0	195.0	195.0	195.0	195.0	195.0	1	1	0	0	0	0	0	0	...	91.56	138.84	0.00	0.00	0.0	0.00	0.0	0.00	64007.970000
4	10	2	39	0.0	0.0	0.0	0.0	0.0	0.0	1	0	1	1	0	0	0	0	...	40501.08	13501.47	85501.89	85501.89	0.0	0.00	0.0	0.00	117310.979016