import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

data = pd.read_csv('/content/drive/My Drive/kaggle/Learning_mutual_information/data/santander-train.csv', nrows=20000)
data.head()

x = data.drop('TARGET', axis=1)
y = data['TARGET']

x.shape, y.shape

((20000, 370), (20000,))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

Remove constant, quasi constant, and duplicate features¶

# Remove constant, quasi constant features
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(x_train)
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)

# Remove duplicate features
x_train_T = x_train_filter.T
x_test_T = x_test_filter.T

x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)

# check duplicated numbers
x_train_T.shape

(245, 16000)

x_train_T.duplicated().sum()

18

duplicated_features = x_train_T.duplicated()
duplicated_features

0      False
1      False
2      False
3      False
4      False
       ...  
240    False
241    False
242    False
243    False
244    False
Length: 245, dtype: bool

features_to_keep = [not index for index in duplicate_features]

x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T

x_train_unique.shape, x_test_unique.shape

((16000, 227), (4000, 227))

Calculate the Mutual Information¶

mi = mutual_info_classif(x_train_unique, y_train)

len(mei)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-20-7661fea77be0> in <module>()
----> 1 len(mei)

NameError: name 'mei' is not defined

mi

array([2.50224816e-03, 0.00000000e+00, 1.33267122e-02, 4.60248826e-04,
       0.00000000e+00, 9.50909184e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       5.51882802e-04, 7.16627769e-04, 1.12666281e-03, 0.00000000e+00,
       1.85568359e-03, 3.67261209e-03, 5.27533055e-04, 1.83874314e-03,
       4.29378241e-04, 6.13554711e-03, 8.79371550e-03, 2.64746671e-03,
       1.91564734e-03, 1.69407877e-03, 2.10789448e-03, 2.30692148e-03,
       9.73770861e-04, 0.00000000e+00, 4.32073698e-04, 5.60750319e-04,
       1.30385932e-03, 1.83930518e-03, 0.00000000e+00, 0.00000000e+00,
       4.36147975e-04, 0.00000000e+00, 1.22481972e-02, 3.13854653e-03,
       0.00000000e+00, 4.26768832e-03, 1.83347140e-03, 0.00000000e+00,
       0.00000000e+00, 8.68772281e-03, 0.00000000e+00, 8.50931869e-03,
       8.85677602e-04, 1.16470462e-03, 2.68652367e-04, 0.00000000e+00,
       9.03518720e-05, 0.00000000e+00, 2.33394433e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.05865677e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00145673e-03,
       2.41738871e-04, 3.02736843e-03, 0.00000000e+00, 1.68991012e-03,
       0.00000000e+00, 1.80244168e-03, 1.93986713e-04, 0.00000000e+00,
       1.33146604e-03, 0.00000000e+00, 5.55780316e-04, 0.00000000e+00,
       1.07569478e-03, 0.00000000e+00, 4.14600973e-04, 4.18768870e-03,
       9.14915495e-03, 2.86919217e-04, 0.00000000e+00, 1.47084711e-03,
       7.52272375e-03, 1.30948420e-03, 0.00000000e+00, 3.63381863e-03,
       6.72791330e-03, 4.76684666e-03, 1.01277742e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 7.54658870e-03, 1.34662910e-03,
       1.85614225e-03, 4.65902623e-04, 2.95445396e-04, 2.80334059e-04,
       0.00000000e+00, 8.25583552e-04, 4.78405888e-06, 2.00059797e-04,
       9.67379808e-04, 0.00000000e+00, 1.62802580e-03, 1.33276611e-02,
       5.57016959e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       9.58703275e-03, 3.05512958e-03, 7.34154618e-03, 9.52580525e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00634564e-03, 1.58218469e-03,
       6.30666147e-04, 0.00000000e+00, 1.63929858e-03, 1.99124730e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.43615878e-04,
       0.00000000e+00, 2.17213556e-04, 1.91935410e-03, 0.00000000e+00,
       1.70014920e-03, 0.00000000e+00, 9.65470096e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.10146158e-05, 0.00000000e+00,
       1.03800287e-03, 0.00000000e+00, 0.00000000e+00, 9.77684080e-04,
       0.00000000e+00, 0.00000000e+00, 9.16354410e-05, 2.77714320e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.10513619e-03, 7.44607372e-04, 7.84420147e-04,
       0.00000000e+00, 3.26613011e-03, 2.23530617e-03, 0.00000000e+00,
       9.59459878e-03, 0.00000000e+00, 3.66240534e-05, 2.76007216e-04,
       1.35102823e-03, 3.00172603e-03, 2.57951233e-04, 0.00000000e+00,
       0.00000000e+00, 1.14673881e-03, 7.32890520e-04, 6.68696530e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.51555874e-04,
       1.06597157e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.24500403e-04, 2.20316287e-03, 3.15278687e-03, 0.00000000e+00,
       0.00000000e+00, 2.89828555e-05, 2.20939478e-03, 7.11813391e-03,
       9.44004311e-03, 7.01418998e-03, 9.31830934e-03, 9.87597616e-04,
       7.59478532e-04, 0.00000000e+00, 1.50122102e-03, 1.44988605e-04,
       2.90973962e-03, 1.97432482e-03, 0.00000000e+00, 2.28224307e-03,
       2.50953392e-03, 8.17916694e-04, 1.20501378e-03, 1.93368764e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 8.01145653e-04, 1.91562198e-03, 6.53125326e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.15537728e-03,
       1.13669805e-03, 0.00000000e+00, 0.00000000e+00, 1.30567008e-03,
       1.18102621e-03, 0.00000000e+00, 1.28929952e-03])

mi = pd.Series(mi)
mi.index = x_train_unique.columns

mi.sort_values(ascending=False, inplace=True)

mi.plot.bar(figsize=(16,5))

<matplotlib.axes._subplots.AxesSubplot at 0x7efbe27a9470>

# select features by percentile

# percentile=10 で　上位10パーセントの特徴量を選択
sel = SelectPercentile(mutual_info_classif, percentile=10).fit(x_train_unique, y_train)
x_train_unique.columns[sel.get_support()]

Int64Index([  2,  16,  21,  22,  40,  44,  49,  51,  60,  75,  86,  88,  91,
            101, 105, 119, 125, 127, 182, 209, 210, 211, 212],
           dtype='int64')

len(x_train_unique.columns[sel.get_support()])

23

# help document check
help(sel)

Help on SelectPercentile in module sklearn.feature_selection._univariate_selection object:

class SelectPercentile(_BaseFilter)
 |  Select features according to a percentile of the highest scores.
 |  
 |  Read more in the :ref:`User Guide <univariate_feature_selection>`.
 |  
 |  Parameters
 |  ----------
 |  score_func : callable
 |      Function taking two arrays X and y, and returning a pair of arrays
 |      (scores, pvalues) or a single array with scores.
 |      Default is f_classif (see below "See also"). The default function only
 |      works with classification tasks.
 |  
 |  percentile : int, optional, default=10
 |      Percent of features to keep.
 |  
 |  Attributes
 |  ----------
 |  scores_ : array-like of shape (n_features,)
 |      Scores of features.
 |  
 |  pvalues_ : array-like of shape (n_features,)
 |      p-values of feature scores, None if `score_func` returned only scores.
 |  
 |  Examples
 |  --------
 |  >>> from sklearn.datasets import load_digits
 |  >>> from sklearn.feature_selection import SelectPercentile, chi2
 |  >>> X, y = load_digits(return_X_y=True)
 |  >>> X.shape
 |  (1797, 64)
 |  >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
 |  >>> X_new.shape
 |  (1797, 7)
 |  
 |  Notes
 |  -----
 |  Ties between features with equal scores will be broken in an unspecified
 |  way.
 |  
 |  See also
 |  --------
 |  f_classif: ANOVA F-value between label/feature for classification tasks.
 |  mutual_info_classif: Mutual information for a discrete target.
 |  chi2: Chi-squared stats of non-negative features for classification tasks.
 |  f_regression: F-value between label/feature for regression tasks.
 |  mutual_info_regression: Mutual information for a continuous target.
 |  SelectKBest: Select features based on the k highest scores.
 |  SelectFpr: Select features based on a false positive rate test.
 |  SelectFdr: Select features based on an estimated false discovery rate.
 |  SelectFwe: Select features based on family-wise error rate.
 |  GenericUnivariateSelect: Univariate feature selector with configurable mode.
 |  
 |  Method resolution order:
 |      SelectPercentile
 |      _BaseFilter
 |      sklearn.feature_selection._base.SelectorMixin
 |      sklearn.base.TransformerMixin
 |      sklearn.base.BaseEstimator
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, score_func=<function f_classif at 0x7efbe4d478c8>, percentile=10)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset()
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from _BaseFilter:
 |  
 |  fit(self, X, y)
 |      Run score function on (X, y) and get the appropriate features.
 |      
 |      Parameters
 |      ----------
 |      X : array-like of shape (n_samples, n_features)
 |          The training input samples.
 |      
 |      y : array-like of shape (n_samples,)
 |          The target values (class labels in classification, real numbers in
 |          regression).
 |      
 |      Returns
 |      -------
 |      self : object
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.feature_selection._base.SelectorMixin:
 |  
 |  get_support(self, indices=False)
 |      Get a mask, or integer index, of the features selected
 |      
 |      Parameters
 |      ----------
 |      indices : boolean (default False)
 |          If True, the return value will be an array of integers, rather
 |          than a boolean mask.
 |      
 |      Returns
 |      -------
 |      support : array
 |          An index that selects the retained features from a feature vector.
 |          If `indices` is False, this is a boolean array of shape
 |          [# input features], in which an element is True iff its
 |          corresponding feature is selected for retention. If `indices` is
 |          True, this is an integer array of shape [# output features] whose
 |          values are indices into the input feature vector.
 |  
 |  inverse_transform(self, X)
 |      Reverse the transformation operation
 |      
 |      Parameters
 |      ----------
 |      X : array of shape [n_samples, n_selected_features]
 |          The input samples.
 |      
 |      Returns
 |      -------
 |      X_r : array of shape [n_samples, n_original_features]
 |          `X` with columns of zeros inserted where features would have
 |          been removed by :meth:`transform`.
 |  
 |  transform(self, X)
 |      Reduce X to the selected features.
 |      
 |      Parameters
 |      ----------
 |      X : array of shape [n_samples, n_features]
 |          The input samples.
 |      
 |      Returns
 |      -------
 |      X_r : array of shape [n_samples, n_selected_features]
 |          The input samples with only the selected features.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.TransformerMixin:
 |  
 |  fit_transform(self, X, y=None, **fit_params)
 |      Fit to data, then transform it.
 |      
 |      Fits transformer to X and y with optional parameters fit_params
 |      and returns a transformed version of X.
 |      
 |      Parameters
 |      ----------
 |      X : numpy array of shape [n_samples, n_features]
 |          Training set.
 |      
 |      y : numpy array of shape [n_samples]
 |          Target values.
 |      
 |      **fit_params : dict
 |          Additional fit parameters.
 |      
 |      Returns
 |      -------
 |      X_new : numpy array of shape [n_samples, n_features_new]
 |          Transformed array.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.TransformerMixin:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __getstate__(self)
 |  
 |  __repr__(self, N_CHAR_MAX=700)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep : bool, default=True
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : mapping of string to any
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as pipelines). The latter have parameters of the form
 |      ``<component>__<parameter>`` so that it's possible to update each
 |      component of a nested object.
 |      
 |      Parameters
 |      ----------
 |      **params : dict
 |          Estimator parameters.
 |      
 |      Returns
 |      -------
 |      self : object
 |          Estimator instance.

x_train_mi = sel.transform(x_train_unique)
x_test_mi = sel.transform(x_test_unique)

x_train_mi.shape

(16000, 23)

Build the model and compare the performance¶

def run_randomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

%%time
run_randomForest(x_train_mi, x_test_mi, y_train, y_test)

Accuracy on test set: 
0.958
CPU times: user 2.32 s, sys: 24.1 ms, total: 2.35 s
Wall time: 1.37 s

%%time
run_randomForest(x_train, x_test, y_train, y_test)

Accuracy on test set: 
0.9585
CPU times: user 6.67 s, sys: 31.4 ms, total: 6.71 s
Wall time: 3.63 s

Mutual Information Gain in Regression¶

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

boston = load_boston()

print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

x = pd.DataFrame(data=boston.data, columns=boston.feature_names)
x.head()

y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

mi = mutual_info_regression(x_train, y_train)
mi = pd.Series(mi)
mi.index = x_train.columns
mi.sort_values(ascending=False, inplace=True)

mi

LSTAT      0.680517
RM         0.558906
INDUS      0.521404
PTRATIO    0.484077
NOX        0.448059
TAX        0.380257
CRIM       0.357595
AGE        0.344053
DIS        0.320180
RAD        0.208857
ZN         0.195227
B          0.159919
CHAS       0.018081
dtype: float64

mi.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7efbe06d0c50>

# get top 9 features
# k=9 <- get 9 features
sel = SelectKBest(mutual_info_regression, k=9).fit(x_train, y_train)
x_train.columns[sel.get_support()]

Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')

model = LinearRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

r2_score(y_test, y_predict)

0.5892223849182512

np.sqrt(mean_squared_error(y_test, y_predict))

5.783509315085133

np.std(y)

9.188011545278203

x_train_9 = sel.transform(x_train)
x_train_9.shape

(404, 9)

x_test_9 = sel.transform(x_test)

model = LinearRegression()
model.fit(x_train_9, y_train)
y_predict = model.predict(x_test_9)
print('r2_score')
r2_score(y_test, y_predict)

r2_score

0.5317127606961575

print('rmse')
np.sqrt(mean_squared_error(y_test, y_predict))

rmse

6.175103151293748

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

	ID	var3	var15	imp_op_var39_comer_ult1	imp_op_var39_comer_ult3	imp_op_var41_comer_ult1	imp_op_var41_comer_ult3	imp_op_var41_ult1	imp_op_var39_ult1	ind_var5_0	ind_var5	ind_var12_0	ind_var12	ind_var13_0	ind_var13_corto_0	ind_var13_corto	ind_var13	...	saldo_medio_var5_ult1	saldo_medio_var5_ult3	saldo_medio_var12_ult1	saldo_medio_var12_ult3	saldo_medio_var13_corto_hace2	saldo_medio_var13_corto_hace3	saldo_medio_var13_corto_ult1	saldo_medio_var13_corto_ult3	var38
0	1	2	23	0.0	0.0	0.0	0.0	0.0	0.0	1	0	0	0	0	0	0	0	...	0.00	0.00	0.00	0.00	0.0	0.00	0.0	0.00	39205.170000
1	3	2	34	0.0	0.0	0.0	0.0	0.0	0.0	1	0	0	0	1	1	1	1	...	0.00	0.00	0.00	0.00	300.0	122.22	300.0	240.75	49278.030000
2	4	2	23	0.0	0.0	0.0	0.0	0.0	0.0	1	1	0	0	0	0	0	0	...	3.00	2.07	0.00	0.00	0.0	0.00	0.0	0.00	67333.770000
3	8	2	37	195.0	195.0	195.0	195.0	195.0	195.0	1	1	0	0	0	0	0	0	...	91.56	138.84	0.00	0.00	0.0	0.00	0.0	0.00	64007.970000
4	10	2	39	0.0	0.0	0.0	0.0	0.0	0.0	1	0	1	1	0	0	0	0	...	40501.08	13501.47	85501.89	85501.89	0.0	0.00	0.0	0.00	117310.979016