In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
In [0]:
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
In [4]:
data = pd.read_csv('/content/drive/My Drive/kaggle/Learning_mutual_information/data/santander-train.csv', nrows=20000)
data.head()
Out[4]:
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 imp_op_var40_ult1 imp_op_var41_comer_ult1 imp_op_var41_comer_ult3 imp_op_var41_efect_ult1 imp_op_var41_efect_ult3 imp_op_var41_ult1 imp_op_var39_efect_ult1 imp_op_var39_efect_ult3 imp_op_var39_ult1 imp_sal_var16_ult1 ind_var1_0 ind_var1 ind_var2_0 ind_var2 ind_var5_0 ind_var5 ind_var6_0 ind_var6 ind_var8_0 ind_var8 ind_var12_0 ind_var12 ind_var13_0 ind_var13_corto_0 ind_var13_corto ind_var13_largo_0 ind_var13_largo ind_var13_medio_0 ind_var13_medio ind_var13 ... saldo_medio_var5_ult1 saldo_medio_var5_ult3 saldo_medio_var8_hace2 saldo_medio_var8_hace3 saldo_medio_var8_ult1 saldo_medio_var8_ult3 saldo_medio_var12_hace2 saldo_medio_var12_hace3 saldo_medio_var12_ult1 saldo_medio_var12_ult3 saldo_medio_var13_corto_hace2 saldo_medio_var13_corto_hace3 saldo_medio_var13_corto_ult1 saldo_medio_var13_corto_ult3 saldo_medio_var13_largo_hace2 saldo_medio_var13_largo_hace3 saldo_medio_var13_largo_ult1 saldo_medio_var13_largo_ult3 saldo_medio_var13_medio_hace2 saldo_medio_var13_medio_hace3 saldo_medio_var13_medio_ult1 saldo_medio_var13_medio_ult3 saldo_medio_var17_hace2 saldo_medio_var17_hace3 saldo_medio_var17_ult1 saldo_medio_var17_ult3 saldo_medio_var29_hace2 saldo_medio_var29_hace3 saldo_medio_var29_ult1 saldo_medio_var29_ult3 saldo_medio_var33_hace2 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET
0 1 2 23 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 39205.170000 0
1 3 2 34 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 ... 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 300.0 122.22 300.0 240.75 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 49278.030000 0
2 4 2 23 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 3.00 2.07 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 67333.770000 0
3 8 2 37 0.0 195.0 195.0 0.0 0.0 0 0 0.0 195.0 195.0 0.0 0.0 195.0 0.0 0.0 195.0 0.0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 91.56 138.84 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 64007.970000 0
4 10 2 39 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 ... 40501.08 13501.47 0.0 0.0 0.0 0.0 0.0 0.0 85501.89 85501.89 0.0 0.00 0.0 0.00 0.0 0.0 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 117310.979016 0

5 rows × 371 columns

In [5]:
x = data.drop('TARGET', axis=1)
y = data['TARGET']

x.shape, y.shape
Out[5]:
((20000, 370), (20000,))
In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

Remove constant, quasi constant, and duplicate features

In [0]:
# Remove constant, quasi constant features
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(x_train)
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)
In [0]:
# Remove duplicate features
x_train_T = x_train_filter.T
x_test_T = x_test_filter.T
In [0]:
x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)
In [11]:
# check duplicated numbers
x_train_T.shape
Out[11]:
(245, 16000)
In [12]:
x_train_T.duplicated().sum()
Out[12]:
18
In [15]:
duplicated_features = x_train_T.duplicated()
duplicated_features
Out[15]:
0      False
1      False
2      False
3      False
4      False
       ...  
240    False
241    False
242    False
243    False
244    False
Length: 245, dtype: bool
In [0]:
features_to_keep = [not index for index in duplicate_features]
In [0]:
x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T
In [18]:
x_train_unique.shape, x_test_unique.shape
Out[18]:
((16000, 227), (4000, 227))

Calculate the Mutual Information

In [0]:
mi = mutual_info_classif(x_train_unique, y_train)
In [20]:
len(mei)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-20-7661fea77be0> in <module>()
----> 1 len(mei)

NameError: name 'mei' is not defined
In [21]:
mi
Out[21]:
array([2.50224816e-03, 0.00000000e+00, 1.33267122e-02, 4.60248826e-04,
       0.00000000e+00, 9.50909184e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       5.51882802e-04, 7.16627769e-04, 1.12666281e-03, 0.00000000e+00,
       1.85568359e-03, 3.67261209e-03, 5.27533055e-04, 1.83874314e-03,
       4.29378241e-04, 6.13554711e-03, 8.79371550e-03, 2.64746671e-03,
       1.91564734e-03, 1.69407877e-03, 2.10789448e-03, 2.30692148e-03,
       9.73770861e-04, 0.00000000e+00, 4.32073698e-04, 5.60750319e-04,
       1.30385932e-03, 1.83930518e-03, 0.00000000e+00, 0.00000000e+00,
       4.36147975e-04, 0.00000000e+00, 1.22481972e-02, 3.13854653e-03,
       0.00000000e+00, 4.26768832e-03, 1.83347140e-03, 0.00000000e+00,
       0.00000000e+00, 8.68772281e-03, 0.00000000e+00, 8.50931869e-03,
       8.85677602e-04, 1.16470462e-03, 2.68652367e-04, 0.00000000e+00,
       9.03518720e-05, 0.00000000e+00, 2.33394433e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.05865677e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00145673e-03,
       2.41738871e-04, 3.02736843e-03, 0.00000000e+00, 1.68991012e-03,
       0.00000000e+00, 1.80244168e-03, 1.93986713e-04, 0.00000000e+00,
       1.33146604e-03, 0.00000000e+00, 5.55780316e-04, 0.00000000e+00,
       1.07569478e-03, 0.00000000e+00, 4.14600973e-04, 4.18768870e-03,
       9.14915495e-03, 2.86919217e-04, 0.00000000e+00, 1.47084711e-03,
       7.52272375e-03, 1.30948420e-03, 0.00000000e+00, 3.63381863e-03,
       6.72791330e-03, 4.76684666e-03, 1.01277742e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 7.54658870e-03, 1.34662910e-03,
       1.85614225e-03, 4.65902623e-04, 2.95445396e-04, 2.80334059e-04,
       0.00000000e+00, 8.25583552e-04, 4.78405888e-06, 2.00059797e-04,
       9.67379808e-04, 0.00000000e+00, 1.62802580e-03, 1.33276611e-02,
       5.57016959e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       9.58703275e-03, 3.05512958e-03, 7.34154618e-03, 9.52580525e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00634564e-03, 1.58218469e-03,
       6.30666147e-04, 0.00000000e+00, 1.63929858e-03, 1.99124730e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.43615878e-04,
       0.00000000e+00, 2.17213556e-04, 1.91935410e-03, 0.00000000e+00,
       1.70014920e-03, 0.00000000e+00, 9.65470096e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.10146158e-05, 0.00000000e+00,
       1.03800287e-03, 0.00000000e+00, 0.00000000e+00, 9.77684080e-04,
       0.00000000e+00, 0.00000000e+00, 9.16354410e-05, 2.77714320e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.10513619e-03, 7.44607372e-04, 7.84420147e-04,
       0.00000000e+00, 3.26613011e-03, 2.23530617e-03, 0.00000000e+00,
       9.59459878e-03, 0.00000000e+00, 3.66240534e-05, 2.76007216e-04,
       1.35102823e-03, 3.00172603e-03, 2.57951233e-04, 0.00000000e+00,
       0.00000000e+00, 1.14673881e-03, 7.32890520e-04, 6.68696530e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.51555874e-04,
       1.06597157e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.24500403e-04, 2.20316287e-03, 3.15278687e-03, 0.00000000e+00,
       0.00000000e+00, 2.89828555e-05, 2.20939478e-03, 7.11813391e-03,
       9.44004311e-03, 7.01418998e-03, 9.31830934e-03, 9.87597616e-04,
       7.59478532e-04, 0.00000000e+00, 1.50122102e-03, 1.44988605e-04,
       2.90973962e-03, 1.97432482e-03, 0.00000000e+00, 2.28224307e-03,
       2.50953392e-03, 8.17916694e-04, 1.20501378e-03, 1.93368764e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 8.01145653e-04, 1.91562198e-03, 6.53125326e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.15537728e-03,
       1.13669805e-03, 0.00000000e+00, 0.00000000e+00, 1.30567008e-03,
       1.18102621e-03, 0.00000000e+00, 1.28929952e-03])
In [0]:
mi = pd.Series(mi)
mi.index = x_train_unique.columns
In [0]:
mi.sort_values(ascending=False, inplace=True)
In [26]:
mi.plot.bar(figsize=(16,5))
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efbe27a9470>
In [27]:
# select features by percentile

# percentile=10 で 上位10パーセントの特徴量を選択
sel = SelectPercentile(mutual_info_classif, percentile=10).fit(x_train_unique, y_train)
x_train_unique.columns[sel.get_support()]
Out[27]:
Int64Index([  2,  16,  21,  22,  40,  44,  49,  51,  60,  75,  86,  88,  91,
            101, 105, 119, 125, 127, 182, 209, 210, 211, 212],
           dtype='int64')
In [28]:
len(x_train_unique.columns[sel.get_support()])
Out[28]:
23
In [29]:
# help document check
help(sel)
Help on SelectPercentile in module sklearn.feature_selection._univariate_selection object:

class SelectPercentile(_BaseFilter)
 |  Select features according to a percentile of the highest scores.
 |  
 |  Read more in the :ref:`User Guide <univariate_feature_selection>`.
 |  
 |  Parameters
 |  ----------
 |  score_func : callable
 |      Function taking two arrays X and y, and returning a pair of arrays
 |      (scores, pvalues) or a single array with scores.
 |      Default is f_classif (see below "See also"). The default function only
 |      works with classification tasks.
 |  
 |  percentile : int, optional, default=10
 |      Percent of features to keep.
 |  
 |  Attributes
 |  ----------
 |  scores_ : array-like of shape (n_features,)
 |      Scores of features.
 |  
 |  pvalues_ : array-like of shape (n_features,)
 |      p-values of feature scores, None if `score_func` returned only scores.
 |  
 |  Examples
 |  --------
 |  >>> from sklearn.datasets import load_digits
 |  >>> from sklearn.feature_selection import SelectPercentile, chi2
 |  >>> X, y = load_digits(return_X_y=True)
 |  >>> X.shape
 |  (1797, 64)
 |  >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
 |  >>> X_new.shape
 |  (1797, 7)
 |  
 |  Notes
 |  -----
 |  Ties between features with equal scores will be broken in an unspecified
 |  way.
 |  
 |  See also
 |  --------
 |  f_classif: ANOVA F-value between label/feature for classification tasks.
 |  mutual_info_classif: Mutual information for a discrete target.
 |  chi2: Chi-squared stats of non-negative features for classification tasks.
 |  f_regression: F-value between label/feature for regression tasks.
 |  mutual_info_regression: Mutual information for a continuous target.
 |  SelectKBest: Select features based on the k highest scores.
 |  SelectFpr: Select features based on a false positive rate test.
 |  SelectFdr: Select features based on an estimated false discovery rate.
 |  SelectFwe: Select features based on family-wise error rate.
 |  GenericUnivariateSelect: Univariate feature selector with configurable mode.
 |  
 |  Method resolution order:
 |      SelectPercentile
 |      _BaseFilter
 |      sklearn.feature_selection._base.SelectorMixin
 |      sklearn.base.TransformerMixin
 |      sklearn.base.BaseEstimator
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, score_func=<function f_classif at 0x7efbe4d478c8>, percentile=10)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset()
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from _BaseFilter:
 |  
 |  fit(self, X, y)
 |      Run score function on (X, y) and get the appropriate features.
 |      
 |      Parameters
 |      ----------
 |      X : array-like of shape (n_samples, n_features)
 |          The training input samples.
 |      
 |      y : array-like of shape (n_samples,)
 |          The target values (class labels in classification, real numbers in
 |          regression).
 |      
 |      Returns
 |      -------
 |      self : object
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.feature_selection._base.SelectorMixin:
 |  
 |  get_support(self, indices=False)
 |      Get a mask, or integer index, of the features selected
 |      
 |      Parameters
 |      ----------
 |      indices : boolean (default False)
 |          If True, the return value will be an array of integers, rather
 |          than a boolean mask.
 |      
 |      Returns
 |      -------
 |      support : array
 |          An index that selects the retained features from a feature vector.
 |          If `indices` is False, this is a boolean array of shape
 |          [# input features], in which an element is True iff its
 |          corresponding feature is selected for retention. If `indices` is
 |          True, this is an integer array of shape [# output features] whose
 |          values are indices into the input feature vector.
 |  
 |  inverse_transform(self, X)
 |      Reverse the transformation operation
 |      
 |      Parameters
 |      ----------
 |      X : array of shape [n_samples, n_selected_features]
 |          The input samples.
 |      
 |      Returns
 |      -------
 |      X_r : array of shape [n_samples, n_original_features]
 |          `X` with columns of zeros inserted where features would have
 |          been removed by :meth:`transform`.
 |  
 |  transform(self, X)
 |      Reduce X to the selected features.
 |      
 |      Parameters
 |      ----------
 |      X : array of shape [n_samples, n_features]
 |          The input samples.
 |      
 |      Returns
 |      -------
 |      X_r : array of shape [n_samples, n_selected_features]
 |          The input samples with only the selected features.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.TransformerMixin:
 |  
 |  fit_transform(self, X, y=None, **fit_params)
 |      Fit to data, then transform it.
 |      
 |      Fits transformer to X and y with optional parameters fit_params
 |      and returns a transformed version of X.
 |      
 |      Parameters
 |      ----------
 |      X : numpy array of shape [n_samples, n_features]
 |          Training set.
 |      
 |      y : numpy array of shape [n_samples]
 |          Target values.
 |      
 |      **fit_params : dict
 |          Additional fit parameters.
 |      
 |      Returns
 |      -------
 |      X_new : numpy array of shape [n_samples, n_features_new]
 |          Transformed array.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.TransformerMixin:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __getstate__(self)
 |  
 |  __repr__(self, N_CHAR_MAX=700)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep : bool, default=True
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : mapping of string to any
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as pipelines). The latter have parameters of the form
 |      ``<component>__<parameter>`` so that it's possible to update each
 |      component of a nested object.
 |      
 |      Parameters
 |      ----------
 |      **params : dict
 |          Estimator parameters.
 |      
 |      Returns
 |      -------
 |      self : object
 |          Estimator instance.

In [0]:
x_train_mi = sel.transform(x_train_unique)
x_test_mi = sel.transform(x_test_unique)
In [31]:
x_train_mi.shape
Out[31]:
(16000, 23)

Build the model and compare the performance

In [0]:
def run_randomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))
In [33]:
%%time
run_randomForest(x_train_mi, x_test_mi, y_train, y_test)
Accuracy on test set: 
0.958
CPU times: user 2.32 s, sys: 24.1 ms, total: 2.35 s
Wall time: 1.37 s
In [34]:
%%time
run_randomForest(x_train, x_test, y_train, y_test)
Accuracy on test set: 
0.9585
CPU times: user 6.67 s, sys: 31.4 ms, total: 6.71 s
Wall time: 3.63 s
In [0]:
 

Mutual Information Gain in Regression

In [0]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
In [0]:
boston = load_boston()
In [37]:
print(boston.DESCR)
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

In [38]:
x = pd.DataFrame(data=boston.data, columns=boston.feature_names)
x.head()
Out[38]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33
In [0]:
y = boston.target
In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
In [0]:
mi = mutual_info_regression(x_train, y_train)
mi = pd.Series(mi)
mi.index = x_train.columns
mi.sort_values(ascending=False, inplace=True)
In [42]:
mi
Out[42]:
LSTAT      0.680517
RM         0.558906
INDUS      0.521404
PTRATIO    0.484077
NOX        0.448059
TAX        0.380257
CRIM       0.357595
AGE        0.344053
DIS        0.320180
RAD        0.208857
ZN         0.195227
B          0.159919
CHAS       0.018081
dtype: float64
In [43]:
mi.plot.bar()
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efbe06d0c50>
In [44]:
# get top 9 features
# k=9 <- get 9 features
sel = SelectKBest(mutual_info_regression, k=9).fit(x_train, y_train)
x_train.columns[sel.get_support()]
Out[44]:
Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')
In [0]:
model = LinearRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
In [46]:
r2_score(y_test, y_predict)
Out[46]:
0.5892223849182512
In [47]:
np.sqrt(mean_squared_error(y_test, y_predict))
Out[47]:
5.783509315085133
In [48]:
np.std(y)
Out[48]:
9.188011545278203
In [0]:
 
In [49]:
x_train_9 = sel.transform(x_train)
x_train_9.shape
Out[49]:
(404, 9)
In [0]:
x_test_9 = sel.transform(x_test)
In [51]:
model = LinearRegression()
model.fit(x_train_9, y_train)
y_predict = model.predict(x_test_9)
print('r2_score')
r2_score(y_test, y_predict)
r2_score
Out[51]:
0.5317127606961575
In [52]:
print('rmse')
np.sqrt(mean_squared_error(y_test, y_predict))
rmse
Out[52]:
6.175103151293748
In [0]: