Regularization is a technique to discourage the complexity of the model. It does this by penalizing the loss function. This helps to solve the overfitting problem.
A regression model that uses L1 regularization technique is called Lasso Regression and model which uses L2 is called Ridge Regression.
Bias(誤差) - variance(分散) trade off について
誤差を小さくしようとすると、分散が大きくなり、分散を小さくしようとすると、誤差が大きくなる。なので、Biasとvarianceがそれぞれ適当なところでモデルを構築することが必要。その調整がパラメーターのλ。
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import SelectFromModel
titanic = sns.load_dataset('titanic')
titanic.head()
titanic.isnull().sum()
titanic.drop(labels = ['age', 'deck'], axis = 1, inplace = True)
titanic = titanic.dropna()
titanic.isnull().sum()
data = titanic[['pclass', 'sex', 'sibsp', 'parch', 'embarked', 'who', 'alone']].copy()
data.head()
data.isnull().sum()
sex = {'male': 0, 'female': 1}
data['sex'] = data['sex'].map(sex)
data.head()
ports = {'S': 0, 'C': 1, 'Q': 2}
data['embarked'] = data['embarked'].map(ports)
who = {'man': 0, 'woman': 1, 'child': 2}
data['who'] = data['who'].map(who)
alone = {True: 1, False: 0}
data['alone'] = data['alone'].map(alone)
data.head()
x = data.copy()
y = titanic['survived']
x.shape, y.shape
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=43)
# 線形回帰モデルの係数が平均より大きい特徴量を抽出
sel = SelectFromModel(LinearRegression())
sel.fit(x_train, y_train)
# 抽出された特徴量(True)
sel.get_support()
# 係数の確認
sel.estimator_.coef_
mean = np.mean(np.abs(sel.estimator_.coef_))
mean
np.abs(sel.estimator_.coef_)
# 選択された特徴量を抽出
features = x_train.columns[sel.get_support()]
features
# データの特徴量を削減
x_train_reg = sel.transform(x_train)
x_test_reg = sel.transform(x_test)
x_test_reg.shape
def run_randomForest(x_train, x_test, y_train, y_test):
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
%%time
run_randomForest(x_train_reg, x_test_reg, y_train, y_test)
%%time
run_randomForest(x_train, x_test, y_train, y_test)
x_train.shape
sel = SelectFromModel(LogisticRegression(penalty='l1', C=0.05, solver='liblinear'))
sel.fit(x_train, y_train)
sel.get_support()
# Cの値で正則の強さが変わってくるので、Cの値が小さくなればなるほど、L1正則が効いてきて0になる係数が発生し、特徴量が選択される。
sel.estimator_.coef_
# 以下のように、Falseの特徴量は、係数が0になっている
x_train_l1 = sel.transform(x_train)
x_test_l1 = sel.transform(x_test)
%%time
run_randomForest(x_train_l1, x_test_l1, y_train, y_test)
sel = SelectFromModel(LogisticRegression(penalty='l2', C=0.05, solver='liblinear'))
sel.fit(x_train, y_train)
sel.get_support()
sel.estimator_.coef_
x_train_l2 = sel.transform(x_train)
x_test_l2 = sel.transform(x_test)
%%time
run_randomForest(x_train_l2, x_test_l2, y_train, y_test)