import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
link2data = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
df = pd.read_csv(link2data)
df = df.dropna()
df.head()
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | MALE |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | FEMALE |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | FEMALE |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | FEMALE |
| 5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | MALE |
df.shape
(333, 7)
x = df[['bill_length_mm', 'bill_depth_mm']]
x.head(3)
| bill_length_mm | bill_depth_mm | |
|---|---|---|
| 0 | 39.1 | 18.7 |
| 1 | 39.5 | 17.4 |
| 2 | 40.3 | 18.0 |
y = df[['flipper_length_mm', 'body_mass_g']]
y.head(3)
| flipper_length_mm | body_mass_g | |
|---|---|---|
| 0 | 181.0 | 3750.0 |
| 1 | 186.0 | 3800.0 |
| 2 | 195.0 | 3250.0 |
x_mc = (x - x.mean()) / (x.std())
x_mc.head(3)
| bill_length_mm | bill_depth_mm | |
|---|---|---|
| 0 | -0.894695 | 0.779559 |
| 1 | -0.821552 | 0.119404 |
| 2 | -0.675264 | 0.424091 |
y_mc = (y - y.mean()) / (y.std())
y_mc.head(3)
| flipper_length_mm | body_mass_g | |
|---|---|---|
| 0 | -1.424608 | -0.567621 |
| 1 | -1.067867 | -0.505525 |
| 2 | -0.425733 | -1.188572 |
from sklearn.cross_decomposition import CCA
ca = CCA()
ca.fit(x_mc, y_mc)
x_c, y_c = ca.transform(x_mc, y_mc)
x_c[:3]
array([[-1.18625232, -0.01036701],
[-0.70957262, -0.4560358 ],
[-0.79073194, -0.13080943],
[-1.7186634 , -0.07362316],
[-1.77229457, 0.73624799]])
y_c[:3]
array([[-1.40879506, 0.68286617],
[-1.05385671, 0.42987851],
[-0.3935502 , -0.83961988]])
print(x_c.shape)
print(y_c.shape)
(333, 2) (333, 2)
cc_res = pd.DataFrame({"Ccx_1": x_c[:,0],
"Ccy_1": y_c[:,0],
"Ccx_2": x_c[:,1],
"Ccy_2": y_c[:,1],
"Species": df.species.to_list(),
"Island": df.island.to_list(),
"Sex": df.sex.to_list()})
cc_res.head()
| Ccx_1 | Ccy_1 | Ccx_2 | Ccy_2 | Species | Island | Sex | |
|---|---|---|---|---|---|---|---|
| 0 | -1.186252 | -1.408795 | -0.010367 | 0.682866 | Adelie | Torgersen | MALE |
| 1 | -0.709573 | -1.053857 | -0.456036 | 0.429879 | Adelie | Torgersen | FEMALE |
| 2 | -0.790732 | -0.393550 | -0.130809 | -0.839620 | Adelie | Torgersen | FEMALE |
| 3 | -1.718663 | -0.542888 | -0.073623 | -0.458571 | Adelie | Torgersen | FEMALE |
| 4 | -1.772295 | -0.763548 | 0.736248 | -0.014204 | Adelie | Torgersen | MALE |
np.corrcoef(x_c[:,0], y_c[:,0])
array([[1. , 0.78763151],
[0.78763151, 1. ]])
np.corrcoef(x_c[:,1], y_c[:,1])
array([[1. , 0.08638695],
[0.08638695, 1. ]])
sns.set_context("talk", font_scale=1.2)
plt.figure(figsize=(10,8))
sns.scatterplot(x="Ccx_1", y="Ccy_1", data=cc_res)
plt.title("Comp. 1, corr= %.2f" % np.corrcoef(x_c[:,0], y_c[:,0])[0,1])
Text(0.5, 1.0, 'Comp. 1, corr= 0.79')
plt.figure(figsize=(10,8))
plt.title('Boxplot of Canonical Correlate from Ccx_1 and latent variable(species)')
sns.boxplot(x="Species", y="Ccx_1", data=cc_res)
sns.stripplot(x="Species", y="Ccx_1", data=cc_res)
<AxesSubplot:title={'center':'Boxplot of Canonical Correlate from Ccx_1 and latent variable(species)'}, xlabel='Species', ylabel='Ccx_1'>
plt.figure(figsize=(8,6))
plt.title('Boxplot of relation before CCA')
sns.boxplot(x="species", y="bill_length_mm", data=df)
sns.stripplot(x="species", y="bill_length_mm", data=df)
<AxesSubplot:title={'center':'Boxplot of relation before CCA'}, xlabel='species', ylabel='bill_length_mm'>
plt.figure(figsize=(8,6))
plt.title('Boxplot of relation before CCA')
sns.boxplot(x="species", y="bill_depth_mm", data=df)
sns.stripplot(x="species", y="bill_depth_mm", data=df)
<AxesSubplot:title={'center':'Boxplot of relation before CCA'}, xlabel='species', ylabel='bill_depth_mm'>
plt.figure(figsize=(10,8))
plt.title('Boxplot of Canonical Correlate from Ccy_1 and latent variable(species)')
sns.boxplot(x="Species", y="Ccy_1", data=cc_res)
sns.stripplot(x="Species", y="Ccy_1", data=cc_res)
<AxesSubplot:title={'center':'Boxplot of Canonical Correlate from Ccy_1 and latent variable(species)'}, xlabel='Species', ylabel='Ccy_1'>
plt.figure(figsize=(8,6))
plt.title('Boxplot of relation before CCA')
sns.boxplot(x="species", y="flipper_length_mm", data=df)
sns.stripplot(x="species", y="flipper_length_mm", data=df)
<AxesSubplot:title={'center':'Boxplot of relation before CCA'}, xlabel='species', ylabel='flipper_length_mm'>
plt.figure(figsize=(8,6))
plt.title('Boxplot of relation before CCA')
sns.boxplot(x="species", y="body_mass_g", data=df)
sns.stripplot(x="species", y="body_mass_g", data=df)
<AxesSubplot:title={'center':'Boxplot of relation before CCA'}, xlabel='species', ylabel='body_mass_g'>
plt.figure(figsize=(10,8))
sns.scatterplot(x="Ccx_1", y="Ccy_1", hue="Species", data=cc_res)
plt.title('First Pair of Canonical Covariate, corr = %.2f' % np.corrcoef(x_c[:,0], y_c[:,0])[0,1])
Text(0.5, 1.0, 'First Pair of Canonical Covariate, corr = 0.79')
ccX_df = pd.DataFrame({"CCX_1":x_c[:, 0],
"CCX_2":x_c[:, 1],
"Species":df.species.astype('category').cat.codes,
"Island":df.island.astype('category').cat.codes,
"sex":df.sex.astype('category').cat.codes,
"bill_length":x_mc.bill_length_mm,
"bill_depth":x_mc.bill_depth_mm})
corr_X_df= ccX_df.corr(method='pearson')
corr_X_df.head()
| CCX_1 | CCX_2 | Species | Island | sex | bill_length | bill_depth | |
|---|---|---|---|---|---|---|---|
| CCX_1 | 1.000000e+00 | 7.728804e-18 | 0.935057 | -0.561781 | 0.025383 | 0.828437 | -0.734650 |
| CCX_2 | 7.728804e-18 | 1.000000e+00 | -0.078719 | 0.228933 | 0.576790 | 0.560082 | 0.678447 |
| Species | 9.350575e-01 | -7.871884e-02 | 1.000000 | -0.622428 | 0.010964 | 0.730548 | -0.740346 |
| Island | -5.617810e-01 | 2.289327e-01 | -0.622428 | 1.000000 | -0.012435 | -0.337179 | 0.568031 |
| sex | 2.538332e-02 | 5.767897e-01 | 0.010964 | -0.012435 | 1.000000 | 0.344078 | 0.372673 |
plt.figure(figsize=(10,8))
X_df_lt = corr_X_df.where(np.tril(np.ones(corr_X_df.shape)).astype(np.bool))
C:\Users\yoshi\AppData\Local\Temp/ipykernel_2152/3334907121.py:2: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations X_df_lt = corr_X_df.where(np.tril(np.ones(corr_X_df.shape)).astype(np.bool))
<Figure size 720x576 with 0 Axes>
plt.figure(figsize=(10,8))
sns.heatmap(X_df_lt,cmap="coolwarm",annot=True,fmt='.1g')
<AxesSubplot:>
# second pair of canonical covariates with the dataset
ccY_df = pd.DataFrame({"CCY_1":y_c[:, 0],
"CCY_2":y_c[:, 1],
"Species":df.species.astype('category').cat.codes,
"Island":df.island.astype('category').cat.codes,
"sex":df.sex.astype('category').cat.codes,
"flipper_length":y_mc.flipper_length_mm,
"body_mass":y_mc.body_mass_g})
# compute correlation with Pandas corr()
corr_Y_df= ccY_df.corr(method='pearson')
# Get lower triangular correlation matrix
Y_df_lt = corr_Y_df.where(np.tril(np.ones(corr_Y_df.shape)).astype(np.bool))
# make a lower triangular correlation heatmap with Seaborn
plt.figure(figsize=(10,8))
sns.heatmap(Y_df_lt,cmap="coolwarm",annot=True,fmt='.1g')
C:\Users\yoshi\AppData\Local\Temp/ipykernel_2152/3193900153.py:12: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations Y_df_lt = corr_Y_df.where(np.tril(np.ones(corr_Y_df.shape)).astype(np.bool))
<AxesSubplot:>