%matplotlib inline


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tslearn.clustering import TimeSeriesKMeans


df = pd.read_csv("サンプル - スーパーストア.csv")
df = df[['オーダー日','サブカテゴリ', '売上']]


df.head()


# datetime型に変換
df["オーダー日"] = pd.to_datetime(df["オーダー日"])

<ipython-input-11-caf72bc31fcf>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["オーダー日"] = pd.to_datetime(df["オーダー日"])


df = df.query("サブカテゴリ in \
    ['画材', 'バインダー', '文房具', '保管箱', '事務機器', 'テーブル', 'アプライアンス', '電話機', '椅子']")


# 月取り出し
df['month'] = df['オーダー日'].dt.strftime("%Y-%m")
# サブカテゴリと月でグループ化
grouped_df = df.groupby(['サブカテゴリ', 'month']).sum()
grouped_df.reset_index(inplace=True)
grouped_df.head()


# 横持ちへ変換
grouped_df['month'] = pd.to_datetime(grouped_df['month'])
pivoted_df = grouped_df.pivot('month', 'サブカテゴリ', '売上')
pivoted_df.head()


pivoted_df.rename(columns={'アプライアンス': 'appliance', 'テーブル':'table', 'バインダー':'binder', '事務機器': 'equipment', '保管箱': 'box', '文房具': 'stationery', '椅子': 'chair', '画材': 'art supplies', '電話機': 'telephone'}, inplace=True)


# nullチェック
pivoted_df.isnull().any()

サブカテゴリ
appliance       False
table            True
binder          False
equipment        True
box             False
stationery      False
chair           False
art supplies    False
telephone       False
dtype: bool


# null埋め
pivoted_df.fillna(method='ffill', inplace=True)


fig = plt.figure(figsize=(12,7))
for category in pivoted_df.columns:
    plt.plot(pivoted_df.index, pivoted_df[category], label=category)
plt.legend()
plt.show()


%time
dba_km = TimeSeriesKMeans(
            n_clusters=3,
            metric="dtw",
            verbose=True,
            max_iter_barycenter=100,
            random_state=8)
y_pred = dba_km.fit_predict(pivoted_df.T)
print(y_pred)

Wall time: 0 ns
794524198897.621 --> 485552050201.164 --> 485552050201.164 --> 
[1 0 0 0 0 0 2 0 2]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.0s finished


# 分類結果
print(y_pred)
# ↓　3つに分類された

[1 0 0 0 0 0 2 0 2]


# 色分け用
colors = pd.DataFrame(y_pred).replace(0, "r").replace(1, "b").replace(2, "g")
colors


# 結果の可視化
fig = plt.figure(figsize=(12,7))
index = 0
for category in pivoted_df.columns:
    plt.plot(pivoted_df.index, pivoted_df[category], label = category + "_cluster" + str(y_pred[index]), color=colors[0][index])
    index += 1
plt.legend()
plt.title("DTW")
plt.show()


km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=8)
y_pred = km.fit_predict(pivoted_df.T)

1596245654084.401 --> 941992718314.580 --> 941992718314.580 -->


print(y_pred)

[1 0 0 0 0 0 2 0 1]


# 色分け用
colors = pd.DataFrame(y_pred).replace(0, "r").replace(1, "b").replace(2, "g")
colors


# 結果の可視化
fig = plt.figure(figsize=(12,7))
index = 0
for category in pivoted_df.columns:
    plt.plot(pivoted_df.index, pivoted_df[category], label = category + "_cluster" + str(y_pred[index]), color=colors[0][index])
    index += 1
plt.legend()
plt.title("Euclid")
plt.show()

	オーダー日	サブカテゴリ	売上
0	2020/11/8	本棚	16974.0
1	2021/10/7	アプライアンス	52224.0
2	2019/8/18	バインダー	3319.2
3	2019/11/25	椅子	16446.0
4	2019/11/25	家具	18600.0

	サブカテゴリ	month	売上
0	アプライアンス	2018-01	329962.8
1	アプライアンス	2018-02	88375.2
2	アプライアンス	2018-03	700734.0
3	アプライアンス	2018-04	270794.0
4	アプライアンス	2018-05	558646.8

サブカテゴリ	アプライアンス	テーブル	バインダー	事務機器	保管箱	文房具	椅子	画材	電話機
month
2018-01-01	329962.8	19963.2	112837.7	169352.6	51144.0	9164.4	221972.6	8653.6	131717.9
2018-02-01	88375.2	96303.2	13906.0	121094.0	120397.2	69989.0	171601.9	12135.6	66920.8
2018-03-01	700734.0	290200.8	4049.6	77816.0	70404.0	59495.6	129646.0	45665.6	67640.0
2018-04-01	270794.0	40186.8	8056.4	256098.0	80455.7	12210.0	596357.0	9704.6	131854.0
2018-05-01	558646.8	462822.6	49795.7	240189.1	171582.4	56284.0	297615.2	71330.4	606604.0

tslearn.clustringを用いた時系列データのクラスタリング¶

tslearn.clustring：Derivative DTWのライブラリ¶

tslearn.clustering.TimeSeriesKMeansのパラメータ¶

DTWのk-meansで上記推移をクラスタリング¶

Euclidのk-meansで上記推移をクラスタリング¶