%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tslearn.clustering import TimeSeriesKMeans
df = pd.read_csv("サンプル - スーパーストア.csv")
df = df[['オーダー日','サブカテゴリ', '売上']]
df.head()
オーダー日 | サブカテゴリ | 売上 | |
---|---|---|---|
0 | 2020/11/8 | 本棚 | 16974.0 |
1 | 2021/10/7 | アプライアンス | 52224.0 |
2 | 2019/8/18 | バインダー | 3319.2 |
3 | 2019/11/25 | 椅子 | 16446.0 |
4 | 2019/11/25 | 家具 | 18600.0 |
# datetime型に変換
df["オーダー日"] = pd.to_datetime(df["オーダー日"])
<ipython-input-11-caf72bc31fcf>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["オーダー日"] = pd.to_datetime(df["オーダー日"])
df = df.query("サブカテゴリ in \
['画材', 'バインダー', '文房具', '保管箱', '事務機器', 'テーブル', 'アプライアンス', '電話機', '椅子']")
# 月取り出し
df['month'] = df['オーダー日'].dt.strftime("%Y-%m")
# サブカテゴリと月でグループ化
grouped_df = df.groupby(['サブカテゴリ', 'month']).sum()
grouped_df.reset_index(inplace=True)
grouped_df.head()
サブカテゴリ | month | 売上 | |
---|---|---|---|
0 | アプライアンス | 2018-01 | 329962.8 |
1 | アプライアンス | 2018-02 | 88375.2 |
2 | アプライアンス | 2018-03 | 700734.0 |
3 | アプライアンス | 2018-04 | 270794.0 |
4 | アプライアンス | 2018-05 | 558646.8 |
# 横持ちへ変換
grouped_df['month'] = pd.to_datetime(grouped_df['month'])
pivoted_df = grouped_df.pivot('month', 'サブカテゴリ', '売上')
pivoted_df.head()
サブカテゴリ | アプライアンス | テーブル | バインダー | 事務機器 | 保管箱 | 文房具 | 椅子 | 画材 | 電話機 |
---|---|---|---|---|---|---|---|---|---|
month | |||||||||
2018-01-01 | 329962.8 | 19963.2 | 112837.7 | 169352.6 | 51144.0 | 9164.4 | 221972.6 | 8653.6 | 131717.9 |
2018-02-01 | 88375.2 | 96303.2 | 13906.0 | 121094.0 | 120397.2 | 69989.0 | 171601.9 | 12135.6 | 66920.8 |
2018-03-01 | 700734.0 | 290200.8 | 4049.6 | 77816.0 | 70404.0 | 59495.6 | 129646.0 | 45665.6 | 67640.0 |
2018-04-01 | 270794.0 | 40186.8 | 8056.4 | 256098.0 | 80455.7 | 12210.0 | 596357.0 | 9704.6 | 131854.0 |
2018-05-01 | 558646.8 | 462822.6 | 49795.7 | 240189.1 | 171582.4 | 56284.0 | 297615.2 | 71330.4 | 606604.0 |
pivoted_df.rename(columns={'アプライアンス': 'appliance', 'テーブル':'table', 'バインダー':'binder', '事務機器': 'equipment', '保管箱': 'box', '文房具': 'stationery', '椅子': 'chair', '画材': 'art supplies', '電話機': 'telephone'}, inplace=True)
# nullチェック
pivoted_df.isnull().any()
サブカテゴリ appliance False table True binder False equipment True box False stationery False chair False art supplies False telephone False dtype: bool
# null埋め
pivoted_df.fillna(method='ffill', inplace=True)
fig = plt.figure(figsize=(12,7))
for category in pivoted_df.columns:
plt.plot(pivoted_df.index, pivoted_df[category], label=category)
plt.legend()
plt.show()
%time
dba_km = TimeSeriesKMeans(
n_clusters=3,
metric="dtw",
verbose=True,
max_iter_barycenter=100,
random_state=8)
y_pred = dba_km.fit_predict(pivoted_df.T)
print(y_pred)
Wall time: 0 ns 794524198897.621 --> 485552050201.164 --> 485552050201.164 --> [1 0 0 0 0 0 2 0 2] [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 9 out of 9 | elapsed: 0.0s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 0.0s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 0.0s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 0.0s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 0.0s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 0.0s finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 0.0s finished
# 分類結果
print(y_pred)
# ↓ 3つに分類された
[1 0 0 0 0 0 2 0 2]
# 色分け用
colors = pd.DataFrame(y_pred).replace(0, "r").replace(1, "b").replace(2, "g")
colors
0 | |
---|---|
0 | b |
1 | r |
2 | r |
3 | r |
4 | r |
5 | r |
6 | g |
7 | r |
8 | g |
# 結果の可視化
fig = plt.figure(figsize=(12,7))
index = 0
for category in pivoted_df.columns:
plt.plot(pivoted_df.index, pivoted_df[category], label = category + "_cluster" + str(y_pred[index]), color=colors[0][index])
index += 1
plt.legend()
plt.title("DTW")
plt.show()
km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=8)
y_pred = km.fit_predict(pivoted_df.T)
1596245654084.401 --> 941992718314.580 --> 941992718314.580 -->
print(y_pred)
[1 0 0 0 0 0 2 0 1]
# 色分け用
colors = pd.DataFrame(y_pred).replace(0, "r").replace(1, "b").replace(2, "g")
colors
0 | |
---|---|
0 | b |
1 | r |
2 | r |
3 | r |
4 | r |
5 | r |
6 | g |
7 | r |
8 | b |
# 結果の可視化
fig = plt.figure(figsize=(12,7))
index = 0
for category in pivoted_df.columns:
plt.plot(pivoted_df.index, pivoted_df[category], label = category + "_cluster" + str(y_pred[index]), color=colors[0][index])
index += 1
plt.legend()
plt.title("Euclid")
plt.show()