import numpy as np
import pandas as pd
import datetime # manipulating date formats
import os

from matplotlib import pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose,STL

# settings
plt.style.use('seaborn-deep')


ts = pd.read_csv("daily.csv", parse_dates=['timestamp'])
ts


print(ts.info())
ts.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615234 entries, 0 to 615233
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Unnamed: 0      615234 non-null  int64         
 1   timestamp       615234 non-null  datetime64[ns]
 2   reportingGroup  615234 non-null  object        
 3   locationName    615234 non-null  object        
 4   value           615234 non-null  float64       
 5   unit            615234 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 28.2+ MB
None


ts.groupby(['reportingGroup','unit','locationName'])['locationName'].count()

reportingGroup  unit  locationName                                                        
Electricity     kWh   1000 Hakaniemen kauppahalli                                             1089
                      1001 Hietalahden kauppahalli                                            1092
                      1002 Vanha kauppahalli                                                  1092
                      1037 Vuotalo                                                            1084
                      1507 Suutarilan monitoimitalo/ala-aste ja Lpk Seulanen                  1092
                                                                                              ... 
                      6492 Kontukuja 5, Kontulan monipuolinen palvelukeskus                   1092
                      6743 Kivikonkuja 4, terveydenhuollon päätearkisto                       1018
                      6760 Kaupunkiympäristön toimiala, 44572 Kaupunkiympäristön toimitalo     286
                      6761 Päiväkoti Franzenia                                                1096
                m3    4511 Lpk Kajuutta, Perhetalo Kajuutta                                   1096
Name: locationName, Length: 582, dtype: int64


ts = ts[ts['unit'] != 'm3']
ts


groups = ts.groupby('locationName')

# divide into different dataframes for each location
locs = [groups.get_group(df).set_index('timestamp').value for df in groups.groups]

# remove duplicated rows
locs = [df[~df.index.duplicated(keep='first')] for df in locs]

ts = pd.concat(locs, join='outer', axis=1, ignore_index=True)
ts


ts = ts.loc[:,ts.isna().sum() < 100]
ts.head()


ts = ts.sort_index().interpolate()
ts.head()


ts= ts.sum(axis=1).to_frame(name='kWh')
ts.index.rename('Date',inplace=True)


ts.plot(figsize = (12,6),style = 'o')

<AxesSubplot:xlabel='Date'>


file_name = 'cleaned-daily.csv'
ts.to_csv(f"./{file_name}")


ts2017,ts2018,ts2019 = ts['2017-01-01':'2018-01-01'],ts['2018-01-01':'2019-01-01'],ts['2019-01-01':'2020-01-01']
figs,ax =  plt.subplots(3,figsize=(10,15))


ax[0].plot(ts2017,'o')
ax[0].set_title('Energy Consumption 2017')
ax[0].grid(True)
ax[1].plot(ts2018,'o')
ax[1].set_title('Energy Consumption 2018')
ax[1].grid(True)
ax[2].plot(ts2019,'o')
ax[2].set_title('Energy Consumption 2019')
ax[2].grid(True)

plt.show()


feb_end = '2017-02-28'

ts.loc[:feb_end].plot(figsize = (12,6))
plt.suptitle('Energy Consumption in January 2017')

Text(0.5, 0.98, 'Energy Consumption in January 2017')


# The day of the week with Monday=0,..., Sunday=6
ts.loc[:feb_end].index.dayofweek

Int64Index([6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6,
            0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0,
            1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1],
           dtype='int64', name='Date')


decompose = seasonal_decompose(ts2017.values, period = 7, model="additive")
plt.rcParams.update({'figure.figsize':(10,9)})
f = decompose.plot()


stl = STL(ts2017,seasonal=7,robust=True)
res = stl.fit()
fig = res.plot()


# Basic imports
import numpy as np
import pandas as pd
from math import sqrt

# Machine learning basics
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE, r2_score
from sklearn import datasets, linear_model


train_end = '2019-12-01'
test_end = '2020-01-01'
demo_start = '2019-11-25'
demo = ts[demo_start:test_end]
train,test = ts[:train_end], ts[train_end:]

train.plot(figsize=(12,6), style = 'o',grid=True)

<AxesSubplot:xlabel='Date'>


X = (train.index - train.index[0]).days.values.reshape(-1, 1)
y = train.kWh.values.reshape(-1, 1)

X_demo = (demo.index - train.index[0]).days.values.reshape(-1,1)
y_demo = demo.kWh.values.reshape(-1, 1)

X_test = (test.index - train.index[0]).days.values.reshape(-1,1)
y_test = test.kWh.values.reshape(-1, 1)

regr = linear_model.LinearRegression()
regr.fit(X,y)

regr_pred_demo = regr.predict(X_demo)
regr_pred_test = regr.predict(X_test)

fig, ax = plt.subplots(figsize=(12,7))
ax.set(title='Energy consumption', xlabel='Date', ylabel='kWh')

ax.plot(demo.index, y_demo,'o')
ax.plot(demo.index, regr_pred_demo, linewidth=3)

err = 'Coefficients on test set: ' + str(round(regr.coef_.take(0).take(0),2)) + '\n Root mean squared error on test set: ' + \
    str(round(sqrt(MSE(y_test, regr_pred_test)),2)) + '\nCoefficient of determination on test set: ' + \
        str(round(r2_score(y_test, regr_pred_test), 2))

plt.figtext(0.12, -0.08, err, ha="left",fontsize=15,va='center')
plt.xticks(rotation=45)
legend = ax.legend(["Observed", "Forecast"])
ax.grid(True)


# Basic imports
import numpy as np
import pandas as pd
import datetime # manipulating date formats
import itertools
import time
import holidays
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_percentage_error as MAPE

# Stats packages
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.tools import diff
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm

import warnings
warnings.filterwarnings("ignore")


def test_stationarity(timeseries):
    # Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries,maxlag=7*4, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (round(dfoutput,3))

test_stationarity(ts.kWh)

Results of Dickey-Fuller Test:
Test Statistic                   -2.639
p-value                           0.085
#Lags Used                       28.000
Number of Observations Used    1067.000
Critical Value (1%)              -3.436
Critical Value (5%)              -2.864
Critical Value (10%)             -2.568
dtype: float64


# invert differenced forecast
def invert_diff(series, first_element_before):
    cumsum = series.cumsum()
    return cumsum.fillna(0) + first_element_original

diffed = diff(ts,seasonal_periods=7)
diffed.plot(figsize=(10,6))

<AxesSubplot:xlabel='Date'>


test_stationarity(diffed.kWh)

Results of Dickey-Fuller Test:
Test Statistic                   -6.859
p-value                           0.000
#Lags Used                       27.000
Number of Observations Used    1067.000
Critical Value (1%)              -3.436
Critical Value (5%)              -2.864
Critical Value (10%)             -2.568
dtype: float64


acf = plot_acf(diffed)
pacf = plot_pacf(diffed)


begin = '2017-01-01'
abv = '2019-12-31'
hol = pd.to_datetime(holidays.CountryHoliday('Finland')[begin:abv])

exo = ts.copy()
exo["holidays"] = 0
exo.loc[hol] = 1

exo.drop(["kWh"],axis=1,inplace=True)
exo_train,exo_test = exo[:train_end],exo[train_end:abv]
exo_test[:10]


def sarimax(ts,exo,all_param):
    results = []
    for param in all_param:
        try:
            mod = SARIMAX(ts,
                          exog = exo,
                          order=param[0],
                          seasonal_order=param[1])
            res = mod.fit()
            results.append((res,res.aic,param))
            print('Tried out SARIMAX{}x{} - AIC:{}'.format(param[0], param[1], round(res.aic,2)))
        except Exception as e:
            print(e)
            continue
            
    return results


# set parameter range
p,d,q = range(0,3),[1],range(0,3)
P,D,Q,s = range(0,3),[1],range(0,3),[7]
# list of all parameter combos
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, s))
all_param = list(itertools.product(pdq,seasonal_pdq))

all_res = sarimax(train,exo_train,all_param)

Tried out SARIMAX(0, 1, 0)x(0, 1, 0, 7) - AIC:31259.07
Tried out SARIMAX(0, 1, 0)x(0, 1, 1, 7) - AIC:24640.34
Tried out SARIMAX(0, 1, 0)x(0, 1, 2, 7) - AIC:24624.16
Tried out SARIMAX(0, 1, 0)x(1, 1, 0, 7) - AIC:24754.79
Tried out SARIMAX(0, 1, 0)x(1, 1, 1, 7) - AIC:24586.39
Tried out SARIMAX(0, 1, 0)x(1, 1, 2, 7) - AIC:24569.56
Tried out SARIMAX(0, 1, 0)x(2, 1, 0, 7) - AIC:24701.83
Tried out SARIMAX(0, 1, 0)x(2, 1, 1, 7) - AIC:24570.74
Tried out SARIMAX(0, 1, 0)x(2, 1, 2, 7) - AIC:24587.37
Tried out SARIMAX(0, 1, 1)x(0, 1, 0, 7) - AIC:24823.7
Tried out SARIMAX(0, 1, 1)x(0, 1, 1, 7) - AIC:24585.39
Tried out SARIMAX(0, 1, 1)x(0, 1, 2, 7) - AIC:24574.37
Tried out SARIMAX(0, 1, 1)x(1, 1, 0, 7) - AIC:24661.13
Tried out SARIMAX(0, 1, 1)x(1, 1, 1, 7) - AIC:24554.85
Tried out SARIMAX(0, 1, 1)x(1, 1, 2, 7) - AIC:24542.64
Tried out SARIMAX(0, 1, 1)x(2, 1, 0, 7) - AIC:24614.24
Tried out SARIMAX(0, 1, 1)x(2, 1, 1, 7) - AIC:24545.54
Tried out SARIMAX(0, 1, 1)x(2, 1, 2, 7) - AIC:24544.29
Tried out SARIMAX(0, 1, 2)x(0, 1, 0, 7) - AIC:24808.58
Tried out SARIMAX(0, 1, 2)x(0, 1, 1, 7) - AIC:24557.09
Tried out SARIMAX(0, 1, 2)x(0, 1, 2, 7) - AIC:24544.54
Tried out SARIMAX(0, 1, 2)x(1, 1, 0, 7) - AIC:24638.62
Tried out SARIMAX(0, 1, 2)x(1, 1, 1, 7) - AIC:24523.87
Tried out SARIMAX(0, 1, 2)x(1, 1, 2, 7) - AIC:24513.95
Tried out SARIMAX(0, 1, 2)x(2, 1, 0, 7) - AIC:24590.03
Tried out SARIMAX(0, 1, 2)x(2, 1, 1, 7) - AIC:24516.25
Tried out SARIMAX(0, 1, 2)x(2, 1, 2, 7) - AIC:24515.67
Tried out SARIMAX(1, 1, 0)x(0, 1, 0, 7) - AIC:24868.76
Tried out SARIMAX(1, 1, 0)x(0, 1, 1, 7) - AIC:24621.36
Tried out SARIMAX(1, 1, 0)x(0, 1, 2, 7) - AIC:24610.28
Tried out SARIMAX(1, 1, 0)x(1, 1, 0, 7) - AIC:24700.7
Tried out SARIMAX(1, 1, 0)x(1, 1, 1, 7) - AIC:24588.84
Tried out SARIMAX(1, 1, 0)x(1, 1, 2, 7) - AIC:24576.75
Tried out SARIMAX(1, 1, 0)x(2, 1, 0, 7) - AIC:24655.83
Tried out SARIMAX(1, 1, 0)x(2, 1, 1, 7) - AIC:24579.15
Tried out SARIMAX(1, 1, 0)x(2, 1, 2, 7) - AIC:24578.58
Tried out SARIMAX(1, 1, 1)x(0, 1, 0, 7) - AIC:24812.02
Tried out SARIMAX(1, 1, 1)x(0, 1, 1, 7) - AIC:24568.06
Tried out SARIMAX(1, 1, 1)x(0, 1, 2, 7) - AIC:24555.92
Tried out SARIMAX(1, 1, 1)x(1, 1, 0, 7) - AIC:24647.04
Tried out SARIMAX(1, 1, 1)x(1, 1, 1, 7) - AIC:24536.07
Tried out SARIMAX(1, 1, 1)x(1, 1, 2, 7) - AIC:24525.97
Tried out SARIMAX(1, 1, 1)x(2, 1, 0, 7) - AIC:24599.64
Tried out SARIMAX(1, 1, 1)x(2, 1, 1, 7) - AIC:24528.38
Tried out SARIMAX(1, 1, 1)x(2, 1, 2, 7) - AIC:24527.65
Tried out SARIMAX(1, 1, 2)x(0, 1, 0, 7) - AIC:24808.0
Tried out SARIMAX(1, 1, 2)x(0, 1, 1, 7) - AIC:24556.75
Tried out SARIMAX(1, 1, 2)x(0, 1, 2, 7) - AIC:24544.3
Tried out SARIMAX(1, 1, 2)x(1, 1, 0, 7) - AIC:24637.91
Tried out SARIMAX(1, 1, 2)x(1, 1, 1, 7) - AIC:24523.53
Tried out SARIMAX(1, 1, 2)x(1, 1, 2, 7) - AIC:24513.32
Tried out SARIMAX(1, 1, 2)x(2, 1, 0, 7) - AIC:24589.16
Tried out SARIMAX(1, 1, 2)x(2, 1, 1, 7) - AIC:24515.7
Tried out SARIMAX(1, 1, 2)x(2, 1, 2, 7) - AIC:24515.03
Tried out SARIMAX(2, 1, 0)x(0, 1, 0, 7) - AIC:24807.91
Tried out SARIMAX(2, 1, 0)x(0, 1, 1, 7) - AIC:24562.28
Tried out SARIMAX(2, 1, 0)x(0, 1, 2, 7) - AIC:24550.38
Tried out SARIMAX(2, 1, 0)x(1, 1, 0, 7) - AIC:24640.5
Tried out SARIMAX(2, 1, 0)x(1, 1, 1, 7) - AIC:24528.72
Tried out SARIMAX(2, 1, 0)x(1, 1, 2, 7) - AIC:24517.54
Tried out SARIMAX(2, 1, 0)x(2, 1, 0, 7) - AIC:24593.35
Tried out SARIMAX(2, 1, 0)x(2, 1, 1, 7) - AIC:24520.0
Tried out SARIMAX(2, 1, 0)x(2, 1, 2, 7) - AIC:24519.3
Tried out SARIMAX(2, 1, 1)x(0, 1, 0, 7) - AIC:24807.88
Tried out SARIMAX(2, 1, 1)x(0, 1, 1, 7) - AIC:24558.86
Tried out SARIMAX(2, 1, 1)x(0, 1, 2, 7) - AIC:24546.55
Tried out SARIMAX(2, 1, 1)x(1, 1, 0, 7) - AIC:24639.65
Tried out SARIMAX(2, 1, 1)x(1, 1, 1, 7) - AIC:24525.97
Tried out SARIMAX(2, 1, 1)x(1, 1, 2, 7) - AIC:24515.56
Tried out SARIMAX(2, 1, 1)x(2, 1, 0, 7) - AIC:24590.88
Tried out SARIMAX(2, 1, 1)x(2, 1, 1, 7) - AIC:24518.03
Tried out SARIMAX(2, 1, 1)x(2, 1, 2, 7) - AIC:24517.3
Tried out SARIMAX(2, 1, 2)x(0, 1, 0, 7) - AIC:24808.81
Tried out SARIMAX(2, 1, 2)x(0, 1, 1, 7) - AIC:24557.81
Tried out SARIMAX(2, 1, 2)x(0, 1, 2, 7) - AIC:24545.32
Tried out SARIMAX(2, 1, 2)x(1, 1, 0, 7) - AIC:24639.01
Tried out SARIMAX(2, 1, 2)x(1, 1, 1, 7) - AIC:24524.41
Tried out SARIMAX(2, 1, 2)x(1, 1, 2, 7) - AIC:24514.13
Tried out SARIMAX(2, 1, 2)x(2, 1, 0, 7) - AIC:24590.1
Tried out SARIMAX(2, 1, 2)x(2, 1, 1, 7) - AIC:24516.52
Tried out SARIMAX(2, 1, 2)x(2, 1, 2, 7) - AIC:24516.86


all_res.sort(key=lambda x: x[1])
all_res[:5]

[(<statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2eada5156d0>,
  24513.320903805343,
  ((1, 1, 2), (1, 1, 2, 7))),
 (<statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2ea73ef8ac0>,
  24513.95207335959,
  ((0, 1, 2), (1, 1, 2, 7))),
 (<statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2eaffef6e80>,
  24514.127795680713,
  ((2, 1, 2), (1, 1, 2, 7))),
 (<statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2eaed8d8670>,
  24515.033250992434,
  ((1, 1, 2), (2, 1, 2, 7))),
 (<statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2eb25ff5550>,
  24515.561430520036,
  ((2, 1, 1), (1, 1, 2, 7)))]


res = all_res[0][0]
res.plot_diagnostics(figsize=(15, 12))

plt.show()
print("Ljung-box p-values:\n" + str(res.test_serial_correlation(method='ljungbox')[0][1]))
res.summary()

Ljung-box p-values:
[0.67721521 0.31963045 0.49660402 0.51013043 0.13165714 0.0922102
 0.01728597 0.00318366 0.00267157 0.00169986 0.00163678 0.00219893
 0.00374616 0.00203584]


pred_test = res.get_prediction(start=train_end,end=test_end,exog=exo_test)
err = 'Mean absolute percentage error: %.2f'% MAPE(test, pred_test.predicted_mean) + \
'\nRoot mean squared error: %.2f'% sqrt(MSE(test, pred_test.predicted_mean))

pred = res.get_prediction(start=begin,end=test_end,exog=exo_test)
pred_ci = pred.conf_int()

fig, ax = plt.subplots(figsize=(12,7))
ax.set(title='Energy consumption', ylabel='kWh')

ts.plot(ax=ax, style = 'o')
pred.predicted_mean.plot(ax=ax, style='o')
ci = pred_ci.loc[demo_start:]
ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1)

plt.figtext(0.12, -0.06, err, ha="left",fontsize=15,va='center')
legend = ax.legend(["Train Set Observed","Test Set Observed", "Forecast"])


pred_test = res.get_prediction(start=train_end,end=test_end,exog=exo_test)
# The root mean squared error
err = 'Mean absolute percentage error: %.2f'% MAPE(test, pred_test.predicted_mean) + \
'\nRoot mean squared error: %.2f'% sqrt(MSE(test, pred_test.predicted_mean))

pred = res.get_prediction(start=demo_start,end=test_end,exog=exo_test)
pred_ci = pred.conf_int()

fig, ax = plt.subplots(figsize=(12,7))
ax.set(title='Energy consumption', ylabel='kWh')

train[demo_start:].plot(ax=ax)
test.plot(ax=ax)
pred.predicted_mean.plot(ax=ax)
ci = pred_ci.loc[demo_start:]
ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1)

plt.figtext(0.12, -0.06, err, ha="left",fontsize=15,va='center')
legend = ax.legend(["Train Set Observed","Test Set Observed", "Forecast"])
ax.grid(True)


import numpy as np
import pandas as pd
import torch
import seaborn as sns
import datetime # manipulating date formats
import holidays
import torch
import torch
import torch.nn as nn

from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error as MSE, r2_score
from torch.autograd import Variable
from fastprogress import master_bar, progress_bar
from torch.utils.data import Dataset
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_percentage_error as MAPE
from torch.autograd import Variable

# settings
plt.style.use('seaborn-deep')
import warnings
warnings.filterwarnings("ignore")


import holidays

begin = '2017-01-01'
end = '2019-12-31' # errors within the holidays package
hol = pd.to_datetime(holidays.CountryHoliday('Finland')[begin:end])

exo = ts.copy()
exo["holidays"] = 0
exo.loc[hol] = 1
exo.loc["2020-01-01","holidays"] = 1

ts["holidays"] = exo["holidays"]
ts


date = ts.index
timestamp_s = date.map(datetime.datetime.timestamp)


week = 7*24*60*60
ts['Week sin'] = np.sin(timestamp_s * (2 * np.pi / week))
ts['Week cos'] = np.cos(timestamp_s * (2 * np.pi / week))


plt.plot(np.array(ts['Week sin'])[:22])
plt.plot(np.array(ts['Week cos'])[:22])

plt.xlabel('Time [d]')
plt.title('Time of week signal')

Text(0.5, 1.0, 'Time of week signal')


ts.head(10)


ts.describe()


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
scaler_kwh = MinMaxScaler(feature_range=(0, 1)).fit(ts[["kWh"]])
ts_normalized = pd.DataFrame(scaler.fit_transform(ts),
                             columns = ts.columns.values,
                            index = ts.index)


ts_normalized


melt_plot = ts_normalized.melt(var_name='Column', value_name='Normalized value')
plt.figure(figsize=(12, 6))
ax = sns.violinplot(x='Column', y='Normalized value', data=melt_plot)
_ = ax.set_xticklabels(ts_normalized.keys(), rotation=90)


train_end = '2019-12-01'
test_end = '2020-01-01'
demo_start = '2019-11-25'

test_df = ts_normalized[train_end:]
train_val = ts_normalized[:train_end]

n = len(train_val)

train_df = train_val[:int(n*0.8)]
val_df = train_val[int(n*0.8):]

column_indices = {name: i for i, name in enumerate(ts.columns)}
num_features = ts.shape[1]
train_df


def split_sequence(df, fea_width=28, label_width=1, shift=1, label_col_indices=[0]):
    
    """ Split sequence with sliding window into
        sequences of context features and label.
        Args:
            df (DataFrame): the target time series
            fea_width (int): Length of features vector.
            label_width (int): Length of labels vector.
            shift (int): Distance between the features vector and the labels one. 
            label_col_indices (list(int)): list of the columns for the labels.
            ra
        Return:
            X (np.array): sequence of features
            y (np.array): sequence of labels
    """
    arr = df.to_numpy()

    # Work out the label column indices.

    window_size = fea_width + shift + label_width - 1
    fea_slice = slice(0, fea_width)
    fea_indices = np.arange(window_size)[fea_slice]

    label_slice = slice(window_size - label_width, None)
    label_indices = np.arange(window_size)[label_slice]
    
    
    print("\n window_size: {} \n fea_indices: {} \n label_indices: {}"
          .format(window_size,fea_indices,label_indices))
    
    
    def split_window(window):
        features = window[fea_slice]
        labels = window[label_slice]
#         print(label_col)
#         print([label_col_indices[name] for name in label_col])
        if label_col_indices is not None:
            labels = np.stack(
                [labels[:, label_col_indices[idx]] 
                 for idx in label_col_indices],
                axis=-1)
        
        labels = labels.flatten()
        
        return features, labels
            
    n = len(arr)
    X,y = list(),list()
    for i in range(0, n, shift):
        
        window = arr[i:i+window_size]
        if (len(window) != window_size): break
        
        # Find the end of this pattern:
        features,labels = split_window(window)
        X.append(features)
        y.append(labels)
        
#     print(label_col_indices)
#     if randomize == True:
#         random.shuffle(X)
#         random.shuffle(y)
    
    return X,y


fea_width, label_width, shift, label_col_indices = 28, 1, 1, [0]

X_ts,y_ts = split_sequence(ts_normalized)

X_train,y_train = split_sequence(train_df)
X_val,y_val = split_sequence(val_df)
X_test,y_test = split_sequence(test_df)

X_train[0].shape,y_train[0].shape

 window_size: 29 
 fea_indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27] 
 label_indices: [28]

 window_size: 29 
 fea_indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27] 
 label_indices: [28]

 window_size: 29 
 fea_indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27] 
 label_indices: [28]

 window_size: 29 
 fea_indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27] 
 label_indices: [28]

((28, 4), (1,))


def to_tensor(ls):
    return Variable(torch.cuda.FloatTensor(np.array(ls)))

X_ts,y_ts = to_tensor(X_ts),to_tensor(y_ts)

X_train,y_train = to_tensor(X_train),to_tensor(y_train)
X_val,y_val = to_tensor(X_val),to_tensor(y_val)
X_test = to_tensor(X_test)

X_train

tensor([[[2.3033e-01, 1.0000e+00, 7.4672e-01, 4.0110e-02],
         [6.4248e-01, 0.0000e+00, 3.0279e-01, 6.7022e-03],
         [6.7586e-01, 0.0000e+00, 7.9159e-14, 3.3441e-01],
         ...,
         [9.2681e-01, 0.0000e+00, 4.5188e-01, 1.0000e+00],
         [8.5173e-01, 0.0000e+00, 8.6628e-01, 8.3667e-01],
         [3.7862e-01, 0.0000e+00, 9.9750e-01, 4.0948e-01]],

        [[6.4248e-01, 0.0000e+00, 3.0279e-01, 6.7022e-03],
         [6.7586e-01, 0.0000e+00, 7.9159e-14, 3.3441e-01],
         [6.7369e-01, 0.0000e+00, 6.6349e-02, 7.7647e-01],
         ...,
         [8.5173e-01, 0.0000e+00, 8.6628e-01, 8.3667e-01],
         [3.7862e-01, 0.0000e+00, 9.9750e-01, 4.0948e-01],
         [3.3198e-01, 0.0000e+00, 7.4672e-01, 4.0110e-02]],

        [[6.7586e-01, 0.0000e+00, 7.9159e-14, 3.3441e-01],
         [6.7369e-01, 0.0000e+00, 6.6349e-02, 7.7647e-01],
         [6.6314e-01, 0.0000e+00, 4.5188e-01, 1.0000e+00],
         ...,
         [3.7862e-01, 0.0000e+00, 9.9750e-01, 4.0948e-01],
         [3.3198e-01, 0.0000e+00, 7.4672e-01, 4.0110e-02],
         [9.1621e-01, 0.0000e+00, 3.0279e-01, 6.7022e-03]],

        ...,

        [[7.9383e-01, 0.0000e+00, 5.9968e-03, 3.1611e-01],
         [7.7873e-01, 0.0000e+00, 5.5840e-02, 7.6036e-01],
         [7.3793e-01, 0.0000e+00, 4.3278e-01, 9.9820e-01],
         ...,
         [2.0707e-01, 0.0000e+00, 1.0000e+00, 4.2858e-01],
         [1.8763e-01, 0.0000e+00, 7.6315e-01, 5.0054e-02],
         [6.7798e-01, 0.0000e+00, 3.2078e-01, 2.0045e-13]],

        [[7.7873e-01, 0.0000e+00, 5.5840e-02, 7.6036e-01],
         [7.3793e-01, 0.0000e+00, 4.3278e-01, 9.9820e-01],
         [7.2000e-01, 0.0000e+00, 8.5297e-01, 8.5055e-01],
         ...,
         [1.8763e-01, 0.0000e+00, 7.6315e-01, 5.0054e-02],
         [6.7798e-01, 0.0000e+00, 3.2078e-01, 2.0045e-13],
         [6.2343e-01, 0.0000e+00, 5.9968e-03, 3.1611e-01]],

        [[7.3793e-01, 0.0000e+00, 4.3278e-01, 9.9820e-01],
         [7.2000e-01, 0.0000e+00, 8.5297e-01, 8.5055e-01],
         [2.6813e-01, 0.0000e+00, 1.0000e+00, 4.2858e-01],
         ...,
         [6.7798e-01, 0.0000e+00, 3.2078e-01, 2.0045e-13],
         [6.2343e-01, 0.0000e+00, 5.9968e-03, 3.1611e-01],
         [2.8551e-01, 1.0000e+00, 5.5840e-02, 7.6036e-01]]], device='cuda:0')


class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.batch_size = 1
        #self.seq_length = seq_length
        self.LSTM = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,batch_first=True,dropout = 0.25)
        
        
        
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(p=0.2)
    def forward(self, x):
        h_1 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size).cuda())
         
        
        c_1 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size).cuda())
        
       
        _, (hn, cn) = self.LSTM(x, (h_1, c_1))
     
        #print("hidden state shpe is:",hn.size())
        y = hn.view(-1, self.hidden_size)
        
        final_state = hn.view(self.num_layers, x.size(0), self.hidden_size)[-1]
        #print("final state shape is:",final_state.shape)
        out = self.fc(final_state)
        #out = self.dropout(out)
        #print(out.size())
        return out

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
num_epochs = 300
learning_rate = 1e-3
input_size = 4
hidden_size = 512
num_layers = 2

num_classes = 1

model = LSTM(num_classes, input_size, hidden_size, num_layers)
model.cuda()


model.apply(init_weights)

criterion = torch.nn.MSELoss().cuda()    # Will be transformed to root mean-squared error for regression
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,  patience=100, factor =0.5 ,min_lr=1e-7, eps=1e-08)


for epoch in progress_bar(range(1,num_epochs+1)):
    model.train()
    optimizer.zero_grad()
    y_pred = model(X_train)
    
    loss = torch.sqrt(criterion(y_pred, y_train))
    loss.backward()
    
    optimizer.step()
    
    model.eval()
    valid = model(X_val)
    val_loss = torch.sqrt(criterion(valid, y_val))
    scheduler.step(val_loss)
    
    if epoch%20 == 1:
        print("Epoch: %d, loss: %1.5f validation loss:  %1.5f " 
              %(epoch, loss.item(),val_loss.item()))

print("Final: \nEpoch: %d, loss: %1.5f validation loss:  %1.5f " 
              %(epoch, loss.item(),val_loss.item()))

Epoch: 1, loss: 0.63244 validation loss:  0.30838 
Epoch: 21, loss: 0.15830 validation loss:  0.15350 
Epoch: 41, loss: 0.14124 validation loss:  0.12848 
Epoch: 61, loss: 0.13214 validation loss:  0.11443 
Epoch: 81, loss: 0.09786 validation loss:  0.07861 
Epoch: 101, loss: 0.08649 validation loss:  0.05874 
Epoch: 121, loss: 0.08123 validation loss:  0.05096 
Epoch: 141, loss: 0.07777 validation loss:  0.04442 
Epoch: 161, loss: 0.07651 validation loss:  0.04426 
Epoch: 181, loss: 0.07473 validation loss:  0.04543 
Epoch: 201, loss: 0.07185 validation loss:  0.04300 
Epoch: 221, loss: 0.06986 validation loss:  0.04042 
Epoch: 241, loss: 0.06834 validation loss:  0.03962 
Epoch: 261, loss: 0.06709 validation loss:  0.04418 
Epoch: 281, loss: 0.06476 validation loss:  0.04164 
Final: 
Epoch: 300, loss: 0.06227 validation loss:  0.04138


model.eval()

## Inverse Normalize 
y_test_pred = scaler_kwh.inverse_transform(model(X_test).cpu().detach().numpy())
y_test_obs = scaler_kwh.inverse_transform(y_test)

print('The root mean squared error of the model is ' +
      str(round(np.sqrt(MSE(y_test_obs,y_test_pred)),2)) +
      '\nThe root mean squared error of the model is ' + 
     str(round(MAPE(y_test_obs,y_test_pred),2)))

The root mean squared error of the model is 66634.48
The root mean squared error of the model is 0.12


y_pred = scaler_kwh.inverse_transform(model(X_ts).cpu().detach().numpy())
y_obs = scaler_kwh.inverse_transform(y_ts.cpu().numpy())

pred_df = pd.DataFrame(y_pred,index = ts.index[-len(y_pred):],columns = ["kWh"])
obs_df = pd.DataFrame(y_obs,index = ts.index[-len(y_pred):],columns = ["kWh"])

fig, ax = plt.subplots()
plot_df = pd.concat([obs_df,pred_df],axis=1)
plot_df.plot(ax=ax,figsize=(12,7),style = 'o', grid=True)
ax.legend(["Observed", "Forecasts"])
ax.set_title("Energy Consumption 2017-2019")
ax.set_ylabel("kWh")

Text(0, 0.5, 'kWh')


fig, ax = plt.subplots()
plot_df.loc[train_end:].plot(ax = ax, figsize=(12,7), grid=True)

ax.legend(["Observed", "Forecasts"])
ax.set_title("Energy Consumption December 2019")
ax.set_ylabel("kWh")

Text(0, 0.5, 'kWh')

	Unnamed: 0	value
count	615234.000000	615234.000000
mean	307616.500000	915.019259
std	177602.902095	1942.878510
min	0.000000	0.000000
25%	153808.250000	156.880000
50%	307616.500000	358.200000
75%	461424.750000	895.105000
max	615233.000000	35463.010000

	0	1	2	3	4	5	6	7	8	9	...	571	572	573	574	575	576	577	578	579	580
timestamp
2017-01-01	2005.76	969.10	233.00	1652.32	1023.70	258.63	91.08	559.20	0.94	225.60	...	567.42	1181.76	1081.59	1069.2	703.99	4700.0	5264.00	670.37	NaN	1131.0
2017-01-02	3081.50	995.34	788.95	2145.28	1508.15	457.95	214.32	634.51	0.95	525.04	...	1152.73	1653.48	1436.20	1459.6	1244.22	8900.0	5976.00	875.44	NaN	2512.0
2017-01-03	3176.77	1122.91	1067.80	2151.84	1501.27	477.70	208.48	682.64	0.94	550.96	...	1211.86	1674.48	1496.60	1184.4	1275.65	9300.0	6056.00	882.21	NaN	2620.0
2017-01-04	3173.77	956.25	1125.55	2489.12	1497.33	484.12	215.52	693.18	0.95	545.52	...	1119.05	1630.08	1471.29	1211.2	885.92	11900.0	5916.00	837.58	NaN	3277.0
2017-01-05	3204.18	932.25	1217.55	2596.96	1431.18	476.02	221.72	702.17	0.95	536.56	...	1048.96	1553.52	1426.89	1236.8	850.15	13000.0	5800.00	805.91	NaN	3810.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2019-12-28	64.18	1184.67	693.20	2563.72	1363.52	343.14	72.44	612.14	0.00	189.92	...	401.63	554.88	924.79	1090.4	847.90	5660.0	4879.04	618.41	1435.52	2461.0
2019-12-29	1.51	886.31	420.55	2406.35	1257.60	289.67	73.00	645.90	0.00	189.12	...	398.91	555.60	927.02	1044.4	997.44	4420.0	4821.19	624.39	1399.04	1795.0
2019-12-30	2.28	1084.05	399.30	2760.30	1863.64	499.47	146.72	537.75	0.00	405.12	...	843.78	686.88	1526.55	1774.0	1476.41	5610.0	5421.80	689.85	2330.24	1428.0
2019-12-31	27.57	1050.63	502.25	2746.62	1966.87	481.41	143.68	533.44	0.00	362.64	...	782.43	678.36	1507.52	1668.0	1588.64	6980.0	5613.77	809.78	2283.20	1820.0
2020-01-01	1.49	860.00	455.55	2428.93	1676.72	332.33	145.64	561.07	0.00	345.84	...	602.65	642.84	974.16	1079.2	1549.79	6550.0	5242.65	796.75	2100.16	1097.0

	0	1	2	3	4	5	6	7	9	10	...	570	571	572	573	574	575	576	577	578	580
timestamp
2017-01-01	2005.76	969.10	233.00	1652.32	1023.70	258.63	91.08	559.20	225.60	379.89	...	98.48	567.42	1181.76	1081.59	1069.2	703.99	4700.0	5264.0	670.37	1131.0
2017-01-02	3081.50	995.34	788.95	2145.28	1508.15	457.95	214.32	634.51	525.04	457.54	...	340.96	1152.73	1653.48	1436.20	1459.6	1244.22	8900.0	5976.0	875.44	2512.0
2017-01-03	3176.77	1122.91	1067.80	2151.84	1501.27	477.70	208.48	682.64	550.96	447.20	...	342.88	1211.86	1674.48	1496.60	1184.4	1275.65	9300.0	6056.0	882.21	2620.0
2017-01-04	3173.77	956.25	1125.55	2489.12	1497.33	484.12	215.52	693.18	545.52	546.25	...	301.68	1119.05	1630.08	1471.29	1211.2	885.92	11900.0	5916.0	837.58	3277.0
2017-01-05	3204.18	932.25	1217.55	2596.96	1431.18	476.02	221.72	702.17	536.56	456.44	...	276.08	1048.96	1553.52	1426.89	1236.8	850.15	13000.0	5800.0	805.91	3810.0

	0	1	2	3	4	5	6	7	9	10	...	570	571	572	573	574	575	576	577	578	580
timestamp
2017-01-01	2005.76	969.10	233.00	1652.32	1023.70	258.63	91.08	559.20	225.60	379.89	...	98.48	567.42	1181.76	1081.59	1069.2	703.99	4700.0	5264.0	670.37	1131.0
2017-01-02	3081.50	995.34	788.95	2145.28	1508.15	457.95	214.32	634.51	525.04	457.54	...	340.96	1152.73	1653.48	1436.20	1459.6	1244.22	8900.0	5976.0	875.44	2512.0
2017-01-03	3176.77	1122.91	1067.80	2151.84	1501.27	477.70	208.48	682.64	550.96	447.20	...	342.88	1211.86	1674.48	1496.60	1184.4	1275.65	9300.0	6056.0	882.21	2620.0
2017-01-04	3173.77	956.25	1125.55	2489.12	1497.33	484.12	215.52	693.18	545.52	546.25	...	301.68	1119.05	1630.08	1471.29	1211.2	885.92	11900.0	5916.0	837.58	3277.0
2017-01-05	3204.18	932.25	1217.55	2596.96	1431.18	476.02	221.72	702.17	536.56	456.44	...	276.08	1048.96	1553.52	1426.89	1236.8	850.15	13000.0	5800.0	805.91	3810.0

	holidays
Date
2019-12-01	0
2019-12-02	0
2019-12-03	0
2019-12-04	0
2019-12-05	0
2019-12-06	1
2019-12-07	0
2019-12-08	0
2019-12-09	0
2019-12-10	0

Energy Demand Prediction¶

Table of Contents¶

Exploratory Data Analysis ¶

Data Cleaning ¶

Data Visualization ¶

Baseline Model ¶

SARIMAX ¶

Model Identification ¶

Model Estimation ¶

Model Validation ¶

LSTM ¶

Feature Engineering ¶

Holidays Data Generation ¶

Cyclical feature encoding ¶

Feature Vector Normalization ¶

Modeling ¶

Further Readings ¶

	Unnamed: 0	timestamp	reportingGroup	locationName	value	unit
0	0	2017-01-01	Electricity	1037 Vuotalo	1652.32	kWh
1	1	2017-01-02	Electricity	1037 Vuotalo	2145.28	kWh
2	2	2017-01-03	Electricity	1037 Vuotalo	2151.84	kWh
3	3	2017-01-04	Electricity	1037 Vuotalo	2489.12	kWh
4	4	2017-01-05	Electricity	1037 Vuotalo	2596.96	kWh
...	...	...	...	...	...	...
615229	615229	2019-12-28	Electricity	4046 Lp Hilleri, 25820 leikkipuisto	349.73	kWh
615230	615230	2019-12-29	Electricity	4046 Lp Hilleri, 25820 leikkipuisto	306.35	kWh
615231	615231	2019-12-30	Electricity	4046 Lp Hilleri, 25820 leikkipuisto	253.59	kWh
615232	615232	2019-12-31	Electricity	4046 Lp Hilleri, 25820 leikkipuisto	275.06	kWh
615233	615233	2020-01-01	Electricity	4046 Lp Hilleri, 25820 leikkipuisto	280.62	kWh

Dep. Variable:	kWh	No. Observations:	1065
Model:	SARIMAX(1, 1, 2)x(1, 1, 2, 7)	Log Likelihood	-12248.660
Date:	Fri, 22 Jan 2021	AIC	24513.321
Time:	19:57:58	BIC	24553.026
Sample:	01-01-2017	HQIC	24528.371
	- 12-01-2019
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
holidays	-6.638e+04	3166.364	-20.965	0.000	-7.26e+04	-6.02e+04
ar.L1	-0.1796	0.208	-0.865	0.387	-0.586	0.227
ma.L1	-0.0816	0.210	-0.388	0.698	-0.494	0.330
ma.L2	-0.2114	0.065	-3.269	0.001	-0.338	-0.085
ar.S.L7	0.6702	0.052	12.914	0.000	0.569	0.772
ma.S.L7	-1.2942	0.060	-21.595	0.000	-1.412	-1.177
ma.S.L14	0.3010	0.047	6.353	0.000	0.208	0.394
sigma2	8.962e+08	0.013	6.86e+10	0.000	8.96e+08	8.96e+08

Ljung-Box (L1) (Q):	0.17	Jarque-Bera (JB):	2419.63
Prob(Q):	0.68	Prob(JB):	0.00
Heteroskedasticity (H):	0.94	Skew:	-0.81
Prob(H) (two-sided):	0.57	Kurtosis:	10.23

	kWh	holidays
Date
2017-01-01	393707.05	1
2017-01-02	538022.63	0
2017-01-03	549714.09	0
2017-01-04	548953.12	0
2017-01-05	545259.28	0
...	...	...
2019-12-28	462485.64	0
2019-12-29	442889.22	0
2019-12-30	519266.06	0
2019-12-31	520889.60	0
2020-01-01	462607.10	1

	kWh	holidays	Week sin	Week cos
count	1096.000000	1096.000000	1096.000000	1096.000000
mean	508050.718476	0.028285	-0.001503	-0.001394
std	94631.972830	0.165861	0.707415	0.707442
min	313056.850000	0.000000	-0.955573	-0.943883
25%	428946.617500	0.000000	-0.826239	-0.846724
50%	522296.615000	0.000000	-0.111964	-0.111964
75%	589724.655000	0.000000	0.707107	0.680173
max	663211.580000	1.000000	0.993712	0.997204

	kWh	holidays	Week sin	Week cos
Date
2017-01-01	0.230327	1.0	7.467214e-01	0.040110
2017-01-02	0.642475	0.0	3.027940e-01	0.006702
2017-01-03	0.675865	0.0	7.915890e-14	0.334415
2017-01-04	0.673692	0.0	6.634947e-02	0.776474
2017-01-05	0.663142	0.0	4.518799e-01	1.000000
...	...	...	...	...
2019-12-28	0.426751	0.0	9.974958e-01	0.409482
2019-12-29	0.370786	0.0	7.467214e-01	0.040110
2019-12-30	0.588909	0.0	3.027940e-01	0.006702
2019-12-31	0.593545	0.0	4.502509e-13	0.334415
2020-01-01	0.427098	1.0	6.634947e-02	0.776474

	kWh	holidays	Week sin	Week cos
Date
2017-01-01	393707.05	1	0.500000	-0.866025
2017-01-02	538022.63	0	-0.365341	-0.930874
2017-01-03	549714.09	0	-0.955573	-0.294755
2017-01-04	548953.12	0	-0.826239	0.563320
2017-01-05	545259.28	0	-0.074730	0.997204
2017-01-06	476644.64	1	0.733052	0.680173
2017-01-07	441790.83	0	0.988831	-0.149042
2017-01-08	427113.22	0	0.500000	-0.866025
2017-01-09	629581.78	0	-0.365341	-0.930874
2017-01-10	635106.34	0	-0.955573	-0.294755