From: https://github.com/ksatola
Version: 0.1.0

Model - PM2.5 - Machine Learning Modelling - Regression

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
import sys
sys.path.insert(0, '../src')
In [4]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
In [5]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator
In [6]:
from model import (
    get_pm25_data_for_modelling,
    get_models_for_regression,
    split_df_for_ml_modelling_offset,
    perform_grid_search_cv,
    perform_random_search_cv
)

from measure import (
    get_mean_folds_rmse_for_n_prediction_points,
    score_ml_models,
    prepare_data_for_visualization,
    walk_forward_ml_model_validation
)

from plot import (
   visualize_results
)

from utils import (
    get_datetime_identifier
)

from logger import logger
from datetime import datetime

Machine Learning Regression

XXXXXXXXX

  • Linear Algorithms: Logistic Regression.
  • Nonlinear Algorithms: Classification and Regression Trees (CART), Support Vector Machines (SVM), Gaussian Naive Bayes (NB) and k-Nearest Neighbors (KNN).

Napisac z scikit learn wstep do regresji

wypisac funkcje - zob. ponizej

Hyper based on CV and training data Final check based on test data - TODO

w pracy opisac generalne podejscie do ML, proste, dla najlepszych hyper, i dla nich super learner

XXXX From https://www.degruyter.com/view/journals/sagmb/6/1/article-sagmb.2007.6.1.1309.xml.xml

score models na zbiorze testowym! zrobic final test wybranych modeli ML na zbiorze testowym i policzyc RMSE -> symulacja predykcji jesli linear regression jest najlepsze to sprawdzic OLS assumptions dla zbioru danych


Load hourly data

In [7]:
dfh = get_pm25_data_for_modelling('ml', 'h')
dfh.head()
common.py | 42 | get_pm25_data_for_modelling | 12-Jun-20 22:13:04 | INFO: Dataframe loaded: /Users/ksatola/Documents/git/air-pollution/agh/data/dfpm25_2008-2018_ml_24hours_lags.hdf
common.py | 43 | get_pm25_data_for_modelling | 12-Jun-20 22:13:04 | INFO: Dataframe size: (96378, 19)
Out[7]:
t t-1 t-2 t-3 t-4 t-5 t-6 t-7 t-8 t-9 t-10 month day hour dayofyear weekofyear dayofweek quarter season
0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 73.0 81.0 92.0 1 1 11 1 1 1 1 1
1 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 73.0 81.0 1 1 12 1 1 1 1 1
2 28.5 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 73.0 1 1 13 1 1 1 1 1
3 28.0 28.5 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 1 1 14 1 1 1 1 1
4 32.5 28.0 28.5 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 1 1 15 1 1 1 1 1
In [27]:
df = dfh[:]
df.shape
Out[27]:
(96378, 19)
In [9]:
df.head()
Out[9]:
t t-1 t-2 t-3 t-4 t-5 t-6 t-7 t-8 t-9 t-10 month day hour dayofyear weekofyear dayofweek quarter season
0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 73.0 81.0 92.0 1 1 11 1 1 1 1 1
1 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 73.0 81.0 1 1 12 1 1 1 1 1
2 28.5 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 73.0 1 1 13 1 1 1 1 1
3 28.0 28.5 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 60.5 1 1 14 1 1 1 1 1
4 32.5 28.0 28.5 26.0 30.5 41.5 62.5 70.5 69.5 67.0 61.0 1 1 15 1 1 1 1 1

Base Modelling

In [10]:
# Define first past/future cutoff point in time offset (1 year of data)
cut_off_offset = 365*24 # for hourly data
#cut_off_offset = 365 # for daily data
In [11]:
# For this method to work properly, observations in the data frame must be ordered by time (the greater index, the recent data)
# Train test split
X_train, X_test, y_train, y_test = split_df_for_ml_modelling_offset(data=df, target_col='t', cut_off_offset=cut_off_offset)
In [12]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Out[12]:
((87618, 18), (87618,), (8760, 18), (8760,))
In [13]:
# Define linear and non-linear regression models in scope
reg_models = get_models_for_regression()
models = []

for model in reg_models:
    item = (type(model).__name__, model)
    models.append(item)
    print(item)
('LinearRegression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False))
('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))
('DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'))
('KNeighborsRegressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'))
('AdaBoostRegressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None))
('BaggingRegressor', BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
('ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=10, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False))
In [14]:
%%time
# Perform initial ranking - choose best performing models for validation
# We assume, models will predict the best t+1, for subsequent ts the error will grow. That is why, for the initial ranking we will use one day prediction only
scores, results, names = score_ml_models(X_train=X_train,
                                         y_train=y_train,
                                         models=models,
                                         n_splits = 5,
                                         metric='neg_root_mean_squared_error',
                                         metric_label="RMSE", 
                                         seed=123)
for score in scores:
    print(score)
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:13:26 | INFO: Scoring LinearRegression started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:13:26 | INFO: Scoring ElasticNet started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:13:26 | INFO: Scoring SVR started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:15 | INFO: Scoring DecisionTreeRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:20 | INFO: Scoring KNeighborsRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:26 | INFO: Scoring AdaBoostRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:47 | INFO: Scoring BaggingRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:32:19 | INFO: Scoring RandomForestRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:32:49 | INFO: Scoring ExtraTreesRegressor started
LinearRegression, RMSE 8.454518896700154, (std. dev. 1.0553480373018875)
ElasticNet, RMSE 8.466827600091218, (std. dev. 1.067885147340825)
SVR, RMSE 11.326321281986267, (std. dev. 3.2873896157758)
DecisionTreeRegressor, RMSE 11.935730744140105, (std. dev. 1.212603561790327)
KNeighborsRegressor, RMSE 10.866634500393763, (std. dev. 1.608640951710884)
AdaBoostRegressor, RMSE 16.822330057803956, (std. dev. 2.3545417407828704)
BaggingRegressor, RMSE 8.826528266269595, (std. dev. 1.1106023504757678)
RandomForestRegressor, RMSE 8.805528004207913, (std. dev. 1.118664501467662)
ExtraTreesRegressor, RMSE 8.818717598661783, (std. dev. 1.1022531965880644)
CPU times: user 19min 39s, sys: 4 s, total: 19min 43s
Wall time: 19min 40s
In [ ]:
LinearRegression, RMSE 8.454518896700154, (std. dev. 1.0553480373018875)
ElasticNet, RMSE 8.466827600091218, (std. dev. 1.067885147340825)
SVR, RMSE 11.326321281986267, (std. dev. 3.2873896157758)
DecisionTreeRegressor, RMSE 11.935730744140105, (std. dev. 1.212603561790327)
KNeighborsRegressor, RMSE 10.866634500393763, (std. dev. 1.608640951710884)
AdaBoostRegressor, RMSE 16.822330057803956, (std. dev. 2.3545417407828704)
BaggingRegressor, RMSE 8.826528266269595, (std. dev. 1.1106023504757678)
RandomForestRegressor, RMSE 8.805528004207913, (std. dev. 1.118664501467662)
ExtraTreesRegressor, RMSE 8.818717598661783, (std. dev. 1.1022531965880644)
CPU times: user 19min 39s, sys: 4 s, total: 19min 43s
Wall time: 19min 40s
In [15]:
# Compare Algorithms
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Algorithms Comparison - RMSE')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('images/ml_comparison_rmse_h.png')
plt.show();

Hyper-parameter Tuning

Hyper-parameters are parameters that are not directly learnt within estimators. It is possible and recommended to search the hyper-parameter space for the best cross validation score.

While using a grid of parameter settings is currently the most widely used method for parameter optimization (GridSearchCV), other search methods have more favourable properties. RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search:

  • A budget can be chosen independent of the number of parameters and possible values.
  • Adding parameters that do not influence the performance does not decrease efficiency.

Specifying how parameters should be sampled is done using a dictionary, very similar to specifying parameters for GridSearchCV. Additionally, a computation budget, being the number of sampled candidates or sampling iterations, is specified using the n_iter parameter. For each parameter, either a distribution over possible values or a list of discrete choices (which will be sampled uniformly) can be specified: loguniform(1, 100) can be used instead of [1, 10, 100].

https://towardsdatascience.com/hyperparameter-tuning-c5619e7e6624


Grid Search: hyper-parameters tuning

https://scikit-learn.org/stable/modules/grid_search.html#grid-search

Hyper-parameters are parameters that are not directly learnt within estimators. In scikit-learn they are passed as arguments to the constructor of the estimator classes. Typical examples include C, kernel and gamma for Support Vector Classifier, alpha for Lasso, etc.

Some models allow for specialized, efficient parameter search strategies, outlined below. Two generic approaches to sampling search candidates are provided in scikit-learn: for given values, GridSearchCV exhaustively considers all parameter combinations, while RandomizedSearchCV can sample a given number of candidates from a parameter space with a specified distribution.

Note that it is common that a small subset of those parameters can have a large impact on the predictive or computation performance of the model while others can be left to their default values. It is recommended to read the docstring of the estimator class to get a finer understanding of their expected behavior, possibly by reading the enclosed reference to the literature.

While using a grid of parameter settings is currently the most widely used method for parameter optimization, other search methods have more favourable properties. RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search:

  • A budget can be chosen independent of the number of parameters and possible values.

  • Adding parameters that do not influence the performance does not decrease efficiency.

Cross-validation: evaluating estimator performance

https://scikit-learn.org/stable/modules/cross_validation.html

Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test.

When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.

However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.

A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:

  • A model is trained using of the folds as training data;

  • the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).

The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as is the case when fixing an arbitrary validation set), which is a major advantage in problems such as inverse inference where the number of samples is very small.

dac obrazek ze strony, zrobic source: sklearn

Pipelines and composite estimators

https://scikit-learn.org/stable/modules/compose.html

Transformers are usually combined with classifiers, regressors or other estimators to build a composite estimator. The most common tool is a Pipeline.

Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves multiple purposes here:

Convenience and encapsulation You only have to call fit and predict once on your data to fit a whole sequence of estimators.

Joint parameter selection You can grid search over parameters of all estimators in the pipeline at once.

Safety Pipelines help avoid leaking statistics from your test data into the trained model in cross-validation, by ensuring that the same samples are used to train the transformers and predictors.

All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).

Model persistence

https://scikit-learn.org/stable/modules/model_persistence.html

After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. The following section gives you an example of how to persist a model with pickle.

pickle (and joblib by extension), has some issues regarding maintainability and security. Because of this,

  • Never unpickle untrusted data as it could lead to malicious code being executed upon loading.

  • While models saved using one version of scikit-learn might load in other versions, this is entirely unsupported and inadvisable. It should also be kept in mind that operations performed on such data could give different and unexpected results.

In order to rebuild a similar model with future versions of scikit-learn, additional metadata should be saved along the pickled model:

  • The training data, e.g. a reference to an immutable snapshot

  • The python source code used to generate the model

  • The versions of scikit-learn and its dependencies

  • The cross validation score obtained on the training data

  • This should make it possible to check that the cross-validation score is in the same range as before.

Since a model internal representation may be different on two different architectures, dumping a model on one architecture and loading it on another architecture is not supported.

more: https://pyvideo.org/pycon-us-2014/pickles-are-for-delis-not-software.html

Based on the best median RMSE metric results, the following algorithms seem to work best with PM2.5 data (cutoff < 8.5 g/m3) and the best non-linear algorithm:

  • LinearRegression
  • ElasticNet
  • RandomForestRegressor

After hyper-parameter tuning, they will be validated using a common procedure.

In [16]:
models[0][0], models[0][1].get_params().keys()
Out[16]:
('LinearRegression',
 dict_keys(['copy_X', 'fit_intercept', 'n_jobs', 'normalize']))
In [17]:
%%time
# LinearRegression
param_grid = {
    "fit_intercept": [True, False],
    "normalize": [True, False],
    "n_jobs": [-1]
}
model = models[0][1]
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
In [18]:
%%time
perform_grid_search_cv(X_train=X_train, 
                        y_train=y_train, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)
regression.py | 70 | perform_grid_search_cv | 12-Jun-20 22:36:53 | INFO: Best: -8.417609654057864 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': True}
CPU times: user 5.01 s, sys: 1.09 s, total: 6.1 s
Wall time: 925 ms
In [ ]:
regression.py | 70 | perform_grid_search_cv | 12-Jun-20 22:36:53 | INFO: Best: -8.417609654057864 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': True}
CPU times: user 5.01 s, sys: 1.09 s, total: 6.1 s
Wall time: 925 ms
In [19]:
models[1][0], models[1][1].get_params().keys()
Out[19]:
('ElasticNet',
 dict_keys(['alpha', 'copy_X', 'fit_intercept', 'l1_ratio', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start']))
In [20]:
%%time
# ElasticNet
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "l1_ratio": np.arange(0.0, 1.0, 0.1),
    "max_iter": [1, 10, 100, 500, 1000],
    "fit_intercept": [True, False],
    "selection": ["cyclic", "random"]
}
model = models[1][1]
CPU times: user 17 µs, sys: 3 µs, total: 20 µs
Wall time: 24.8 µs
In [21]:
%%time
perform_grid_search_cv(X_train=X_train, 
                        y_train=y_train, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)
regression.py | 70 | perform_grid_search_cv | 12-Jun-20 23:03:26 | INFO: Best: -8.420425074217084 using {'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}
CPU times: user 1h 40min 6s, sys: 13min 37s, total: 1h 53min 44s
Wall time: 23min 35s
In [ ]:
regression.py | 70 | perform_grid_search_cv | 12-Jun-20 23:03:26 | INFO: Best: -8.420425074217084 using {'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}
CPU times: user 1h 40min 6s, sys: 13min 37s, total: 1h 53min 44s
Wall time: 23min 35s
In [22]:
models[7][0], models[7][1].get_params().keys()
Out[22]:
('RandomForestRegressor',
 dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start']))
In [30]:
%%time
# RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset 
# and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples 
# parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
    "criterion": ['mse', 'mae'],
    #"min_samples_split": [2, 8, 16],
    #"min_samples_leaf": [1, 2, 4],
    "n_estimators": [100, 200],
    "n_jobs": [-1]
}
model = models[7][1]
CPU times: user 4 µs, sys: 7 µs, total: 11 µs
Wall time: 16.9 µs
In [28]:
# Limit training data to 2 last years of hourly data (for performance reasons)
X_train_limited = X_train[-365*24*2:]
y_train_limited = y_train[-365*24*2:]

X_train_limited.shape, y_train_limited.shape
Out[28]:
((17520, 18), (17520,))
In [ ]:
%%time
# Random search, to speed-up
perform_random_search_cv(X_train=X_train_limited, 
                        y_train=y_train_limited, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)

For entire dataset, random grid search was running for more than 6 hours - no results. Stoped and limited training data set to 2 years of observations (17520, comparing to 87618). X_train_limited, y_train_limited 23:30 started with full training dataset 6:15 interrupted manually - no visible progress 6:23 started with limited 10:41 interrupted manually - no visible progress (limited and limited parameters grid) -> I will use standard settings 10:42 started with limited historical data and limited grid params 11:15 decision: do not tune RandomForestRegressor hyper-parameters, use it with default settings

Evaluate best models on test dataset

In [32]:
best_models = [
    LinearRegression({'fit_intercept': True, 'n_jobs': -1, 'normalize': True}),
    ElasticNet(**{'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}),
    RandomForestRegressor()]

models = []

for model in best_models:
    item = (type(model).__name__, model)
    models.append(item)
    print(item)
('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': True},
                 n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.9,
           max_iter=500, normalize=False, positive=False, precompute=False,
           random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
In [ ]:
('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': True},
                 n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.9,
           max_iter=500, normalize=False, positive=False, precompute=False,
           random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
In [33]:
# number of t-n columns in a dataset
n_lags_in_dataset = 10 # for hourly data
#n_lags_in_dataset = 4 # for daily data

# Set datetime format for index
dt_format = "%Y-%m-%d %H:%M:%S" # for hourly data
#dt_format = "%Y-%m-%d" # for daily data

# Create train and validate sets
train_test_split_position = int(len(df)-cut_off_offset)

# Create as many folds as remains till the end of known data
n_folds = len(df) #train_test_split_position+3

# Predict for X points
n_pred_points = 24 # for hourly data
#n_pred_points = 7 # for daily data
In [34]:
%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
  
# Model 1: LinearRegression
model_name = models[0][0]
model = models[0][1]

# Model 2: ElasticNet
#model_name = models[1][0]
#model = models[1][1]

# Model 3: RandomForestRegressor
#model_name = models[7][0]
#model = models[7][1]


fold_results = walk_forward_ml_model_validation(data=df,
                                                 model=model,
                                                 target_col='t',
                                                 cut_off_offset=cut_off_offset,
                                                 n_pred_points=n_pred_points,
                                                 n_folds=-1,
                                                 n_lags = n_lags_in_dataset)
print(len(fold_results))
print(fold_results[0])
validation.py | 88 | walk_forward_ml_model_validation | 13-Jun-20 11:17:11 | INFO: ('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': True},
                 n_jobs=None, normalize=False)) model validation started
Started fold 000000/008760 - 2020-06-13_11-17-11
Started fold 000100/008760 - 2020-06-13_11-18-48
Started fold 000200/008760 - 2020-06-13_11-21-33
Started fold 000300/008760 - 2020-06-13_11-23-21
Started fold 000400/008760 - 2020-06-13_11-25-09
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<timed exec> in <module>

~/Documents/git/air-pollution/src/measure/validation.py in walk_forward_ml_model_validation(data, model, target_col, cut_off_offset, n_pred_points, n_folds, n_lags)
    174             # Calculate t-n column values for the last row and replace them in the history table
    175             for i in range(1, n_lags + 1):
--> 176                 history[-1:]['t-' + str(i)] = df_temp['t'].shift(i)
    177 
    178             # print(f'after: history.tail(2) -> {history.tail(2)}')

/usr/local/lib/python3.7/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   3485         else:
   3486             # set column
-> 3487             self._set_item(key, value)
   3488 
   3489     def _setitem_slice(self, key, value):

/usr/local/lib/python3.7/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   3569         # value exception to occur first
   3570         if len(self):
-> 3571             self._check_setitem_copy()
   3572 
   3573     def insert(self, loc, column, value, allow_duplicates=False):

KeyboardInterrupt: 

Serialize output data

In [ ]:
from joblib import dump, load

model_name = models[0][0]

timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")

path = f'results/pm25_ml_{model_name}_results_h_{timestamp}.joblib'

dump(fold_results, path) 
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])

Calculate and visualize results

In [ ]:
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
In [ ]:
print(res)
In [ ]:
# Show forecasts for n-th point in the future
show_n_points_of_forecasts = [1, 12, 24] # for hourly data
#show_n_points_of_forecasts = [1, 3, 7] # for daily data

# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-03-01'), ('2018-06-01', '2018-07-01')]
# for daily data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]

# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]

# File names for plots (format png will be used, do not add .png extension)
base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
#base_file_path = f'images/pm25_obs_vs_pred_365_d_ml_{model_name}' # for daily data
In [ ]:
fold_results[0].index
In [ ]:
# We need to convert to datetime index for plotting

# Replace integer-based index with datetime-based index

# We remove n_lags_in_dataset, b/c when the dataset with t-n lags was created
# rows conatining NaNs were removed
df_datetime = get_pm25_data_for_modelling('ts', 'd')[n_lags_in_dataset:]
#df_datetime.head()
#df_datetime[0:1].index.strftime(dt_format)[0]

# Integer to datetime index using a mapper (ts version of the dataset)
for i in range(0, len(fold_results)):
    if not isinstance(fold_results[i].index, pd.DatetimeIndex):
        # Create a table of datetime indexes corresponding to integer indexes
        datetime_indexes = []
        integer_indexes = fold_results[i].index
        for ix in integer_indexes:
            datetime_indexes.append(df_datetime[ix:ix+1].index.strftime(dt_format)[0])
        print(datetime_indexes)
        
        # Apply datetime_indexes as index for fold data
        fold_results[i].index = pd.to_datetime(datetime_indexes)
In [ ]:
fold_results[0].index
In [ ]:
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
                   start_end_dates=start_end_dates,
                   plot_types=plot_types,
                   base_file_path=base_file_path,
                   fold_results=fold_results, 
                   n_pred_points=n_pred_points, 
                   cut_off_offset=cut_off_offset, 
                   model_name=model_name,
                timestamp=timestamp)
In [ ]:
 

Additional resources

In [ ]:
%%time
#LinearRegression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
# fits a linear model with coefficients w = (w1, …, wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "l1_ratio": np.arange(0.0, 1.0, 0.1),
    "max_iter": [1, 10, 100, 1000],
    "fit_intercept": [True, False],
    "selection": ["cyclic", "random"]
}
model = models[1][1]
In [ ]:
%%time
#ElasticNet
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
# Linear regression with combined L1 and L2 priors as regularizer.
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "l1_ratio": np.arange(0.0, 1.0, 0.1),
    "max_iter": [1, 10, 100, 1000],
    "fit_intercept": [True, False],
    "selection": ["cyclic", "random"]
}
model = models[1][1]
In [ ]:
# Excluded
%%time
#DecisionTreeRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
# Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
param_grid = {
    "criterion": ["mse", "friedman_mse", "mae"],
    "splitter": ["best", "random"],
    "max_depth": range(2, 16, 2),
    "min_samples_split": range(2, 16, 2),
    "min_samples_leaf": range(2, 16, 2),
    "max_features": ["auto", "sqrt", "log2"],
    "ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10, 100, 1000]
}
In [ ]:
%%time
#BaggingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
# https://www.programcreek.com/python/example/85938/sklearn.ensemble.BaggingRegressor
# A Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.
# The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.
param_grid = {
    "base_estimator": [models[0][1]],
    "n_estimators": [100, 200, 500, 1000, 5000, 10000],
    "bootstrap": [True, False],
    "n_jobs": [-1]
}
In [ ]:
%%time
#RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
    "n_estimators": [100, 200, 500],#, 1000, 5000, 10000],
    #"criterion": ["mse", "mae"],
    #"bootstrap": [True, False],
    "ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10],#, 100, 1000],
    "max_depth": [10, 20, 30, 40, None],#50, 60, 70, 80, 90, 100, None],
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_leaf": [1, 2, 4],
    #"min_samples_split": range(2, 16, 2),
    "n_jobs": [-1]
}
model = models[7][1]
In [ ]:
%%time
#ExtraTreesRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html
# https://www.programcreek.com/python/example/102434/sklearn.ensemble.ExtraTreesRegressor
# This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
param_grid = {
    "n_estimators": [100, 200, 500],#, 1000, 5000, 10000],
    #"criterion": ["mse", "mae"],
    #"bootstrap": [True, False],
    "ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10],#, 100, 1000],
    "max_depth": [10, 20, 30, 40, None],#50, 60, 70, 80, 90, 100, None],
    #"max_features": ["auto", "sqrt", "log2"],
    "min_samples_leaf": [1, 2, 4],
    #"min_samples_split": range(2, 16, 2),
    "n_jobs": [-1]
}
model = models[8][1]
In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

Load daily data

In [35]:
dfd = get_pm25_data_for_modelling('ml', 'd')
dfd.head()
common.py | 42 | get_pm25_data_for_modelling | 13-Jun-20 11:25:44 | INFO: Dataframe loaded: /Users/ksatola/Documents/git/air-pollution/agh/data/dfpm25_2008-2018_ml_7days_lags.hdf
common.py | 43 | get_pm25_data_for_modelling | 13-Jun-20 11:25:44 | INFO: Dataframe size: (4015, 13)
Out[35]:
t t-1 t-2 t-3 t-4 month day hour dayofyear weekofyear dayofweek quarter season
0 57.312500 42.979167 46.104167 30.958333 53.586957 1 5 0 5 1 5 1 1
1 36.062500 57.312500 42.979167 46.104167 30.958333 1 6 0 6 1 6 1 1
2 46.083333 36.062500 57.312500 42.979167 46.104167 1 7 0 7 2 0 1 1
3 45.041667 46.083333 36.062500 57.312500 42.979167 1 8 0 8 2 1 1 1
4 101.375000 45.041667 46.083333 36.062500 57.312500 1 9 0 9 2 2 1 1

Modelling

In [36]:
df = dfd.copy()
df.head()
Out[36]:
t t-1 t-2 t-3 t-4 month day hour dayofyear weekofyear dayofweek quarter season
0 57.312500 42.979167 46.104167 30.958333 53.586957 1 5 0 5 1 5 1 1
1 36.062500 57.312500 42.979167 46.104167 30.958333 1 6 0 6 1 6 1 1
2 46.083333 36.062500 57.312500 42.979167 46.104167 1 7 0 7 2 0 1 1
3 45.041667 46.083333 36.062500 57.312500 42.979167 1 8 0 8 2 1 1 1
4 101.375000 45.041667 46.083333 36.062500 57.312500 1 9 0 9 2 2 1 1
In [37]:
# Define first past/future cutoff point in time offset (1 year of data)
#cut_off_offset = 365*24 # for hourly data
cut_off_offset = 365 # for daily data
In [38]:
# For this method to work properly, observations in the data frame must be ordered by time (the greater index, the recent data)
# Train test split
X_train, X_test, y_train, y_test = split_df_for_ml_modelling_offset(data=df, target_col='t', cut_off_offset=cut_off_offset)
In [39]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Out[39]:
((3650, 12), (3650,), (365, 12), (365,))
In [40]:
# Define linear and non-linear regression models in scope
reg_models = get_models_for_regression()
models = []

for model in reg_models:
    item = (type(model).__name__, model)
    models.append(item)
    print(item)
('LinearRegression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False))
('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))
('DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'))
('KNeighborsRegressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'))
('AdaBoostRegressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None))
('BaggingRegressor', BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
('ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=10, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False))
In [41]:
%%time
# Perform initial ranking - choose best performing models for validation
# We assume, models will predict the best t+1, for subsequent ts the error will grow. That is why, for the initial ranking we will use one day prediction only
scores, results, names = score_ml_models(X_train=X_train,
                                         y_train=y_train,
                                         models=models,
                                         n_splits = 5,
                                         metric='neg_root_mean_squared_error',
                                         metric_label="RMSE", 
                                         seed=123)
for score in scores:
    print(score)
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:37 | INFO: Scoring LinearRegression started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:37 | INFO: Scoring ElasticNet started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:37 | INFO: Scoring SVR started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:38 | INFO: Scoring DecisionTreeRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:38 | INFO: Scoring KNeighborsRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:38 | INFO: Scoring AdaBoostRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:39 | INFO: Scoring BaggingRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:39 | INFO: Scoring RandomForestRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:40 | INFO: Scoring ExtraTreesRegressor started
LinearRegression, RMSE 21.71519880662688, (std. dev. 2.9273799166199206)
ElasticNet, RMSE 21.830784202119624, (std. dev. 2.9573842361855087)
SVR, RMSE 24.045745977510276, (std. dev. 4.441156440786244)
DecisionTreeRegressor, RMSE 31.147068453565886, (std. dev. 2.809053790566411)
KNeighborsRegressor, RMSE 23.707382569743043, (std. dev. 3.3448167988995414)
AdaBoostRegressor, RMSE 27.586419082358997, (std. dev. 3.418986721465014)
BaggingRegressor, RMSE 23.431554189892402, (std. dev. 3.5302630679381473)
RandomForestRegressor, RMSE 23.217160647978343, (std. dev. 3.3641116644360025)
ExtraTreesRegressor, RMSE 24.202871671021512, (std. dev. 3.2927586658114762)
CPU times: user 4 s, sys: 120 ms, total: 4.12 s
Wall time: 3.71 s
In [ ]:
LinearRegression, RMSE 21.71519880662688, (std. dev. 2.9273799166199206)
ElasticNet, RMSE 21.830784202119624, (std. dev. 2.9573842361855087)
SVR, RMSE 24.045745977510276, (std. dev. 4.441156440786244)
DecisionTreeRegressor, RMSE 31.147068453565886, (std. dev. 2.809053790566411)
KNeighborsRegressor, RMSE 23.707382569743043, (std. dev. 3.3448167988995414)
AdaBoostRegressor, RMSE 27.586419082358997, (std. dev. 3.418986721465014)
BaggingRegressor, RMSE 23.431554189892402, (std. dev. 3.5302630679381473)
RandomForestRegressor, RMSE 23.217160647978343, (std. dev. 3.3641116644360025)
ExtraTreesRegressor, RMSE 24.202871671021512, (std. dev. 3.2927586658114762)
CPU times: user 4 s, sys: 120 ms, total: 4.12 s
Wall time: 3.71 s
In [42]:
# Compare Algorithms
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Algorithms Comparison - RMSE')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('images/ml_comparison_rmse_d.png')
plt.show();

Hyper-parameter Tuning

Based on the best median RMSE metric results, the following algorithms seem to work best with PM2.5 data (cutoff < 22 g/m3) and the best non-linear algorithm:

  • LinearRegression
  • ElasticNet
  • RandomForestRegressor

After hyper-parameter tuning, they will be validated using a common procedure.

In [43]:
models[0][0], models[0][1].get_params().keys()
Out[43]:
('LinearRegression',
 dict_keys(['copy_X', 'fit_intercept', 'n_jobs', 'normalize']))
In [44]:
%%time
# LinearRegression
param_grid = {
    "fit_intercept": [True, False],
    "normalize": [True, False],
    "n_jobs": [-1]
}
model = models[0][1]
CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 6.91 µs
In [45]:
%%time
perform_grid_search_cv(X_train=X_train, 
                        y_train=y_train, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)
regression.py | 70 | perform_grid_search_cv | 13-Jun-20 11:34:01 | INFO: Best: -21.68400804843603 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': False}
CPU times: user 215 ms, sys: 23.2 ms, total: 238 ms
Wall time: 83.3 ms
In [ ]:
regression.py | 70 | perform_grid_search_cv | 13-Jun-20 11:34:01 | INFO: Best: -21.68400804843603 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': False}
CPU times: user 215 ms, sys: 23.2 ms, total: 238 ms
Wall time: 83.3 ms
In [46]:
models[1][0], models[1][1].get_params().keys()
Out[46]:
('ElasticNet',
 dict_keys(['alpha', 'copy_X', 'fit_intercept', 'l1_ratio', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start']))
In [47]:
%%time
# ElasticNet
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "l1_ratio": np.arange(0.0, 1.0, 0.1),
    "max_iter": [1, 10, 100, 500, 1000],
    "fit_intercept": [True, False],
    "selection": ["cyclic", "random"]
}
model = models[1][1]
CPU times: user 15 µs, sys: 3 µs, total: 18 µs
Wall time: 21 µs
In [48]:
%%time
perform_grid_search_cv(X_train=X_train, 
                        y_train=y_train, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)
regression.py | 70 | perform_grid_search_cv | 13-Jun-20 11:40:34 | INFO: Best: -21.688890846091354 using {'alpha': 0.0001, 'fit_intercept': True, 'l1_ratio': 0.4, 'max_iter': 10, 'selection': 'random'}
CPU times: user 3min 8s, sys: 20.4 s, total: 3min 28s
Wall time: 52.8 s
In [ ]:
regression.py | 70 | perform_grid_search_cv | 13-Jun-20 11:40:34 | INFO: Best: -21.688890846091354 using {'alpha': 0.0001, 'fit_intercept': True, 'l1_ratio': 0.4, 'max_iter': 10, 'selection': 'random'}
CPU times: user 3min 8s, sys: 20.4 s, total: 3min 28s
Wall time: 52.8 s
In [49]:
models[7][0], models[7][1].get_params().keys()
Out[49]:
('RandomForestRegressor',
 dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start']))
In [53]:
%%time
#RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset 
# and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples 
# parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
    "criterion": ['mse', 'mae'],
    "min_samples_split": [2, 8, 16],
    "min_samples_leaf": [1, 2, 4],
    "n_estimators": [100, 200],
    "n_jobs": [-1]
}
model = models[7][1]
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.06 µs
In [54]:
%%time
# Random, to speed-up
#RandomForestRegressor - Random Search
perform_random_search_cv(X_train=X_train, 
                        y_train=y_train, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 14.2min finished
regression.py | 107 | perform_random_search_cv | 13-Jun-20 11:58:16 | INFO: Best: -21.81259741714167 using {'n_jobs': -1, 'n_estimators': 200, 'min_samples_split': 16, 'min_samples_leaf': 4, 'criterion': 'mae'}
CPU times: user 2min 20s, sys: 342 ms, total: 2min 20s
Wall time: 14min 27s
In [ ]:
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 14.2min finished
regression.py | 107 | perform_random_search_cv | 13-Jun-20 11:58:16 | INFO: Best: -21.81259741714167 using {'n_jobs': -1, 'n_estimators': 200, 'min_samples_split': 16, 'min_samples_leaf': 4, 'criterion': 'mae'}
CPU times: user 2min 20s, sys: 342 ms, total: 2min 20s
Wall time: 14min 27s

Evaluate best models on test dataset

In [55]:
best_models = [
    LinearRegression({'fit_intercept': True, 'n_jobs': -1, 'normalize': False}),
    ElasticNet(**{'alpha': 0.0001, 'fit_intercept': True, 'l1_ratio': 0.4, 'max_iter': 10, 'selection': 'random'}),
    RandomForestRegressor(**{'n_jobs': -1, 'n_estimators': 200, 'min_samples_split': 16, 'min_samples_leaf': 4, 'criterion': 'mae'}),
]

models = []

for model in best_models:
    item = (type(model).__name__, model)
    models.append(item)
    print(item)
('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': False},
                 n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.0001, copy_X=True, fit_intercept=True, l1_ratio=0.4,
           max_iter=10, normalize=False, positive=False, precompute=False,
           random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=16, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
In [ ]:
('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': False},
                 n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.0001, copy_X=True, fit_intercept=True, l1_ratio=0.4,
           max_iter=10, normalize=False, positive=False, precompute=False,
           random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=16, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
In [56]:
# number of t-n columns in a dataset
#n_lags_in_dataset = 10 # for hourly data
n_lags_in_dataset = 4 # for daily data

# Set datetime format for index
#dt_format = "%Y-%m-%d %H:%M:%S" # for hourly data
dt_format = "%Y-%m-%d" # for daily data

# Create train and validate sets
train_test_split_position = int(len(df)-cut_off_offset)

# Create as many folds as remains till the end of known data
n_folds = len(df) #train_test_split_position+3

# Predict for X points
#n_pred_points = 24 # for hourly data
n_pred_points = 7 # for daily data
In [77]:
%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
  
# Model 1: LinearRegression
#model_name = models[0][0]
#model = models[0][1]

# Model 2: ElasticNet
#model_name = models[1][0]
#model = models[1][1]

# Model 3: RandomForestRegressor
model_name = models[2][0]
model = models[2][1]

fold_results = walk_forward_ml_model_validation(data=df,
                                                 model=model,
                                                 target_col='t',
                                                 cut_off_offset=cut_off_offset,
                                                 n_pred_points=n_pred_points,
                                                 n_folds=-1,
                                                 n_lags = n_lags_in_dataset)
print(len(fold_results))
print(fold_results[0])
validation.py | 88 | walk_forward_ml_model_validation | 13-Jun-20 12:13:52 | INFO: ('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=16, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)) model validation started
Started fold 000000/000365 - 2020-06-13_12-13-52
Started fold 000010/000365 - 2020-06-13_12-32-38
Started fold 000020/000365 - 2020-06-13_12-51-02
Started fold 000030/000365 - 2020-06-13_13-10-25
Started fold 000040/000365 - 2020-06-13_13-29-00
Started fold 000050/000365 - 2020-06-13_13-47-26
Started fold 000060/000365 - 2020-06-13_14-05-20
Started fold 000070/000365 - 2020-06-13_14-23-52
Started fold 000080/000365 - 2020-06-13_14-42-27
Started fold 000090/000365 - 2020-06-13_15-01-26
Started fold 000100/000365 - 2020-06-13_15-20-27
Started fold 000110/000365 - 2020-06-13_15-42-08
Started fold 000120/000365 - 2020-06-13_16-04-21
Started fold 000130/000365 - 2020-06-13_16-26-12
Started fold 000140/000365 - 2020-06-13_16-50-06
Started fold 000150/000365 - 2020-06-13_17-13-50
Started fold 000160/000365 - 2020-06-13_17-37-07
Started fold 000170/000365 - 2020-06-13_18-00-26
Started fold 000180/000365 - 2020-06-13_18-24-28
Started fold 000190/000365 - 2020-06-13_18-48-49
Started fold 000200/000365 - 2020-06-13_19-12-33
Started fold 000210/000365 - 2020-06-13_19-36-20
Started fold 000220/000365 - 2020-06-13_20-00-22
Started fold 000230/000365 - 2020-06-13_20-24-16
Started fold 000240/000365 - 2020-06-13_20-48-14
Started fold 000250/000365 - 2020-06-13_21-11-32
Started fold 000260/000365 - 2020-06-13_21-36-11
Started fold 000270/000365 - 2020-06-13_21-59-56
Started fold 000280/000365 - 2020-06-13_22-21-27
Started fold 000290/000365 - 2020-06-13_22-45-13
Started fold 000300/000365 - 2020-06-13_23-09-11
Started fold 000310/000365 - 2020-06-13_23-32-40
Started fold 000320/000365 - 2020-06-13_23-56-09
Started fold 000330/000365 - 2020-06-14_00-19-26
Started fold 000340/000365 - 2020-06-14_00-40-22
Started fold 000350/000365 - 2020-06-14_00-59-43
358
       observed  predicted      error  abs_error
3650  67.991848  75.416027   7.424179   7.424179
3651  16.026950  74.369728  58.342778  58.342778
3652  14.590020  26.050605  11.460584  11.460584
3653  22.094854  25.651497   3.556643   3.556643
3654  62.504217  32.889676  29.614540  29.614540
3655  43.929804  57.558177  13.628373  13.628373
3656  22.088192  61.956002  39.867810  39.867810
CPU times: user 3d 21h 17min 45s, sys: 13min 46s, total: 3d 21h 31min 32s
Wall time: 13h 1min 19s
In [ ]:
# Model 3: RandomForestRegressor
validation.py | 88 | walk_forward_ml_model_validation | 13-Jun-20 12:13:52 | INFO: ('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=16, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)) model validation started
Started fold 000000/000365 - 2020-06-13_12-13-52
Started fold 000010/000365 - 2020-06-13_12-32-38
Started fold 000020/000365 - 2020-06-13_12-51-02
Started fold 000030/000365 - 2020-06-13_13-10-25
Started fold 000040/000365 - 2020-06-13_13-29-00
Started fold 000050/000365 - 2020-06-13_13-47-26
Started fold 000060/000365 - 2020-06-13_14-05-20
Started fold 000070/000365 - 2020-06-13_14-23-52
Started fold 000080/000365 - 2020-06-13_14-42-27
Started fold 000090/000365 - 2020-06-13_15-01-26
Started fold 000100/000365 - 2020-06-13_15-20-27
Started fold 000110/000365 - 2020-06-13_15-42-08
Started fold 000120/000365 - 2020-06-13_16-04-21
Started fold 000130/000365 - 2020-06-13_16-26-12
Started fold 000140/000365 - 2020-06-13_16-50-06
Started fold 000150/000365 - 2020-06-13_17-13-50
Started fold 000160/000365 - 2020-06-13_17-37-07
Started fold 000170/000365 - 2020-06-13_18-00-26
Started fold 000180/000365 - 2020-06-13_18-24-28
Started fold 000190/000365 - 2020-06-13_18-48-49
Started fold 000200/000365 - 2020-06-13_19-12-33
Started fold 000210/000365 - 2020-06-13_19-36-20
Started fold 000220/000365 - 2020-06-13_20-00-22
Started fold 000230/000365 - 2020-06-13_20-24-16
Started fold 000240/000365 - 2020-06-13_20-48-14
Started fold 000250/000365 - 2020-06-13_21-11-32
Started fold 000260/000365 - 2020-06-13_21-36-11
Started fold 000270/000365 - 2020-06-13_21-59-56
Started fold 000280/000365 - 2020-06-13_22-21-27
Started fold 000290/000365 - 2020-06-13_22-45-13
Started fold 000300/000365 - 2020-06-13_23-09-11
Started fold 000310/000365 - 2020-06-13_23-32-40
Started fold 000320/000365 - 2020-06-13_23-56-09
Started fold 000330/000365 - 2020-06-14_00-19-26
Started fold 000340/000365 - 2020-06-14_00-40-22
Started fold 000350/000365 - 2020-06-14_00-59-43
358
       observed  predicted      error  abs_error
3650  67.991848  75.416027   7.424179   7.424179
3651  16.026950  74.369728  58.342778  58.342778
3652  14.590020  26.050605  11.460584  11.460584
3653  22.094854  25.651497   3.556643   3.556643
3654  62.504217  32.889676  29.614540  29.614540
3655  43.929804  57.558177  13.628373  13.628373
3656  22.088192  61.956002  39.867810  39.867810
CPU times: user 3d 21h 17min 45s, sys: 13min 46s, total: 3d 21h 31min 32s
Wall time: 13h 1min 19s

# Model 2: ElasticNet
358
       observed  predicted      error  abs_error
3650  67.991848  52.050065  15.941783  15.941783
3651  16.026950  60.887953  44.861003  44.861003
3652  14.590020  21.886631   7.296611   7.296611
3653  22.094854  24.978848   2.883993   2.883993
3654  62.504217  29.182693  33.321524  33.321524
3655  43.929804  56.137409  12.207605  12.207605
3656  22.088192  42.861250  20.773058  20.773058
CPU times: user 1min 3s, sys: 6.68 s, total: 1min 9s
Wall time: 17.6 s


# Model 1: LinearRegression
358
       observed  predicted      error  abs_error
3650  67.991848  56.841447  11.150400  11.150400
3651  16.026950  61.386935  45.359985  45.359985
3652  14.590020  23.431795   8.841774   8.841774
3653  22.094854  31.352299   9.257445   9.257445
3654  62.504217  33.263056  29.241160  29.241160
3655  43.929804  59.087193  15.157389  15.157389
3656  22.088192  43.298562  21.210370  21.210370
CPU times: user 44.7 s, sys: 4.41 s, total: 49.2 s
Wall time: 18.1 s

Serialize output data

In [78]:
from joblib import dump, load

timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")

path = f'results/pm25_ml_{model_name}_results_d_{timestamp}.joblib'

dump(fold_results, path) 
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])
358
       observed  predicted      error  abs_error
3650  67.991848  75.416027   7.424179   7.424179
3651  16.026950  74.369728  58.342778  58.342778
3652  14.590020  26.050605  11.460584  11.460584
3653  22.094854  25.651497   3.556643   3.556643
3654  62.504217  32.889676  29.614540  29.614540
3655  43.929804  57.558177  13.628373  13.628373
3656  22.088192  61.956002  39.867810  39.867810

Calculate and visualize results

In [79]:
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
CPU times: user 1.93 s, sys: 6.25 ms, total: 1.93 s
Wall time: 1.94 s
Out[79]:
[9.130201994301995,
 9.262425925925927,
 9.13762849002849,
 9.134805698005698,
 9.326830769230769,
 9.281815099715098,
 9.317902849002849]
In [80]:
print(res)
[9.130201994301995, 9.262425925925927, 9.13762849002849, 9.134805698005698, 9.326830769230769, 9.281815099715098, 9.317902849002849]
In [ ]:
# Model 3: RandomForestRegressor
[9.130201994301995, 9.262425925925927, 9.13762849002849, 9.134805698005698, 9.326830769230769, 9.281815099715098, 9.317902849002849]

# Model 2: ElasticNet
[9.90203247863248, 10.054816239316239, 9.830852706552706, 9.85534358974359, 10.01102792022792, 9.93521396011396, 10.049378347578347]

# Model 1: LinearRegression
[9.462926210826211, 9.488959544159545, 9.400442735042732, 9.458610541310541, 9.551721652421653, 9.50755527065527, 9.564808262108263]
In [81]:
# Show forecasts for n-th point in the future
#show_n_points_of_forecasts = [1, 12, 24] # for hourly data
show_n_points_of_forecasts = [1, 3, 7] # for daily data

# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-03-01'), ('2018-06-01', '2018-07-01')]
# for daily data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]

# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]

# File names for plots (format png will be used, do not add .png extension)
#base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
base_file_path = f'images/pm25_obs_vs_pred_365_d_ml_{model_name}' # for daily data
In [82]:
fold_results[0].index
Out[82]:
Int64Index([3650, 3651, 3652, 3653, 3654, 3655, 3656], dtype='int64')
In [83]:
# We need to convert to datetime index for plotting

# Replace integer-based index with datetime-based index

# We remove n_lags_in_dataset, b/c when the dataset with t-n lags was created
# rows conatining NaNs were removed
df_datetime = get_pm25_data_for_modelling('ts', 'd')[n_lags_in_dataset:]
#df_datetime.head()
#df_datetime[0:1].index.strftime(dt_format)[0]

# Integer to datetime index using a mapper (ts version of the dataset)
for i in range(0, len(fold_results)):
    if not isinstance(fold_results[i].index, pd.DatetimeIndex):
        # Create a table of datetime indexes corresponding to integer indexes
        datetime_indexes = []
        integer_indexes = fold_results[i].index
        for ix in integer_indexes:
            datetime_indexes.append(df_datetime[ix:ix+1].index.strftime(dt_format)[0])
        #print(datetime_indexes)
        
        # Apply datetime_indexes as index for fold data
        fold_results[i].index = pd.to_datetime(datetime_indexes)
common.py | 42 | get_pm25_data_for_modelling | 14-Jun-20 09:05:28 | INFO: Dataframe loaded: /Users/ksatola/Documents/git/air-pollution/agh/data/dfpm25_2008-2018_daily.hdf
common.py | 43 | get_pm25_data_for_modelling | 14-Jun-20 09:05:28 | INFO: Dataframe size: (4019, 1)
In [84]:
fold_results[0].index
Out[84]:
DatetimeIndex(['2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05',
               '2018-01-06', '2018-01-07', '2018-01-08'],
              dtype='datetime64[ns]', freq=None)
In [85]:
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
                   start_end_dates=start_end_dates,
                   plot_types=plot_types,
                   base_file_path=base_file_path,
                   fold_results=fold_results, 
                   n_pred_points=n_pred_points, 
                   cut_off_offset=cut_off_offset, 
                   model_name=model_name,
                timestamp=timestamp)


results.py | 92 | visualize_results | 14-Jun-20 09:05:34 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_01_lag-01_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:35 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_01_lag-03_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:36 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_01_lag-07_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:37 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_02_lag-01_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:38 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_02_lag-03_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:39 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_02_lag-07_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:40 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_03_lag-01_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:41 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_03_lag-03_2020-06-14_08-30-46.png


results.py | 92 | visualize_results | 14-Jun-20 09:05:42 | INFO: images/pm25_obs_vs_pred_365_d_ml_RandomForestRegressor_03_lag-07_2020-06-14_08-30-46.png

Model persistence

In [ ]:
from joblib import dump, load
dump(model, 'model.joblib') 
model2 = load('model.joblib')
model2.predict(X[0:20])
In [ ]:
model[0]
In [ ]:
from joblib import dump, load

timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")

for model in models:
    path = f'results/pm25_ml_{model[0]}_model_d_{timestamp}.joblib'
    dump(model, path)
In [ ]:
%%time
path = f'results/pm25_ml_RandomForestRegressor_model_d_2020-06-12_18-06-21.joblib'
model2 = load(path)
print(model2[0], model2[1])
model2_fitted = model2[1].fit(X_train, y_train)
print(model2_fitted.predict(X_test[0:1]))
print(y_test[0:1])
In [ ]:
RandomForestRegressor RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
[74.06882094]
3650    67.991848
Name: t, dtype: float64
CPU times: user 46.5 s, sys: 191 ms, total: 46.7 s
Wall time: 7.12 s
In [ ]: