From: https://github.com/ksatola
Version: 0.1.0

Model - PM2.5 - Machine Learning Modelling - Regression¶

Contents¶

Machine Learning Regression
Hourly prediction
Daily prediction

%load_ext autoreload

%autoreload 2

import sys
sys.path.insert(0, '../src')

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator

from model import (
    get_pm25_data_for_modelling,
    get_models_for_regression,
    split_df_for_ml_modelling_offset,
    perform_grid_search_cv,
    perform_random_search_cv
)

from measure import (
    get_mean_folds_rmse_for_n_prediction_points,
    score_ml_models,
    prepare_data_for_visualization,
    walk_forward_ml_model_validation
)

from plot import (
   visualize_results
)

from utils import (
    get_datetime_identifier
)

from logger import logger
from datetime import datetime

Machine Learning Regression¶

XXXXXXXXX

Linear Algorithms: Logistic Regression.
Nonlinear Algorithms: Classification and Regression Trees (CART), Support Vector Machines (SVM), Gaussian Naive Bayes (NB) and k-Nearest Neighbors (KNN).

Napisac z scikit learn wstep do regresji

wypisac funkcje - zob. ponizej

Hyper based on CV and training data Final check based on test data - TODO

w pracy opisac generalne podejscie do ML, proste, dla najlepszych hyper, i dla nich super learner

XXXX From https://www.degruyter.com/view/journals/sagmb/6/1/article-sagmb.2007.6.1.1309.xml.xml

score models na zbiorze testowym! zrobic final test wybranych modeli ML na zbiorze testowym i policzyc RMSE -> symulacja predykcji jesli linear regression jest najlepsze to sprawdzic OLS assumptions dla zbioru danych

Load hourly data¶

dfh = get_pm25_data_for_modelling('ml', 'h')
dfh.head()

common.py | 42 | get_pm25_data_for_modelling | 12-Jun-20 22:13:04 | INFO: Dataframe loaded: /Users/ksatola/Documents/git/air-pollution/agh/data/dfpm25_2008-2018_ml_24hours_lags.hdf
common.py | 43 | get_pm25_data_for_modelling | 12-Jun-20 22:13:04 | INFO: Dataframe size: (96378, 19)

df = dfh[:]
df.shape

(96378, 19)

df.head()

Base Modelling¶

# Define first past/future cutoff point in time offset (1 year of data)
cut_off_offset = 365*24 # for hourly data
#cut_off_offset = 365 # for daily data

# For this method to work properly, observations in the data frame must be ordered by time (the greater index, the recent data)
# Train test split
X_train, X_test, y_train, y_test = split_df_for_ml_modelling_offset(data=df, target_col='t', cut_off_offset=cut_off_offset)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((87618, 18), (87618,), (8760, 18), (8760,))

# Define linear and non-linear regression models in scope
reg_models = get_models_for_regression()
models = []

for model in reg_models:
    item = (type(model).__name__, model)
    models.append(item)
    print(item)

('LinearRegression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False))
('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))
('DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'))
('KNeighborsRegressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'))
('AdaBoostRegressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None))
('BaggingRegressor', BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
('ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=10, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False))

%%time
# Perform initial ranking - choose best performing models for validation
# We assume, models will predict the best t+1, for subsequent ts the error will grow. That is why, for the initial ranking we will use one day prediction only
scores, results, names = score_ml_models(X_train=X_train,
                                         y_train=y_train,
                                         models=models,
                                         n_splits = 5,
                                         metric='neg_root_mean_squared_error',
                                         metric_label="RMSE", 
                                         seed=123)
for score in scores:
    print(score)

ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:13:26 | INFO: Scoring LinearRegression started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:13:26 | INFO: Scoring ElasticNet started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:13:26 | INFO: Scoring SVR started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:15 | INFO: Scoring DecisionTreeRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:20 | INFO: Scoring KNeighborsRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:26 | INFO: Scoring AdaBoostRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:31:47 | INFO: Scoring BaggingRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:32:19 | INFO: Scoring RandomForestRegressor started
ml_metrics.py | 36 | score_ml_models | 12-Jun-20 22:32:49 | INFO: Scoring ExtraTreesRegressor started

LinearRegression, RMSE 8.454518896700154, (std. dev. 1.0553480373018875)
ElasticNet, RMSE 8.466827600091218, (std. dev. 1.067885147340825)
SVR, RMSE 11.326321281986267, (std. dev. 3.2873896157758)
DecisionTreeRegressor, RMSE 11.935730744140105, (std. dev. 1.212603561790327)
KNeighborsRegressor, RMSE 10.866634500393763, (std. dev. 1.608640951710884)
AdaBoostRegressor, RMSE 16.822330057803956, (std. dev. 2.3545417407828704)
BaggingRegressor, RMSE 8.826528266269595, (std. dev. 1.1106023504757678)
RandomForestRegressor, RMSE 8.805528004207913, (std. dev. 1.118664501467662)
ExtraTreesRegressor, RMSE 8.818717598661783, (std. dev. 1.1022531965880644)
CPU times: user 19min 39s, sys: 4 s, total: 19min 43s
Wall time: 19min 40s

LinearRegression, RMSE 8.454518896700154, (std. dev. 1.0553480373018875)
ElasticNet, RMSE 8.466827600091218, (std. dev. 1.067885147340825)
SVR, RMSE 11.326321281986267, (std. dev. 3.2873896157758)
DecisionTreeRegressor, RMSE 11.935730744140105, (std. dev. 1.212603561790327)
KNeighborsRegressor, RMSE 10.866634500393763, (std. dev. 1.608640951710884)
AdaBoostRegressor, RMSE 16.822330057803956, (std. dev. 2.3545417407828704)
BaggingRegressor, RMSE 8.826528266269595, (std. dev. 1.1106023504757678)
RandomForestRegressor, RMSE 8.805528004207913, (std. dev. 1.118664501467662)
ExtraTreesRegressor, RMSE 8.818717598661783, (std. dev. 1.1022531965880644)
CPU times: user 19min 39s, sys: 4 s, total: 19min 43s
Wall time: 19min 40s

# Compare Algorithms
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Algorithms Comparison - RMSE')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('images/ml_comparison_rmse_h.png')
plt.show();

Hyper-parameter Tuning¶

Hyper-parameters are parameters that are not directly learnt within estimators. It is possible and recommended to search the hyper-parameter space for the best cross validation score.

While using a grid of parameter settings is currently the most widely used method for parameter optimization (GridSearchCV), other search methods have more favourable properties. RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search:

A budget can be chosen independent of the number of parameters and possible values.
Adding parameters that do not influence the performance does not decrease efficiency.

Specifying how parameters should be sampled is done using a dictionary, very similar to specifying parameters for GridSearchCV. Additionally, a computation budget, being the number of sampled candidates or sampling iterations, is specified using the n_iter parameter. For each parameter, either a distribution over possible values or a list of discrete choices (which will be sampled uniformly) can be specified: loguniform(1, 100) can be used instead of [1, 10, 100].

https://towardsdatascience.com/hyperparameter-tuning-c5619e7e6624

Grid Search: hyper-parameters tuning¶

https://scikit-learn.org/stable/modules/grid_search.html#grid-search

Hyper-parameters are parameters that are not directly learnt within estimators. In scikit-learn they are passed as arguments to the constructor of the estimator classes. Typical examples include C, kernel and gamma for Support Vector Classifier, alpha for Lasso, etc.

Some models allow for specialized, efficient parameter search strategies, outlined below. Two generic approaches to sampling search candidates are provided in scikit-learn: for given values, GridSearchCV exhaustively considers all parameter combinations, while RandomizedSearchCV can sample a given number of candidates from a parameter space with a specified distribution.

Note that it is common that a small subset of those parameters can have a large impact on the predictive or computation performance of the model while others can be left to their default values. It is recommended to read the docstring of the estimator class to get a finer understanding of their expected behavior, possibly by reading the enclosed reference to the literature.

While using a grid of parameter settings is currently the most widely used method for parameter optimization, other search methods have more favourable properties. RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search:

A budget can be chosen independent of the number of parameters and possible values.
Adding parameters that do not influence the performance does not decrease efficiency.

Cross-validation: evaluating estimator performance¶

https://scikit-learn.org/stable/modules/cross_validation.html

Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test.

When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.

However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.

A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:

A model is trained using of the folds as training data;
the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).

The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as is the case when fixing an arbitrary validation set), which is a major advantage in problems such as inverse inference where the number of samples is very small.

dac obrazek ze strony, zrobic source: sklearn

Pipelines and composite estimators¶

https://scikit-learn.org/stable/modules/compose.html

Transformers are usually combined with classifiers, regressors or other estimators to build a composite estimator. The most common tool is a Pipeline.

Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves multiple purposes here:

Convenience and encapsulation You only have to call fit and predict once on your data to fit a whole sequence of estimators.

Joint parameter selection You can grid search over parameters of all estimators in the pipeline at once.

Safety Pipelines help avoid leaking statistics from your test data into the trained model in cross-validation, by ensuring that the same samples are used to train the transformers and predictors.

All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).

Model persistence¶

https://scikit-learn.org/stable/modules/model_persistence.html

After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. The following section gives you an example of how to persist a model with pickle.

pickle (and joblib by extension), has some issues regarding maintainability and security. Because of this,

Never unpickle untrusted data as it could lead to malicious code being executed upon loading.
While models saved using one version of scikit-learn might load in other versions, this is entirely unsupported and inadvisable. It should also be kept in mind that operations performed on such data could give different and unexpected results.

In order to rebuild a similar model with future versions of scikit-learn, additional metadata should be saved along the pickled model:

The training data, e.g. a reference to an immutable snapshot
The python source code used to generate the model
The versions of scikit-learn and its dependencies
The cross validation score obtained on the training data
This should make it possible to check that the cross-validation score is in the same range as before.

Since a model internal representation may be different on two different architectures, dumping a model on one architecture and loading it on another architecture is not supported.

more: https://pyvideo.org/pycon-us-2014/pickles-are-for-delis-not-software.html

Based on the best median RMSE metric results, the following algorithms seem to work best with PM2.5 data (cutoff < 8.5 g/m3) and the best non-linear algorithm:

LinearRegression
ElasticNet
RandomForestRegressor

After hyper-parameter tuning, they will be validated using a common procedure.

models[0][0], models[0][1].get_params().keys()

('LinearRegression',
 dict_keys(['copy_X', 'fit_intercept', 'n_jobs', 'normalize']))

%%time
# LinearRegression
param_grid = {
    "fit_intercept": [True, False],
    "normalize": [True, False],
    "n_jobs": [-1]
}
model = models[0][1]

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs

%%time
perform_grid_search_cv(X_train=X_train, 
                        y_train=y_train, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)

regression.py | 70 | perform_grid_search_cv | 12-Jun-20 22:36:53 | INFO: Best: -8.417609654057864 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': True}

CPU times: user 5.01 s, sys: 1.09 s, total: 6.1 s
Wall time: 925 ms

regression.py | 70 | perform_grid_search_cv | 12-Jun-20 22:36:53 | INFO: Best: -8.417609654057864 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': True}
CPU times: user 5.01 s, sys: 1.09 s, total: 6.1 s
Wall time: 925 ms

models[1][0], models[1][1].get_params().keys()

('ElasticNet',
 dict_keys(['alpha', 'copy_X', 'fit_intercept', 'l1_ratio', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start']))

%%time
# ElasticNet
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "l1_ratio": np.arange(0.0, 1.0, 0.1),
    "max_iter": [1, 10, 100, 500, 1000],
    "fit_intercept": [True, False],
    "selection": ["cyclic", "random"]
}
model = models[1][1]

CPU times: user 17 µs, sys: 3 µs, total: 20 µs
Wall time: 24.8 µs

%%time
perform_grid_search_cv(X_train=X_train, 
                        y_train=y_train, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)

regression.py | 70 | perform_grid_search_cv | 12-Jun-20 23:03:26 | INFO: Best: -8.420425074217084 using {'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}

CPU times: user 1h 40min 6s, sys: 13min 37s, total: 1h 53min 44s
Wall time: 23min 35s

regression.py | 70 | perform_grid_search_cv | 12-Jun-20 23:03:26 | INFO: Best: -8.420425074217084 using {'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}
CPU times: user 1h 40min 6s, sys: 13min 37s, total: 1h 53min 44s
Wall time: 23min 35s

models[7][0], models[7][1].get_params().keys()

('RandomForestRegressor',
 dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start']))

%%time
# RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset 
# and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples 
# parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
    "criterion": ['mse', 'mae'],
    #"min_samples_split": [2, 8, 16],
    #"min_samples_leaf": [1, 2, 4],
    "n_estimators": [100, 200],
    "n_jobs": [-1]
}
model = models[7][1]

CPU times: user 4 µs, sys: 7 µs, total: 11 µs
Wall time: 16.9 µs

# Limit training data to 2 last years of hourly data (for performance reasons)
X_train_limited = X_train[-365*24*2:]
y_train_limited = y_train[-365*24*2:]

X_train_limited.shape, y_train_limited.shape

((17520, 18), (17520,))

%%time
# Random search, to speed-up
perform_random_search_cv(X_train=X_train_limited, 
                        y_train=y_train_limited, 
                        model=model, 
                        param_grid=param_grid, 
                        scoring='neg_root_mean_squared_error', 
                        num_folds=6, 
                        seed=123)

For entire dataset, random grid search was running for more than 6 hours - no results. Stoped and limited training data set to 2 years of observations (17520, comparing to 87618). X_train_limited, y_train_limited 23:30 started with full training dataset 6:15 interrupted manually - no visible progress 6:23 started with limited 10:41 interrupted manually - no visible progress (limited and limited parameters grid) -> I will use standard settings 10:42 started with limited historical data and limited grid params 11:15 decision: do not tune RandomForestRegressor hyper-parameters, use it with default settings

Evaluate best models on test dataset¶

best_models = [
    LinearRegression({'fit_intercept': True, 'n_jobs': -1, 'normalize': True}),
    ElasticNet(**{'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}),
    RandomForestRegressor()]

models = []

for model in best_models:
    item = (type(model).__name__, model)
    models.append(item)
    print(item)

('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': True},
                 n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.9,
           max_iter=500, normalize=False, positive=False, precompute=False,
           random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))

('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': True},
                 n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.9,
           max_iter=500, normalize=False, positive=False, precompute=False,
           random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))

# number of t-n columns in a dataset
n_lags_in_dataset = 10 # for hourly data
#n_lags_in_dataset = 4 # for daily data

# Set datetime format for index
dt_format = "%Y-%m-%d %H:%M:%S" # for hourly data
#dt_format = "%Y-%m-%d" # for daily data

# Create train and validate sets
train_test_split_position = int(len(df)-cut_off_offset)

# Create as many folds as remains till the end of known data
n_folds = len(df) #train_test_split_position+3

# Predict for X points
n_pred_points = 24 # for hourly data
#n_pred_points = 7 # for daily data

%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
  
# Model 1: LinearRegression
model_name = models[0][0]
model = models[0][1]

# Model 2: ElasticNet
#model_name = models[1][0]
#model = models[1][1]

# Model 3: RandomForestRegressor
#model_name = models[7][0]
#model = models[7][1]


fold_results = walk_forward_ml_model_validation(data=df,
                                                 model=model,
                                                 target_col='t',
                                                 cut_off_offset=cut_off_offset,
                                                 n_pred_points=n_pred_points,
                                                 n_folds=-1,
                                                 n_lags = n_lags_in_dataset)
print(len(fold_results))
print(fold_results[0])

validation.py | 88 | walk_forward_ml_model_validation | 13-Jun-20 11:17:11 | INFO: ('LinearRegression', LinearRegression(copy_X=True,
                 fit_intercept={'fit_intercept': True, 'n_jobs': -1,
                                'normalize': True},
                 n_jobs=None, normalize=False)) model validation started

Started fold 000000/008760 - 2020-06-13_11-17-11
Started fold 000100/008760 - 2020-06-13_11-18-48
Started fold 000200/008760 - 2020-06-13_11-21-33
Started fold 000300/008760 - 2020-06-13_11-23-21
Started fold 000400/008760 - 2020-06-13_11-25-09

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<timed exec> in <module>

~/Documents/git/air-pollution/src/measure/validation.py in walk_forward_ml_model_validation(data, model, target_col, cut_off_offset, n_pred_points, n_folds, n_lags)
    174             # Calculate t-n column values for the last row and replace them in the history table
    175             for i in range(1, n_lags + 1):
--> 176                 history[-1:]['t-' + str(i)] = df_temp['t'].shift(i)
    177 
    178             # print(f'after: history.tail(2) -> {history.tail(2)}')

/usr/local/lib/python3.7/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   3485         else:
   3486             # set column
-> 3487             self._set_item(key, value)
   3488 
   3489     def _setitem_slice(self, key, value):

/usr/local/lib/python3.7/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   3569         # value exception to occur first
   3570         if len(self):
-> 3571             self._check_setitem_copy()
   3572 
   3573     def insert(self, loc, column, value, allow_duplicates=False):

KeyboardInterrupt:

Serialize output data¶

from joblib import dump, load

model_name = models[0][0]

timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")

path = f'results/pm25_ml_{model_name}_results_h_{timestamp}.joblib'

dump(fold_results, path) 
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])

Calculate and visualize results¶

%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res

print(res)

# Show forecasts for n-th point in the future
show_n_points_of_forecasts = [1, 12, 24] # for hourly data
#show_n_points_of_forecasts = [1, 3, 7] # for daily data

# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-03-01'), ('2018-06-01', '2018-07-01')]
# for daily data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]

# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]

# File names for plots (format png will be used, do not add .png extension)
base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
#base_file_path = f'images/pm25_obs_vs_pred_365_d_ml_{model_name}' # for daily data

fold_results[0].index

# We need to convert to datetime index for plotting

# Replace integer-based index with datetime-based index

# We remove n_lags_in_dataset, b/c when the dataset with t-n lags was created
# rows conatining NaNs were removed
df_datetime = get_pm25_data_for_modelling('ts', 'd')[n_lags_in_dataset:]
#df_datetime.head()
#df_datetime[0:1].index.strftime(dt_format)[0]

# Integer to datetime index using a mapper (ts version of the dataset)
for i in range(0, len(fold_results)):
    if not isinstance(fold_results[i].index, pd.DatetimeIndex):
        # Create a table of datetime indexes corresponding to integer indexes
        datetime_indexes = []
        integer_indexes = fold_results[i].index
        for ix in integer_indexes:
            datetime_indexes.append(df_datetime[ix:ix+1].index.strftime(dt_format)[0])
        print(datetime_indexes)
        
        # Apply datetime_indexes as index for fold data
        fold_results[i].index = pd.to_datetime(datetime_indexes)

fold_results[0].index

visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
                   start_end_dates=start_end_dates,
                   plot_types=plot_types,
                   base_file_path=base_file_path,
                   fold_results=fold_results, 
                   n_pred_points=n_pred_points, 
                   cut_off_offset=cut_off_offset, 
                   model_name=model_name,
                timestamp=timestamp)

Additional resources¶

%%time
#LinearRegression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
# fits a linear model with coefficients w = (w1, …, wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "l1_ratio": np.arange(0.0, 1.0, 0.1),
    "max_iter": [1, 10, 100, 1000],
    "fit_intercept": [True, False],
    "selection": ["cyclic", "random"]
}
model = models[1][1]

%%time
#ElasticNet
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
# Linear regression with combined L1 and L2 priors as regularizer.
param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "l1_ratio": np.arange(0.0, 1.0, 0.1),
    "max_iter": [1, 10, 100, 1000],
    "fit_intercept": [True, False],
    "selection": ["cyclic", "random"]
}
model = models[1][1]

# Excluded
%%time
#DecisionTreeRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
# Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
param_grid = {
    "criterion": ["mse", "friedman_mse", "mae"],
    "splitter": ["best", "random"],
    "max_depth": range(2, 16, 2),
    "min_samples_split": range(2, 16, 2),
    "min_samples_leaf": range(2, 16, 2),
    "max_features": ["auto", "sqrt", "log2"],
    "ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10, 100, 1000]
}

%%time
#BaggingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
# https://www.programcreek.com/python/example/85938/sklearn.ensemble.BaggingRegressor
# A Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.
# The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.
param_grid = {
    "base_estimator": [models[0][1]],
    "n_estimators": [100, 200, 500, 1000, 5000, 10000],
    "bootstrap": [True, False],
    "n_jobs": [-1]
}

%%time
#RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
    "n_estimators": [100, 200, 500],#, 1000, 5000, 10000],
    #"criterion": ["mse", "mae"],
    #"bootstrap": [True, False],
    "ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10],#, 100, 1000],
    "max_depth": [10, 20, 30, 40, None],#50, 60, 70, 80, 90, 100, None],
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_leaf": [1, 2, 4],
    #"min_samples_split": range(2, 16, 2),
    "n_jobs": [-1]
}
model = models[7][1]

%%time
#ExtraTreesRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html
# https://www.programcreek.com/python/example/102434/sklearn.ensemble.ExtraTreesRegressor
# This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
param_grid = {
    "n_estimators": [100, 200, 500],#, 1000, 5000, 10000],
    #"criterion": ["mse", "mae"],
    #"bootstrap": [True, False],
    "ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10],#, 100, 1000],
    "max_depth": [10, 20, 30, 40, None],#50, 60, 70, 80, 90, 100, None],
    #"max_features": ["auto", "sqrt", "log2"],
    "min_samples_leaf": [1, 2, 4],
    #"min_samples_split": range(2, 16, 2),
    "n_jobs": [-1]
}
model = models[8][1]

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

Load daily data¶

dfd = get_pm25_data_for_modelling('ml', 'd')
dfd.head()

common.py | 42 | get_pm25_data_for_modelling | 13-Jun-20 11:25:44 | INFO: Dataframe loaded: /Users/ksatola/Documents/git/air-pollution/agh/data/dfpm25_2008-2018_ml_7days_lags.hdf
common.py | 43 | get_pm25_data_for_modelling | 13-Jun-20 11:25:44 | INFO: Dataframe size: (4015, 13)

Modelling¶

df = dfd.copy()
df.head()

# Define first past/future cutoff point in time offset (1 year of data)
#cut_off_offset = 365*24 # for hourly data
cut_off_offset = 365 # for daily data

# For this method to work properly, observations in the data frame must be ordered by time (the greater index, the recent data)
# Train test split
X_train, X_test, y_train, y_test = split_df_for_ml_modelling_offset(data=df, target_col='t', cut_off_offset=cut_off_offset)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3650, 12), (3650,), (365, 12), (365,))

# Define linear and non-linear regression models in scope
reg_models = get_models_for_regression()
models = []

for model in reg_models:
    item = (type(model).__name__, model)
    models.append(item)
    print(item)

('LinearRegression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False))
('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))
('DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'))
('KNeighborsRegressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'))
('AdaBoostRegressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None))
('BaggingRegressor', BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))
('ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=10, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False))

%%time
# Perform initial ranking - choose best performing models for validation
# We assume, models will predict the best t+1, for subsequent ts the error will grow. That is why, for the initial ranking we will use one day prediction only
scores, results, names = score_ml_models(X_train=X_train,
                                         y_train=y_train,
                                         models=models,
                                         n_splits = 5,
                                         metric='neg_root_mean_squared_error',
                                         metric_label="RMSE", 
                                         seed=123)
for score in scores:
    print(score)

ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:37 | INFO: Scoring LinearRegression started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:37 | INFO: Scoring ElasticNet started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:37 | INFO: Scoring SVR started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:38 | INFO: Scoring DecisionTreeRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:38 | INFO: Scoring KNeighborsRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:38 | INFO: Scoring AdaBoostRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:39 | INFO: Scoring BaggingRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:39 | INFO: Scoring RandomForestRegressor started
ml_metrics.py | 36 | score_ml_models | 13-Jun-20 11:32:40 | INFO: Scoring ExtraTreesRegressor started

LinearRegression, RMSE 21.71519880662688, (std. dev. 2.9273799166199206)
ElasticNet, RMSE 21.830784202119624, (std. dev. 2.9573842361855087)
SVR, RMSE 24.045745977510276, (std. dev. 4.441156440786244)
DecisionTreeRegressor, RMSE 31.147068453565886, (std. dev. 2.809053790566411)
KNeighborsRegressor, RMSE 23.707382569743043, (std. dev. 3.3448167988995414)
AdaBoostRegressor, RMSE 27.586419082358997, (std. dev. 3.418986721465014)
BaggingRegressor, RMSE 23.431554189892402, (std. dev. 3.5302630679381473)
RandomForestRegressor, RMSE 23.217160647978343, (std. dev. 3.3641116644360025)
ExtraTreesRegressor, RMSE 24.202871671021512, (std. dev. 3.2927586658114762)
CPU times: user 4 s, sys: 120 ms, total: 4.12 s
Wall time: 3.71 s

LinearRegression, RMSE 21.71519880662688, (std. dev. 2.9273799166199206)
ElasticNet, RMSE 21.830784202119624, (std. dev. 2.9573842361855087)
SVR, RMSE 24.045745977510276, (std. dev. 4.441156440786244)
DecisionTreeRegressor, RMSE 31.147068453565886, (std. dev. 2.809053790566411)
KNeighborsRegressor, RMSE 23.707382569743043, (std. dev. 3.3448167988995414)
AdaBoostRegressor, RMSE 27.586419082358997, (std. dev. 3.418986721465014)
BaggingRegressor, RMSE 23.431554189892402, (std. dev. 3.5302630679381473)
RandomForestRegressor, RMSE 23.217160647978343, (std. dev. 3.3641116644360025)
ExtraTreesRegressor, RMSE 24.202871671021512, (std. dev. 3.2927586658114762)
CPU times: user 4 s, sys: 120 ms, total: 4.12 s
Wall time: 3.71 s

# Compare Algorithms
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Algorithms Comparison - RMSE')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('images/ml_comparison_rmse_d.png')
plt.show();