import os
import sys
sys.path.append("../../../" )
import json
import pickle
import fasttreeshap
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import RepeatedKFold, cross_val_score
% reload_ext autoreload
% autoreload 2
IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
Load Training Data
COUNTRY_CODE = "ph"
ROLLOUT_DATE = "-" .join(os.getcwd().split("/" )[- 2 ].split("-" )[:3 ])
# CSV file contains all data
# Metadata JSON file lists the feature columns and label column
data = pd.read_csv(f" { ROLLOUT_DATE} -training-data.csv" )
with open (f" { ROLLOUT_DATE} -training-data-columns.json" , "r" ) as file :
column_metadata = json.load(file )
features = data[column_metadata["features" ]]
labels = data[column_metadata["label" ]]
Cross-Validation
# Set parameters
CV_K_FOLDS = 5
CV_NUM_REPEATS = 5
RANDOM_SEED = 42
print (f"Performing { CV_K_FOLDS} -fold CV..." )
cv = RepeatedKFold(
n_splits= CV_K_FOLDS,
n_repeats= CV_NUM_REPEATS,
random_state= RANDOM_SEED,
)
print (cv.split(features))
Performing 5-fold CV...
<generator object _RepeatedSplits.split at 0x7ff647383970>
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators= 100 , random_state= RANDOM_SEED, verbose= 0 )
R_cv = cross_val_score(model, features.values, labels.values.ravel(), cv= cv)
cv_mean = round (np.array(R_cv).mean(), 2 )
cv_std = round (np.array(R_cv).std(), 2 )
print ("Cross validation scores are: " , R_cv)
print (f"Cross validation R^2 mean: { cv_mean} " )
print (f"Cross validation R^2 std: { cv_std} " )
Cross validation scores are: [0.55682521 0.63091503 0.59480566 0.61487913 0.58103858 0.58468717
0.6176015 0.55499181 0.56994707 0.63015257 0.64765313 0.61932703
0.59837317 0.54697645 0.54584293 0.53601267 0.59513112 0.6302872
0.55665255 0.59870962 0.58082884 0.5416791 0.62810948 0.63953924
0.5935612 ]
Cross validation R^2 mean: 0.59
Cross validation R^2 std: 0.03
Train the final model
For training the final model, we train on all the available data.
model = RandomForestRegressor(n_estimators= 100 , random_state= RANDOM_SEED, verbose= 0 )
model.fit(features.values, labels.values.ravel())
RandomForestRegressor(random_state=42) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
model_save_path = f"./ { ROLLOUT_DATE} - { COUNTRY_CODE} -single-country-model.pkl"
with open (model_save_path, "wb" ) as file :
pickle.dump(model, file )
SHAP
explainer = fasttreeshap.TreeExplainer(model, algorithm= "auto" , n_jobs=- 1 )
shap_values = explainer(features).values
shap.summary_plot(
shap_values, features, feature_names= features.columns, plot_type= "bar"
)
shap.summary_plot(shap_values, features.values, feature_names= features.columns)
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored