Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Use Mercator projection because Spline is a Cartesian gridder
projection = pyproj.Proj(proj="merc", lat_ts=data.latitude.mean())
proj_coords = projection(data.longitude.values, data.latitude.values)
region = vd.get_region((data.longitude, data.latitude))
# The desired grid spacing in degrees (converted to meters using 1 degree approx. 111km)
spacing = 15 / 60
########################################################################################
# Before we begin tuning, let's reiterate what the results were with the default
# parameters.
spline_default = vd.Spline()
score_default = np.mean(
vd.cross_val_score(spline_default, proj_coords, data.air_temperature_c)
)
spline_default.fit(proj_coords, data.air_temperature_c)
print("R² with defaults:", score_default)
########################################################################################
# Tuning
# ------
#
# :class:`~verde.Spline` has many parameters that can be set to modify the final result.
# Mainly the ``damping`` regularization parameter and the ``mindist`` "fudge factor"
# which smooths the solution. Would changing the default values give us a better score?
#
# We can answer these questions by changing the values in our ``spline`` and
# re-evaluating the model score repeatedly for different values of these parameters.
# Let's test the following combinations:
scores = vd.cross_val_score(vd.Spline(), proj_coords, data.air_temperature_c)
print("k-fold scores:", scores)
print("Mean score:", np.mean(scores))
########################################################################################
# You can also use most cross-validation splitter classes from
# :mod:`sklearn.model_selection` by specifying the ``cv`` argument. For example, if we
# want to shuffle then split the data *n* times
# (:class:`sklearn.model_selection.ShuffleSplit`):
from sklearn.model_selection import ShuffleSplit
shuffle = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
scores = vd.cross_val_score(
vd.Spline(), proj_coords, data.air_temperature_c, cv=shuffle
)
print("shuffle scores:", scores)
print("Mean score:", np.mean(scores))
proj_coords, data.air_temperature_c, test_size=0.3, random_state=1
)
print("R² score with seed 1:", vd.Spline().fit(*train_other).score(*test_other))
########################################################################################
# Cross-validation
# ----------------
#
# A more robust way of scoring the gridders is to use function
# :func:`verde.cross_val_score`, which (by default) uses a `k-fold cross-validation
# `__
# by default. It will split the data *k* times and return the score on each *fold*. We
# can then take a mean of these scores.
scores = vd.cross_val_score(vd.Spline(), proj_coords, data.air_temperature_c)
print("k-fold scores:", scores)
print("Mean score:", np.mean(scores))
########################################################################################
# You can also use most cross-validation splitter classes from
# :mod:`sklearn.model_selection` by specifying the ``cv`` argument. For example, if we
# want to shuffle then split the data *n* times
# (:class:`sklearn.model_selection.ShuffleSplit`):
from sklearn.model_selection import ShuffleSplit
shuffle = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
scores = vd.cross_val_score(
vd.Spline(), proj_coords, data.air_temperature_c, cv=shuffle
)
# Use itertools to create a list with all combinations of parameters to test
parameter_sets = [
dict(damping=combo[0], mindist=combo[1])
for combo in itertools.product(dampings, mindists)
]
print("Number of combinations:", len(parameter_sets))
print("Combinations:", parameter_sets)
########################################################################################
# Now we can loop over the combinations and collect the scores for each parameter set.
spline = vd.Spline()
scores = []
for params in parameter_sets:
spline.set_params(**params)
score = np.mean(vd.cross_val_score(spline, proj_coords, data.air_temperature_c))
scores.append(score)
print(scores)
########################################################################################
# The largest score will yield the best parameter combination.
best = np.argmax(scores)
print("Best score:", scores[best])
print("Score with defaults:", score_default)
print("Best parameters:", parameter_sets[best])
########################################################################################
# **That is a big improvement over our previous score!**
#
# This type of tuning is important and should always be performed when using a new
# gridder or a new dataset. However, the above implementation requires a lot of
# times and return the score on each *fold*. We can then take a mean of these scores.
# By default, the data is shuffled prior to splitting.
scores = vd.cross_val_score(vd.Spline(), proj_coords, bathymetry)
print("k-fold scores:", scores)
print("Mean score:", np.mean(scores))
########################################################################################
# You can also use most cross-validation splitter classes from
# :mod:`sklearn.model_selection` and Verde by specifying the ``cv`` argument.
#
# As we've seen before, randomly splitting the data can lead to inflated scores. Verde
# offers a spatially blocked version of k-fold through :class:`verde.BlockKFold`:
kfold = vd.BlockKFold(n_splits=5, shuffle=True, random_state=0, spacing=1 * 111000)
scores = vd.cross_val_score(vd.Spline(), proj_coords, bathymetry, cv=kfold)
print("block k-fold scores:", scores)
print("Mean score:", np.mean(scores))
########################################################################################
# That is not a bad score but we can do better than using the default arguments for
# :class:`~verde.Spline`. We could try different combinations manually until we get a
# good score. A better way is to do this automatically. In :ref:`model_selection` we'll
# go over how to do that.
########################################################################################
# Visualizing blocked k-fold
# --------------------------
#
# It's easier to understand how k-fold works by visualizing each of the folds. First,
# lets plot the train and test sets for a non-randomized blocked k-fold:
)
########################################################################################
# Again we see that changing the random state leads to very different scores.
#
# Cross-validation
# ----------------
#
# A more robust way of scoring the gridders is to use function
# :func:`verde.cross_val_score`, which uses `k-fold cross-validation
# `__
# through :class:`sklearn.model_selection.KFold` by default. It will split the data *k*
# times and return the score on each *fold*. We can then take a mean of these scores.
# By default, the data is shuffled prior to splitting.
scores = vd.cross_val_score(vd.Spline(), proj_coords, bathymetry)
print("k-fold scores:", scores)
print("Mean score:", np.mean(scores))
########################################################################################
# You can also use most cross-validation splitter classes from
# :mod:`sklearn.model_selection` and Verde by specifying the ``cv`` argument.
#
# As we've seen before, randomly splitting the data can lead to inflated scores. Verde
# offers a spatially blocked version of k-fold through :class:`verde.BlockKFold`:
kfold = vd.BlockKFold(n_splits=5, shuffle=True, random_state=0, spacing=1 * 111000)
scores = vd.cross_val_score(vd.Spline(), proj_coords, bathymetry, cv=kfold)
print("block k-fold scores:", scores)
print("Mean score:", np.mean(scores))
########################################################################################
# Use itertools to create a list with all combinations of parameters to test
parameter_sets = [
dict(damping=combo[0], mindist=combo[1])
for combo in itertools.product(dampings, mindists)
]
print("Number of combinations:", len(parameter_sets))
print("Combinations:", parameter_sets)
########################################################################################
# Now we can loop over the combinations and collect the scores for each parameter set.
spline = vd.Spline()
scores = []
for params in parameter_sets:
spline.set_params(**params)
score = np.mean(vd.cross_val_score(spline, proj_coords, data.air_temperature_c))
scores.append(score)
print(scores)
########################################################################################
# The largest score will yield the best parameter combination.
best = np.argmax(scores)
print("Best score:", scores[best])
print("Score with defaults:", score_default)
print("Best parameters:", parameter_sets[best])
########################################################################################
# **That is a big improvement over our previous score!**
#
# This type of tuning is important and should always be performed when using a new
# gridder or a new dataset. However, the above implementation requires a lot of