Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
score = spline.score(*test)
print("R² score:", score)
########################################################################################
# That's a good score meaning that our gridder is able to accurately predict data that
# wasn't used in the gridding algorithm.
#
# .. caution::
#
# Once caveat for this score is that it is highly dependent on the particular split
# that we made. Changing the random number generator seed in
# :func:`verde.train_test_split` will result in a different score.
# Use 1 as a seed instead of 0
train_other, test_other = vd.train_test_split(
proj_coords, data.air_temperature_c, test_size=0.3, random_state=1
)
print("R² score with seed 1:", vd.Spline().fit(*train_other).score(*test_other))
########################################################################################
# Cross-validation
# ----------------
#
# A more robust way of scoring the gridders is to use function
# :func:`verde.cross_val_score`, which (by default) uses a `k-fold cross-validation
# `__
# by default. It will split the data *k* times and return the score on each *fold*. We
# can then take a mean of these scores.
scores = vd.cross_val_score(vd.Spline(), proj_coords, data.air_temperature_c)
import verde as vd
# Fetch the wind speed data from Texas.
data = vd.datasets.fetch_texas_wind()
print(data.head())
# Separate out some of the data into utility variables
coordinates = (data.longitude.values, data.latitude.values)
region = vd.get_region(coordinates)
# Use a Mercator projection because Spline is a Cartesian gridder
projection = pyproj.Proj(proj="merc", lat_ts=data.latitude.mean())
# Split the data into a training and testing set. We'll fit the gridder on the training
# set and use the testing set to evaluate how well the gridder is performing.
train, test = vd.train_test_split(
projection(*coordinates),
(data.wind_speed_east_knots, data.wind_speed_north_knots),
random_state=2,
)
# We'll make a 20 arc-minute grid
spacing = 20 / 60
# Chain together a blocked mean to avoid aliasing, a polynomial trend (Spline usually
# requires de-trended data), and finally a Spline for each component. Notice that
# BlockReduce can work on multicomponent data without the use of Vector.
chain = vd.Chain(
[
("mean", vd.BlockReduce(np.mean, spacing * 111e3)),
("trend", vd.Vector([vd.Trend(degree=1) for i in range(2)])),
(
#
# We can't evaluate a gridder on the data that went into fitting it. The true test of a
# model is if it can correctly predict data that it hasn't seen before. scikit-learn has
# the :func:`sklearn.model_selection.train_test_split` function to separate a dataset
# into two parts: one for fitting the model (called *training* data) and a separate one
# for evaluating the model (called *testing* data). Using it with spatial data would
# involve some tedious array conversions so Verde implements
# :func:`verde.train_test_split` which does the same thing but takes coordinates and
# data arrays instead.
#
# The split is done randomly so we specify a seed for the random number generator to
# guarantee that we'll get the same result every time we run this example. You probably
# don't want to do that for real data. We'll keep 30% of the data to use for testing
# (``test_size=0.3``).
train, test = vd.train_test_split(
proj_coords, bathymetry, test_size=0.3, random_state=0
)
# The test size should be roughly 30% of the available data
print(train[0][0].size, test[0][0].size)
########################################################################################
# The returned ``train`` and ``test`` variables are tuples containing coordinates, data,
# and (optionally) weights arrays. Since we're not using weights, the third element of
# the tuple will be ``None``:
print(train)
########################################################################################
# Let's plot the points belonging two datasets with different colors:
plt.figure(figsize=(8, 6))
import numpy as np
import pyproj
import verde as vd
# Fetch the GPS data from the U.S. West coast. We'll grid only the horizontal components
# of the velocities
data = vd.datasets.fetch_california_gps()
coordinates = (data.longitude.values, data.latitude.values)
region = vd.get_region(coordinates)
# Use a Mercator projection because VectorSpline2D is a Cartesian gridder
projection = pyproj.Proj(proj="merc", lat_ts=data.latitude.mean())
# Split the data into a training and testing set. We'll fit the gridder on the training
# set and use the testing set to evaluate how well the gridder is performing.
train, test = vd.train_test_split(
projection(*coordinates), (data.velocity_east, data.velocity_north), random_state=0
)
# We'll make a 15 arc-minute grid in the end.
spacing = 15 / 60
# Chain together a blocked mean to avoid aliasing, a polynomial trend to take care of
# the increase toward the coast, and finally the vector gridder using Poisson's ratio
# 0.5 to couple the two horizontal components.
chain = vd.Chain(
[
("mean", vd.BlockReduce(np.mean, spacing * 111e3)),
("trend", vd.Vector([vd.Trend(degree=1) for i in range(2)])),
("spline", vd.VectorSpline2D(poisson=0.5, mindist=10e3)),
]
)
def generic_gridder(day, df, idx):
"""
Generic gridding algorithm for easy variables
"""
data = df[idx].values
coordinates = (df["lon"].values, df["lat"].values)
region = [XAXIS[0], XAXIS[-1], YAXIS[0], YAXIS[-1]]
projection = pyproj.Proj(proj="merc", lat_ts=df["lat"].mean())
spacing = 0.5
chain = vd.Chain(
[
("mean", vd.BlockReduce(np.mean, spacing=spacing * 111e3)),
("spline", vd.Spline(damping=1e-10, mindist=100e3)),
]
)
train, test = vd.train_test_split(
projection(*coordinates), data, random_state=0
)
chain.fit(*train)
score = chain.score(*test)
shape = (len(YAXIS), len(XAXIS))
grid = chain.grid(
region=region,
shape=shape,
projection=projection,
dims=["latitude", "longitude"],
data_names=["precip"],
)
res = grid.to_array()
res = np.ma.where(res < 0, 0, res)
print(
("%s %s rows for %s column min:%.3f max:%.3f score: %.3f")
# Now we can chain a block weighted mean and weighted spline together. We'll use
# uncertainty propagation to calculate the new weights from block mean because our data
# vary smoothly but have different uncertainties.
spacing = 5 / 60 # 5 arc-minutes
chain = vd.Chain(
[
("mean", vd.BlockMean(spacing=spacing * 111e3, uncertainty=True)),
("spline", vd.Spline(damping=1e-10)),
]
)
print(chain)
# Split the data into a training and testing set. We'll use the training set to grid the
# data and the testing set to validate our spline model. Weights need to
# 1/uncertainty**2 for the error propagation in BlockMean to work.
train, test = vd.train_test_split(
projection(*coordinates),
data.velocity_up,
weights=1 / data.std_up ** 2,
random_state=0,
)
# Fit the model on the training set
chain.fit(*train)
# And calculate an R^2 score coefficient on the testing set. The best possible score
# (perfect prediction) is 1. This can tell us how good our spline is at predicting data
# that was not in the input dataset.
score = chain.score(*test)
print("\nScore: {:.3f}".format(score))
# Create a grid of the vertical velocity and mask it to only show points close to the
# actual data.
region = vd.get_region(coordinates)
coordinates = (data.longitude.values, data.latitude.values)
region = vd.get_region(coordinates)
# Split the data into a training and testing set by picking points at random
# This is NOT the best way to split spatially correlated data and will cease being the
# default in future versions of Verde.
train, test = vd.train_test_split(coordinates, data.bathymetry_m, random_state=0)
print("Train and test data sizes:")
print(train[0][0].size, test[0][0].size)
# Alternatively, we can split the data into blocks and pick blocks at random.
# The advantage of this approach is that it makes sure that the training and testing
# datasets are not spatially correlated, which would bias our model evaluation.
# This will be the default in future versions of Verde.
block_train, block_test = vd.train_test_split(
coordinates, data.bathymetry_m, method="block", spacing=1, random_state=1
)
print("Blocked train and test data sizes:")
print(block_train[0][0].size, block_test[0][0].size)
fig, axes = plt.subplots(
1, 2, figsize=(10, 6), subplot_kw=dict(projection=ccrs.Mercator())
)
crs = ccrs.PlateCarree()
ax = axes[0]
ax.set_title("Shuffle Split")
ax.scatter(*train[0], c="blue", s=1, transform=crs, label="train")
ax.scatter(*test[0], c="red", s=1, transform=crs, label="test")
vd.datasets.setup_baja_bathymetry_map(ax)
ax.coastlines()
# Now we can chain a blocked mean and spline together. The Spline can be regularized
# by setting the damping coefficient (should be positive). It's also a good idea to set
# the minimum distance to the average data spacing to avoid singularities in the spline.
chain = vd.Chain(
[
("mean", vd.BlockReduce(np.mean, spacing=spacing * 111e3)),
("spline", vd.Spline(damping=1e-10, mindist=100e3)),
]
)
print(chain)
# We can evaluate model performance by splitting the data into a training and testing
# set. We'll use the training set to grid the data and the testing set to validate our
# spline model.
train, test = vd.train_test_split(
projection(*coordinates), data.air_temperature_c, random_state=0
)
# Fit the model on the training set
chain.fit(*train)
# And calculate an R^2 score coefficient on the testing set. The best possible score
# (perfect prediction) is 1. This can tell us how good our spline is at predicting data
# that was not in the input dataset.
score = chain.score(*test)
print("\nScore: {:.3f}".format(score))
# Now we can create a geographic grid of air temperature by providing a projection
# function to the grid method and mask points that are too far from the observations
grid_full = chain.grid(
region=region,
plt.tight_layout()
plt.show()
########################################################################################
# Gridding
# --------
#
# You can use :class:`verde.Vector` to create multi-component gridders out of
# :class:`verde.Spline` the same way as we did for trends. In this case, each component
# is treated separately.
#
# We can start by splitting the data into training and testing sets (see
# :ref:`model_selection`). Notice that :func:`verde.train_test_split` work for
# multicomponent data automatically.
train, test = vd.train_test_split(
coordinates=proj_coords,
data=(data.velocity_east, data.velocity_north),
weights=(1 / data.std_east ** 2, 1 / data.std_north ** 2),
random_state=1,
)
########################################################################################
# Now we can make a 2-component spline. Since :class:`verde.Vector` implements
# ``fit``, ``predict``, and ``filter``, we can use it in a :class:`verde.Chain` to build
# a pipeline.
#
# We need to use a bit of damping so that the weights can be taken into account. Splines
# without damping provide a perfect fit to the data and ignore the weights as a
# consequence.
chain = vd.Chain(