import holoviews as hv
import hvplot
import hvplot.pandas  # noqa
import pandas as pd
import statsmodels.formula.api as smf
pd.options.plotting.backend = "holoviews"Benchmarking: Sharding Extension
Read summary of all benchmarking results.
summary = pd.read_parquet("s3://carbonplan-benchmarks/benchmark-data/v0.2/summary.parq")Subset the data to isolate the impact of Zarr version when using the sharding extension for V3 data and chunk size.
df = summary[
    (summary["projection"] == 3857)
    & (summary["pixels_per_tile"] == 128)
    & ((summary["zarr_version"] == 2) | (summary["shard_size"] == 100))
    & (summary["region"] == "us-west-2")
]cmap = ["#E1BE6A", "#40B0A6"]
plt_opts = {"width": 600, "height": 400}Create a box plot showing how the rendering time depends on Zarr version when using the sharding extension for V3 data and chunk size.
df.hvplot.box(
    y="duration",
    by=["actual_chunk_size", "zarr_version"],
    c="zarr_version",
    cmap=cmap,
    ylabel="Time to render (ms)",
    xlabel="Chunk size (MB); Zarr Version",
    legend=False,
).opts(**plt_opts)Fit a multiple linear regression to the results. The results show that rendering Zarr V3 data with the sharding extension is slower than rendering Zarr V2 data, but most of the variance in rendering time is unexplained by that variable alone.
model = smf.ols("duration ~ actual_chunk_size + C(zarr_version)", data=df).fit()
model.summary()| Dep. Variable: | duration | R-squared: | 0.320 | 
| Model: | OLS | Adj. R-squared: | 0.315 | 
| Method: | Least Squares | F-statistic: | 59.65 | 
| Date: | Tue, 29 Aug 2023 | Prob (F-statistic): | 5.99e-22 | 
| Time: | 20:31:22 | Log-Likelihood: | -1988.5 | 
| No. Observations: | 256 | AIC: | 3983. | 
| Df Residuals: | 253 | BIC: | 3994. | 
| Df Model: | 2 | ||
| Covariance Type: | nonrobust | 
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
| Intercept | 1914.6479 | 64.596 | 29.640 | 0.000 | 1787.434 | 2041.862 | 
| C(zarr_version)[T.3] | 268.8945 | 71.893 | 3.740 | 0.000 | 127.310 | 410.479 | 
| actual_chunk_size | 42.0231 | 4.095 | 10.262 | 0.000 | 33.958 | 50.088 | 
| Omnibus: | 7.661 | Durbin-Watson: | 2.198 | 
| Prob(Omnibus): | 0.022 | Jarque-Bera (JB): | 6.705 | 
| Skew: | 0.323 | Prob(JB): | 0.0350 | 
| Kurtosis: | 2.539 | Cond. No. | 31.2 | 
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Show the rendering time at different zoom levels.
plt_opts = {"width": 400, "height": 300}
plts = []
for zoom_level in range(4):
    df_level = df[df["zoom"] == zoom_level]
    plts.append(
        df_level.hvplot.box(
            y="duration",
            by=["actual_chunk_size", "zarr_version"],
            c="zarr_version",
            cmap=cmap,
            ylabel="Time to render (ms)",
            xlabel="Chunk size (MB); Zarr version",
            legend=False,
            title=f"Zoom level {zoom_level}",
        ).opts(**plt_opts)
    )
hv.Layout(plts).cols(2)/Users/max/mambaforge/envs/benchmark-maps/lib/python3.10/site-packages/holoviews/plotting/bokeh/plot.py:987: UserWarning: found multiple competing values for 'toolbar.active_drag' property; using the latest value
  layout_plot = gridplot(
/Users/max/mambaforge/envs/benchmark-maps/lib/python3.10/site-packages/holoviews/plotting/bokeh/plot.py:987: UserWarning: found multiple competing values for 'toolbar.active_scroll' property; using the latest value
  layout_plot = gridplot(Add a multiplicative interaction term with zoom level to the multiple linear regression. The results show that chunk size has a significant impact on rendering performance at higher zoom levels, with the most pronounced affect at zoom level 3. Zarr V3 data with sharding renders faster than Zarr V2 data at zoom level 0 but slower at higher zoom levels.
model = smf.ols(
    "duration ~ actual_chunk_size * C(zoom) + C(zarr_version) * C(zoom) + actual_chunk_size * C(zarr_version)",  # noqa
    data=df,
).fit()
model.summary()| Dep. Variable: | duration | R-squared: | 0.948 | 
| Model: | OLS | Adj. R-squared: | 0.945 | 
| Method: | Least Squares | F-statistic: | 369.4 | 
| Date: | Tue, 29 Aug 2023 | Prob (F-statistic): | 1.74e-148 | 
| Time: | 20:31:23 | Log-Likelihood: | -1659.4 | 
| No. Observations: | 256 | AIC: | 3345. | 
| Df Residuals: | 243 | BIC: | 3391. | 
| Df Model: | 12 | ||
| Covariance Type: | nonrobust | 
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
| Intercept | 2225.1492 | 38.148 | 58.329 | 0.000 | 2150.006 | 2300.292 | 
| C(zoom)[T.1.0] | 210.8908 | 51.552 | 4.091 | 0.000 | 109.345 | 312.437 | 
| C(zoom)[T.2.0] | -578.2420 | 51.552 | -11.217 | 0.000 | -679.788 | -476.696 | 
| C(zoom)[T.3.0] | -1050.6001 | 51.552 | -20.379 | 0.000 | -1152.146 | -949.054 | 
| C(zarr_version)[T.3] | -34.4019 | 46.388 | -0.742 | 0.459 | -125.775 | 56.971 | 
| C(zarr_version)[T.3]:C(zoom)[T.1.0] | 612.5118 | 57.376 | 10.675 | 0.000 | 499.494 | 725.529 | 
| C(zarr_version)[T.3]:C(zoom)[T.2.0] | 412.6057 | 57.376 | 7.191 | 0.000 | 299.588 | 525.623 | 
| C(zarr_version)[T.3]:C(zoom)[T.3.0] | 539.9608 | 57.376 | 9.411 | 0.000 | 426.943 | 652.978 | 
| actual_chunk_size | 0.5489 | 2.584 | 0.212 | 0.832 | -4.540 | 5.638 | 
| actual_chunk_size:C(zoom)[T.1.0] | 58.0800 | 3.268 | 17.771 | 0.000 | 51.642 | 64.518 | 
| actual_chunk_size:C(zoom)[T.2.0] | 63.2566 | 3.268 | 19.355 | 0.000 | 56.819 | 69.694 | 
| actual_chunk_size:C(zoom)[T.3.0] | 62.6391 | 3.268 | 19.166 | 0.000 | 56.202 | 69.077 | 
| actual_chunk_size:C(zarr_version)[T.3] | -9.0395 | 2.311 | -3.912 | 0.000 | -13.592 | -4.487 | 
| Omnibus: | 0.970 | Durbin-Watson: | 1.815 | 
| Prob(Omnibus): | 0.616 | Jarque-Bera (JB): | 0.681 | 
| Skew: | -0.046 | Prob(JB): | 0.712 | 
| Kurtosis: | 3.235 | Cond. No. | 168. | 
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
