import datetime
from urllib.parse import urlparse
import earthaccess
import obstore
import pandas as pd
import xarray as xr
from obstore.auth.earthdata import NasaEarthdataCredentialProvider
from virtualizarr import open_virtual_dataset, open_virtual_mfdataset
from virtualizarr.parsers import HDFParser
from virtualizarr.registry import ObjectStoreRegistry
Hands-on - Virtualize NetCDF from EarthData
Step 1: Import necessary functions and classes
Zarr can emit a lot of warnings about Numcodecs not being including in the Zarr version 3 specification yet – let’s suppress those.
import warnings
warnings.filterwarnings("ignore",
="Numcodecs codecs are not in the Zarr version 3 specification*",
message=UserWarning,
category )
Step 2: Search NASA CMR using earthaccess
= datetime.datetime(2022, 4, 1)
start_date = datetime.datetime(2022, 5, 12)
end_date = pd.date_range(start=start_date, end=end_date, freq="D").to_pydatetime()
date_array
= "GLDAS_NOAH025_3H"
short_name = "2.1"
version = "SoilMoi0_10cm_inst" # Only select a single variable of interest
variable
print("Retrieving data granules from Earthaccess")
earthaccess.login()= earthaccess.search_data(
results =short_name,
short_name=version,
version=(start_date, end_date),
temporal=True,
cloud_hosted
)= [g["umm"]["RelatedUrls"][1]["URL"] for g in results] urls
Step 3: Create an ObjectStore and an ObjectStoreRegistry
= urls[0] # Use the first URL for virtualizarr
url = urlparse(url)
parsed = parsed.netloc
bucket = parsed.scheme
scheme = [
credential_url "URL"]
item[for item in results[0]["umm"]["RelatedUrls"]
if item["Description"]
== "api endpoint to retrieve temporary credentials valid for same-region direct s3 access"
0]
][= NasaEarthdataCredentialProvider(credential_url) cp
= obstore.store.S3Store(bucket=bucket, region="us-west-2", credential_provider=cp)
store = ObjectStoreRegistry({f"{scheme}://{bucket}": store}) registry
Step 4: Create an instance of the HDFParser
= HDFParser() parser
Step 5: Create a virtual dataset via open_virtual_dataset
= open_virtual_dataset(
vds =url,
url=parser,
parser=registry,
registry )
Step 6: Create a virtual datacube via open_virtual_dataset
= open_virtual_mfdataset(urls=urls[:2], parser=parser, registry=registry)
vds vds
Step 7: Load data directly into Xarray
= xr.open_zarr(parser(urls[0], registry=registry), zarr_format=3, consolidated=False)
ds = ds["Snowf_tavg"]
da da
da.load() da.plot()
TODO
Currently persisting virtual Zarr stores that access data behing Earthdata Login with Icechunk is quite clunky. Therefore, it has been excluded from the demo. We are actively discussing options for a more user-friendly API with the Icechunk developers.