import datetime
from urllib.parse import urlparse
import earthaccess
import obstore
import pandas as pd
import xarray as xr
from obstore.auth.earthdata import NasaEarthdataCredentialProvider
from virtualizarr import open_virtual_dataset, open_virtual_mfdataset
from virtualizarr.parsers import HDFParser
from virtualizarr.registry import ObjectStoreRegistryHands-on - Virtualize NetCDF from EarthData
Step 1: Import necessary functions and classes
Zarr can emit a lot of warnings about Numcodecs not being including in the Zarr version 3 specification yet – let’s suppress those.
import warnings
warnings.filterwarnings(
"ignore",
message="Numcodecs codecs are not in the Zarr version 3 specification*",
category=UserWarning,
)Step 2: Search NASA CMR using earthaccess
start_date = datetime.datetime(2022, 4, 1)
end_date = datetime.datetime(2022, 5, 12)
date_array = pd.date_range(start=start_date, end=end_date, freq="D").to_pydatetime()
short_name = "GLDAS_NOAH025_3H"
version = "2.1"
variable = "SoilMoi0_10cm_inst" # Only select a single variable of interest
print("Retrieving data granules from Earthaccess")
earthaccess.login()
results = earthaccess.search_data(
short_name=short_name,
version=version,
temporal=(start_date, end_date),
cloud_hosted=True,
)
urls = [g["umm"]["RelatedUrls"][1]["URL"] for g in results]Step 3: Create an ObjectStore and an ObjectStoreRegistry
url = urls[0] # Use the first URL for virtualizarr
parsed = urlparse(url)
bucket = parsed.netloc
scheme = parsed.scheme
credential_url = [
item["URL"]
for item in results[0]["umm"]["RelatedUrls"]
if item["Description"]
== "api endpoint to retrieve temporary credentials valid for same-region direct s3 access"
][0]
cp = NasaEarthdataCredentialProvider(credential_url)store = obstore.store.S3Store(bucket=bucket, region="us-west-2", credential_provider=cp)
registry = ObjectStoreRegistry({f"{scheme}://{bucket}": store})Step 4: Create an instance of the HDFParser
parser = HDFParser()Step 5: Create a virtual dataset via open_virtual_dataset
vds = open_virtual_dataset(
url=url,
parser=parser,
registry=registry,
)Step 6: Create a virtual datacube via open_virtual_dataset
vds = open_virtual_mfdataset(urls=urls[:2], parser=parser, registry=registry)
vdsStep 7: Load data directly into Xarray
ds = xr.open_zarr(parser(urls[0], registry=registry), zarr_format=3, consolidated=False)
da = ds["Snowf_tavg"]
dada.load()
da.plot()TODO
Currently persisting virtual Zarr stores that access data behing Earthdata Login with Icechunk is quite clunky. Therefore, it has been excluded from the demo. We are actively discussing options for a more user-friendly API with the Icechunk developers.