Idealized Synthetic Data

Under development

import numpy as np
import pandas as pd
import xarray as xr
from IPython.display import display  # so can run as script too

from melodies_monet import driver
an = driver.analysis()
an.control = "control_idealized.yaml"
an.read_control()
an
analysis(
    control='control_idealized.yaml',
    control_dict=...,
    models={},
    obs={},
    paired={},
    start_time=Timestamp('2019-09-09 00:00:00'),
    end_time=Timestamp('2019-09-10 00:00:00'),
    time_intervals=None,
    download_maps=True,
    output_dir='./output/idealized',
    output_dir_save='./output/idealized',
    output_dir_read='./output/idealized',
    debug=True,
    save={'paired': {'method': 'netcdf', 'prefix': 'asdf', 'data': 'all'}},
    read={'paired': {'method': 'netcdf', 'filenames': {'test_obs_test_model': 'asdf_test_obs_test_model.nc4'}}},
    regrid=False,
)

Generate data

Model

Hide code cell source

rs = np.random.RandomState(42)

control = an.control_dict

nlat = 100
nlon = 200

lon = np.linspace(-161, -60, nlon)
lat = np.linspace(18, 60, nlat)
Lon, Lat = np.meshgrid(lon, lat)

time = pd.date_range(control['analysis']['start_time'], control['analysis']['end_time'], freq="3h")
ntime = time.size

# Generate translating and expanding Gaussian
x_ = np.linspace(-1, 1, lon.size)
y_ = np.linspace(-1, 1, lat.size)
x, y = np.meshgrid(x_, y_)
mu = np.linspace(-0.5, 0.5, ntime)
sigma = np.linspace(0.3, 1, ntime)
g = np.exp(
    -(
        (
            (x[np.newaxis, ...] - mu[:, np.newaxis, np.newaxis])**2
            + y[np.newaxis, ...]**2
        ) / ( 
            2 * sigma[:, np.newaxis, np.newaxis]**2
        ) 
    ) 
)

# Coordinates
lat_da = xr.DataArray(lat, dims="lat", attrs={'longname': 'latitude', 'units': 'degN'}, name="lat")
lon_da = xr.DataArray(lon, dims="lon", attrs={'longname': 'longitude', 'units': 'degE'}, name="lon")
time_da = xr.DataArray(time, dims="time", name="time")

# Generate dataset
field_names = control['model']['test_model']['variables'].keys()
ds_dict = dict()
for field_name in field_names:
    units = control['model']['test_model']['variables'][field_name]['units']
    # data = rs.rand(ntime, nlat, nlon)
    data = g
    da = xr.DataArray(
        data,
        # coords={"lat": lat_da, "lon": lon_da, "time": time_da},
        coords=[time_da, lat_da, lon_da],
        dims=['time', 'lat', 'lon'],
        attrs={'units': units},
    )
    ds_dict[field_name] = da
ds = xr.Dataset(ds_dict).expand_dims("z", axis=1)
ds["z"] = [1]

ds_mod = ds
ds_mod
<xarray.Dataset> Size: 3MB
Dimensions:  (time: 9, lat: 100, lon: 200, z: 1)
Coordinates:
  * time     (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * lat      (lat) float64 800B 18.0 18.42 18.85 19.27 ... 59.15 59.58 60.0
  * lon      (lon) float64 2kB -161.0 -160.5 -160.0 ... -61.02 -60.51 -60.0
  * z        (z) int64 8B 1
Data variables:
    A        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
    B        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
ds.squeeze("z").A.plot(col="time", col_wrap=5, size=3);
../_images/6b9fd5fd2160a718fe24cfb23505ed7c340abc32fb96cc35114fb119ae3f2173.png
ds.to_netcdf(control['model']['test_model']['files'])

Obs

Hide code cell source

from string import ascii_lowercase

# Generate positions
# TODO: only within land boundaries would be cleaner
n = 500
lats = rs.uniform(lat[0], lat[-1], n)
lons = rs.uniform(lon[0], lon[-1], n)
siteid = np.array([f"{x:0{len(str(n))}}" for x in range(n)])[np.newaxis, :]
something_vlen = np.array(
    [
        "".join(rs.choice(list(ascii_lowercase), size=x, replace=True))
        for x in rs.randint(low=2, high=8, size=n)
    ],
    dtype=str,
)[np.newaxis, :]

# Generate dataset
field_names = control['model']['test_model']['variables'].keys()
ds_dict = dict()
for field_name0 in field_names:
    field_name = control['model']['test_model']['mapping']['test_obs'][field_name0]
    units = control['model']['test_model']['variables'][field_name0]['units']
    values = (
        ds_mod.A.squeeze().interp(lat=xr.DataArray(lats), lon=xr.DataArray(lons)).values
        + rs.normal(scale=0.3, size=(ntime, n))
    )[:, np.newaxis]
    da = xr.DataArray(
        values,
        coords={
            "x": ("x", np.arange(n)),  # !!!
            "time": ("time", time),
            "latitude": (("y", "x"), lats[np.newaxis, :], lat_da.attrs),
            "longitude": (("y", "x"), lons[np.newaxis, :], lon_da.attrs),
            "siteid": (("y", "x"), siteid),
            "something_vlen": (("y", "x"), something_vlen),
        },
        dims=("time", "y", "x"),
        attrs={'units': units},
    )
    ds_dict[field_name] = da
ds = xr.Dataset(ds_dict)
ds
<xarray.Dataset> Size: 104kB
Dimensions:         (x: 500, time: 9, y: 1)
Coordinates:
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
    latitude        (y, x) float64 4kB 33.73 57.93 48.74 ... 21.26 58.92 59.42
    longitude       (y, x) float64 4kB -90.49 -106.9 -129.7 ... -65.03 -116.0
    siteid          (y, x) <U3 6kB '000' '001' '002' '003' ... '497' '498' '499'
    something_vlen  (y, x) <U7 14kB 'xicjs' 'cvoaglq' ... 'dzzfatw' 'poqhzl'
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
Dimensions without coordinates: y
Data variables:
    A_obs           (time, y, x) float64 36kB 0.21 0.1793 ... 0.118 0.6552
    B_obs           (time, y, x) float64 36kB -0.1099 -0.2269 ... 0.1155 0.6365
ds.to_netcdf(control['obs']['test_obs']['filename'])

Load

an.open_models()
random
test_model.nc
**** Reading Unspecified model output. Take Caution...
an.models['test_model'].obj

Hide code cell output

<xarray.Dataset> Size: 3MB
Dimensions:  (time: 9, z: 1, lat: 100, lon: 200)
Coordinates:
  * time     (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * z        (z) int64 8B 1
  * lat      (lat) float64 800B 18.0 18.42 18.85 19.27 ... 59.15 59.58 60.0
  * lon      (lon) float64 2kB -161.0 -160.5 -160.0 ... -61.02 -60.51 -60.0
Data variables:
    A        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
    B        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
an.open_obs()
an.obs['test_obs'].obj

Hide code cell output

<xarray.Dataset> Size: 104kB
Dimensions:         (time: 9, y: 1, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
    latitude        (y, x) float64 4kB ...
    longitude       (y, x) float64 4kB ...
    siteid          (y, x) <U3 6kB ...
    something_vlen  (y, x) <U7 14kB ...
Dimensions without coordinates: y
Data variables:
    A_obs           (time, y, x) float64 36kB ...
    B_obs           (time, y, x) float64 36kB ...

Pair

%%time

an.pair_data()

Hide code cell output

1, in pair data
After pairing:             time     A_obs     B_obs   latitude   longitude siteid  \
0    2019-09-09  0.209983 -0.109890  33.730685  -90.485667    000   
1    2019-09-09  0.179324 -0.226945  57.930001 -106.854267    001   
2    2019-09-09  0.173470  0.051573  48.743746 -129.737711    002   
3    2019-09-09 -0.171391 -0.117394  43.143656  -78.806703    003   
4    2019-09-09  0.031642  0.115815  24.552783  -91.842152    004   
...         ...       ...       ...        ...         ...    ...   
4495 2019-09-10  0.839140  0.076721  32.840794 -151.750211    495   
4496 2019-09-10  0.755061  0.721992  42.513557  -68.351329    496   
4497 2019-09-10  0.462036  0.622860  21.264855 -147.181318    497   
4498 2019-09-10  0.117985  0.115536  58.924582  -65.026027    498   
4499 2019-09-10  0.655239  0.636502  59.420851 -115.953417    499   

     something_vlen         A         B  
0             xicjs  0.008033  0.008033  
1           cvoaglq  0.001784  0.001784  
2              uzxt  0.291694  0.291694  
3            edltxr  0.000693  0.000693  
4                qs  0.001035  0.001035  
...             ...       ...       ...  
4495        njcycof  0.401354  0.401354  
4496            qlv  0.930279  0.930279  
4497           txky  0.330805  0.330805  
4498        dzzfatw  0.593909  0.593909  
4499         poqhzl  0.515131  0.515131  

[4500 rows x 9 columns]
saving pair
CPU times: user 89.8 ms, sys: 5.09 ms, total: 94.9 ms
Wall time: 89.9 ms
an.paired
{'test_obs_test_model': pair(
     type='pt_sfc',
     radius_of_influence=1000000.0,
     obs='test_obs',
     model='test_model',
     model_vars=['A', 'B'],
     obs_vars=['A_obs', 'B_obs'],
     filename='test_obs_test_model.nc',
 )}
an.paired['test_obs_test_model'].obj
<xarray.Dataset> Size: 196kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    A_obs           (time, x) float64 36kB 0.21 0.1793 0.1735 ... 0.118 0.6552
    B_obs           (time, x) float64 36kB -0.1099 -0.2269 ... 0.1155 0.6365
    something_vlen  (time, x) object 36kB 'xicjs' 'cvoaglq' ... 'poqhzl'
    A               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    B               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    latitude        (x) float64 4kB 33.73 57.93 48.74 ... 21.26 58.92 59.42
    longitude       (x) float64 4kB -90.49 -106.9 -129.7 ... -65.03 -116.0
    siteid          (x) object 4kB '000' '001' '002' '003' ... '497' '498' '499'
an.paired['test_obs_test_model'].obj.dims
FrozenMappingWarningOnValuesAccess({'time': 9, 'x': 500})

Plot

%%time

an.plotting()
Warning: variables dict for A_obs not provided, so defaults used
/home/docs/checkouts/readthedocs.org/user_builds/melodies-monet/conda/develop/lib/python3.11/site-packages/cartopy/io/__init__.py:242: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/10m_physical/ne_10m_coastline.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
/home/docs/checkouts/readthedocs.org/user_builds/melodies-monet/conda/develop/lib/python3.11/site-packages/cartopy/io/__init__.py:242: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/50m_cultural/ne_50m_admin_0_boundary_lines_land.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
/home/docs/checkouts/readthedocs.org/user_builds/melodies-monet/conda/develop/lib/python3.11/site-packages/cartopy/io/__init__.py:242: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_1_states_provinces_lines.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
Warning: variables dict for B_obs not provided, so defaults used
CPU times: user 3.61 s, sys: 207 ms, total: 3.82 s
Wall time: 5.3 s
../_images/b6bb217a01c1cdb37f24f6a9dc07cc5b74ee6da6f52c3f01298af21a3d73ece1.png ../_images/feee1223cd16bc8bbb25bac9b80f87456155a92b97e5a9335bb22da8efeb7cfc.png ../_images/9d640f54aee0ea3c7d27e3b03e6dbf225646a2a8b3318efef9c09233b0694b3b.png ../_images/93e3a43f057b476f2a9a5f1d48167a10340d1351daecbf780a5c47999fface71.png

Save/load paired data – netCDF

And compare to the original pair object.

from copy import deepcopy

p0 = deepcopy(an.paired['test_obs_test_model'].obj)

an.save_analysis()
an.read_analysis()
p1 = deepcopy(an.paired['test_obs_test_model'].obj)
p1.close()

display(p0)
display(p1)
assert p1 is not p0 and p1.equals(p0)
Writing: ./output/idealized/asdf_test_obs_test_model.nc4
Reading: ./output/idealized/asdf_test_obs_test_model.nc4
<xarray.Dataset> Size: 196kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    A_obs           (time, x) float64 36kB 0.21 0.1793 0.1735 ... 0.118 0.6552
    B_obs           (time, x) float64 36kB -0.1099 -0.2269 ... 0.1155 0.6365
    something_vlen  (time, x) object 36kB 'xicjs' 'cvoaglq' ... 'poqhzl'
    A               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    B               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    latitude        (x) float64 4kB 33.73 57.93 48.74 ... 21.26 58.92 59.42
    longitude       (x) float64 4kB -90.49 -106.9 -129.7 ... -65.03 -116.0
    siteid          (x) object 4kB '000' '001' '002' '003' ... '497' '498' '499'
<xarray.Dataset> Size: 288kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    A_obs           (time, x) float64 36kB ...
    B_obs           (time, x) float64 36kB ...
    something_vlen  (time, x) <U7 126kB ...
    A               (time, x) float64 36kB ...
    B               (time, x) float64 36kB ...
    latitude        (x) float64 4kB ...
    longitude       (x) float64 4kB ...
    siteid          (x) <U3 6kB ...
Attributes:
    title:         
    format:        NetCDF-4
    date_created:  2026-06-03
    dict_json:     {\n    "type": "pt_sfc",\n    "radius_of_influence": 10000...
    group_name:    test_obs_test_model

Save/load paired data – Python object

print(an.save)
an.save["paired"]["method"] = "pkl"
del an.save["paired"]["prefix"]
# We could leave `prefix` since unused, but we need to set `output_name`
an.save["paired"]["output_name"] = "asdf.joblib"
print("->", an.save)

print()
print(an.read)
an.read["paired"]["method"] = "pkl"
an.read["paired"]["filenames"] = "asdf.joblib"
print("->", an.read)

print()
an.save_analysis()
an.read_analysis()
p2 = deepcopy(an.paired['test_obs_test_model'].obj)
p2.close()

# display(p0)
display(p2)
assert p2 is not p0 and p2.equals(p0)
{'paired': {'method': 'netcdf', 'prefix': 'asdf', 'data': 'all'}}
-> {'paired': {'method': 'pkl', 'data': 'all', 'output_name': 'asdf.joblib'}}

{'paired': {'method': 'netcdf', 'filenames': {'test_obs_test_model': ['./output/idealized/asdf_test_obs_test_model.nc4']}}}
-> {'paired': {'method': 'pkl', 'filenames': 'asdf.joblib'}}

Writing: ./output/idealized/asdf.joblib
Reading: ./output/idealized/asdf.joblib
<xarray.Dataset> Size: 288kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    A_obs           (time, x) float64 36kB ...
    B_obs           (time, x) float64 36kB ...
    something_vlen  (time, x) <U7 126kB ...
    A               (time, x) float64 36kB ...
    B               (time, x) float64 36kB ...
    latitude        (x) float64 4kB ...
    longitude       (x) float64 4kB ...
    siteid          (x) <U3 6kB ...
Attributes:
    title:         
    format:        NetCDF-4
    date_created:  2026-06-03
    dict_json:     {\n    "type": "pt_sfc",\n    "radius_of_influence": 10000...
    group_name:    test_obs_test_model