Idealized Synthetic Data

Under development

import numpy as np
import pandas as pd
import xarray as xr
from IPython.display import display  # so can run as script too

from melodies_monet import driver
an = driver.analysis()
an.control = "control_idealized.yaml"
an.read_control()
an
analysis(
    control='control_idealized.yaml',
    control_dict=...,
    models={},
    obs={},
    paired={},
    start_time=Timestamp('2019-09-09 00:00:00'),
    end_time=Timestamp('2019-09-10 00:00:00'),
    time_intervals=None,
    download_maps=True,
    output_dir='./output/idealized',
    output_dir_save='./output/idealized',
    output_dir_read='./output/idealized',
    debug=True,
    save={'paired': {'method': 'netcdf', 'prefix': 'asdf', 'data': 'all'}},
    read={'paired': {'method': 'netcdf', 'filenames': {'test_obs_test_model': 'asdf_test_obs_test_model.nc4'}}},
    regrid=False,
)

Generate data

Model

Hide code cell source
rs = np.random.RandomState(42)

control = an.control_dict

nlat = 100
nlon = 200

lon = np.linspace(-161, -60, nlon)
lat = np.linspace(18, 60, nlat)
Lon, Lat = np.meshgrid(lon, lat)

time = pd.date_range(control['analysis']['start_time'], control['analysis']['end_time'], freq="3H")
ntime = time.size

# Generate translating and expanding Gaussian
x_ = np.linspace(-1, 1, lon.size)
y_ = np.linspace(-1, 1, lat.size)
x, y = np.meshgrid(x_, y_)
mu = np.linspace(-0.5, 0.5, ntime)
sigma = np.linspace(0.3, 1, ntime)
g = np.exp(
    -(
        (
            (x[np.newaxis, ...] - mu[:, np.newaxis, np.newaxis])**2
            + y[np.newaxis, ...]**2
        ) / ( 
            2 * sigma[:, np.newaxis, np.newaxis]**2
        ) 
    ) 
)

# Coordinates
lat_da = xr.DataArray(lat, dims="lat", attrs={'longname': 'latitude', 'units': 'degN'}, name="lat")
lon_da = xr.DataArray(lon, dims="lon", attrs={'longname': 'longitude', 'units': 'degE'}, name="lon")
time_da = xr.DataArray(time, dims="time", name="time")

# Generate dataset
field_names = control['model']['test_model']['variables'].keys()
ds_dict = dict()
for field_name in field_names:
    units = control['model']['test_model']['variables'][field_name]['units']
    # data = rs.rand(ntime, nlat, nlon)
    data = g
    da = xr.DataArray(
        data,
        # coords={"lat": lat_da, "lon": lon_da, "time": time_da},
        coords=[time_da, lat_da, lon_da],
        dims=['time', 'lat', 'lon'],
        attrs={'units': units},
    )
    ds_dict[field_name] = da
ds = xr.Dataset(ds_dict).expand_dims("z", axis=1)
ds["z"] = [1]

ds_mod = ds
ds_mod
<xarray.Dataset> Size: 3MB
Dimensions:  (time: 9, lat: 100, lon: 200, z: 1)
Coordinates:
  * time     (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * lat      (lat) float64 800B 18.0 18.42 18.85 19.27 ... 59.15 59.58 60.0
  * lon      (lon) float64 2kB -161.0 -160.5 -160.0 ... -61.02 -60.51 -60.0
  * z        (z) int64 8B 1
Data variables:
    A        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
    B        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
ds.squeeze("z").A.plot(col="time", col_wrap=5, size=3);
../_images/7a7a6190d1f4dc47986ebad35ce0ab2aa746db58e0e5d3d67d66545158c2b4be.png
ds.to_netcdf(control['model']['test_model']['files'])

Obs

Hide code cell source
from string import ascii_lowercase

# Generate positions
# TODO: only within land boundaries would be cleaner
n = 500
lats = rs.uniform(lat[0], lat[-1], n)
lons = rs.uniform(lon[0], lon[-1], n)
siteid = np.array([f"{x:0{len(str(n))}}" for x in range(n)])[np.newaxis, :]
something_vlen = np.array(
    [
        "".join(rs.choice(list(ascii_lowercase), size=x, replace=True))
        for x in rs.randint(low=2, high=8, size=n)
    ],
    dtype=str,
)[np.newaxis, :]

# Generate dataset
field_names = control['model']['test_model']['variables'].keys()
ds_dict = dict()
for field_name0 in field_names:
    field_name = control['model']['test_model']['mapping']['test_obs'][field_name0]
    units = control['model']['test_model']['variables'][field_name0]['units']
    values = (
        ds_mod.A.squeeze().interp(lat=xr.DataArray(lats), lon=xr.DataArray(lons)).values
        + rs.normal(scale=0.3, size=(ntime, n))
    )[:, np.newaxis]
    da = xr.DataArray(
        values,
        coords={
            "x": ("x", np.arange(n)),  # !!!
            "time": ("time", time),
            "latitude": (("y", "x"), lats[np.newaxis, :], lat_da.attrs),
            "longitude": (("y", "x"), lons[np.newaxis, :], lon_da.attrs),
            "siteid": (("y", "x"), siteid),
            "something_vlen": (("y", "x"), something_vlen),
        },
        dims=("time", "y", "x"),
        attrs={'units': units},
    )
    ds_dict[field_name] = da
ds = xr.Dataset(ds_dict)
ds
<xarray.Dataset> Size: 104kB
Dimensions:         (x: 500, time: 9, y: 1)
Coordinates:
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
    latitude        (y, x) float64 4kB 33.73 57.93 48.74 ... 21.26 58.92 59.42
    longitude       (y, x) float64 4kB -90.49 -106.9 -129.7 ... -65.03 -116.0
    siteid          (y, x) <U3 6kB '000' '001' '002' '003' ... '497' '498' '499'
    something_vlen  (y, x) <U7 14kB 'xicjs' 'cvoaglq' ... 'dzzfatw' 'poqhzl'
Dimensions without coordinates: y
Data variables:
    A_obs           (time, y, x) float64 36kB 0.21 0.1793 ... 0.118 0.6552
    B_obs           (time, y, x) float64 36kB -0.1099 -0.2269 ... 0.1155 0.6365
ds.to_netcdf(control['obs']['test_obs']['filename'])

Load

an.open_models()
random
test_model.nc
**** Reading Unspecified model output. Take Caution...
an.models['test_model'].obj
Hide code cell output
<xarray.Dataset> Size: 3MB
Dimensions:  (time: 9, lat: 100, lon: 200, z: 1)
Coordinates:
  * time     (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * lat      (lat) float64 800B 18.0 18.42 18.85 19.27 ... 59.15 59.58 60.0
  * lon      (lon) float64 2kB -161.0 -160.5 -160.0 ... -61.02 -60.51 -60.0
  * z        (z) int64 8B 1
Data variables:
    A        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
    B        (time, z, lat, lon) float64 1MB 0.000964 0.001019 ... 0.5379 0.5353
an.open_obs()
an.obs['test_obs'].obj
Hide code cell output
<xarray.Dataset> Size: 104kB
Dimensions:         (x: 500, time: 9, y: 1)
Coordinates:
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
    latitude        (y, x) float64 4kB ...
    longitude       (y, x) float64 4kB ...
    siteid          (y, x) <U3 6kB ...
    something_vlen  (y, x) <U7 14kB ...
Dimensions without coordinates: y
Data variables:
    A_obs           (time, y, x) float64 36kB ...
    B_obs           (time, y, x) float64 36kB ...

Pair

%%time

an.pair_data()
Hide code cell output
1, in pair data
After pairing:                      time   latitude   longitude siteid something_vlen  \
0    2019-09-09 00:00:00  33.730685  -90.485667    000          xicjs   
1    2019-09-09 03:00:00  33.730685  -90.485667    000          xicjs   
2    2019-09-09 06:00:00  33.730685  -90.485667    000          xicjs   
3    2019-09-09 09:00:00  33.730685  -90.485667    000          xicjs   
4    2019-09-09 12:00:00  33.730685  -90.485667    000          xicjs   
...                  ...        ...         ...    ...            ...   
4495 2019-09-09 12:00:00  59.420851 -115.953417    499         poqhzl   
4496 2019-09-09 15:00:00  59.420851 -115.953417    499         poqhzl   
4497 2019-09-09 18:00:00  59.420851 -115.953417    499         poqhzl   
4498 2019-09-09 21:00:00  59.420851 -115.953417    499         poqhzl   
4499 2019-09-10 00:00:00  59.420851 -115.953417    499         poqhzl   

         A_obs     B_obs         A         B  
0     0.209983 -0.109890  0.008033  0.008033  
1     0.171024  0.289535  0.111157  0.111157  
2    -0.174732  0.634226  0.343373  0.343373  
3     0.518397  0.673546  0.587818  0.587818  
4     1.091830  1.225345  0.769533  0.769533  
...        ...       ...       ...       ...  
4495  0.160357  0.926215  0.316865  0.316865  
4496  0.652020  0.369675  0.394017  0.394017  
4497  0.713842  0.143496  0.450187  0.450187  
4498  0.328453  0.524407  0.489130  0.489130  
4499  0.655239  0.636502  0.515131  0.515131  

[4500 rows x 9 columns]
saving pair
CPU times: user 78.4 ms, sys: 3.97 ms, total: 82.4 ms
Wall time: 84 ms
an.paired
{'test_obs_test_model': pair(
     type='pt_sfc',
     radius_of_influence=1000000.0,
     obs='test_obs',
     model='test_model',
     model_vars=['A', 'B'],
     obs_vars=['A_obs', 'B_obs'],
     filename='test_obs_test_model.nc',
 )}
an.paired['test_obs_test_model'].obj
<xarray.Dataset> Size: 196kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    something_vlen  (time, x) object 36kB 'xicjs' 'cvoaglq' ... 'poqhzl'
    A_obs           (time, x) float64 36kB 0.21 0.1793 0.1735 ... 0.118 0.6552
    B_obs           (time, x) float64 36kB -0.1099 -0.2269 ... 0.1155 0.6365
    A               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    B               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    latitude        (x) float64 4kB 33.73 57.93 48.74 ... 21.26 58.92 59.42
    longitude       (x) float64 4kB -90.49 -106.9 -129.7 ... -65.03 -116.0
    siteid          (x) object 4kB '000' '001' '002' '003' ... '497' '498' '499'
an.paired['test_obs_test_model'].obj.dims
FrozenMappingWarningOnValuesAccess({'time': 9, 'x': 500})

Plot

%%time

an.plotting()
Warning: variables dict for A_obs not provided, so defaults used
/home/docs/checkouts/readthedocs.org/user_builds/melodies-monet/conda/v1.0/lib/python3.11/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/10m_physical/ne_10m_coastline.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
/home/docs/checkouts/readthedocs.org/user_builds/melodies-monet/conda/v1.0/lib/python3.11/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/50m_cultural/ne_50m_admin_0_boundary_lines_land.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
/home/docs/checkouts/readthedocs.org/user_builds/melodies-monet/conda/v1.0/lib/python3.11/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_1_states_provinces_lines.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
Warning: variables dict for B_obs not provided, so defaults used
CPU times: user 3.94 s, sys: 170 ms, total: 4.1 s
Wall time: 5.58 s
../_images/0d37e6e99b71fbcf742060b53a923e53c6e0157d200407f23d3d271d48949f11.png ../_images/9c14d1592da969c42e39fb06e813469bbc011684de209a82f345c9e3a1f7d37a.png ../_images/1e75fc56cf95f49c1d4825c264452afad8b62ee21ee9940e4aaa2fcb139e1281.png ../_images/cd5c182dbe95df9f60c6550ecf763054858da4e944907ff8912efd432a08fadd.png

Save/load paired data – netCDF

And compare to the original pair object.

from copy import deepcopy

p0 = deepcopy(an.paired['test_obs_test_model'].obj)

an.save_analysis()
an.read_analysis()
p1 = deepcopy(an.paired['test_obs_test_model'].obj)
p1.close()

display(p0)
display(p1)
assert p1 is not p0 and p1.equals(p0)
Writing: ./output/idealized/asdf_test_obs_test_model.nc4
Reading: ./output/idealized/asdf_test_obs_test_model.nc4
<xarray.Dataset> Size: 196kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    something_vlen  (time, x) object 36kB 'xicjs' 'cvoaglq' ... 'poqhzl'
    A_obs           (time, x) float64 36kB 0.21 0.1793 0.1735 ... 0.118 0.6552
    B_obs           (time, x) float64 36kB -0.1099 -0.2269 ... 0.1155 0.6365
    A               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    B               (time, x) float64 36kB 0.008033 0.001784 ... 0.5939 0.5151
    latitude        (x) float64 4kB 33.73 57.93 48.74 ... 21.26 58.92 59.42
    longitude       (x) float64 4kB -90.49 -106.9 -129.7 ... -65.03 -116.0
    siteid          (x) object 4kB '000' '001' '002' '003' ... '497' '498' '499'
<xarray.Dataset> Size: 288kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    something_vlen  (time, x) <U7 126kB ...
    A_obs           (time, x) float64 36kB ...
    B_obs           (time, x) float64 36kB ...
    A               (time, x) float64 36kB ...
    B               (time, x) float64 36kB ...
    latitude        (x) float64 4kB ...
    longitude       (x) float64 4kB ...
    siteid          (x) <U3 6kB ...
Attributes:
    title:         
    format:        NetCDF-4
    date_created:  2025-06-30
    dict_json:     {\n    "type": "pt_sfc",\n    "radius_of_influence": 10000...
    group_name:    test_obs_test_model

Save/load paired data – Python object

print(an.save)
an.save["paired"]["method"] = "pkl"
del an.save["paired"]["prefix"]
# We could leave `prefix` since unused, but we need to set `output_name`
an.save["paired"]["output_name"] = "asdf.joblib"
print("->", an.save)

print()
print(an.read)
an.read["paired"]["method"] = "pkl"
an.read["paired"]["filenames"] = "asdf.joblib"
print("->", an.read)

print()
an.save_analysis()
an.read_analysis()
p2 = deepcopy(an.paired['test_obs_test_model'].obj)
p2.close()

# display(p0)
display(p2)
assert p2 is not p0 and p2.equals(p0)
{'paired': {'method': 'netcdf', 'prefix': 'asdf', 'data': 'all'}}
-> {'paired': {'method': 'pkl', 'data': 'all', 'output_name': 'asdf.joblib'}}

{'paired': {'method': 'netcdf', 'filenames': {'test_obs_test_model': ['./output/idealized/asdf_test_obs_test_model.nc4']}}}
-> {'paired': {'method': 'pkl', 'filenames': 'asdf.joblib'}}

Writing: ./output/idealized/asdf.joblib
Reading: ./output/idealized/asdf.joblib
<xarray.Dataset> Size: 288kB
Dimensions:         (time: 9, x: 500)
Coordinates:
  * time            (time) datetime64[ns] 72B 2019-09-09 ... 2019-09-10
  * x               (x) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499
Data variables:
    something_vlen  (time, x) <U7 126kB ...
    A_obs           (time, x) float64 36kB ...
    B_obs           (time, x) float64 36kB ...
    A               (time, x) float64 36kB ...
    B               (time, x) float64 36kB ...
    latitude        (x) float64 4kB ...
    longitude       (x) float64 4kB ...
    siteid          (x) <U3 6kB ...
Attributes:
    title:         
    format:        NetCDF-4
    date_created:  2025-06-30
    dict_json:     {\n    "type": "pt_sfc",\n    "radius_of_influence": 10000...
    group_name:    test_obs_test_model