xarray:通过 OPeNDAP 存储的数据的平均值

xarray: mean of data stored via OPeNDAP

我正在使用 xarray 非常酷的 pydap 后端 (http://xarray.pydata.org/en/stable/io.html#opendap) 来读取通过 IRI 的 OPenDAP 存储的数据:

import xarray as xr
remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
print(remote_data)
#<xarray.DataArray 'zg' (P: 2, S: 6569, M: 3, L: 45, Y: 181, X: 360)>
#[115569730800 values with dtype=float32]
#Coordinates:
# * L        (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
# * Y        (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
# * S        (S) datetime64[ns] 1999-01-07 1999-01-08 1999-01-09 1999-01-10 ...
# * M        (M) float32 1.0 2.0 3.0
# * X        (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
# * P        (P) int32 500 200
#Attributes:
#    level_type:     pressure level
#    standard_name:  geopotential_height
#    long_name:      Geopotential Height
#    units:          m

作为参考,它是次季节预测数据,其中 L 是提前期(45 天预测),S 是初始化日期,M 是合奏。

我想做一个集合平均值,我只对 500 hPa 水平感兴趣。但是,它崩溃并给出 RuntimeError: NetCDF: Access failure:

da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')

RuntimeError                              Traceback (most recent call last)
<ipython-input-46-eca488e9def5> in <module>()
      1 remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models'                                '/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
      2 da = remote_data.sel(P=500)
----> 3 da_ensmean = da.mean(dim='M')

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/common.py in wrapped_func(self, dim, axis, skipna, keep_attrs, **kwargs)
     20                              keep_attrs=False, **kwargs):
     21                 return self.reduce(func, dim, axis, keep_attrs=keep_attrs,
---> 22                                    skipna=skipna, allow_lazy=True, **kwargs)
     23         else:
     24             def wrapped_func(self, dim=None, axis=None, keep_attrs=False,

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in reduce(self, func, dim, axis, keep_attrs, **kwargs)
   1359             summarized data and the indicated dimension(s) removed.
   1360         """
-> 1361         var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs)
   1362         return self._replace_maybe_drop_dims(var)
   1363 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in reduce(self, func, dim, axis, keep_attrs, allow_lazy, **kwargs)
   1264         if dim is not None:
   1265             axis = self.get_axis_num(dim)
-> 1266         data = func(self.data if allow_lazy else self.values,
   1267                     axis=axis, **kwargs)
   1268 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in data(self)
    293             return self._data
    294         else:
--> 295             return self.values
    296 
    297     @data.setter

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in values(self)
    385     def values(self):
    386         """The variable's data as a numpy.ndarray"""
--> 387         return _as_array_or_item(self._data)
    388 
    389     @values.setter

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in _as_array_or_item(data)
    209     TODO: remove this (replace with np.asarray) once these issues are fixed
    210     """
--> 211     data = np.asarray(data)
    212     if data.ndim == 0:
    213         if data.dtype.kind == 'M':

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    622 
    623     def __array__(self, dtype=None):
--> 624         self._ensure_cached()
    625         return np.asarray(self.array, dtype=dtype)
    626 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in _ensure_cached(self)
    619     def _ensure_cached(self):
    620         if not isinstance(self.array, NumpyIndexingAdapter):
--> 621             self.array = NumpyIndexingAdapter(np.asarray(self.array))
    622 
    623     def __array__(self, dtype=None):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    600 
    601     def __array__(self, dtype=None):
--> 602         return np.asarray(self.array, dtype=dtype)
    603 
    604     def __getitem__(self, key):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    506     def __array__(self, dtype=None):
    507         array = as_indexable(self.array)
--> 508         return np.asarray(array[self.key], dtype=None)
    509 
    510     def transpose(self, order):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in __getitem__(self, key)
     64 
     65     def __getitem__(self, key):
---> 66         return self.func(self.array[key])
     67 
     68     def __repr__(self):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in _apply_mask(data, encoded_fill_values, decoded_fill_value, dtype)
    133     for fv in encoded_fill_values:
    134         condition |= data == fv
--> 135     data = np.asarray(data, dtype=dtype)
    136     return np.where(condition, decoded_fill_value, data)
    137 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    506     def __array__(self, dtype=None):
    507         array = as_indexable(self.array)
--> 508         return np.asarray(array[self.key], dtype=None)
    509 
    510     def transpose(self, order):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/netCDF4_.py in __getitem__(self, key)
     63         with self.datastore.ensure_open(autoclose=True):
     64             try:
---> 65                 array = getitem(self.get_array(), key.tuple)
     66             except IndexError:
     67                 # Catch IndexError in netCDF4 and return a more informative

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/common.py in robust_getitem(array, key, catch, max_retries, initial_delay)
    114     for n in range(max_retries + 1):
    115         try:
--> 116             return array[key]
    117         except catch:
    118             if n == max_retries:

netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable.__getitem__()

netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable._get()

netCDF4/_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()

RuntimeError: NetCDF: Access failure

分解计算会删除 RuntimeError。猜猜所有开始时间的计算量太大了。在 S:

上循环应该不会太难
da = remote_data.isel(P=0,S=0)
da_ensmean = da.mean(dim='M')
print(da_ensmean)
<xarray.DataArray 'zg' (L: 45, Y: 181, X: 360)>
array([[[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
        [5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
        ...,
        [5056.2383, 5056.2383, ..., 5056.2383, 5056.2383],
        [5056.2383, 5056.2383, ..., 5056.2383, 5056.2383]],

       [[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
        [5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
        ...,
        [5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ],
        [5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ]],

       ...,

       [[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
        [5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
        ...,
        [5154.2173, 5154.2173, ..., 5154.2173, 5154.2173],
        [5154.2173, 5154.2173, ..., 5154.2173, 5154.2173]],

       [[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
        [5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
        ...,
        [5226.0063, 5226.0063, ..., 5226.0063, 5226.0063],
        [5226.0063, 5226.0063, ..., 5226.0063, 5226.0063]]], dtype=float32)
Coordinates:
  * L        (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
  * Y        (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
    S        datetime64[ns] 1999-01-07
  * X        (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
    P        int32 500

这是使用 dask 进行分块的一个很好的用例,例如,

import xarray as xr
url = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods'
remote_data = xr.open_dataarray(url, chunks={'S': 1, 'L': 1})
da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')

此版本将使用许多较小的数据块并行访问数据服务器。下载231GB的数据还是会很慢,但是你的请求成功几率会大很多。