如何按秒对 xarray 进行上采样并包括边界时间
How to upsample xarray by seconds and include bounding hours
我有一个 xarray.DataArray
坐标像
ary["time"] = [
"2000-01-01T03:04:05", # leading records are missing,
"2000-01-01T03:04:06",
"2000-01-01T03:04:08", # some medium records are missing,
"2000-01-01T03:04:09",
"2000-01-01T03:04:11",
...
"2000-01-01T06:54:02",
"2000-01-01T06:54:03" # and trailing records are missing.
]
并想重新索引到
ary["time"] = [
"2000-01-01T03:00:00",
"2000-01-01T03:00:01",
"2000-01-01T03:00:02",
...
"2000-01-01T03:04:06",
"2000-01-01T03:04:07",
"2000-01-01T03:04:08",
"2000-01-01T03:04:09",
...
"2000-01-01T06:59:57",
"2000-01-01T06:59:58",
"2000-01-01T06:59:59"
]
并在所有缺失记录处设置 NaN。
我找到了 ary = ary.resample(time="1S").asfreq()
但它只插入了中等记录。
如何表示左右边界是每小时? (或分钟或天?)
样本(取自gist):
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import xarray as xr
def make_ary():
time = []
for i in range(300, 14000):
if i % 3 != 2 and i % 5 != 2:
time.append(datetime(2000, 1, 1, 3, 0, 0) + timedelta(seconds=i))
data = np.random.rand(len(time))
return xr.DataArray(data=data, coords=[("time", time)], dims=["time"])
def make_expected():
expected = []
for i in range(0, 4*60*60):
expected.append(
datetime(2000, 1, 1, 3, 0, 0) + timedelta(seconds=i)
)
return pd.to_datetime(np.array(expected))
def make_not_expected():
'''
result of 'inserts medium records'
'''
not_expected = []
for i in range(300, 14000):
not_expected.append(
datetime(2000, 1, 1, 3, 0, 0) + timedelta(seconds=i)
)
return pd.to_datetime(np.array(not_expected))
def resample(ary):
return ary.resample(time="1S").asfreq()
def main():
ary = make_ary()
expected = make_expected()
not_expected = make_not_expected()
print(np.array_equal(ary["time"].values, expected)) # False
ary = resample(ary)
print(np.array_equal(ary["time"], expected)) # False
print(np.array_equal(ary["time"], not_expected)) # True, but not expected
main()
一种实现你想要的方法,就是在开头和结尾附加一个 NaN
pad 和相应的时间戳,然后只使用 resample
:
start_timestamp = "2000-01-01T03:00:00"
stop_timestamp = "2000-01-01T06:59:59"
ary2 = xr.concat([
xr.DataArray(data=[np.nan], coords=[("time", pd.date_range(start=start_timestamp, freq="1S", periods=1))], dims=["time"]),
ary,
xr.DataArray(data=[np.nan], coords=[("time", pd.date_range(start=stop_timestamp, freq="1S", periods=1))], dims=["time"])
], dim="time").resample(time="1s").asfreq()
给你:
print(ary2.time)
# <xarray.DataArray 'time' (time: 14400)>
# array(['2000-01-01T03:00:00.000000000', '2000-01-01T03:00:01.000000000',
# '2000-01-01T03:00:02.000000000', ..., '2000-01-01T06:59:57.000000000',
# '2000-01-01T06:59:58.000000000', '2000-01-01T06:59:59.000000000'],
# dtype='datetime64[ns]')
# Coordinates:
# * time (time) datetime64[ns] 2000-01-01T03:00:00 ... 2000-01-01T06:59:59
使用DataArray.reindex (Documentation)
特别是在这种情况下,DataArray.reindex
可能是更好的选择。
在下面的代码示例中,目标数组的日期范围是用date_range
指定的(注意参数closed
设置为"left"
,因为我们不想要包含的范围 "2000-01-01T07:00:00"
.
start_time = "2000-01-01T03:00:00"
end_time = "2000-01-01T07:00:00"
new_ary = ary.reindex(time=pd.date_range(start=start_time,end=end_time,freq="1S",closed='left'))
print(ary)
这给出了以下输出:
<xarray.DataArray 'time' (time: 14400)>
array(['2000-01-01T03:00:00.000000000', '2000-01-01T03:00:01.000000000',
'2000-01-01T03:00:02.000000000', ..., '2000-01-01T06:59:57.000000000',
'2000-01-01T06:59:58.000000000', '2000-01-01T06:59:59.000000000'],
dtype='datetime64[ns]')
Coordinates:
* time (time) datetime64[ns] 2000-01-01T03:00:00 ... 2000-01-01T06:59:59
默认情况下,reindex
用 NaN 填充缺失值。下面测试代码的输出表明 "2000-01-01T03:08:06"
和 "2000-01-01T03:08:09"
之间的缺失值被设置为新数组的 NaN。
print(ary[100:102])
# Non NaN values start from index 300 for new_ary
print(new_ary[486:490])
输出:
<xarray.DataArray (time: 2)>
array([0.25910861, 0.07897777])
Coordinates:
* time (time) datetime64[ns] 2000-01-01T03:08:06 2000-01-01T03:08:09
<xarray.DataArray (time: 4)>
array([0.25910861, nan, nan, 0.07897777])
Coordinates:
* time (time) datetime64[ns] 2000-01-01T03:08:06 ... 2000-01-01T03:08:09
我有一个 xarray.DataArray
坐标像
ary["time"] = [
"2000-01-01T03:04:05", # leading records are missing,
"2000-01-01T03:04:06",
"2000-01-01T03:04:08", # some medium records are missing,
"2000-01-01T03:04:09",
"2000-01-01T03:04:11",
...
"2000-01-01T06:54:02",
"2000-01-01T06:54:03" # and trailing records are missing.
]
并想重新索引到
ary["time"] = [
"2000-01-01T03:00:00",
"2000-01-01T03:00:01",
"2000-01-01T03:00:02",
...
"2000-01-01T03:04:06",
"2000-01-01T03:04:07",
"2000-01-01T03:04:08",
"2000-01-01T03:04:09",
...
"2000-01-01T06:59:57",
"2000-01-01T06:59:58",
"2000-01-01T06:59:59"
]
并在所有缺失记录处设置 NaN。
我找到了 ary = ary.resample(time="1S").asfreq()
但它只插入了中等记录。
如何表示左右边界是每小时? (或分钟或天?)
样本(取自gist):
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import xarray as xr
def make_ary():
time = []
for i in range(300, 14000):
if i % 3 != 2 and i % 5 != 2:
time.append(datetime(2000, 1, 1, 3, 0, 0) + timedelta(seconds=i))
data = np.random.rand(len(time))
return xr.DataArray(data=data, coords=[("time", time)], dims=["time"])
def make_expected():
expected = []
for i in range(0, 4*60*60):
expected.append(
datetime(2000, 1, 1, 3, 0, 0) + timedelta(seconds=i)
)
return pd.to_datetime(np.array(expected))
def make_not_expected():
'''
result of 'inserts medium records'
'''
not_expected = []
for i in range(300, 14000):
not_expected.append(
datetime(2000, 1, 1, 3, 0, 0) + timedelta(seconds=i)
)
return pd.to_datetime(np.array(not_expected))
def resample(ary):
return ary.resample(time="1S").asfreq()
def main():
ary = make_ary()
expected = make_expected()
not_expected = make_not_expected()
print(np.array_equal(ary["time"].values, expected)) # False
ary = resample(ary)
print(np.array_equal(ary["time"], expected)) # False
print(np.array_equal(ary["time"], not_expected)) # True, but not expected
main()
一种实现你想要的方法,就是在开头和结尾附加一个 NaN
pad 和相应的时间戳,然后只使用 resample
:
start_timestamp = "2000-01-01T03:00:00"
stop_timestamp = "2000-01-01T06:59:59"
ary2 = xr.concat([
xr.DataArray(data=[np.nan], coords=[("time", pd.date_range(start=start_timestamp, freq="1S", periods=1))], dims=["time"]),
ary,
xr.DataArray(data=[np.nan], coords=[("time", pd.date_range(start=stop_timestamp, freq="1S", periods=1))], dims=["time"])
], dim="time").resample(time="1s").asfreq()
给你:
print(ary2.time)
# <xarray.DataArray 'time' (time: 14400)>
# array(['2000-01-01T03:00:00.000000000', '2000-01-01T03:00:01.000000000',
# '2000-01-01T03:00:02.000000000', ..., '2000-01-01T06:59:57.000000000',
# '2000-01-01T06:59:58.000000000', '2000-01-01T06:59:59.000000000'],
# dtype='datetime64[ns]')
# Coordinates:
# * time (time) datetime64[ns] 2000-01-01T03:00:00 ... 2000-01-01T06:59:59
使用DataArray.reindex (Documentation)
特别是在这种情况下,DataArray.reindex
可能是更好的选择。
在下面的代码示例中,目标数组的日期范围是用date_range
指定的(注意参数closed
设置为"left"
,因为我们不想要包含的范围 "2000-01-01T07:00:00"
.
start_time = "2000-01-01T03:00:00"
end_time = "2000-01-01T07:00:00"
new_ary = ary.reindex(time=pd.date_range(start=start_time,end=end_time,freq="1S",closed='left'))
print(ary)
这给出了以下输出:
<xarray.DataArray 'time' (time: 14400)>
array(['2000-01-01T03:00:00.000000000', '2000-01-01T03:00:01.000000000',
'2000-01-01T03:00:02.000000000', ..., '2000-01-01T06:59:57.000000000',
'2000-01-01T06:59:58.000000000', '2000-01-01T06:59:59.000000000'],
dtype='datetime64[ns]')
Coordinates:
* time (time) datetime64[ns] 2000-01-01T03:00:00 ... 2000-01-01T06:59:59
默认情况下,reindex
用 NaN 填充缺失值。下面测试代码的输出表明 "2000-01-01T03:08:06"
和 "2000-01-01T03:08:09"
之间的缺失值被设置为新数组的 NaN。
print(ary[100:102])
# Non NaN values start from index 300 for new_ary
print(new_ary[486:490])
输出:
<xarray.DataArray (time: 2)>
array([0.25910861, 0.07897777])
Coordinates:
* time (time) datetime64[ns] 2000-01-01T03:08:06 2000-01-01T03:08:09
<xarray.DataArray (time: 4)>
array([0.25910861, nan, nan, 0.07897777])
Coordinates:
* time (time) datetime64[ns] 2000-01-01T03:08:06 ... 2000-01-01T03:08:09