如何将大型多维数组分段写入 HDF5 文件?
How can I write a large multidimensional array to an HDF5 file in parts?
我在 C# 中使用 HDF5DotNet,我有一个非常大的数组(几 GB),我想将其写入 HDF5 文件。它太大了,无法将整个内容存储在内存中,所以我一次生成它的区域并想将它们写出来,但是当它被读回时仍然看起来像一个大数组。我知道这对于 HDF5 是可行的,但是 .NET API 的文档有些稀疏。
我写了一些简短的示例代码,其中包含一个 5 x 3 数组,其中填充了值 1..15:
const int ROWS = 5;
const int COLS = 3;
static void Main(string[] args)
{
WriteWholeArray();
WriteArrayByRows();
ushort[,] array = ReadWholeArray();
}
static void WriteWholeArray()
{
H5FileId h5 = H5F.create(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.CreateMode.ACC_TRUNC);
H5DataSpaceId dsi = H5S.create_simple(2, new long[] { ROWS, COLS });
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi);
ushort[,] array = new ushort[ROWS, COLS];
ushort value = 1;
for(int i = 0; i < array.GetLength(0); i++)
{
for (int j = 0; j < array.GetLength(1); j++)
{
array[i, j] = value++;
}
}
H5D.write<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), new H5Array<ushort>(array));
H5D.close(dataset);
H5F.close(h5);
}
static void WriteArrayByRows()
{
H5FileId h5 = H5F.create(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.CreateMode.ACC_TRUNC);
H5DataSpaceId dsi = H5S.create_simple(2, new long[] { ROWS, COLS });
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi);
ushort[,] array = new ushort[ROWS, COLS];
ushort value = 1;
for (int i = 0; i < array.GetLength(0); i++)
{
for (int j = 0; j < array.GetLength(1); j++)
{
array[i, j] = value++;
}
}
for(int i = 0; i < array.GetLength(0); i++)
{
H5S.selectHyperslab(dsi, H5S.SelectOperator.SET, new long[] { i, 0 }, new long[] { 1, array.GetLength(1) });
ushort[,] row = new ushort[1, array.GetLength(1)];
for(int j = 0; j < array.GetLength(1); j++)
{
row[0, j] = array[i, j];
}
H5D.write<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), new H5Array<ushort>(row));
}
H5D.close(dataset);
H5F.close(h5);
}
static ushort[,] ReadWholeArray()
{
H5FileId h5 = H5F.open(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.OpenMode.ACC_RDONLY);
ushort[,] array = new ushort[ROWS, COLS];
H5Array<ushort> h5_array = new H5Array<ushort>(array);
H5DataSetId dataset = H5D.open(h5, "array");
H5D.read<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), h5_array);
H5D.close(dataset);
H5F.close(h5);
return (array);
}
当我一次写入整个数组时,它可以正常读取。当我尝试按行写入时,我读回的数组有一些正确的值(在错误的元素中)、一些零和一些疯狂的值(例如 43440)。有人可以告诉我如何正确执行此操作吗?
我明白了。显然,当您编写数组的超平板时,您需要第二个数据空间,该数据空间对应于您正在编写的内存中的数组。这是更正后的函数:
static void WriteArrayByRows()
{
H5FileId h5 = H5F.create(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.CreateMode.ACC_TRUNC);
H5DataSpaceId dsi = H5S.create_simple(2, new long[] { ROWS, COLS });
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi);
ushort[,] array = new ushort[ROWS, COLS];
ushort value = 1;
for (int i = 0; i < array.GetLength(0); i++)
{
for (int j = 0; j < array.GetLength(1); j++)
{
array[i, j] = value++;
}
}
for (int i = 0; i < array.GetLength(0); i++)
{
H5S.selectHyperslab(dsi, H5S.SelectOperator.SET, new long[] { i, 0 }, new long[] { 1, array.GetLength(1) });
H5DataSpaceId dsi2 = H5S.create_simple(2, new long[] { 1, array.GetLength(1) }); // added
ushort[,] row = new ushort[1, array.GetLength(1)];
for (int j = 0; j < array.GetLength(1); j++)
{
row[0, j] = array[i, j];
}
H5PropertyListId pli = new H5PropertyListId(H5P.Template.DEFAULT); // added
H5D.write<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi2, dsi, pli, new H5Array<ushort>(row)); // modified
}
H5D.close(dataset);
H5F.close(h5);
}
我还发现分块对于在编写大型数组时获得不错的性能非常有用,下面是一个例子:
H5PropertyListId pli = H5P.create(H5P.PropertyListClass.DATASET_CREATE); // added
H5P.setChunk(pli, new long[] { 1, COLS }); // added
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi, H5P.create(H5P.PropertyListClass.LINK_CREATE), pli, H5P.create(H5P.PropertyListClass.DATASET_ACCESS)); // modified
我在 C# 中使用 HDF5DotNet,我有一个非常大的数组(几 GB),我想将其写入 HDF5 文件。它太大了,无法将整个内容存储在内存中,所以我一次生成它的区域并想将它们写出来,但是当它被读回时仍然看起来像一个大数组。我知道这对于 HDF5 是可行的,但是 .NET API 的文档有些稀疏。
我写了一些简短的示例代码,其中包含一个 5 x 3 数组,其中填充了值 1..15:
const int ROWS = 5;
const int COLS = 3;
static void Main(string[] args)
{
WriteWholeArray();
WriteArrayByRows();
ushort[,] array = ReadWholeArray();
}
static void WriteWholeArray()
{
H5FileId h5 = H5F.create(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.CreateMode.ACC_TRUNC);
H5DataSpaceId dsi = H5S.create_simple(2, new long[] { ROWS, COLS });
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi);
ushort[,] array = new ushort[ROWS, COLS];
ushort value = 1;
for(int i = 0; i < array.GetLength(0); i++)
{
for (int j = 0; j < array.GetLength(1); j++)
{
array[i, j] = value++;
}
}
H5D.write<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), new H5Array<ushort>(array));
H5D.close(dataset);
H5F.close(h5);
}
static void WriteArrayByRows()
{
H5FileId h5 = H5F.create(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.CreateMode.ACC_TRUNC);
H5DataSpaceId dsi = H5S.create_simple(2, new long[] { ROWS, COLS });
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi);
ushort[,] array = new ushort[ROWS, COLS];
ushort value = 1;
for (int i = 0; i < array.GetLength(0); i++)
{
for (int j = 0; j < array.GetLength(1); j++)
{
array[i, j] = value++;
}
}
for(int i = 0; i < array.GetLength(0); i++)
{
H5S.selectHyperslab(dsi, H5S.SelectOperator.SET, new long[] { i, 0 }, new long[] { 1, array.GetLength(1) });
ushort[,] row = new ushort[1, array.GetLength(1)];
for(int j = 0; j < array.GetLength(1); j++)
{
row[0, j] = array[i, j];
}
H5D.write<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), new H5Array<ushort>(row));
}
H5D.close(dataset);
H5F.close(h5);
}
static ushort[,] ReadWholeArray()
{
H5FileId h5 = H5F.open(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.OpenMode.ACC_RDONLY);
ushort[,] array = new ushort[ROWS, COLS];
H5Array<ushort> h5_array = new H5Array<ushort>(array);
H5DataSetId dataset = H5D.open(h5, "array");
H5D.read<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), h5_array);
H5D.close(dataset);
H5F.close(h5);
return (array);
}
当我一次写入整个数组时,它可以正常读取。当我尝试按行写入时,我读回的数组有一些正确的值(在错误的元素中)、一些零和一些疯狂的值(例如 43440)。有人可以告诉我如何正确执行此操作吗?
我明白了。显然,当您编写数组的超平板时,您需要第二个数据空间,该数据空间对应于您正在编写的内存中的数组。这是更正后的函数:
static void WriteArrayByRows()
{
H5FileId h5 = H5F.create(Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.h5"), H5F.CreateMode.ACC_TRUNC);
H5DataSpaceId dsi = H5S.create_simple(2, new long[] { ROWS, COLS });
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi);
ushort[,] array = new ushort[ROWS, COLS];
ushort value = 1;
for (int i = 0; i < array.GetLength(0); i++)
{
for (int j = 0; j < array.GetLength(1); j++)
{
array[i, j] = value++;
}
}
for (int i = 0; i < array.GetLength(0); i++)
{
H5S.selectHyperslab(dsi, H5S.SelectOperator.SET, new long[] { i, 0 }, new long[] { 1, array.GetLength(1) });
H5DataSpaceId dsi2 = H5S.create_simple(2, new long[] { 1, array.GetLength(1) }); // added
ushort[,] row = new ushort[1, array.GetLength(1)];
for (int j = 0; j < array.GetLength(1); j++)
{
row[0, j] = array[i, j];
}
H5PropertyListId pli = new H5PropertyListId(H5P.Template.DEFAULT); // added
H5D.write<ushort>(dataset, new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi2, dsi, pli, new H5Array<ushort>(row)); // modified
}
H5D.close(dataset);
H5F.close(h5);
}
我还发现分块对于在编写大型数组时获得不错的性能非常有用,下面是一个例子:
H5PropertyListId pli = H5P.create(H5P.PropertyListClass.DATASET_CREATE); // added
H5P.setChunk(pli, new long[] { 1, COLS }); // added
H5DataSetId dataset = H5D.create(h5, "array", new H5DataTypeId(H5T.H5Type.NATIVE_USHORT), dsi, H5P.create(H5P.PropertyListClass.LINK_CREATE), pli, H5P.create(H5P.PropertyListClass.DATASET_ACCESS)); // modified