如何从 C++ 中的 hdf5 文件中读取数据块?

How to read chunk of the data from a hdf5 file in c++?

我想读取一大块数据,它只是存储在一个数据集中的许多帧中的一帧。整个数据集的shape为(10, 11214,3),10帧,每帧11214行4列。这里是file。我要读取的块的形状为 (11214,3)。我可以使用打印预定义数组,但我不确定如何从 hdf5 文件中读取数据。这是我的代码,

#include <h5xx/h5xx.hpp>
#include <boost/multi_array.hpp>
#include <iostream>
#include <vector>
#include <cstdio>

typedef boost::multi_array<int, 2> array_2d_t;

const int NI=10;
const int NJ=NI;

void print_array(array_2d_t const& array)
{
    for (unsigned int j = 0; j < array.shape()[1]; j++)
    {
        for (unsigned int i = 0; i < array.shape()[0]; i++)
        {
            printf("%2d ", array[j][i]);
        }
        printf("\n");
    }
}
void write_int_data(std::string const& filename, array_2d_t const& array)
{
    h5xx::file file(filename, h5xx::file::trunc);
    std::string name;

    {
        // --- create dataset and fill it with the default array data (positive values)
        name = "integer array";
        h5xx::create_dataset(file, name, array);
        h5xx::write_dataset(file, name, array);

        // --- create a slice object (aka hyperslab) to specify the location in the dataset to be overwritten
        std::vector<int> offset; int offset_raw[2] = {4,4}; offset.assign(offset_raw, offset_raw + 2);
        std::vector<int> count;  int count_raw[2] = {2,2}; count.assign(count_raw, count_raw + 2);
        h5xx::slice slice(offset, count);
    }
}
void read_int_data(std::string const& filename)
{
    h5xx::file file(filename, h5xx::file::in);
    std::string name = "integer array";

    // read and print the full dataset
    {
        array_2d_t array;
        // --- read the complete dataset into array, the array is resized and overwritten internally
        h5xx::read_dataset(file, name, array);
        printf("original integer array read from file, negative number patch was written using a slice\n");
        print_array(array);
        printf("\n");
    }
}
int main(int argc, char** argv)
{
    std::string filename = argv[0];
    filename.append(".h5");

    // --- do a few demos/tests using integers
    {
        array_2d_t array(boost::extents[NJ][NI]);
        {
            const int nelem = NI*NJ;
            int data[nelem];
            for (int i = 0; i < nelem; i++)
                data[i] = i;
            array.assign(data, data + nelem);
        }

        write_int_data(filename, array);

        read_int_data(filename);
    }

    return 0;
}

我正在使用 h5xx — HDF5 库 link 和 boost 库的基于模板的 C++ 包装器。 数据集存储在 particles/lipids/box/positions 路径中。数据集名称值包含帧。

  1. argv[0] 不是你想要的(参数从 1 开始,0 是程序名称)。还要考虑边界检查:

    std::vector<std::string> const args(argv, argv + argc);
    std::string const              filename = args.at(1) + ".h5";
    
  2. 直接初始化就可以了,不需要临时数组(multi_array干什么,不然?)

    for (size_t i = 0; i < array.num_elements(); i++)
        array.data()[i] = i;
    
  3. 或者实际上,让它成为一个算法:

     std::iota(array.data(), array.data() + array.num_elements(), 0);
    
  4. 与矢量相同:

    std::vector<int> offset; int offset_raw[2] = {4,4}; offset.assign(offset_raw, offset_raw + 2);
    std::vector<int> count;  int count_raw[2] = {2,2}; count.assign(count_raw, count_raw + 2);
    

    除了格式混乱之外,还可以简单地

    std::vector offset{4,4}, count{2,2};
    h5xx::slice slice(offset, count);
    

进入真正的问题

代码与文件无关。完全没有。我创建了一些 debug/tracing 代码来转储文件内容:

void dump(h5xx::group const& g, std::string indent = "") {
    auto dd = g.datasets();
    auto gg = g.groups();

    for (auto it = dd.begin(); it != dd.end(); ++it) {
        std::cout << indent << " ds:" << it.get_name() << "\n";
    }

    for (auto it = gg.begin(); it != gg.end(); ++it) {
        dump(*it, indent + "/" + it.get_name());
    }
}

int main()
{
    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);

    dump(xaa);
}

版画

/particles/lipids/box/edges ds:box_size
/particles/lipids/box/edges ds:step
/particles/lipids/box/edges ds:time
/particles/lipids/box/edges ds:value
/particles/lipids/box/positions ds:step
/particles/lipids/box/positions ds:time
/particles/lipids/box/positions ds:value

现在我们可以深入到数据集。让我们看看是否可以找出正确的类型。它肯定不是 array_2d_t:

h5xx::dataset ds(xaa, "particles/lipids/box/positions/value");

array_2d_t a;
h5xx::datatype detect(a);
std::cout << "type:   " << std::hex << ds.get_type() << std::dec << "\n";
std::cout << "detect: " << std::hex << detect.get_type_id() << std::dec << "\n";

版画

type:   30000000000013b
detect: 30000000000000c

这是类型不匹配。我想我也必须学会阅读那些乱码...

让我们添加一些诊断:

void diag_type(hid_t type)
{
    std::cout << " Class     " << ::H5Tget_class(type)       << std::endl;
    std::cout << " Size      " << ::H5Tget_size(type)        << std::endl;
    std::cout << " Sign      " << ::H5Tget_sign(type)        << std::endl;
    std::cout << " Order     " << ::H5Tget_order(type)       << std::endl;
    std::cout << " Precision " << ::H5Tget_precision(type)   << std::endl;
    std::cout << " NDims     " << ::H5Tget_array_ndims(type) << std::endl;
    std::cout << " NMembers  " << ::H5Tget_nmembers(type)    << std::endl;
}

int main()
{
    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);
    // dump(xaa);

    {
        h5xx::group g(xaa, "particles/lipids/box/positions");
        h5xx::dataset ds(g, "value");
        std::cout << "dataset:   " << std::hex << ds.get_type() << std::dec << std::endl;
        diag_type(ds.get_type());
    }

    {
        array_2d_t     a(boost::extents[NJ][NI]);
        h5xx::datatype detect(a);
        std::cout << "detect: " << std::hex << detect.get_type_id() << std::dec << std::endl;
        diag_type(detect.get_type_id());
    }
}

版画

dataset:   30000000000013b
 Class     1
 Size      4
 Sign      -1
 Order     0
 Precision 32
 NDims     -1
 NMembers  -1
detect: 30000000000000c
 Class     0
 Size      4
 Sign      1
 Order     0
 Precision 32
 NDims     -1
 NMembers  -1

至少我们知道 HST_FLOAT (class 1) 是必需的。让我们修改 array_2d_t:

using array_2d_t = boost::multi_array<float, 2>;
array_2d_t a(boost::extents[11214][3]);

这至少使数据看起来相似。让我们...天真地尝试阅读:

h5xx::read_dataset(ds, a);

哎呀,可预见的抛出

terminate called after throwing an instance of 'h5xx::error'
  what():  /home/sehe/Projects/Whosebug/deps/h5xx/h5xx/dataset/boost_multi_array.hpp:176:read_dataset(): dataset "/particles/lipi
ds/box/positions/value" and target array have mismatching dimensions

不用担心,我们可以猜到:

using array_3d_t = boost::multi_array<float, 3>;
array_3d_t     a(boost::extents[10][11214][3]);
h5xx::read_dataset(ds, a);

至少这个确实有效。调整打印功能:

template <typename T> void print_array(T const& array) {
    for (auto const& row : array) {
        for (auto v : row) printf("%5f ", v);
        printf("\n");
    }
}

现在我们可以打印第一帧了:

h5xx::read_dataset(ds, a);
print_array(*a.begin()); // print the first frame

这会打印:

80.480003 35.360001 4.250000
37.450001 3.920000 3.960000
18.530001 -9.690000 4.680000
55.389999 74.339996 4.600000
22.110001 68.709999 3.850000
-4.130000 24.040001 3.730000
40.160000 6.390000 4.730000
-5.400000 35.730000 4.850000
36.669998 22.450001 4.080000
-3.680000 -10.660000 4.180000
(...)

结帐 h5ls -r -d xaa.h5/particles/lipids/box/positions/value:

particles/lipids/box/positions/value Dataset {75/Inf, 11214, 3}
    Data:
        (0,0,0) 80.48, 35.36, 4.25, 37.45, 3.92, 3.96, 18.53, -9.69, 4.68,
        (0,3,0) 55.39, 74.34, 4.6, 22.11, 68.71, 3.85, -4.13, 24.04, 3.73,
        (0,6,0) 40.16, 6.39, 4.73, -5.4, 35.73, 4.85, 36.67, 22.45, 4.08, -3.68,
        (0,9,1) -10.66, 4.18, 35.95, 36.43, 5.15, 57.17, 3.88, 5.08, -23.64,
        (0,12,1) 50.44, 4.32, 6.78, 8.24, 4.36, 21.34, 50.63, 5.21, 16.29,
        (0,15,1) -1.34, 5.28, 22.26, 71.25, 5.4, 19.76, 10.38, 5.34, 78.62,
        (0,18,1) 11.13, 5.69, 22.14, 59.7, 4.92, 15.65, 47.28, 5.22, 82.41,
        (0,21,1) 2.09, 5.24, 16.87, -11.68, 5.35, 15.54, -0.63, 5.2, 81.25,
(...)

最后一步:添加切片

array_2d_t read_frame(int frame_no) {
    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);

    h5xx::group   g(xaa, "particles/lipids/box/positions");
    h5xx::dataset ds(g, "value");

    array_2d_t a(boost::extents[11214][3]);

    std::vector offsets{frame_no, 0, 0}, counts{1, 11214, 3};
    h5xx::slice slice(offsets, counts);

    h5xx::read_dataset(ds, a, slice);
    return a;
}

给你。现在我们可以打印任何帧了:

print_array(read_frame(0));

打印和以前一样。让我们试试最后一帧:

print_array(read_frame(9));

版画

79.040001 36.349998 3.990000
37.250000 3.470000 4.140000
18.600000 -9.270000 4.900000
55.669998 75.070000 5.370000
21.920000 67.709999 3.790000
-4.670000 24.770000 3.690000
40.000000 6.060000 5.240000
-5.340000 36.320000 5.410000
36.369999 22.490000 4.130000
-3.520000 -10.430000 4.280000
(...)

再次检查 h5ls -r -d xaa.h5/particles/lipids/box/positions/value |& grep '(9' | head 确认:

(9,0,0) 79.04, 36.35, 3.99, 37.25, 3.47, 4.14, 18.6, -9.27, 4.9, 55.67,
(9,3,1) 75.07, 5.37, 21.92, 67.71, 3.79, -4.67, 24.77, 3.69, 40, 6.06,
(9,6,2) 5.24, -5.34, 36.32, 5.41, 36.37, 22.49, 4.13, -3.52, -10.43,
(9,9,2) 4.28, 35.8, 36.43, 4.99, 56.6, 4.09, 5.04, -23.37, 49.42, 3.81,
(9,13,0) 6.31, 8.83, 4.56, 22.01, 50.38, 5.43, 16.3, -2.92, 5.4, 22.02,
(9,16,1) 70.09, 5.36, 20.23, 11.12, 5.66, 78.48, 11.34, 6.09, 20.26,
(9,19,1) 61.45, 5.35, 14.25, 48.32, 5.35, 79.95, 1.71, 5.38, 17.56,
(9,22,1) -11.61, 5.39, 15.64, -0.19, 5.06, 80.43, 71.77, 5.29, 75.54,
(9,25,1) 35.14, 5.26, 22.45, 56.86, 5.56, 16.47, 52.97, 6.16, 20.62,
(9,28,1) 65.12, 5.26, 19.68, 71.2, 5.52, 23.39, 49.84, 5.28, 22.7,

完整列表

#include <boost/multi_array.hpp>
#include <h5xx/h5xx.hpp>
#include <iostream>

using array_2d_t = boost::multi_array<float, 2>;

template <typename T> void print_array(T const& array)
{
    for (auto const& row : array) { for (auto v : row)
            printf("%5f ", v);
        printf("\n");
    }
}

void dump(h5xx::group const& g, std::string indent = "") {
    auto dd = g.datasets();
    auto gg = g.groups();

    for (auto it = dd.begin(); it != dd.end(); ++it) {
        std::cout << indent << " ds:" << it.get_name() << std::endl;
    }

    for (auto it = gg.begin(); it != gg.end(); ++it) {
        dump(*it, indent + "/" + it.get_name());
    }
}

array_2d_t read_frame(int frame_no) {
    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);

    h5xx::group   g(xaa, "particles/lipids/box/positions");
    h5xx::dataset ds(g, "value");

    array_2d_t arr(boost::extents[11214][3]);

    std::vector offsets{frame_no, 0, 0}, counts{1, 11214, 3};
    h5xx::slice slice(offsets, counts);

    h5xx::read_dataset(ds, arr, slice);
    return arr;
}

int main()
{
    print_array(read_frame(9));
}