如何在 HDF5 中写入固定长度的字符串?
How to write fixed length strings in HDF5?
我有一些 C++ 代码调用 HDF5 C API 来写出固定长度的字符串。出于某种原因,结果完全是垃圾。这是 h5dump 的示例:
DATASET "simple" {
DATATYPE H5T_STRING {
STRSIZE 10;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 100 ) / ( 100 ) }
DATA {
(0): "[=10=]16", "", "7777776357", "", "y6", "",
(6): "7777776357", "", "7777777616", "",
(10): "7777776357", "", "i7", "", "7777776357", "",
(16): "7777777417", "", "7777776357", "", "Y0", "",
(22): "7777776357", "", "7777777210", "",
(26): "7777776357", "", "I1", "", "7777776357", "",
(32): "7777777011", "", "7777776357", "", "92", "",
但是如果我将大小更改为 H5Tset_size (datatype_id, H5T_VARIABLE);
,输出看起来符合预期。
--- 编辑 ----
这是一个只有 C 语言的小例子,它有同样的问题。它可以用 h5cc
:
构建
#include <string.h>
#include <stdio.h>
#include "hdf5.h"
#define FILE "chard.h5"
int main() {
hid_t file_id, dataset_id, dataspace_id; /* identifiers */
herr_t status;
hid_t dtype;
size_t size;
file_id = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
hsize_t dims[1] = {4};
dataspace_id = H5Screate_simple(1, dims, NULL);
dtype = H5Tcopy (H5T_C_S1);
size = 5;
status = H5Tset_size (dtype, size);
char *strs[4] = {
"this",
"this",
"this",
"this"};
dataset_id = H5Dcreate(file_id, "simp", dtype, dataspace_id, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
status = H5Dwrite (dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);
status = H5Dclose(dataset_id);
status = H5Sclose(dataspace_id);
status = H5Fclose(file_id);
}
--- 结束编辑 ---
这是我的代码:
#include <iostream>
#include <assert.h>
#include <cstring>
#include <memory>
#include <string>
#include <vector>
#include "hdf5.h"
const char** vec_to_ptr(const std::vector<std::string>& v) {
const char** ret;
ret = new const char*[v.size()];
for (size_t i = 0; i < v.size(); ++i) {
ret[i] = v[i].c_str();
}
return ret;
}
hid_t get_datatype_id(const std::vector<std::string>& v) {
hid_t datatype_id = H5Tcopy (H5T_C_S1);
/* H5Tset_size (datatype_id, H5T_VARIABLE); */
// If I replace the H5Tset_size line below with the commented one above, this
// code works well... but compression isn't nearly as good since almost all
// my strings are of the same length
herr_t status = H5Tset_size (datatype_id, 10);
assert( status >= 0 );
v.size(); // shutup, compiler
return datatype_id;
}
// str_vec: a vector of string to be written out
// group_id: a group_id which has already been opened
// dataset_name: the to write out to
// release_type: if 'true', release the datatype
// compression_level: the level of compression (6 seems reasonable)
//
// return: the status of H5Dwrite (last H5 operation)
template <typename T>
herr_t data_to_h5(
const std::vector<T>& str_vec,
hid_t group_id,
const std::string& dataset_name,
bool release_type,
uint compression_level = 6
) {
herr_t status;
hsize_t dims[1] = {str_vec.size()};
// create the propery which allows for compression
hid_t prop_id = H5Pcreate(H5P_DATASET_CREATE);
// chunk size is same size as vector
status = H5Pset_chunk(prop_id, 1, dims);
assert( status >= 0 );
// compress using gzip
status = H5Pset_deflate(prop_id, compression_level);
assert( status >= 0 );
// create the data type
hid_t datatype_id = get_datatype_id(str_vec);
// create the dataspace
hid_t dataspace_id = H5Screate_simple(1, dims, NULL);
// create the dataset
hid_t dataset_id = H5Dcreate(group_id, dataset_name.c_str(), datatype_id,
dataspace_id, H5P_DEFAULT, prop_id, H5P_DEFAULT);
// get the ptrs from the string and write out
auto ptr = vec_to_ptr(str_vec);
status = H5Dwrite(dataset_id, datatype_id, H5S_ALL, H5S_ALL, H5P_DEFAULT,
ptr);
assert( status >= 0 );
status = H5Pclose(prop_id);
assert( status >= 0 );
status = H5Dclose(dataset_id);
assert( status >= 0 );
status = H5Sclose(dataspace_id);
assert( status >= 0 );
if (release_type) {
status = H5Tclose(datatype_id);
assert( status >= 0 );
delete [] ptr;
}
return status;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
std::cerr << "Usage: h5_string output.h5" << std::endl;
return 1;
}
std::string fname(argv[1]);
std::cout << "Opening file " << fname << std::endl;
std::vector<std::string> str_vec;
// make a bunch of strings
const auto NSTR = 100;
for (auto i = 0; i < NSTR; ++i) {
std::string cur_str = std::to_string( i );
str_vec.push_back( cur_str );
}
hid_t file_id;
file_id = H5Fcreate(fname.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
herr_t status;
hid_t root;
root = H5Gopen(file_id, "/", H5P_DEFAULT);
assert(status >= 0);
status = data_to_h5(str_vec, root, "simple", true, 6);
assert(status >= 0);
status = H5Gclose(root);
assert(status >= 0);
status = H5Fclose(file_id);
assert(status >= 0);
return 0;
}
和对应的CMakeLists.txt:
find_package(HDF5)
add_executable(h5_string h5_string.cpp)
if(HDF5_FOUND)
include_directories( ${HDF5_INCLUDE_DIR} )
target_link_libraries( h5_string ${HDF5_LIBRARIES} )
else()
message(FATAL_ERROR "HDF5 not found. Required to output files")
endif()
非常感谢任何帮助。提前致谢!
您的代码在设置大小方面工作正常(但我认为您偏离了一个)。真正的问题在于如何定义字符串。
char *strs[4] = {
"this",
"this",
"this",
"this"};
您将它们定义为指针数组与二维数组:
char strs[4][4] = {
"this",
"this",
"this",
"this"};
并且还修复了一个:
dtype = H5Tcopy (H5T_C_S1);
size = 4;
status = H5Tset_size (dtype, size);
我也倾向于关闭数据类型。
status = H5Tclose(dtype);
这是您的完整测试程序:
#include <string.h>
#include <stdio.h>
#include "hdf5.h"
#define FILE "chard.h5"
int main() {
hid_t file_id, dataset_id, dataspace_id; /* identifiers */
herr_t status;
hid_t dtype;
size_t size;
file_id = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
hsize_t dims[1] = {4};
dataspace_id = H5Screate_simple(1, dims, NULL);
dtype = H5Tcopy (H5T_C_S1);
size = 4 * sizeof(char);
status = H5Tset_size (dtype, size);
char strs[4][4] = {
"this",
"this",
"this",
"this"};
dataset_id = H5Dcreate(file_id, "simp", dtype, dataspace_id, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
status = H5Dwrite (dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);
status = H5Dclose(dataset_id);
status = H5Sclose(dataspace_id);
status = H5Tclose(dtype);
status = H5Fclose(file_id);
}
如果编译,运行 并转储输出:
~$ h5pcc -g -O0 -Wall -o foo foo.c
~$ ./foo
~$ h5dump chard.h5
HDF5 "chard.h5" {
GROUP "/" {
DATASET "simp" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 4 ) / ( 4 ) }
DATA {
(0): "this", "this", "this", "this"
}
}
}
}
在 C++ 中的 HDF5 中编写固定长度字符串的另一种(更容易)方法是使用 HDFql。使用 HDFql,可以按如下方式执行此操作:
// declare variable 'data'
char data[4][4];
// populate variable 'data' with values
memcpy(data[0], "this", 4);
memcpy(data[1], "that", 4);
memcpy(data[2], "what", 4);
memcpy(data[3], "when", 4);
// create an HDF5 file named 'chard.h5' and use (i.e. open) it
HDFql::execute("create and use file chard.h5");
// register variable 'data' for subsequent usage (by HDFql)
HDFql::variableRegister(data);
// create a dataset named 'simp' of fixed-length string and write content of 'data' into it
HDFql::execute("create dataset simp as char(4, 4) values from memory 0");
我有一些 C++ 代码调用 HDF5 C API 来写出固定长度的字符串。出于某种原因,结果完全是垃圾。这是 h5dump 的示例:
DATASET "simple" {
DATATYPE H5T_STRING {
STRSIZE 10;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 100 ) / ( 100 ) }
DATA {
(0): "[=10=]16", "", "7777776357", "", "y6", "",
(6): "7777776357", "", "7777777616", "",
(10): "7777776357", "", "i7", "", "7777776357", "",
(16): "7777777417", "", "7777776357", "", "Y0", "",
(22): "7777776357", "", "7777777210", "",
(26): "7777776357", "", "I1", "", "7777776357", "",
(32): "7777777011", "", "7777776357", "", "92", "",
但是如果我将大小更改为 H5Tset_size (datatype_id, H5T_VARIABLE);
,输出看起来符合预期。
--- 编辑 ----
这是一个只有 C 语言的小例子,它有同样的问题。它可以用 h5cc
:
#include <string.h>
#include <stdio.h>
#include "hdf5.h"
#define FILE "chard.h5"
int main() {
hid_t file_id, dataset_id, dataspace_id; /* identifiers */
herr_t status;
hid_t dtype;
size_t size;
file_id = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
hsize_t dims[1] = {4};
dataspace_id = H5Screate_simple(1, dims, NULL);
dtype = H5Tcopy (H5T_C_S1);
size = 5;
status = H5Tset_size (dtype, size);
char *strs[4] = {
"this",
"this",
"this",
"this"};
dataset_id = H5Dcreate(file_id, "simp", dtype, dataspace_id, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
status = H5Dwrite (dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);
status = H5Dclose(dataset_id);
status = H5Sclose(dataspace_id);
status = H5Fclose(file_id);
}
--- 结束编辑 ---
这是我的代码:
#include <iostream>
#include <assert.h>
#include <cstring>
#include <memory>
#include <string>
#include <vector>
#include "hdf5.h"
const char** vec_to_ptr(const std::vector<std::string>& v) {
const char** ret;
ret = new const char*[v.size()];
for (size_t i = 0; i < v.size(); ++i) {
ret[i] = v[i].c_str();
}
return ret;
}
hid_t get_datatype_id(const std::vector<std::string>& v) {
hid_t datatype_id = H5Tcopy (H5T_C_S1);
/* H5Tset_size (datatype_id, H5T_VARIABLE); */
// If I replace the H5Tset_size line below with the commented one above, this
// code works well... but compression isn't nearly as good since almost all
// my strings are of the same length
herr_t status = H5Tset_size (datatype_id, 10);
assert( status >= 0 );
v.size(); // shutup, compiler
return datatype_id;
}
// str_vec: a vector of string to be written out
// group_id: a group_id which has already been opened
// dataset_name: the to write out to
// release_type: if 'true', release the datatype
// compression_level: the level of compression (6 seems reasonable)
//
// return: the status of H5Dwrite (last H5 operation)
template <typename T>
herr_t data_to_h5(
const std::vector<T>& str_vec,
hid_t group_id,
const std::string& dataset_name,
bool release_type,
uint compression_level = 6
) {
herr_t status;
hsize_t dims[1] = {str_vec.size()};
// create the propery which allows for compression
hid_t prop_id = H5Pcreate(H5P_DATASET_CREATE);
// chunk size is same size as vector
status = H5Pset_chunk(prop_id, 1, dims);
assert( status >= 0 );
// compress using gzip
status = H5Pset_deflate(prop_id, compression_level);
assert( status >= 0 );
// create the data type
hid_t datatype_id = get_datatype_id(str_vec);
// create the dataspace
hid_t dataspace_id = H5Screate_simple(1, dims, NULL);
// create the dataset
hid_t dataset_id = H5Dcreate(group_id, dataset_name.c_str(), datatype_id,
dataspace_id, H5P_DEFAULT, prop_id, H5P_DEFAULT);
// get the ptrs from the string and write out
auto ptr = vec_to_ptr(str_vec);
status = H5Dwrite(dataset_id, datatype_id, H5S_ALL, H5S_ALL, H5P_DEFAULT,
ptr);
assert( status >= 0 );
status = H5Pclose(prop_id);
assert( status >= 0 );
status = H5Dclose(dataset_id);
assert( status >= 0 );
status = H5Sclose(dataspace_id);
assert( status >= 0 );
if (release_type) {
status = H5Tclose(datatype_id);
assert( status >= 0 );
delete [] ptr;
}
return status;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
std::cerr << "Usage: h5_string output.h5" << std::endl;
return 1;
}
std::string fname(argv[1]);
std::cout << "Opening file " << fname << std::endl;
std::vector<std::string> str_vec;
// make a bunch of strings
const auto NSTR = 100;
for (auto i = 0; i < NSTR; ++i) {
std::string cur_str = std::to_string( i );
str_vec.push_back( cur_str );
}
hid_t file_id;
file_id = H5Fcreate(fname.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
herr_t status;
hid_t root;
root = H5Gopen(file_id, "/", H5P_DEFAULT);
assert(status >= 0);
status = data_to_h5(str_vec, root, "simple", true, 6);
assert(status >= 0);
status = H5Gclose(root);
assert(status >= 0);
status = H5Fclose(file_id);
assert(status >= 0);
return 0;
}
和对应的CMakeLists.txt:
find_package(HDF5)
add_executable(h5_string h5_string.cpp)
if(HDF5_FOUND)
include_directories( ${HDF5_INCLUDE_DIR} )
target_link_libraries( h5_string ${HDF5_LIBRARIES} )
else()
message(FATAL_ERROR "HDF5 not found. Required to output files")
endif()
非常感谢任何帮助。提前致谢!
您的代码在设置大小方面工作正常(但我认为您偏离了一个)。真正的问题在于如何定义字符串。
char *strs[4] = {
"this",
"this",
"this",
"this"};
您将它们定义为指针数组与二维数组:
char strs[4][4] = {
"this",
"this",
"this",
"this"};
并且还修复了一个:
dtype = H5Tcopy (H5T_C_S1);
size = 4;
status = H5Tset_size (dtype, size);
我也倾向于关闭数据类型。
status = H5Tclose(dtype);
这是您的完整测试程序:
#include <string.h>
#include <stdio.h>
#include "hdf5.h"
#define FILE "chard.h5"
int main() {
hid_t file_id, dataset_id, dataspace_id; /* identifiers */
herr_t status;
hid_t dtype;
size_t size;
file_id = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
hsize_t dims[1] = {4};
dataspace_id = H5Screate_simple(1, dims, NULL);
dtype = H5Tcopy (H5T_C_S1);
size = 4 * sizeof(char);
status = H5Tset_size (dtype, size);
char strs[4][4] = {
"this",
"this",
"this",
"this"};
dataset_id = H5Dcreate(file_id, "simp", dtype, dataspace_id, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
status = H5Dwrite (dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);
status = H5Dclose(dataset_id);
status = H5Sclose(dataspace_id);
status = H5Tclose(dtype);
status = H5Fclose(file_id);
}
如果编译,运行 并转储输出:
~$ h5pcc -g -O0 -Wall -o foo foo.c
~$ ./foo
~$ h5dump chard.h5
HDF5 "chard.h5" {
GROUP "/" {
DATASET "simp" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 4 ) / ( 4 ) }
DATA {
(0): "this", "this", "this", "this"
}
}
}
}
在 C++ 中的 HDF5 中编写固定长度字符串的另一种(更容易)方法是使用 HDFql。使用 HDFql,可以按如下方式执行此操作:
// declare variable 'data'
char data[4][4];
// populate variable 'data' with values
memcpy(data[0], "this", 4);
memcpy(data[1], "that", 4);
memcpy(data[2], "what", 4);
memcpy(data[3], "when", 4);
// create an HDF5 file named 'chard.h5' and use (i.e. open) it
HDFql::execute("create and use file chard.h5");
// register variable 'data' for subsequent usage (by HDFql)
HDFql::variableRegister(data);
// create a dataset named 'simp' of fixed-length string and write content of 'data' into it
HDFql::execute("create dataset simp as char(4, 4) values from memory 0");