如何读取ORC文件列数据
How to read ORC file column data
我已经下载了 ORC c++ API 并在我的 Ubuntu 上构建了它。现在我正在尝试批量读取它的列数据。在这个 reference 中提到 orc::ColumnVectorBatch
可以 dynamic_cast
到特定的列数据类型 batch 像:orc::Decimal64VectorBatch
。但它没有将空指针作为动态转换结果。
下面是我的代码:
// Orc Reader.
#include <memory>
#include <iostream>
#include <vector>
#include <list>
#include <fstream>
#include <orc/Reader.hh>
#include <orc/ColumnPrinter.hh>
#include <orc/Exceptions.hh>
#include <orc/OrcFile.hh>
int main(int argc, char const *argv[])
{
std::list<uint64_t> read_cols = {4};
std::string file_path = "~/trades_data.zlib.orc";
std::ifstream in_file(file_path.c_str(), std::ios::binary);
in_file.seekg(0, std::ios::end);
int file_size = in_file.tellg();
std::cout << "Size of the file is" << " " << file_size << " " << "bytes";
orc::RowReaderOptions row_reader_opts;
row_reader_opts.include(read_cols);
orc::ReaderOptions reader_opts;
std::unique_ptr<orc::Reader> reader;
std::unique_ptr<orc::RowReader> row_reader;
reader = orc::createReader(orc::readFile(file_path), reader_opts);
row_reader = reader->createRowReader(row_reader_opts);
std::unique_ptr<orc::ColumnVectorBatch> batch = row_reader->createRowBatch(1000);
while (row_reader->next(*batch))
{
// BELOW LINE OF CODE IS GIVING NULLPOINTER.
orc::Decimal64VectorBatch *dec_vec = dynamic_cast<orc::Decimal64VectorBatch*>(batch.get());
}
return 0;
}
如果有人能指出错误,对我帮助很大
我刚才已经解决了这个问题,现在我正在写一个我自己的问题的答案。希望也有助于您的代码。
在上面的代码中,它试图将它从 row_reader
读取的 bath 转换为 orc::Decimal64VectorBatch
,但应该首先将批处理转换为 orc::StructVectorBatch
。然后使用列的索引号可以很容易地转换成所需的列数据。
const int time_idx = 0; // Index of column containing time in decimal64 format.
while (row_reader->next(*batch))
{
// Now batch should initially convert into StructVectorBatc.
const auto &struct_batch = dynamic_cast<const orc::StructVectorBatch&>(*batch.get());
// And then struct_batch can be converted into required column data format.
const auto &dec_vec = dynamic_cast<orc::Decimal64VectorBatch&>(*(struct_batch.fields[time_idx)).values.data();
}
这是我的方法,希望对你有帮助。
完整代码演示:https://github.com/harbby/cmake_ExternalProject_demo
//double field
auto *fields = dynamic_cast<orc::StructVectorBatch *>(batch.get());
auto *col0 = dynamic_cast<orc::DoubleVectorBatch *>(fields->fields[0]);
double *buffer1 = col0->data.data();
//string field
auto *col4 = dynamic_cast<orc::StringVectorBatch *>(fields->fields[4]);
char **buffer2 = col4->data.data();
long *lengths = col4->length.data();
while (row_reader->next(*batch)) {
for (uint32_t r = 0; r < batch->numElements; ++r) {
std::cout << "line " << buffer1[r] << "," << std::string(buffer2[r], lengths[r]) << "\n";
}
//std::cout << "this batch nums" << " " << batch->numElements << " " << "lines\n";
}
我已经下载了 ORC c++ API 并在我的 Ubuntu 上构建了它。现在我正在尝试批量读取它的列数据。在这个 reference 中提到 orc::ColumnVectorBatch
可以 dynamic_cast
到特定的列数据类型 batch 像:orc::Decimal64VectorBatch
。但它没有将空指针作为动态转换结果。
下面是我的代码:
// Orc Reader.
#include <memory>
#include <iostream>
#include <vector>
#include <list>
#include <fstream>
#include <orc/Reader.hh>
#include <orc/ColumnPrinter.hh>
#include <orc/Exceptions.hh>
#include <orc/OrcFile.hh>
int main(int argc, char const *argv[])
{
std::list<uint64_t> read_cols = {4};
std::string file_path = "~/trades_data.zlib.orc";
std::ifstream in_file(file_path.c_str(), std::ios::binary);
in_file.seekg(0, std::ios::end);
int file_size = in_file.tellg();
std::cout << "Size of the file is" << " " << file_size << " " << "bytes";
orc::RowReaderOptions row_reader_opts;
row_reader_opts.include(read_cols);
orc::ReaderOptions reader_opts;
std::unique_ptr<orc::Reader> reader;
std::unique_ptr<orc::RowReader> row_reader;
reader = orc::createReader(orc::readFile(file_path), reader_opts);
row_reader = reader->createRowReader(row_reader_opts);
std::unique_ptr<orc::ColumnVectorBatch> batch = row_reader->createRowBatch(1000);
while (row_reader->next(*batch))
{
// BELOW LINE OF CODE IS GIVING NULLPOINTER.
orc::Decimal64VectorBatch *dec_vec = dynamic_cast<orc::Decimal64VectorBatch*>(batch.get());
}
return 0;
}
如果有人能指出错误,对我帮助很大
我刚才已经解决了这个问题,现在我正在写一个我自己的问题的答案。希望也有助于您的代码。
在上面的代码中,它试图将它从 row_reader
读取的 bath 转换为 orc::Decimal64VectorBatch
,但应该首先将批处理转换为 orc::StructVectorBatch
。然后使用列的索引号可以很容易地转换成所需的列数据。
const int time_idx = 0; // Index of column containing time in decimal64 format.
while (row_reader->next(*batch))
{
// Now batch should initially convert into StructVectorBatc.
const auto &struct_batch = dynamic_cast<const orc::StructVectorBatch&>(*batch.get());
// And then struct_batch can be converted into required column data format.
const auto &dec_vec = dynamic_cast<orc::Decimal64VectorBatch&>(*(struct_batch.fields[time_idx)).values.data();
}
这是我的方法,希望对你有帮助。
完整代码演示:https://github.com/harbby/cmake_ExternalProject_demo
//double field
auto *fields = dynamic_cast<orc::StructVectorBatch *>(batch.get());
auto *col0 = dynamic_cast<orc::DoubleVectorBatch *>(fields->fields[0]);
double *buffer1 = col0->data.data();
//string field
auto *col4 = dynamic_cast<orc::StringVectorBatch *>(fields->fields[4]);
char **buffer2 = col4->data.data();
long *lengths = col4->length.data();
while (row_reader->next(*batch)) {
for (uint32_t r = 0; r < batch->numElements; ++r) {
std::cout << "line " << buffer1[r] << "," << std::string(buffer2[r], lengths[r]) << "\n";
}
//std::cout << "this batch nums" << " " << batch->numElements << " " << "lines\n";
}