How do I troubleshoot ValueError: array is of length %s, while the length of the DataFrame is %s?
How do I troubleshoot ValueError: array is of length %s, while the length of the DataFrame is %s?
我正在尝试按照此 notebook 中的示例进行操作。
如本 github thread 中所建议:
- 我已将 ulimit 提高到 9999。
- 我已经将 csv 文件转换为 hdf5
我的代码在尝试将单个 hdf5 文件打开到数据帧中时失败:
df = vaex.open('data/chat_history_00.hdf5')
这是代码的其余部分:
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('data/*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
assert len(hdf5_list) == 11, "Incorrect number of files"
# Check how the single file looks like:
df = vaex.open('data/chat_history_10.hdf5')
df
产生错误:
ERROR:MainThread:vaex:error opening 'data/chat_history_00.hdf5'
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in
1 # Check how the single file looks like:
----> 2 df = vaex.open('data/chat_history_10.hdf5')
3 df
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/init.py in
open(path, convert, shuffle, copy_index, *args, **kwargs)
207 ds = from_csv(path, copy_index=copy_index, **kwargs)
208 else:
--> 209 ds = vaex.file.open(path, *args, **kwargs)
210 if convert and ds:
211 ds.export_hdf5(filename_hdf5, shuffle=shuffle)
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/file/init.py
in open(path, *args, **kwargs)
39 break
40 if dataset_class:
---> 41 dataset = dataset_class(path, *args, **kwargs)
42 return dataset
43
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py
in init(self, filename, write)
84 self.h5table_root_name = None
85 self._version = 1
---> 86 self._load()
87
88 def write_meta(self):
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py
in _load(self)
182 def _load(self):
183 if "data" in self.h5file:
--> 184 self._load_columns(self.h5file["/data"])
185 self.h5table_root_name = "/data"
186 if "table" in self.h5file:
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py
in _load_columns(self, h5data, first)
348 self.add_column(column_name, self._map_hdf5_array(data, column['mask']))
349 else:
--> 350 self.add_column(column_name, self._map_hdf5_array(data))
351 else:
352 transposed = shape1 < shape[0]
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/dataframe.py in
add_column(self, name, f_or_array, dtype) 2929
if len(self) == len(ar): 2930 raise
ValueError("Array is of length %s, while the length of the DataFrame
is %s due to the filtering, the (unfiltered) length is %s." %
(len(ar), len(self), self.length_unfiltered()))
-> 2931 raise ValueError("array is of length %s, while the length of the DataFrame is %s" % (len(ar),
self.length_original())) 2932 # assert
self.length_unfiltered() == len(data), "columns should be of equal
length, length should be %d, while it is %d" % (
self.length_unfiltered(), len(data)) 2933 valid_name =
vaex.utils.find_valid_name(name)
ValueError: array is of length 2578961, while the length of the
DataFrame is 6
这是什么意思,我该如何解决?所有文件都有 6 列。
编辑:
以下是我创建 hdf5 文件的方式:
pd.read_csv(r'G:/path/to/file/data/chat_history-00.csv').to_hdf(r'data/chat_history_00.hdf5', key='data')
问题已由 Jovan of vaex on Github 回答:
You should not use pandas .to_hdf if you want to read the data with
vaex in a memory-mapped way. Please see this link for more details.
我改用这个:
vdf = vaex.from_pandas(df, copy_index=False)
vdf.export_hdf5('chat_history_00.hdf5')
我正在尝试按照此 notebook 中的示例进行操作。
如本 github thread 中所建议:
- 我已将 ulimit 提高到 9999。
- 我已经将 csv 文件转换为 hdf5
我的代码在尝试将单个 hdf5 文件打开到数据帧中时失败:
df = vaex.open('data/chat_history_00.hdf5')
这是代码的其余部分:
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('data/*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
assert len(hdf5_list) == 11, "Incorrect number of files"
# Check how the single file looks like:
df = vaex.open('data/chat_history_10.hdf5')
df
产生错误:
ERROR:MainThread:vaex:error opening 'data/chat_history_00.hdf5' --------------------------------------------------------------------------- ValueError Traceback (most recent call last) in 1 # Check how the single file looks like: ----> 2 df = vaex.open('data/chat_history_10.hdf5') 3 df
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/init.py in open(path, convert, shuffle, copy_index, *args, **kwargs) 207 ds = from_csv(path, copy_index=copy_index, **kwargs) 208 else: --> 209 ds = vaex.file.open(path, *args, **kwargs) 210 if convert and ds: 211 ds.export_hdf5(filename_hdf5, shuffle=shuffle)
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/file/init.py in open(path, *args, **kwargs) 39 break 40 if dataset_class: ---> 41 dataset = dataset_class(path, *args, **kwargs) 42 return dataset 43
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py in init(self, filename, write) 84 self.h5table_root_name = None 85 self._version = 1 ---> 86 self._load() 87 88 def write_meta(self):
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py in _load(self) 182 def _load(self): 183 if "data" in self.h5file: --> 184 self._load_columns(self.h5file["/data"]) 185 self.h5table_root_name = "/data" 186 if "table" in self.h5file:
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py in _load_columns(self, h5data, first) 348 self.add_column(column_name, self._map_hdf5_array(data, column['mask'])) 349 else: --> 350 self.add_column(column_name, self._map_hdf5_array(data)) 351 else: 352 transposed = shape1 < shape[0]
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/dataframe.py in add_column(self, name, f_or_array, dtype) 2929
if len(self) == len(ar): 2930 raise ValueError("Array is of length %s, while the length of the DataFrame is %s due to the filtering, the (unfiltered) length is %s." % (len(ar), len(self), self.length_unfiltered())) -> 2931 raise ValueError("array is of length %s, while the length of the DataFrame is %s" % (len(ar), self.length_original())) 2932 # assert self.length_unfiltered() == len(data), "columns should be of equal length, length should be %d, while it is %d" % ( self.length_unfiltered(), len(data)) 2933 valid_name = vaex.utils.find_valid_name(name)ValueError: array is of length 2578961, while the length of the DataFrame is 6
这是什么意思,我该如何解决?所有文件都有 6 列。
编辑: 以下是我创建 hdf5 文件的方式:
pd.read_csv(r'G:/path/to/file/data/chat_history-00.csv').to_hdf(r'data/chat_history_00.hdf5', key='data')
问题已由 Jovan of vaex on Github 回答:
You should not use pandas .to_hdf if you want to read the data with vaex in a memory-mapped way. Please see this link for more details.
我改用这个:
vdf = vaex.from_pandas(df, copy_index=False)
vdf.export_hdf5('chat_history_00.hdf5')