从损坏的文件中恢复数据
Recover data from corrupted file
我有一个 HDF5 文件由于某种原因损坏了。我正在尝试检索基本上没问题的文件部分。我可以从不包含损坏字段的组中读取所有数据集就好了。但是我无法从一个数据集也已损坏的组中读取任何未损坏的数据集。
有趣的是,我 可以使用 HDFView 轻松读取 那些数据集。 IE。我可以打开它们,找到所有数值。使用 HDFView 我只能读取损坏的数据集。
我的问题是如何利用它并尽可能多地检索数据?
使用h5py阅读时:
Traceback (most recent call last):
File "repair.py", line 44, in <module>
print(data['/dt_yield/000000'][...])
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/usr/local/lib/python3.6/site-packages/h5py/_hl/group.py", line 167, in __getitem__
oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5o.pyx", line 190, in h5py.h5o.open
KeyError: 'Unable to open object (bad heap free list)'
使用 C++ 阅读时:
HDF5-DIAG: Error detected in HDF5 (1.10.1) thread 0:
#000: H5D.c line 294 in H5Dopen2(): unable to open dataset
major: Dataset
minor: Can't open object
#001: H5Dint.c line 1362 in H5D__open_name(): not found
major: Dataset
minor: Object not found
#002: H5Gloc.c line 428 in H5G_loc_find(): can't find object
major: Symbol table
minor: Object not found
#003: H5Gtraverse.c line 867 in H5G_traverse(): internal path traversal failed
major: Symbol table
minor: Object not found
#004: H5Gtraverse.c line 594 in H5G_traverse_real(): can't look up component
major: Symbol table
minor: Object not found
#005: H5Gobj.c line 1156 in H5G__obj_lookup(): can't locate object
major: Symbol table
minor: Object not found
#006: H5Gstab.c line 890 in H5G__stab_lookup(): unable to protect symbol table heap
major: Symbol table
minor: Protected metadata error
#007: H5HL.c line 364 in H5HL_protect(): unable to load heap data block
major: Heap
minor: Unable to protect metadata
#008: H5AC.c line 1763 in H5AC_protect(): H5C_protect() failed
major: Object cache
minor: Unable to protect metadata
#009: H5C.c line 2561 in H5C_protect(): can't load entry
major: Object cache
minor: Unable to load metadata into cache
#010: H5C.c line 6877 in H5C_load_entry(): Can't deserialize image
major: Object cache
minor: Unable to load metadata into cache
#011: H5HLcache.c line 763 in H5HL__cache_datablock_deserialize(): can't initialize free list
major: Heap
minor: Unable to initialize object
#012: H5HLcache.c line 241 in H5HL__fl_deserialize(): bad heap free list
major: Heap
minor: Out of range
libc++abi.dylib: terminating with uncaught exception of type H5::FileIException
恢复脚本(使用 h5py)
这是我到目前为止至少可以恢复 h5py 可以直接读取的所有内容的实现:
import numpy as np
import h5py, os, time
def getdatasets(key,archive):
if key[-1] != '/': key += '/'
out = []
for name in archive[key]:
path = key + name
if isinstance(archive[path], h5py.Dataset):
out += [path]
else:
try : out += getdatasets(path,archive)
except: pass
return out
data = h5py.File('data.hdf5' ,'r')
fixed = h5py.File('fixed.hdf5','w')
datasets = getdatasets('/',data)
groups = list(set([i[::-1].split('/',1)[1][::-1] for i in datasets]))
groups = [i for i in groups if len(i)>0]
idx = np.argsort(np.array([len(i.split('/')) for i in groups]))
groups = [groups[i] for i in idx]
for group in groups:
fixed.create_group(group)
for path in datasets:
# - check path
if path not in data: continue
# - try reading
try : data[path]
except: continue
# - get group name
group = path[::-1].split('/',1)[1][::-1]
# - minimum group name
if len(group) == 0: group = '/'
# - copy data
data.copy(path, fixed[group])
我找到了一种巧妙的方法来恢复所有不包含损坏节点的顶级组。可以通过递归调用简单地扩展到较低级别的组。
import h5py as h5
def RecoverFile( f1, f2 ):
""" recover read-open HDF5 file f1 to write-open HDF5 file f2 """
names = []
f1.visit(names.append)
for n in names:
try:
f2.create_dataset( n, data=f1[n][()] )
except:
pass
with h5.File( file_broken, 'r' ) as fb:
with h5.File( file_recover, 'w' ) as fr:
for key in fb.keys():
try:
fr.create_dataset( key, data=fb[key][()] )
except:
try:
fr.create_group(key)
RecoverFile( fb[key], fr[key] )
except:
fr.__delitem__(key)
我有一个 HDF5 文件由于某种原因损坏了。我正在尝试检索基本上没问题的文件部分。我可以从不包含损坏字段的组中读取所有数据集就好了。但是我无法从一个数据集也已损坏的组中读取任何未损坏的数据集。
有趣的是,我 可以使用 HDFView 轻松读取 那些数据集。 IE。我可以打开它们,找到所有数值。使用 HDFView 我只能读取损坏的数据集。
我的问题是如何利用它并尽可能多地检索数据?
使用h5py阅读时:
Traceback (most recent call last):
File "repair.py", line 44, in <module>
print(data['/dt_yield/000000'][...])
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/usr/local/lib/python3.6/site-packages/h5py/_hl/group.py", line 167, in __getitem__
oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5o.pyx", line 190, in h5py.h5o.open
KeyError: 'Unable to open object (bad heap free list)'
使用 C++ 阅读时:
HDF5-DIAG: Error detected in HDF5 (1.10.1) thread 0:
#000: H5D.c line 294 in H5Dopen2(): unable to open dataset
major: Dataset
minor: Can't open object
#001: H5Dint.c line 1362 in H5D__open_name(): not found
major: Dataset
minor: Object not found
#002: H5Gloc.c line 428 in H5G_loc_find(): can't find object
major: Symbol table
minor: Object not found
#003: H5Gtraverse.c line 867 in H5G_traverse(): internal path traversal failed
major: Symbol table
minor: Object not found
#004: H5Gtraverse.c line 594 in H5G_traverse_real(): can't look up component
major: Symbol table
minor: Object not found
#005: H5Gobj.c line 1156 in H5G__obj_lookup(): can't locate object
major: Symbol table
minor: Object not found
#006: H5Gstab.c line 890 in H5G__stab_lookup(): unable to protect symbol table heap
major: Symbol table
minor: Protected metadata error
#007: H5HL.c line 364 in H5HL_protect(): unable to load heap data block
major: Heap
minor: Unable to protect metadata
#008: H5AC.c line 1763 in H5AC_protect(): H5C_protect() failed
major: Object cache
minor: Unable to protect metadata
#009: H5C.c line 2561 in H5C_protect(): can't load entry
major: Object cache
minor: Unable to load metadata into cache
#010: H5C.c line 6877 in H5C_load_entry(): Can't deserialize image
major: Object cache
minor: Unable to load metadata into cache
#011: H5HLcache.c line 763 in H5HL__cache_datablock_deserialize(): can't initialize free list
major: Heap
minor: Unable to initialize object
#012: H5HLcache.c line 241 in H5HL__fl_deserialize(): bad heap free list
major: Heap
minor: Out of range
libc++abi.dylib: terminating with uncaught exception of type H5::FileIException
恢复脚本(使用 h5py)
这是我到目前为止至少可以恢复 h5py 可以直接读取的所有内容的实现:
import numpy as np
import h5py, os, time
def getdatasets(key,archive):
if key[-1] != '/': key += '/'
out = []
for name in archive[key]:
path = key + name
if isinstance(archive[path], h5py.Dataset):
out += [path]
else:
try : out += getdatasets(path,archive)
except: pass
return out
data = h5py.File('data.hdf5' ,'r')
fixed = h5py.File('fixed.hdf5','w')
datasets = getdatasets('/',data)
groups = list(set([i[::-1].split('/',1)[1][::-1] for i in datasets]))
groups = [i for i in groups if len(i)>0]
idx = np.argsort(np.array([len(i.split('/')) for i in groups]))
groups = [groups[i] for i in idx]
for group in groups:
fixed.create_group(group)
for path in datasets:
# - check path
if path not in data: continue
# - try reading
try : data[path]
except: continue
# - get group name
group = path[::-1].split('/',1)[1][::-1]
# - minimum group name
if len(group) == 0: group = '/'
# - copy data
data.copy(path, fixed[group])
我找到了一种巧妙的方法来恢复所有不包含损坏节点的顶级组。可以通过递归调用简单地扩展到较低级别的组。
import h5py as h5
def RecoverFile( f1, f2 ):
""" recover read-open HDF5 file f1 to write-open HDF5 file f2 """
names = []
f1.visit(names.append)
for n in names:
try:
f2.create_dataset( n, data=f1[n][()] )
except:
pass
with h5.File( file_broken, 'r' ) as fb:
with h5.File( file_recover, 'w' ) as fr:
for key in fb.keys():
try:
fr.create_dataset( key, data=fb[key][()] )
except:
try:
fr.create_group(key)
RecoverFile( fb[key], fr[key] )
except:
fr.__delitem__(key)