删除符合特定条件的 hdf5 文件中的特定索引
Deleting specific indices in an hdf5 file corresponding to certain criteria
我有一个大约 735MB 的 hdf5 文件,其结构如下所述。我必须过滤掉符合特定条件的数据,但是,我遇到了操作问题。数据集 tracks_from_jet
有某些变量,我想将我的 select 离子切割放在上面(假设我想 select 一个变量 >= 500),我必须删除那些相应的 records/data通过jets
数据集中不满足条件(对应变量>=500)的索引。
tracks_from_jet
数据集的第一个索引与 jets
索引一一对应。如何从 jets
数据集中删除与我在 tracks_from_jet
数据集中的 selection 标准相对应的记录?
h5文件结构为:
jets Dataset {679015/Inf}
Location: 1:800
Links: 1
Chunks: {2048} 671744 bytes
Storage: 222716920 logical bytes, 110070578 allocated bytes, 202.34% utilization
Filter-0: deflate-1 OPT {7}
Type: struct {
"pt_btagJes" +0 native float
"eta_btagJes" +4 native float
"absEta_btagJes" +8 native float
"JetFitter_energyFraction" +12 native float
"JetFitter_mass" +16 native float
"JetFitter_significance3d" +20 native float
"JetFitter_deltaphi" +24 native float
"JetFitter_deltaeta" +28 native float
"JetFitter_massUncorr" +32 native float
"JetFitter_dRFlightDir" +36 native float
"SV1_masssvx" +40 native float
"SV1_efracsvx" +44 native float
"SV1_significance3d" +48 native float
"SV1_correctSignificance3d" +52 native float
"SV1_dstToMatLay" +56 native float
"SV1_deltaR" +60 native float
"SV1_Lxy" +64 native float
"SV1_L3d" +68 native float
"JetFitter_deltaR" +72 native float
"JetFitterSecondaryVertex_displacement3d" +76 native float
"JetFitterSecondaryVertex_displacement2d" +80 native float
"JetFitterSecondaryVertex_mass" +84 native float
"JetFitterSecondaryVertex_energy" +88 native float
"JetFitterSecondaryVertex_energyFraction" +92 native float
"JetFitterSecondaryVertex_minimumTrackRelativeEta" +96 native float
"JetFitterSecondaryVertex_maximumTrackRelativeEta" +100 native float
"JetFitterSecondaryVertex_averageTrackRelativeEta" +104 native float
"JetFitterSecondaryVertex_maximumAllJetTrackRelativeEta" +108 native float
"JetFitterSecondaryVertex_minimumAllJetTrackRelativeEta" +112 native float
"JetFitterSecondaryVertex_averageAllJetTrackRelativeEta" +116 native float
"IP2D_pu" +120 native float
"IP2D_pc" +124 native float
"IP2D_pb" +128 native float
"IP3D_pu" +132 native float
"IP3D_pc" +136 native float
"IP3D_pb" +140 native float
"IP2D_cu" +144 native float
"IP2D_bu" +148 native float
"IP2D_bc" +152 native float
"IP3D_cu" +156 native float
"IP3D_bu" +160 native float
"IP3D_bc" +164 native float
"rnnip_pu" +168 native float
"rnnip_pc" +172 native float
"rnnip_pb" +176 native float
"DL1r_pu" +180 native float
"DL1r_pc" +184 native float
"DL1r_pb" +188 native float
"IP2D_isDefaults" +192 native int
"IP3D_isDefaults" +196 native int
"JetFitter_isDefaults" +200 native int
"SV1_isDefaults" +204 native int
"JetFitterSecondaryVertex_isDefaults" +208 native int
"rnnip_isDefaults" +212 native int
"JetFitter_nVTX" +216 native float
"JetFitter_nSingleTracks" +220 native float
"JetFitter_nTracksAtVtx" +224 native float
"JetFitter_N2Tpair" +228 native float
"SV1_N2Tpair" +232 native float
"SV1_NGTinSvx" +236 native float
"JetFitterSecondaryVertex_nTracks" +240 native float
"IP2D_nTrks" +244 native float
"IP3D_nTrks" +248 native float
"pt" +252 native float
"eta" +256 native float
"energy" +260 native float
"mass" +264 native float
"GhostBHadronsFinalPt" +268 native float
"bTagJVT" +272 native float
"GhostBHadronsFinalCount" +276 native int
"GhostCHadronsFinalCount" +280 native int
"HadronConeExclTruthLabelID" +284 native int
"HadronConeExclExtendedTruthLabelID" +288 native int
"PartonTruthLabelID" +292 native int
"jetPtRank" +296 native int
"mcEventWeight" +300 native float
"eventNumber" +304 native long
"averageInteractionsPerCrossing" +312 native float
"actualInteractionsPerCrossing" +316 native float
"nPrimaryVertices" +320 native int
"beamSpotWeight" +324 native float
} 328 bytes
tracks_from_jet Dataset {679015/Inf, 40/40}
Location: 1:7832
Links: 1
Chunks: {2048, 40} 9338880 bytes
Storage: 3096308400 logical bytes, 661050378 allocated bytes, 468.39% utilization
Filter-0: deflate-1 OPT {7}
Type: struct {
"chiSquared" +0 native float
"numberDoF" +4 native float
"radiusOfFirstHit" +8 native float
"IP3D_signed_d0" +12 native float
"IP2D_signed_d0" +16 native float
"IP3D_signed_z0" +20 native float
"theta" +24 native float
"qOverP" +28 native float
"numberOfInnermostPixelLayerHits" +32 native unsigned char
"numberOfNextToInnermostPixelLayerHits" +33 native unsigned char
"numberOfInnermostPixelLayerSharedHits" +34 native unsigned char
"numberOfInnermostPixelLayerSplitHits" +35 native unsigned char
"numberOfPixelHits" +36 native unsigned char
"numberOfPixelHoles" +37 native unsigned char
"numberOfPixelSharedHits" +38 native unsigned char
"numberOfPixelSplitHits" +39 native unsigned char
"numberOfSCTHits" +40 native unsigned char
"numberOfSCTHoles" +41 native unsigned char
"numberOfSCTSharedHits" +42 native unsigned char
"expectNextToInnermostPixelLayerHit" +43 native unsigned char
"expectInnermostPixelLayerHit" +44 native unsigned char
"d0" +45 native float
"z0SinTheta" +49 native float
"d0Uncertainty" +53 native float
"z0SinThetaUncertainty" +57 native float
"IP3D_signed_d0_significance" +61 native float
"IP3D_signed_z0_significance" +65 native float
"pt" +69 native float
"eta" +73 native float
"phiUncertainty" +77 native float
"thetaUncertainty" +81 native float
"qOverPUncertainty" +85 native float
"deta" +89 native float
"dphi" +93 native float
"dr" +97 native float
"ptfrac" +101 native float
"z0RelativeToBeamspot" +105 native float
"z0RelativeToBeamspotUncertainty" +109 native float
"valid" +113 enum native signed char {
TRUE = 1
FALSE = 0
}
} 114 bytes
这是一个有效的原型。它的内存效率不高,但在系统上(比如 i9,5.3GHz,24GB NVIDIA RTX-3090 GPU 内存和 32GB RAM),每次迭代大约需要 30 秒。但是,它确实会在常规笔记本电脑资源上崩溃。
import h5py
import numpy as np
import os
import glob
#directory=os.getcwd()
results = glob.glob('std/*output.h5',recursive=True)
#results = glob.glob('lrt/*output.h5',recursive=True)
#print(results)
path_std = '../ttbar/std/'
path_lrt = '../ttbar/lrt/'
isExist_std = os.path.exists(path_std)
if not isExist_std:
os.makedirs(path_std)
print("The new std-directory is created!\n")
isExist_lrt = os.path.exists(path_lrt)
if not isExist_lrt:
os.makedirs(path_lrt)
print("The new lrt-directory is created!\n")
for f in results:
print("Running cuts on:"+str(f)+"\n")
with h5py.File(f, "r") as h5r:
key_jets = list(h5r.keys())[0]
key_tracks = list(h5r.keys())[1]
# print (key_jets, h5r[key_jets].shape, h5r[key_jets].dtype)
# print (key_tracks, h5r[key_tracks].shape, h5r[key_tracks].dtype)
jets = np.array(h5r['jets'])
print(jets.shape)
tracks = np.array(h5r['tracks_from_jet'])
print(tracks.shape)
print(tracks.shape[0])
print(tracks.shape[1])
#print(tracks.shape[2])
tracks_d0 = h5r['tracks_from_jet']['d0']
tracks_z0 = h5r['tracks_from_jet']['z0SinTheta']
npy_tracks_d0 = np.array(tracks_d0)
npy_tracks_z0 = np.array(tracks_z0)
# print(npy_jets)
# print(npy_tracks_d0)
# print(npy_tracks_z0)
sol_d0 = np.argwhere(abs(npy_tracks_d0) > 1)
sol_z0 = np.argwhere(abs(npy_tracks_z0) > 1.5)
sold0 = sol_d0[0]
solz0 = sol_z0[0]
indices_to_remove = np.unique(np.concatenate((sold0, solz0)))
if indices_to_remove.size == 0:
continue
print(indices_to_remove)
#for i in indices_to_remove:
# print(jets[i])
#for j in indices_to_remove:
# print(tracks[j][:][:])
# print('====================================================================================================================================================================')
jets = np.delete(jets, indices_to_remove, axis=0)
tracks = np.delete(tracks, indices_to_remove, axis=0)
newfile = 'refined' + os.path.basename(f)
completeName = os.path.join(path_std, newfile)
#completeName = os.path.join(path_lrt, newfile)
with h5py.File(completeName, 'w') as fwrite:
fwrite.create_dataset('jets', data=jets,compression='gzip', compression_opts=7)
fwrite.create_dataset('tracks_from_jet', data=tracks,compression='gzip', compression_opts=7)
我有一个大约 735MB 的 hdf5 文件,其结构如下所述。我必须过滤掉符合特定条件的数据,但是,我遇到了操作问题。数据集 tracks_from_jet
有某些变量,我想将我的 select 离子切割放在上面(假设我想 select 一个变量 >= 500),我必须删除那些相应的 records/data通过jets
数据集中不满足条件(对应变量>=500)的索引。
tracks_from_jet
数据集的第一个索引与 jets
索引一一对应。如何从 jets
数据集中删除与我在 tracks_from_jet
数据集中的 selection 标准相对应的记录?
h5文件结构为:
jets Dataset {679015/Inf}
Location: 1:800
Links: 1
Chunks: {2048} 671744 bytes
Storage: 222716920 logical bytes, 110070578 allocated bytes, 202.34% utilization
Filter-0: deflate-1 OPT {7}
Type: struct {
"pt_btagJes" +0 native float
"eta_btagJes" +4 native float
"absEta_btagJes" +8 native float
"JetFitter_energyFraction" +12 native float
"JetFitter_mass" +16 native float
"JetFitter_significance3d" +20 native float
"JetFitter_deltaphi" +24 native float
"JetFitter_deltaeta" +28 native float
"JetFitter_massUncorr" +32 native float
"JetFitter_dRFlightDir" +36 native float
"SV1_masssvx" +40 native float
"SV1_efracsvx" +44 native float
"SV1_significance3d" +48 native float
"SV1_correctSignificance3d" +52 native float
"SV1_dstToMatLay" +56 native float
"SV1_deltaR" +60 native float
"SV1_Lxy" +64 native float
"SV1_L3d" +68 native float
"JetFitter_deltaR" +72 native float
"JetFitterSecondaryVertex_displacement3d" +76 native float
"JetFitterSecondaryVertex_displacement2d" +80 native float
"JetFitterSecondaryVertex_mass" +84 native float
"JetFitterSecondaryVertex_energy" +88 native float
"JetFitterSecondaryVertex_energyFraction" +92 native float
"JetFitterSecondaryVertex_minimumTrackRelativeEta" +96 native float
"JetFitterSecondaryVertex_maximumTrackRelativeEta" +100 native float
"JetFitterSecondaryVertex_averageTrackRelativeEta" +104 native float
"JetFitterSecondaryVertex_maximumAllJetTrackRelativeEta" +108 native float
"JetFitterSecondaryVertex_minimumAllJetTrackRelativeEta" +112 native float
"JetFitterSecondaryVertex_averageAllJetTrackRelativeEta" +116 native float
"IP2D_pu" +120 native float
"IP2D_pc" +124 native float
"IP2D_pb" +128 native float
"IP3D_pu" +132 native float
"IP3D_pc" +136 native float
"IP3D_pb" +140 native float
"IP2D_cu" +144 native float
"IP2D_bu" +148 native float
"IP2D_bc" +152 native float
"IP3D_cu" +156 native float
"IP3D_bu" +160 native float
"IP3D_bc" +164 native float
"rnnip_pu" +168 native float
"rnnip_pc" +172 native float
"rnnip_pb" +176 native float
"DL1r_pu" +180 native float
"DL1r_pc" +184 native float
"DL1r_pb" +188 native float
"IP2D_isDefaults" +192 native int
"IP3D_isDefaults" +196 native int
"JetFitter_isDefaults" +200 native int
"SV1_isDefaults" +204 native int
"JetFitterSecondaryVertex_isDefaults" +208 native int
"rnnip_isDefaults" +212 native int
"JetFitter_nVTX" +216 native float
"JetFitter_nSingleTracks" +220 native float
"JetFitter_nTracksAtVtx" +224 native float
"JetFitter_N2Tpair" +228 native float
"SV1_N2Tpair" +232 native float
"SV1_NGTinSvx" +236 native float
"JetFitterSecondaryVertex_nTracks" +240 native float
"IP2D_nTrks" +244 native float
"IP3D_nTrks" +248 native float
"pt" +252 native float
"eta" +256 native float
"energy" +260 native float
"mass" +264 native float
"GhostBHadronsFinalPt" +268 native float
"bTagJVT" +272 native float
"GhostBHadronsFinalCount" +276 native int
"GhostCHadronsFinalCount" +280 native int
"HadronConeExclTruthLabelID" +284 native int
"HadronConeExclExtendedTruthLabelID" +288 native int
"PartonTruthLabelID" +292 native int
"jetPtRank" +296 native int
"mcEventWeight" +300 native float
"eventNumber" +304 native long
"averageInteractionsPerCrossing" +312 native float
"actualInteractionsPerCrossing" +316 native float
"nPrimaryVertices" +320 native int
"beamSpotWeight" +324 native float
} 328 bytes
tracks_from_jet Dataset {679015/Inf, 40/40}
Location: 1:7832
Links: 1
Chunks: {2048, 40} 9338880 bytes
Storage: 3096308400 logical bytes, 661050378 allocated bytes, 468.39% utilization
Filter-0: deflate-1 OPT {7}
Type: struct {
"chiSquared" +0 native float
"numberDoF" +4 native float
"radiusOfFirstHit" +8 native float
"IP3D_signed_d0" +12 native float
"IP2D_signed_d0" +16 native float
"IP3D_signed_z0" +20 native float
"theta" +24 native float
"qOverP" +28 native float
"numberOfInnermostPixelLayerHits" +32 native unsigned char
"numberOfNextToInnermostPixelLayerHits" +33 native unsigned char
"numberOfInnermostPixelLayerSharedHits" +34 native unsigned char
"numberOfInnermostPixelLayerSplitHits" +35 native unsigned char
"numberOfPixelHits" +36 native unsigned char
"numberOfPixelHoles" +37 native unsigned char
"numberOfPixelSharedHits" +38 native unsigned char
"numberOfPixelSplitHits" +39 native unsigned char
"numberOfSCTHits" +40 native unsigned char
"numberOfSCTHoles" +41 native unsigned char
"numberOfSCTSharedHits" +42 native unsigned char
"expectNextToInnermostPixelLayerHit" +43 native unsigned char
"expectInnermostPixelLayerHit" +44 native unsigned char
"d0" +45 native float
"z0SinTheta" +49 native float
"d0Uncertainty" +53 native float
"z0SinThetaUncertainty" +57 native float
"IP3D_signed_d0_significance" +61 native float
"IP3D_signed_z0_significance" +65 native float
"pt" +69 native float
"eta" +73 native float
"phiUncertainty" +77 native float
"thetaUncertainty" +81 native float
"qOverPUncertainty" +85 native float
"deta" +89 native float
"dphi" +93 native float
"dr" +97 native float
"ptfrac" +101 native float
"z0RelativeToBeamspot" +105 native float
"z0RelativeToBeamspotUncertainty" +109 native float
"valid" +113 enum native signed char {
TRUE = 1
FALSE = 0
}
} 114 bytes
这是一个有效的原型。它的内存效率不高,但在系统上(比如 i9,5.3GHz,24GB NVIDIA RTX-3090 GPU 内存和 32GB RAM),每次迭代大约需要 30 秒。但是,它确实会在常规笔记本电脑资源上崩溃。
import h5py
import numpy as np
import os
import glob
#directory=os.getcwd()
results = glob.glob('std/*output.h5',recursive=True)
#results = glob.glob('lrt/*output.h5',recursive=True)
#print(results)
path_std = '../ttbar/std/'
path_lrt = '../ttbar/lrt/'
isExist_std = os.path.exists(path_std)
if not isExist_std:
os.makedirs(path_std)
print("The new std-directory is created!\n")
isExist_lrt = os.path.exists(path_lrt)
if not isExist_lrt:
os.makedirs(path_lrt)
print("The new lrt-directory is created!\n")
for f in results:
print("Running cuts on:"+str(f)+"\n")
with h5py.File(f, "r") as h5r:
key_jets = list(h5r.keys())[0]
key_tracks = list(h5r.keys())[1]
# print (key_jets, h5r[key_jets].shape, h5r[key_jets].dtype)
# print (key_tracks, h5r[key_tracks].shape, h5r[key_tracks].dtype)
jets = np.array(h5r['jets'])
print(jets.shape)
tracks = np.array(h5r['tracks_from_jet'])
print(tracks.shape)
print(tracks.shape[0])
print(tracks.shape[1])
#print(tracks.shape[2])
tracks_d0 = h5r['tracks_from_jet']['d0']
tracks_z0 = h5r['tracks_from_jet']['z0SinTheta']
npy_tracks_d0 = np.array(tracks_d0)
npy_tracks_z0 = np.array(tracks_z0)
# print(npy_jets)
# print(npy_tracks_d0)
# print(npy_tracks_z0)
sol_d0 = np.argwhere(abs(npy_tracks_d0) > 1)
sol_z0 = np.argwhere(abs(npy_tracks_z0) > 1.5)
sold0 = sol_d0[0]
solz0 = sol_z0[0]
indices_to_remove = np.unique(np.concatenate((sold0, solz0)))
if indices_to_remove.size == 0:
continue
print(indices_to_remove)
#for i in indices_to_remove:
# print(jets[i])
#for j in indices_to_remove:
# print(tracks[j][:][:])
# print('====================================================================================================================================================================')
jets = np.delete(jets, indices_to_remove, axis=0)
tracks = np.delete(tracks, indices_to_remove, axis=0)
newfile = 'refined' + os.path.basename(f)
completeName = os.path.join(path_std, newfile)
#completeName = os.path.join(path_lrt, newfile)
with h5py.File(completeName, 'w') as fwrite:
fwrite.create_dataset('jets', data=jets,compression='gzip', compression_opts=7)
fwrite.create_dataset('tracks_from_jet', data=tracks,compression='gzip', compression_opts=7)