删除符合特定条件的 hdf5 文件中的特定索引

Deleting specific indices in an hdf5 file corresponding to certain criteria

我有一个大约 735MB 的 hdf5 文件,其结构如下所述。我必须过滤掉符合特定条件的数据,但是,我遇到了操作问题。数据集 tracks_from_jet 有某些变量,我想将我的 select 离子切割放在上面(假设我想 select 一个变量 >= 500),我必须删除那些相应的 records/data通过jets数据集中不满足条件(对应变量>=500)的索引。

tracks_from_jet 数据集的第一个索引与 jets 索引一一对应。如何从 jets 数据集中删除与我在 tracks_from_jet 数据集中的 selection 标准相对应的记录?

h5文件结构为:

jets                     Dataset {679015/Inf}
    Location:  1:800
    Links:     1
    Chunks:    {2048} 671744 bytes
    Storage:   222716920 logical bytes, 110070578 allocated bytes, 202.34% utilization
    Filter-0:  deflate-1 OPT {7}
    Type:      struct {
                   "pt_btagJes"       +0    native float
                   "eta_btagJes"      +4    native float
                   "absEta_btagJes"   +8    native float
                   "JetFitter_energyFraction" +12   native float
                   "JetFitter_mass"   +16   native float
                   "JetFitter_significance3d" +20   native float
                   "JetFitter_deltaphi" +24   native float
                   "JetFitter_deltaeta" +28   native float
                   "JetFitter_massUncorr" +32   native float
                   "JetFitter_dRFlightDir" +36   native float
                   "SV1_masssvx"      +40   native float
                   "SV1_efracsvx"     +44   native float
                   "SV1_significance3d" +48   native float
                   "SV1_correctSignificance3d" +52   native float
                   "SV1_dstToMatLay"  +56   native float
                   "SV1_deltaR"       +60   native float
                   "SV1_Lxy"          +64   native float
                   "SV1_L3d"          +68   native float
                   "JetFitter_deltaR" +72   native float
                   "JetFitterSecondaryVertex_displacement3d" +76   native float
                   "JetFitterSecondaryVertex_displacement2d" +80   native float
                   "JetFitterSecondaryVertex_mass" +84   native float
                   "JetFitterSecondaryVertex_energy" +88   native float
                   "JetFitterSecondaryVertex_energyFraction" +92   native float
                   "JetFitterSecondaryVertex_minimumTrackRelativeEta" +96   native float
                   "JetFitterSecondaryVertex_maximumTrackRelativeEta" +100  native float
                   "JetFitterSecondaryVertex_averageTrackRelativeEta" +104  native float
                   "JetFitterSecondaryVertex_maximumAllJetTrackRelativeEta" +108  native float
                   "JetFitterSecondaryVertex_minimumAllJetTrackRelativeEta" +112  native float
                   "JetFitterSecondaryVertex_averageAllJetTrackRelativeEta" +116  native float
                   "IP2D_pu"          +120  native float
                   "IP2D_pc"          +124  native float
                   "IP2D_pb"          +128  native float
                   "IP3D_pu"          +132  native float
                   "IP3D_pc"          +136  native float
                   "IP3D_pb"          +140  native float
                   "IP2D_cu"          +144  native float
                   "IP2D_bu"          +148  native float
                   "IP2D_bc"          +152  native float
                   "IP3D_cu"          +156  native float
                   "IP3D_bu"          +160  native float
                   "IP3D_bc"          +164  native float
                   "rnnip_pu"         +168  native float
                   "rnnip_pc"         +172  native float
                   "rnnip_pb"         +176  native float
                   "DL1r_pu"          +180  native float
                   "DL1r_pc"          +184  native float
                   "DL1r_pb"          +188  native float
                   "IP2D_isDefaults"  +192  native int
                   "IP3D_isDefaults"  +196  native int
                   "JetFitter_isDefaults" +200  native int
                   "SV1_isDefaults"   +204  native int
                   "JetFitterSecondaryVertex_isDefaults" +208  native int
                   "rnnip_isDefaults" +212  native int
                   "JetFitter_nVTX"   +216  native float
                   "JetFitter_nSingleTracks" +220  native float
                   "JetFitter_nTracksAtVtx" +224  native float
                   "JetFitter_N2Tpair" +228  native float
                   "SV1_N2Tpair"      +232  native float
                   "SV1_NGTinSvx"     +236  native float
                   "JetFitterSecondaryVertex_nTracks" +240  native float
                   "IP2D_nTrks"       +244  native float
                   "IP3D_nTrks"       +248  native float
                   "pt"               +252  native float
                   "eta"              +256  native float
                   "energy"           +260  native float
                   "mass"             +264  native float
                   "GhostBHadronsFinalPt" +268  native float
                   "bTagJVT"          +272  native float
                   "GhostBHadronsFinalCount" +276  native int
                   "GhostCHadronsFinalCount" +280  native int
                   "HadronConeExclTruthLabelID" +284  native int
                   "HadronConeExclExtendedTruthLabelID" +288  native int
                   "PartonTruthLabelID" +292  native int
                   "jetPtRank"        +296  native int
                   "mcEventWeight"    +300  native float
                   "eventNumber"      +304  native long
                   "averageInteractionsPerCrossing" +312  native float
                   "actualInteractionsPerCrossing" +316  native float
                   "nPrimaryVertices" +320  native int
                   "beamSpotWeight"   +324  native float
               } 328 bytes
tracks_from_jet          Dataset {679015/Inf, 40/40}
    Location:  1:7832
    Links:     1
    Chunks:    {2048, 40} 9338880 bytes
    Storage:   3096308400 logical bytes, 661050378 allocated bytes, 468.39% utilization
    Filter-0:  deflate-1 OPT {7}
    Type:      struct {
                   "chiSquared"       +0    native float
                   "numberDoF"        +4    native float
                   "radiusOfFirstHit" +8    native float
                   "IP3D_signed_d0"   +12   native float
                   "IP2D_signed_d0"   +16   native float
                   "IP3D_signed_z0"   +20   native float
                   "theta"            +24   native float
                   "qOverP"           +28   native float
                   "numberOfInnermostPixelLayerHits" +32   native unsigned char
                   "numberOfNextToInnermostPixelLayerHits" +33   native unsigned char
                   "numberOfInnermostPixelLayerSharedHits" +34   native unsigned char
                   "numberOfInnermostPixelLayerSplitHits" +35   native unsigned char
                   "numberOfPixelHits" +36   native unsigned char
                   "numberOfPixelHoles" +37   native unsigned char
                   "numberOfPixelSharedHits" +38   native unsigned char
                   "numberOfPixelSplitHits" +39   native unsigned char
                   "numberOfSCTHits"  +40   native unsigned char
                   "numberOfSCTHoles" +41   native unsigned char
                   "numberOfSCTSharedHits" +42   native unsigned char
                   "expectNextToInnermostPixelLayerHit" +43   native unsigned char
                   "expectInnermostPixelLayerHit" +44   native unsigned char
                   "d0"               +45   native float
                   "z0SinTheta"       +49   native float
                   "d0Uncertainty"    +53   native float
                   "z0SinThetaUncertainty" +57   native float
                   "IP3D_signed_d0_significance" +61   native float
                   "IP3D_signed_z0_significance" +65   native float
                   "pt"               +69   native float
                   "eta"              +73   native float
                   "phiUncertainty"   +77   native float
                   "thetaUncertainty" +81   native float
                   "qOverPUncertainty" +85   native float
                   "deta"             +89   native float
                   "dphi"             +93   native float
                   "dr"               +97   native float
                   "ptfrac"           +101  native float
                   "z0RelativeToBeamspot" +105  native float
                   "z0RelativeToBeamspotUncertainty" +109  native float
                   "valid"            +113  enum native signed char {
                       TRUE             = 1
                       FALSE            = 0
                   }
               } 114 bytes

这是一个有效的原型。它的内存效率不高,但在系统上(比如 i9,5.3GHz,24GB NVIDIA RTX-3090 GPU 内存和 32GB RAM),每次迭代大约需要 30 秒。但是,它确实会在常规笔记本电脑资源上崩溃。

import h5py
import numpy as np
import os
import glob

#directory=os.getcwd()
results = glob.glob('std/*output.h5',recursive=True)
#results = glob.glob('lrt/*output.h5',recursive=True)
#print(results)

path_std = '../ttbar/std/'
path_lrt = '../ttbar/lrt/'

isExist_std = os.path.exists(path_std)

if not isExist_std:
    os.makedirs(path_std)
    print("The new std-directory is created!\n")

isExist_lrt = os.path.exists(path_lrt)

if not isExist_lrt:
    os.makedirs(path_lrt)
    print("The new lrt-directory is created!\n")

for f in results:
    print("Running cuts on:"+str(f)+"\n")
    with h5py.File(f, "r") as h5r:
        key_jets = list(h5r.keys())[0]
        key_tracks = list(h5r.keys())[1]
        # print (key_jets, h5r[key_jets].shape, h5r[key_jets].dtype)
        # print (key_tracks, h5r[key_tracks].shape, h5r[key_tracks].dtype)
        jets = np.array(h5r['jets'])
        print(jets.shape)
        tracks = np.array(h5r['tracks_from_jet'])
        print(tracks.shape)
        print(tracks.shape[0])
        print(tracks.shape[1])
        #print(tracks.shape[2])
        tracks_d0 = h5r['tracks_from_jet']['d0']
        tracks_z0 = h5r['tracks_from_jet']['z0SinTheta']
        npy_tracks_d0 = np.array(tracks_d0)
        npy_tracks_z0 = np.array(tracks_z0)
        # print(npy_jets)
        # print(npy_tracks_d0)
        # print(npy_tracks_z0)
        sol_d0 = np.argwhere(abs(npy_tracks_d0) > 1)
        sol_z0 = np.argwhere(abs(npy_tracks_z0) > 1.5)
        sold0 = sol_d0[0]
        solz0 = sol_z0[0]
        indices_to_remove = np.unique(np.concatenate((sold0, solz0)))
        if indices_to_remove.size == 0:
            continue
        print(indices_to_remove)
        #for i in indices_to_remove:
        #    print(jets[i])
        #for j in indices_to_remove:
        #    print(tracks[j][:][:])
        #    print('====================================================================================================================================================================')

        jets = np.delete(jets, indices_to_remove, axis=0)
        tracks = np.delete(tracks, indices_to_remove, axis=0)

        newfile = 'refined' + os.path.basename(f)
        completeName = os.path.join(path_std, newfile)
        #completeName = os.path.join(path_lrt, newfile)
        with h5py.File(completeName, 'w') as fwrite:
            fwrite.create_dataset('jets', data=jets,compression='gzip', compression_opts=7)
            fwrite.create_dataset('tracks_from_jet', data=tracks,compression='gzip', compression_opts=7)