Keras/HDF5 TypeError: PointSelection __getitem__ only works with bool arrays

Keras/HDF5 TypeError: PointSelection __getitem__ only works with bool arrays

当我尝试使用 fit_generator 和生成器函数从 HDF5 文件加载训练数据时,我得到一个 ValueError,它是由 HDF5 的 PointSelectionError 引起的:

Epoch 1/10
Exception in thread Thread-1:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 763, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 429, in data_generator_task
    generator_output = next(self._generator)
  File "osr.py", line 108, in generate_training_sequences
    X = training_save_file["X"][batch_idxs]
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2684)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2642)
  File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/dataset.py", line 462, in __getitem__
    selection = sel.select(self.shape, args, dsid=self.id)
  File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/selections.py", line 72, in select
    sel[arg]
  File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/selections.py", line 210, in __getitem__
    raise TypeError("PointSelection __getitem__ only works with bool arrays")
TypeError: PointSelection __getitem__ only works with bool arrays

Traceback (most recent call last):
  File "osr.py", line 359, in <module>
    osr.train_osr_model()
  File "osr.py", line 89, in train_osr_model
    nb_worker=1)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1532, in fit_generator
    str(generator_output))
ValueError: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None

我研究了这个错误,有人提到它可能是由于重复索引造成的,但在我的情况下似乎并非如此。以下是访问的行索引:

[581 305  67 510 631 832 340 663 689 801 579 701 831 879 382 844  15 798
 342 329 118 657 503 129 602   2 528 157 341 299 731 539]

这里是训练和生成函数:

    def train_osr_model(self):
        """ Train the optical speech recognizer
        """
        print "\nTraining OSR"
        validation_ratio = 0.3
        batch_size = 32
        with h5py.File(self.training_save_fn, "r") as training_save_file:
            sample_count = int(training_save_file.attrs["sample_count"])
            sample_idxs = range(0, sample_count)
            sample_idxs = np.random.permutation(sample_idxs)
            training_sample_idxs = sample_idxs[0:int((1-validation_ratio)*sample_count)]
            validation_sample_idxs = sample_idxs[int((1-validation_ratio)*sample_count):]
            training_sequence_generator = self.generate_training_sequences(batch_size=batch_size, 
                                                                           training_save_file=training_save_file,
                                                                           training_sample_idxs=training_sample_idxs)
            validation_sequence_generator = self.generate_validation_sequences(batch_size=batch_size, 
                                                                               training_save_file=training_save_file,
                                                                               validation_sample_idxs=validation_sample_idxs)

            print "Sample Idxs: {0}\n".format(sample_idxs) # FOR DEBUG ONLY
            print "Training Idxs: {0}\n".format(training_sample_idxs) # FOR DEBUG ONLY
            print "Validation Idxs: {0}\n".format(validation_sample_idxs) # FOR DEBUG ONLY

            pbi = ProgressDisplay()
            self.osr.fit_generator(generator=training_sequence_generator,
                                   validation_data=validation_sequence_generator,
                                   samples_per_epoch=len(training_sample_idxs),
                                   nb_val_samples=len(validation_sample_idxs),
                                   nb_epoch=10,
                                   max_q_size=1,
                                   verbose=2,
                                   callbacks=[pbi],
                                   class_weight=None,
                                   nb_worker=1)

    def generate_training_sequences(self, batch_size, training_save_file, training_sample_idxs):
        """ Generates training sequences from HDF5 file on demand
        """
        while True:
            # generate sequences for training
            training_sample_count = len(training_sample_idxs)
            batches = int(training_sample_count/batch_size)
            remainder_samples = training_sample_count%batch_size
            if remainder_samples:
                batches = batches + 1
            # generate batches of samples
            for idx in xrange(0, batches):
                if idx == batches - 1:
                    batch_idxs = training_sample_idxs[idx*batch_size:]
                else:
                    batch_idxs = training_sample_idxs[idx*batch_size:idx*batch_size+batch_size]

                print batch_idxs # FOR DEBUG ONLY

                X = training_save_file["X"][batch_idxs]
                Y = training_save_file["Y"][batch_idxs]



                yield (np.array(X), np.array(Y))

    def generate_validation_sequences(self, batch_size, training_save_file, validation_sample_idxs):
        while True:
            # generate sequences for validation
            validation_sample_count = len(validation_sample_idxs)
            batches = int(validation_sample_count/batch_size)
            remainder_samples = validation_sample_count%batch_size
            if remainder_samples:
                batches = batches + 1
            # generate batches of samples
            for idx in xrange(0, batches):
                if idx == batches - 1:
                    batch_idxs = validation_sample_idxs[idx*batch_size:]
                else:
                    batch_idxs = validation_sample_idxs[idx*batch_size:idx*batch_size+batch_size]

                print batch_idxs # FOR DEBUG ONLY

                X = training_save_file["X"][batch_idxs]
                Y = training_save_file["Y"][batch_idxs]

                yield (np.array(X), np.array(Y))

以下是预处理训练数据并将其保存到 HDF5 文件中的函数:

def process_training_data(self):
    """ Preprocesses training data and saves them into an HDF5 file
    """
    # load training metadata from config file
    training_metadata = {}
    training_classes = []
    with open(self.config_file) as training_config:
        training_metadata = json.load(training_config)
        training_classes = sorted(list(training_metadata.keys()))

        print "".join(["\n",
                       "Found {0} training classes!\n".format(len(training_classes)),
                       "-"*40])
        for class_label, training_class in enumerate(training_classes):
            print "{0:<4d} {1:<10s} {2:<30s}".format(class_label, training_class, training_metadata[training_class])
        print ""

    # count number of samples
    sample_count = 0
    sample_count_by_class = [0]*len(training_classes)
    for class_label, training_class in enumerate(training_classes):
        # get training class sequeunce paths
        training_class_data_path = training_metadata[training_class]
        training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
                                         for file_name in os.listdir(training_class_data_path)
                                         if (os.path.isfile(os.path.join(training_class_data_path, file_name))
                                             and ".mov" in file_name)]
        # update sample count
        sample_count += len(training_class_sequence_paths)
        sample_count_by_class[class_label] = len(training_class_sequence_paths)

    print "".join(["\n",
                   "Found {0} training samples!\n".format(sample_count),
                   "-"*40])
    for class_label, training_class in enumerate(training_classes):
        print "{0:<4d} {1:<10s} {2:<6d}".format(class_label, training_class, sample_count_by_class[class_label])
    print ""

    # initialize HDF5 save file, but clear older duplicate first if it exists
    try:
        print "Saved file \"{0}\" already exists! Overwriting previous saved file.\n".format(self.training_save_fn)
        os.remove(self.training_save_fn)
    except OSError:
        pass

    # process and save training data into HDF5 file
    print "Generating {0} samples from {1} samples via data augmentation\n".format(sample_count*self.samples_generated_per_sample,
                                                                                   sample_count)
    sample_count = sample_count*self.samples_generated_per_sample
    with h5py.File(self.training_save_fn, "w") as training_save_file:
        training_save_file.attrs["training_classes"] = np.string_(",".join(training_classes))
        training_save_file.attrs["sample_count"] = sample_count
        x_training_dataset = training_save_file.create_dataset("X", 
                                                               shape=(sample_count, self.frames_per_sequence, 3, self.rows, self.columns),
                                                               dtype="f")
        y_training_dataset = training_save_file.create_dataset("Y",
                                                               shape=(sample_count, len(training_classes)),
                                                               dtype="i")

        # iterate through each class data
        sample_idx = 0
        for class_label, training_class in enumerate(training_classes):
            # get training class sequeunce paths
            training_class_data_path = training_metadata[training_class]
            training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
                                             for file_name in os.listdir(training_class_data_path)
                                             if (os.path.isfile(os.path.join(training_class_data_path, file_name))
                                                 and ".mov" in file_name)]
            # iterate through each sequence
            for idx, training_class_sequence_path in enumerate(training_class_sequence_paths):
                sys.stdout.write("Processing training data for class \"{0}\": {1}/{2} sequences\r"
                                 .format(training_class, idx+1, len(training_class_sequence_paths)))
                sys.stdout.flush()

                # accumulate samples and labels
                samples_batch = self.process_frames(training_class_sequence_path)
                label = [0]*len(training_classes)
                label[class_label] = 1
                label = np.array(label).astype("int32")

                for sample in samples_batch:
                    x_training_dataset[sample_idx] = sample
                    y_training_dataset[sample_idx] = label

                    # update sample index
                    sample_idx += 1

            print "\n"

        training_save_file.close()

        print "Training data processed and saved to {0}".format(self.training_save_fn)

def process_frames(self, video_file_path):
    """ Preprocesses sequence frames
    """
    # haar cascades for localizing oral region
    face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
    mouth_cascade = cv2.CascadeClassifier('haarcascade_mcs_mouth.xml')

    video = cv2.VideoCapture(video_file_path)
    success, frame = video.read()

    frames = []
    success = True

    # convert to grayscale, localize oral region, equalize frame dimensions, and accumulate valid frames 
    while success:
      success, frame = video.read()
      if success:
        # convert to grayscale
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # localize single facial region
        faces_coords = face_cascade.detectMultiScale(frame, 1.3, 5)
        if len(faces_coords) == 1:
          face_x, face_y, face_w, face_h = faces_coords[0]
          frame = frame[face_y:face_y + face_h, face_x:face_x + face_w]

          # localize oral region
          mouth_coords = mouth_cascade.detectMultiScale(frame, 1.3, 5)
          threshold = 0
          for (mouth_x, mouth_y, mouth_w, mouth_h) in mouth_coords:
            if (mouth_y > threshold):
                threshold = mouth_y
                valid_mouth_coords = (mouth_x, mouth_y, mouth_w, mouth_h)
            else:
                pass
          mouth_x, mouth_y, mouth_w, mouth_h = valid_mouth_coords
          frame = frame[mouth_y:mouth_y + mouth_h, mouth_x:mouth_x + mouth_w]

          # equalize frame dimensions
          frame = cv2.resize(frame, (self.columns, self.rows)).astype('float32')

          # accumulate frames
          frames.append(frame)

        # ignore multiple facial region detections
        else:
            pass

    # equalize sequence lengths 
    if len(frames) < self.frames_per_sequence:
        frames = [frames[0]]*(self.frames_per_sequence - len(frames)) + frames
    frames = np.array(frames[-self.frames_per_sequence:])

    # function to normalize and add channel dimension to each frame
    proc_frame = lambda frame: np.array([frame / 255.0]*3)

    samples_batch = [np.array(map(proc_frame, frames))]
    # random transformations for data augmentation
    for _ in xrange(0, self.samples_generated_per_sample-1):
        rotated_frames = random_rotation(frames, rg=4.5)
        shifted_frames = random_shift(rotated_frames, wrg=0.05, hrg=0.05)
        sheared_frames = random_shear(shifted_frames, intensity=0.08)
        zoomed_frames = random_zoom(sheared_frames, zoom_range=(1.05, 1.05))
        samples_batch.append(np.array(map(proc_frame, zoomed_frames)))

    return samples_batch

错误来自两件事:

  • 您正在阅读的是因为 batch_idxs 是一个数组,而不是列表。 h5py 对象接受列表索引。但即使你改变

    X = training_save_file["X"][list(batch_idxs)]
    

    你仍然会得到一个错误。这来自对列表索引的一些限制。这就把我们带到了第二点。

  • 如果你读了doc you sent me,这是这样写的:

    The following restrictions exist:

    • List selections may not be empty
    • Selection coordinates must be given in increasing order
    • Duplicate selections are ignored
    • Very long lists (> 1000 elements) may produce poor performance

    第二个项目符号是我们的问题:您在创建 training_sample_idxs 时进行的随机改组使索引顺序随机,并且数据集期望它们按递增顺序排列。这是您必须处理的一个限制,但它并不太受限制,因为批次中的顺序无关紧要,模型无论如何都会在整个批次上进行优化。

有帮助吗?