Python:从给定的数据集生成唯一的批次

Python: Generate a unique batch from given dataset

我正在应用 CNN 对给定数据集进行分类。

我的函数:

def batch_generator(dataset, input_shape = (256, 256), batch_size = 32):
    dataset_images = []
    dataset_labels = []
    for i in range(0, len(dataset)):
        dataset_images.append(cv2.resize(cv2.imread(dataset[i], cv2.IMREAD_COLOR), 
                     input_shape, interpolation = cv2.INTER_AREA))
        dataset_labels.append(labels[dataset[i].split('/')[-2]])
    return dataset_images, dataset_labels

这个函数应该在每个时期调用,它应该 return 一个大小为 'batch_size' 的唯一批次,包含 dataset_images(每个图像为 256x256)和相应的 dataset_label 来自标签字典。

输入 'dataset' 包含所有图像的路径,因此我打开它们并将它们的大小调整为 256x256。有人可以帮我添加到此代码中,以便 return 是所需的批次吗?

正如@jodag 所建议的那样,使用 DataLoaders 是个好主意。

我有一个片段用于我在 Pytorch 中的一些 CNN

from torch.utils.data import Dataset, DataLoader
import torch
class Data(Dataset):
    """
    Constructs a Dataset to be parsed into a DataLoader
    """
    def __init__(self,X,y):
        X = torch.from_numpy(X).float()

        #Transpose to fit dimensions of my network
        X = torch.transpose(X,1,2)

        y = torch.from_numpy(y).float()
        self.X,self.y = X,y

    def __getitem__(self, i):
        return self.X[i],self.y[i]

    def __len__(self):
        return self.X.shape[0]

def create_data_loader(X,y,batch_size,**kwargs):
    """
    Creates a data-loader for the data X and y

    params:
    -------

    X: np.array
        - numpy array of size "n" x k where n is samples an "k" is number of features

    y: np.array
        - numpy array of sie "n"

    batch_size: int
        - Take a wild guess, dumbass

    kwargs:
        - Additional keyword-arguments for "DataLoader"

    return
    ------

    dl: torch.utils.data.DataLoader object
    """

    data = Data(X, y)

    dl = DataLoader(data, batch_size=batch_size,num_workers=0,**kwargs)
    return dl

是这样用的;

from create_data_loader import create_data_loader

train_data_loader= create_data_loader(X_train,y_train,batch_size=32) #Note, it has "shuffle=True" as default!
val_data_loader= create_data_loader(X_val,y_val,batch_size=32,shuffle=False) #If you want to keep index'es in the same order for e.g cross-validate


for x_train, y_train in train_data_loader:
   logit = net(x_train,y_train)
   .
   .
   net.eval()
   for x_val,y_val in val_data_loader:
       logit  = net(x_val,y_val)
       classes_pred = logit.argmax(axis=1)
       print(f"Val accuracy: {(y_val==classes_pred).mean()}")

PyTorch 有两个听起来很相似,但加载数据的抽象概念却截然不同。我强烈建议阅读有关数据加载器的文档 here。总结

  1. 数据是您通常实现的一个对象,return是一个单独的样本(数据+标签)
  2. A DataLoader 是 pytorch 中的内置 class,它从数据集中采样批量样本(可能并行)。

(地图样式)数据集是一个简单的对象,它只实现了两个强制方法:__getitem____len__。 Getitem 是当您使用方括号运算符时在对象上调用的方法,即 dataset[i]__len__ 是当您使用 python 内置 len 在你的对象上运行,即 len(dataset)

对于 pytorch,您通常需要 __getitem__ 到 return 一个包含数据集对象中单个项目的数据和标签的元组。例如,根据您提供的内容,类似这样的内容应该符合您的需要

from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.functional as F

class CustomDataset(Dataset):
    def __init__(self, image_paths, labels, input_shape=(256, 256)):
        # `image_paths` is what you called `dataset` in your example.
        #               I'm assume this is a list of image paths.
        # `labels` isn't defined in your script but I assume its a
        #          dict that maps image names to an integer label
        #          between 0 and num classes minus 1
        self.image_paths = image_paths
        self.labels = labels
        self.input_shape = input_shape

    def __getitem__(self, index):
        # return the data and label for the specified index
        image_path = self.image_paths[index]
        data = cv2.resize(cv2.imread(image_path, cv2.IMREAD_COLOR), 
                          self.input_shape, interpolation = cv2.INTER_AREA)
        label = self.labels[image_path.split('/')[-2]]

        # convert data to PyTorch tensor
        # This converts data from a uint8 np.array of shape HxWxC
        # between 0 and 255 to a pytorch float32 tensor of shape CxHxW
        # between 0.0 and 1.0.
        data = F.to_tensor(data)

        return data, label

    def __len__(self):
        return len(self.image_paths)

...
# using what you call "dataset" and "labels"
# num_workers > 0 allows you to load data in parallel while network is running
dataloader = DataLoader(
    CustomDataset(dataset, labels, (256, 256)),
    batch_size=32,
    shuffle=True,    # shuffle tells us to randomly sample the
                     # dataset without replacement
    num_workers=4    # num workers is the number of worker processes
                     # that load from dataset in parallel while your
                     # model is processing stuff
)

# training loop
for epoch in range(num_epochs):
    # iterates over all data in your dataset in a random order
    # in batches of size 32 each time this loop is run
    for data_batch, label_batch in dataloader:
        # data_batch is a pytorch FloatTensor of shape 32x3x256x256
        # label_batch is a pytorch LongTensor of shape 32

        # if using GPU acceleration now is the time to move data_batch and label_batch to GPU
        # data_batch = data_batch.cuda()
        # label_batch = label_batch.cuda()

        # zero the gradients, pass data through your model, backprop, and step the optimizer
        ...