Python:从给定的数据集生成唯一的批次
Python: Generate a unique batch from given dataset
我正在应用 CNN 对给定数据集进行分类。
我的函数:
def batch_generator(dataset, input_shape = (256, 256), batch_size = 32):
dataset_images = []
dataset_labels = []
for i in range(0, len(dataset)):
dataset_images.append(cv2.resize(cv2.imread(dataset[i], cv2.IMREAD_COLOR),
input_shape, interpolation = cv2.INTER_AREA))
dataset_labels.append(labels[dataset[i].split('/')[-2]])
return dataset_images, dataset_labels
这个函数应该在每个时期调用,它应该 return 一个大小为 'batch_size' 的唯一批次,包含 dataset_images(每个图像为 256x256)和相应的 dataset_label 来自标签字典。
输入 'dataset' 包含所有图像的路径,因此我打开它们并将它们的大小调整为 256x256。有人可以帮我添加到此代码中,以便 return 是所需的批次吗?
正如@jodag 所建议的那样,使用 DataLoaders 是个好主意。
我有一个片段用于我在 Pytorch 中的一些 CNN
from torch.utils.data import Dataset, DataLoader
import torch
class Data(Dataset):
"""
Constructs a Dataset to be parsed into a DataLoader
"""
def __init__(self,X,y):
X = torch.from_numpy(X).float()
#Transpose to fit dimensions of my network
X = torch.transpose(X,1,2)
y = torch.from_numpy(y).float()
self.X,self.y = X,y
def __getitem__(self, i):
return self.X[i],self.y[i]
def __len__(self):
return self.X.shape[0]
def create_data_loader(X,y,batch_size,**kwargs):
"""
Creates a data-loader for the data X and y
params:
-------
X: np.array
- numpy array of size "n" x k where n is samples an "k" is number of features
y: np.array
- numpy array of sie "n"
batch_size: int
- Take a wild guess, dumbass
kwargs:
- Additional keyword-arguments for "DataLoader"
return
------
dl: torch.utils.data.DataLoader object
"""
data = Data(X, y)
dl = DataLoader(data, batch_size=batch_size,num_workers=0,**kwargs)
return dl
是这样用的;
from create_data_loader import create_data_loader
train_data_loader= create_data_loader(X_train,y_train,batch_size=32) #Note, it has "shuffle=True" as default!
val_data_loader= create_data_loader(X_val,y_val,batch_size=32,shuffle=False) #If you want to keep index'es in the same order for e.g cross-validate
for x_train, y_train in train_data_loader:
logit = net(x_train,y_train)
.
.
net.eval()
for x_val,y_val in val_data_loader:
logit = net(x_val,y_val)
classes_pred = logit.argmax(axis=1)
print(f"Val accuracy: {(y_val==classes_pred).mean()}")
PyTorch 有两个听起来很相似,但加载数据的抽象概念却截然不同。我强烈建议阅读有关数据加载器的文档 here。总结
- 数据集是您通常实现的一个对象,return是一个单独的样本(数据+标签)
- A DataLoader 是 pytorch 中的内置 class,它从数据集中采样批量样本(可能并行)。
(地图样式)数据集是一个简单的对象,它只实现了两个强制方法:__getitem__
和 __len__
。 Getitem 是当您使用方括号运算符时在对象上调用的方法,即 dataset[i]
和 __len__
是当您使用 python 内置 len
在你的对象上运行,即 len(dataset)
对于 pytorch,您通常需要 __getitem__
到 return 一个包含数据集对象中单个项目的数据和标签的元组。例如,根据您提供的内容,类似这样的内容应该符合您的需要
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.functional as F
class CustomDataset(Dataset):
def __init__(self, image_paths, labels, input_shape=(256, 256)):
# `image_paths` is what you called `dataset` in your example.
# I'm assume this is a list of image paths.
# `labels` isn't defined in your script but I assume its a
# dict that maps image names to an integer label
# between 0 and num classes minus 1
self.image_paths = image_paths
self.labels = labels
self.input_shape = input_shape
def __getitem__(self, index):
# return the data and label for the specified index
image_path = self.image_paths[index]
data = cv2.resize(cv2.imread(image_path, cv2.IMREAD_COLOR),
self.input_shape, interpolation = cv2.INTER_AREA)
label = self.labels[image_path.split('/')[-2]]
# convert data to PyTorch tensor
# This converts data from a uint8 np.array of shape HxWxC
# between 0 and 255 to a pytorch float32 tensor of shape CxHxW
# between 0.0 and 1.0.
data = F.to_tensor(data)
return data, label
def __len__(self):
return len(self.image_paths)
...
# using what you call "dataset" and "labels"
# num_workers > 0 allows you to load data in parallel while network is running
dataloader = DataLoader(
CustomDataset(dataset, labels, (256, 256)),
batch_size=32,
shuffle=True, # shuffle tells us to randomly sample the
# dataset without replacement
num_workers=4 # num workers is the number of worker processes
# that load from dataset in parallel while your
# model is processing stuff
)
# training loop
for epoch in range(num_epochs):
# iterates over all data in your dataset in a random order
# in batches of size 32 each time this loop is run
for data_batch, label_batch in dataloader:
# data_batch is a pytorch FloatTensor of shape 32x3x256x256
# label_batch is a pytorch LongTensor of shape 32
# if using GPU acceleration now is the time to move data_batch and label_batch to GPU
# data_batch = data_batch.cuda()
# label_batch = label_batch.cuda()
# zero the gradients, pass data through your model, backprop, and step the optimizer
...
我正在应用 CNN 对给定数据集进行分类。
我的函数:
def batch_generator(dataset, input_shape = (256, 256), batch_size = 32):
dataset_images = []
dataset_labels = []
for i in range(0, len(dataset)):
dataset_images.append(cv2.resize(cv2.imread(dataset[i], cv2.IMREAD_COLOR),
input_shape, interpolation = cv2.INTER_AREA))
dataset_labels.append(labels[dataset[i].split('/')[-2]])
return dataset_images, dataset_labels
这个函数应该在每个时期调用,它应该 return 一个大小为 'batch_size' 的唯一批次,包含 dataset_images(每个图像为 256x256)和相应的 dataset_label 来自标签字典。
输入 'dataset' 包含所有图像的路径,因此我打开它们并将它们的大小调整为 256x256。有人可以帮我添加到此代码中,以便 return 是所需的批次吗?
正如@jodag 所建议的那样,使用 DataLoaders 是个好主意。
我有一个片段用于我在 Pytorch 中的一些 CNN
from torch.utils.data import Dataset, DataLoader
import torch
class Data(Dataset):
"""
Constructs a Dataset to be parsed into a DataLoader
"""
def __init__(self,X,y):
X = torch.from_numpy(X).float()
#Transpose to fit dimensions of my network
X = torch.transpose(X,1,2)
y = torch.from_numpy(y).float()
self.X,self.y = X,y
def __getitem__(self, i):
return self.X[i],self.y[i]
def __len__(self):
return self.X.shape[0]
def create_data_loader(X,y,batch_size,**kwargs):
"""
Creates a data-loader for the data X and y
params:
-------
X: np.array
- numpy array of size "n" x k where n is samples an "k" is number of features
y: np.array
- numpy array of sie "n"
batch_size: int
- Take a wild guess, dumbass
kwargs:
- Additional keyword-arguments for "DataLoader"
return
------
dl: torch.utils.data.DataLoader object
"""
data = Data(X, y)
dl = DataLoader(data, batch_size=batch_size,num_workers=0,**kwargs)
return dl
是这样用的;
from create_data_loader import create_data_loader
train_data_loader= create_data_loader(X_train,y_train,batch_size=32) #Note, it has "shuffle=True" as default!
val_data_loader= create_data_loader(X_val,y_val,batch_size=32,shuffle=False) #If you want to keep index'es in the same order for e.g cross-validate
for x_train, y_train in train_data_loader:
logit = net(x_train,y_train)
.
.
net.eval()
for x_val,y_val in val_data_loader:
logit = net(x_val,y_val)
classes_pred = logit.argmax(axis=1)
print(f"Val accuracy: {(y_val==classes_pred).mean()}")
PyTorch 有两个听起来很相似,但加载数据的抽象概念却截然不同。我强烈建议阅读有关数据加载器的文档 here。总结
- 数据集是您通常实现的一个对象,return是一个单独的样本(数据+标签)
- A DataLoader 是 pytorch 中的内置 class,它从数据集中采样批量样本(可能并行)。
(地图样式)数据集是一个简单的对象,它只实现了两个强制方法:__getitem__
和 __len__
。 Getitem 是当您使用方括号运算符时在对象上调用的方法,即 dataset[i]
和 __len__
是当您使用 python 内置 len
在你的对象上运行,即 len(dataset)
对于 pytorch,您通常需要 __getitem__
到 return 一个包含数据集对象中单个项目的数据和标签的元组。例如,根据您提供的内容,类似这样的内容应该符合您的需要
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.functional as F
class CustomDataset(Dataset):
def __init__(self, image_paths, labels, input_shape=(256, 256)):
# `image_paths` is what you called `dataset` in your example.
# I'm assume this is a list of image paths.
# `labels` isn't defined in your script but I assume its a
# dict that maps image names to an integer label
# between 0 and num classes minus 1
self.image_paths = image_paths
self.labels = labels
self.input_shape = input_shape
def __getitem__(self, index):
# return the data and label for the specified index
image_path = self.image_paths[index]
data = cv2.resize(cv2.imread(image_path, cv2.IMREAD_COLOR),
self.input_shape, interpolation = cv2.INTER_AREA)
label = self.labels[image_path.split('/')[-2]]
# convert data to PyTorch tensor
# This converts data from a uint8 np.array of shape HxWxC
# between 0 and 255 to a pytorch float32 tensor of shape CxHxW
# between 0.0 and 1.0.
data = F.to_tensor(data)
return data, label
def __len__(self):
return len(self.image_paths)
...
# using what you call "dataset" and "labels"
# num_workers > 0 allows you to load data in parallel while network is running
dataloader = DataLoader(
CustomDataset(dataset, labels, (256, 256)),
batch_size=32,
shuffle=True, # shuffle tells us to randomly sample the
# dataset without replacement
num_workers=4 # num workers is the number of worker processes
# that load from dataset in parallel while your
# model is processing stuff
)
# training loop
for epoch in range(num_epochs):
# iterates over all data in your dataset in a random order
# in batches of size 32 each time this loop is run
for data_batch, label_batch in dataloader:
# data_batch is a pytorch FloatTensor of shape 32x3x256x256
# label_batch is a pytorch LongTensor of shape 32
# if using GPU acceleration now is the time to move data_batch and label_batch to GPU
# data_batch = data_batch.cuda()
# label_batch = label_batch.cuda()
# zero the gradients, pass data through your model, backprop, and step the optimizer
...