MNIST Shard Descriptor: IndexError: list index out of range
MNIST Shard Descriptor: IndexError: list index out of range
我正在使用英特尔 OpenFL 进行联邦学习实验。我想使用不同的非独立性场景来分发我的数据集 (MNIST)。
我正在关注他们的官方文档:https://openfl.readthedocs.io/en/latest/source/utilities/splitters_data.html
这是我原来的工作代码:
"""Mnist Shard Descriptor."""
import logging
import os
from typing import List
import numpy as np
import requests
from openfl.interface.interactive_api.shard_descriptor import ShardDataset
from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor
logger = logging.getLogger(__name__)
class MnistShardDataset(ShardDataset):
"""Mnist Shard dataset class."""
def __init__(self, x, y, data_type, rank=1, worldsize=1):
"""Initialize MNISTDataset."""
self.data_type = data_type
self.rank = rank
self.worldsize = worldsize
self.x = x[self.rank - 1::self.worldsize]
self.y = y[self.rank - 1::self.worldsize]
def __getitem__(self, index: int):
"""Return an item by the index."""
return self.x[index], self.y[index]
def __len__(self):
"""Return the len of the dataset."""
return len(self.x)
class MnistShardDescriptor(ShardDescriptor):
"""Mnist Shard descriptor class."""
def __init__(
self,
rank_worldsize: str = '1, 1',
**kwargs
):
"""Initialize MnistShardDescriptor."""
self.rank, self.worldsize = tuple(int(num) for num in rank_worldsize.split(','))
(x_train, y_train), (x_test, y_test) = self.download_data()
self.data_by_type = {
'train': (x_train, y_train),
'val': (x_test, y_test)
}
def get_shard_dataset_types(self) -> List[str]:
"""Get available shard dataset types."""
return list(self.data_by_type)
def get_dataset(self, dataset_type='train'):
"""Return a shard dataset by type."""
if dataset_type not in self.data_by_type:
raise Exception(f'Wrong dataset type: {dataset_type}')
return MnistShardDataset(
*self.data_by_type[dataset_type],
data_type=dataset_type,
rank=self.rank,
worldsize=self.worldsize
)
@property
def sample_shape(self):
"""Return the sample shape info."""
return ['28', '28', '1']
@property
def target_shape(self):
"""Return the target shape info."""
return ['28', '28', '1']
@property
def dataset_description(self) -> str:
"""Return the dataset description."""
return (f'Mnist dataset, shard number {self.rank}'
f' out of {self.worldsize}')
def download_data(self):
"""Download prepared dataset."""
local_file_path = 'mnist.npz'
mnist_url = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz'
response = requests.get(mnist_url)
with open(local_file_path, 'wb') as f:
f.write(response.content)
with np.load(local_file_path) as f:
x_train, y_train = f['x_train'], f['y_train']
x_test, y_test = f['x_test'], f['y_test']
#x_train = np.reshape(x_train, (-1, 784))
#x_test = np.reshape(x_test, (-1, 784))
os.remove(local_file_path) # remove mnist.npz
print('Mnist data was loaded!')
return (x_train, y_train), (x_test, y_test)
基本上,我以这种方式更改了我的两个联邦节点中的 MnistShardDescriptor class:
...
class MnistShardDescriptor(ShardDescriptor):
"""Mnist Shard descriptor class."""
def __init__(
self,
rank_worldsize: str = '1, 1',
**kwargs
):
"""Initialize MnistShardDescriptor."""
self.rank, self.worldsize = tuple(int(num) for num in rank_worldsize.split(','))
(x_train, y_train), (x_test, y_test) = self.download_data()
train_splitter = RandomNumPyDataSplitter()
test_splitter = RandomNumPyDataSplitter()
train_idx = train_splitter.split(y_train, self.worldsize)[self.rank]
test_idx = test_splitter.split(y_test, self.worldsize)[self.rank]
x_train_shard = x_train[train_idx]
x_test_shard = x_test[test_idx]
self.data_by_type = {
'train': (x_train, y_train),
'val': (x_test, y_test)
}
...
我在行 train_idx
:IndexError: list index out of range
中遇到此错误,但仅在 2 个节点之一中。我不知道为什么,因为代码在我的联邦的两个节点上完全相同。
编辑:我更改了上面编写的代码的位置,特别是我在 class MnistShardDataset 而不是 MnistShardDescriptor 中编写:
class MnistShardDataset(ShardDataset):
"""Mnist Shard dataset class."""
def __init__(self, x, y, data_type, rank=1, worldsize=1):
"""Initialize MNISTDataset."""
self.data_type = data_type
self.rank = rank
self.worldsize = worldsize
self.x = x[self.rank - 1::self.worldsize]
self.y = y[self.rank - 1::self.worldsize]
train_splitter = RandomNumPyDataSplitter()
#test_splitter = RandomNumPyDataSplitter()
train_idx = train_splitter.split(self.y, self.worldsize)[self.rank]
#test_idx = test_splitter.split(self.y, self.worldsize)[self.rank]
x_train_shard = self.x[train_idx]
#x_test_shard = self.x[test_idx]
self.x = x_train_shard
有了这个,我可以创建联盟,并且在 director 的同一个节点中,客户端开始训练,并且分裂是真正的 运行dom 因为我 运行 实验 2次,每次特使都有不同数量的样本。但是在另一个节点中(因为我使用了 2 个节点,每个节点一个)与 envoy(openFL 调用 envoy 客户端上的工作人员)我有相同的错误 Index out of 运行ge ...
EDIT2:这里是使用 openFL 的数据拆分示例:https://github.com/intel/openfl/blob/develop/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/kvasir_shard_descriptor_with_data_splitter.py
但是我的数据集不同,我没有成功采用这个解决方案。关于像 MNIST 这样的数据集的分片,你能给我推荐任何其他例子吗?要遵循的教程?
整个错误:
File "/home/lmancuso/envoymnist/mnist_shard_descriptor_with_data_splitter.py", line 61, in __init__
train_idx = train_splitter.split(y_train, self.worldsize)[self.rank]
IndexError: list index out of range
编辑:有趣的一点:如果我改变联邦的维度,将 envoy_config.yaml 中的 rank_worldsize 从 2 增加到 3,训练开始(并且数据集分为 运行dom方式,所以它有效,因为每个节点都有不同数量的样本)。但是,它之所以有效,是因为我有 2 个节点,但我创建了一个没有 3 节点的 3 联合。实际上,一个节点的样本是 8064,另一个节点的样本是 9856。然而考虑到我在 MNIST 中有 60000 个训练样本,所有剩余的样本都丢失了,因为它们应该在最后一个节点(不存在)。
到目前为止我找到的唯一解决方案是降低每个特使的等级:
train_idx = train_splitter.split(self.y, self.worldsize)[self.rank-1]
我正在使用英特尔 OpenFL 进行联邦学习实验。我想使用不同的非独立性场景来分发我的数据集 (MNIST)。 我正在关注他们的官方文档:https://openfl.readthedocs.io/en/latest/source/utilities/splitters_data.html
这是我原来的工作代码:
"""Mnist Shard Descriptor."""
import logging
import os
from typing import List
import numpy as np
import requests
from openfl.interface.interactive_api.shard_descriptor import ShardDataset
from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor
logger = logging.getLogger(__name__)
class MnistShardDataset(ShardDataset):
"""Mnist Shard dataset class."""
def __init__(self, x, y, data_type, rank=1, worldsize=1):
"""Initialize MNISTDataset."""
self.data_type = data_type
self.rank = rank
self.worldsize = worldsize
self.x = x[self.rank - 1::self.worldsize]
self.y = y[self.rank - 1::self.worldsize]
def __getitem__(self, index: int):
"""Return an item by the index."""
return self.x[index], self.y[index]
def __len__(self):
"""Return the len of the dataset."""
return len(self.x)
class MnistShardDescriptor(ShardDescriptor):
"""Mnist Shard descriptor class."""
def __init__(
self,
rank_worldsize: str = '1, 1',
**kwargs
):
"""Initialize MnistShardDescriptor."""
self.rank, self.worldsize = tuple(int(num) for num in rank_worldsize.split(','))
(x_train, y_train), (x_test, y_test) = self.download_data()
self.data_by_type = {
'train': (x_train, y_train),
'val': (x_test, y_test)
}
def get_shard_dataset_types(self) -> List[str]:
"""Get available shard dataset types."""
return list(self.data_by_type)
def get_dataset(self, dataset_type='train'):
"""Return a shard dataset by type."""
if dataset_type not in self.data_by_type:
raise Exception(f'Wrong dataset type: {dataset_type}')
return MnistShardDataset(
*self.data_by_type[dataset_type],
data_type=dataset_type,
rank=self.rank,
worldsize=self.worldsize
)
@property
def sample_shape(self):
"""Return the sample shape info."""
return ['28', '28', '1']
@property
def target_shape(self):
"""Return the target shape info."""
return ['28', '28', '1']
@property
def dataset_description(self) -> str:
"""Return the dataset description."""
return (f'Mnist dataset, shard number {self.rank}'
f' out of {self.worldsize}')
def download_data(self):
"""Download prepared dataset."""
local_file_path = 'mnist.npz'
mnist_url = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz'
response = requests.get(mnist_url)
with open(local_file_path, 'wb') as f:
f.write(response.content)
with np.load(local_file_path) as f:
x_train, y_train = f['x_train'], f['y_train']
x_test, y_test = f['x_test'], f['y_test']
#x_train = np.reshape(x_train, (-1, 784))
#x_test = np.reshape(x_test, (-1, 784))
os.remove(local_file_path) # remove mnist.npz
print('Mnist data was loaded!')
return (x_train, y_train), (x_test, y_test)
基本上,我以这种方式更改了我的两个联邦节点中的 MnistShardDescriptor class:
...
class MnistShardDescriptor(ShardDescriptor):
"""Mnist Shard descriptor class."""
def __init__(
self,
rank_worldsize: str = '1, 1',
**kwargs
):
"""Initialize MnistShardDescriptor."""
self.rank, self.worldsize = tuple(int(num) for num in rank_worldsize.split(','))
(x_train, y_train), (x_test, y_test) = self.download_data()
train_splitter = RandomNumPyDataSplitter()
test_splitter = RandomNumPyDataSplitter()
train_idx = train_splitter.split(y_train, self.worldsize)[self.rank]
test_idx = test_splitter.split(y_test, self.worldsize)[self.rank]
x_train_shard = x_train[train_idx]
x_test_shard = x_test[test_idx]
self.data_by_type = {
'train': (x_train, y_train),
'val': (x_test, y_test)
}
...
我在行 train_idx
:IndexError: list index out of range
中遇到此错误,但仅在 2 个节点之一中。我不知道为什么,因为代码在我的联邦的两个节点上完全相同。
编辑:我更改了上面编写的代码的位置,特别是我在 class MnistShardDataset 而不是 MnistShardDescriptor 中编写:
class MnistShardDataset(ShardDataset):
"""Mnist Shard dataset class."""
def __init__(self, x, y, data_type, rank=1, worldsize=1):
"""Initialize MNISTDataset."""
self.data_type = data_type
self.rank = rank
self.worldsize = worldsize
self.x = x[self.rank - 1::self.worldsize]
self.y = y[self.rank - 1::self.worldsize]
train_splitter = RandomNumPyDataSplitter()
#test_splitter = RandomNumPyDataSplitter()
train_idx = train_splitter.split(self.y, self.worldsize)[self.rank]
#test_idx = test_splitter.split(self.y, self.worldsize)[self.rank]
x_train_shard = self.x[train_idx]
#x_test_shard = self.x[test_idx]
self.x = x_train_shard
有了这个,我可以创建联盟,并且在 director 的同一个节点中,客户端开始训练,并且分裂是真正的 运行dom 因为我 运行 实验 2次,每次特使都有不同数量的样本。但是在另一个节点中(因为我使用了 2 个节点,每个节点一个)与 envoy(openFL 调用 envoy 客户端上的工作人员)我有相同的错误 Index out of 运行ge ...
EDIT2:这里是使用 openFL 的数据拆分示例:https://github.com/intel/openfl/blob/develop/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/kvasir_shard_descriptor_with_data_splitter.py
但是我的数据集不同,我没有成功采用这个解决方案。关于像 MNIST 这样的数据集的分片,你能给我推荐任何其他例子吗?要遵循的教程?
整个错误:
File "/home/lmancuso/envoymnist/mnist_shard_descriptor_with_data_splitter.py", line 61, in __init__
train_idx = train_splitter.split(y_train, self.worldsize)[self.rank]
IndexError: list index out of range
编辑:有趣的一点:如果我改变联邦的维度,将 envoy_config.yaml 中的 rank_worldsize 从 2 增加到 3,训练开始(并且数据集分为 运行dom方式,所以它有效,因为每个节点都有不同数量的样本)。但是,它之所以有效,是因为我有 2 个节点,但我创建了一个没有 3 节点的 3 联合。实际上,一个节点的样本是 8064,另一个节点的样本是 9856。然而考虑到我在 MNIST 中有 60000 个训练样本,所有剩余的样本都丢失了,因为它们应该在最后一个节点(不存在)。
到目前为止我找到的唯一解决方案是降低每个特使的等级:
train_idx = train_splitter.split(self.y, self.worldsize)[self.rank-1]