编写性能与数组 foreach 相当的 IEnumerator
Writing an IEnumerator with performance comparable to array foreach
要向自定义集合添加 foreach
支持,您需要实施 IEnumerable
。然而,数组的特殊之处在于它们本质上编译为基于范围的 for 循环,这比使用 IEnumerable 快 多 。一个简单的基准确认:
number of elements: 20,000,000
byte[]: 6.860ms
byte[] as IEnumerable<byte>: 89.444ms
CustomCollection.IEnumerator<byte>: 89.667ms
基准:
private byte[] byteArray = new byte[20000000];
private CustomCollection<byte> collection = new CustomCollection<T>( 20000000 );
[Benchmark]
public void enumerateByteArray()
{
var counter = 0;
foreach( var item in byteArray )
counter += item;
}
[Benchmark]
public void enumerateByteArrayAsIEnumerable()
{
var counter = 0;
var casted = (IEnumerable<byte>) byteArray;
foreach( var item in casted )
counter += item;
}
[Benchmark]
public void enumerateCollection()
{
var counter = 0;
foreach( var item in collection )
counter += item;
}
和实施:
public class CustomCollectionEnumerator : IEnumerable<T> where T : unmanaged
{
private CustomCollection<T> _collection;
private int _index;
private int _endIndex;
public CustomCollectionEnumerator( CustomCollection<T> collection )
{
_collection = collection;
_index = -1;
_endIndex = collection.Length;
}
public bool MoveNext()
{
if ( _index < _endIndex )
{
_index++;
return ( _index < _endIndex );
}
return false;
}
public T Current => _collection[ _index ];
object IEnumerator.Current => _collection[ _index ];
public void Reset() { _index = -1; }
public void Dispose() { }
}
public class CustomCollection<T> : IEnumerable<T> where T : unmanaged
{
private T* _ptr;
public int Length { get; private set; }
public T this[ int index ]
{
[MethodImpl( MethodImplOptions.AggressiveInlining )]
get => *_ptr[ index ];
[MethodImpl( MethodImplOptions.AggressiveInlining )]
set => *_ptr[ index ] = value;
}
public IEnumerator<T> GetEnumerator()
{
return new CustomCollectionEnumerator<T>( this );
}
}
因为数组得到编译器的特殊处理,所以它们将 IEnumerable
集合抛在脑后。由于 C# 非常注重类型安全,我可以理解为什么会这样,但它仍然会产生大量的开销,尤其是对于我的自定义集合,它在 中以完全相同的方式 进行枚举就像数组一样。事实上,我的自定义集合比基于范围的 for 循环中的字节数组更快,因为它使用指针算法来跳过 CLR 的数组范围检查。
所以我的问题是:有没有一种方法可以自定义 foreach
循环的行为,以便我可以获得与数组相当的性能? 也许通过编译器内在函数还是用 IL 手动编译委托?
当然,我总是可以只使用基于范围的 for 循环。 我很好奇是否有任何可能的方式来自定义 foreach
循环的低级行为,类似于编译器处理数组的方式。
类型实际上不需要实现 IEnumerable
/IEnumerable<T>
即可在 foreach
语句中使用。 foreach
语句是鸭子类型的,这意味着编译器首先查找具有正确签名(GetEnumerator()
、MoveNext()
和 Current
)的 public 方法,而不管它们是否是这些接口的实现,并且只在必要时回退到接口。
这为一些优化打开了大门,这些优化可以在紧密循环中产生显着差异:GetEnumerator()
可以 return 一个具体类型而不是 IEnumerator<T>
,然后允许 foreach
使用非虚拟和潜在内联调用构建的循环,以及使枚举器成为 struct
以避免 GC 开销。 List<T>
等某些框架集合也利用了这一点。
结合其他一些优化,这个基于您的 CustomCollection
的枚举器非常接近微基准测试中的原始数组循环:
public Enumerator GetEnumerator() => new Enumerator(this);
// Being a ref struct makes it less likely to mess up the pointer usage,
// but doesn't affect the foreach loop
// There is no technical reason why this couldn't implement IEnumerator
// as long as lifetime issues are considered
public unsafe ref struct Enumerator
{
// Storing the pointer directly instead of the collection reference to reduce indirection
// Assuming it's immutable for the lifetime of the enumerator
private readonly T* _ptr;
private uint _index;
private readonly uint _endIndex;
public T Current
{
get
{
// This check could be omitted at the cost of safety if consumers are
// expected to never manually use the enumerator in an incorrect order
if (_index >= _endIndex)
ThrowInvalidOp();
// Without the (int) cast Desktop x86 generates much worse code,
// but only if _ptr is generic. Not sure why.
return _ptr[(int)_index];
}
}
internal Enumerator(CustomCollection<T> collection)
{
_ptr = collection._ptr;
_index = UInt32.MaxValue;
_endIndex = (uint)collection.Length;
}
// Technically this could unexpectedly reset the enumerator if someone were to
// manually call MoveNext() countless times after it returns false for some reason
public bool MoveNext() => unchecked(++_index) < _endIndex;
// Pulling this out of the getter improves inlining of Current
private static void ThrowInvalidOp() => throw new InvalidOperationException();
}
要向自定义集合添加 foreach
支持,您需要实施 IEnumerable
。然而,数组的特殊之处在于它们本质上编译为基于范围的 for 循环,这比使用 IEnumerable 快 多 。一个简单的基准确认:
number of elements: 20,000,000
byte[]: 6.860ms
byte[] as IEnumerable<byte>: 89.444ms
CustomCollection.IEnumerator<byte>: 89.667ms
基准:
private byte[] byteArray = new byte[20000000];
private CustomCollection<byte> collection = new CustomCollection<T>( 20000000 );
[Benchmark]
public void enumerateByteArray()
{
var counter = 0;
foreach( var item in byteArray )
counter += item;
}
[Benchmark]
public void enumerateByteArrayAsIEnumerable()
{
var counter = 0;
var casted = (IEnumerable<byte>) byteArray;
foreach( var item in casted )
counter += item;
}
[Benchmark]
public void enumerateCollection()
{
var counter = 0;
foreach( var item in collection )
counter += item;
}
和实施:
public class CustomCollectionEnumerator : IEnumerable<T> where T : unmanaged
{
private CustomCollection<T> _collection;
private int _index;
private int _endIndex;
public CustomCollectionEnumerator( CustomCollection<T> collection )
{
_collection = collection;
_index = -1;
_endIndex = collection.Length;
}
public bool MoveNext()
{
if ( _index < _endIndex )
{
_index++;
return ( _index < _endIndex );
}
return false;
}
public T Current => _collection[ _index ];
object IEnumerator.Current => _collection[ _index ];
public void Reset() { _index = -1; }
public void Dispose() { }
}
public class CustomCollection<T> : IEnumerable<T> where T : unmanaged
{
private T* _ptr;
public int Length { get; private set; }
public T this[ int index ]
{
[MethodImpl( MethodImplOptions.AggressiveInlining )]
get => *_ptr[ index ];
[MethodImpl( MethodImplOptions.AggressiveInlining )]
set => *_ptr[ index ] = value;
}
public IEnumerator<T> GetEnumerator()
{
return new CustomCollectionEnumerator<T>( this );
}
}
因为数组得到编译器的特殊处理,所以它们将 IEnumerable
集合抛在脑后。由于 C# 非常注重类型安全,我可以理解为什么会这样,但它仍然会产生大量的开销,尤其是对于我的自定义集合,它在 中以完全相同的方式 进行枚举就像数组一样。事实上,我的自定义集合比基于范围的 for 循环中的字节数组更快,因为它使用指针算法来跳过 CLR 的数组范围检查。
所以我的问题是:有没有一种方法可以自定义 foreach
循环的行为,以便我可以获得与数组相当的性能? 也许通过编译器内在函数还是用 IL 手动编译委托?
当然,我总是可以只使用基于范围的 for 循环。 我很好奇是否有任何可能的方式来自定义 foreach
循环的低级行为,类似于编译器处理数组的方式。
类型实际上不需要实现 IEnumerable
/IEnumerable<T>
即可在 foreach
语句中使用。 foreach
语句是鸭子类型的,这意味着编译器首先查找具有正确签名(GetEnumerator()
、MoveNext()
和 Current
)的 public 方法,而不管它们是否是这些接口的实现,并且只在必要时回退到接口。
这为一些优化打开了大门,这些优化可以在紧密循环中产生显着差异:GetEnumerator()
可以 return 一个具体类型而不是 IEnumerator<T>
,然后允许 foreach
使用非虚拟和潜在内联调用构建的循环,以及使枚举器成为 struct
以避免 GC 开销。 List<T>
等某些框架集合也利用了这一点。
结合其他一些优化,这个基于您的 CustomCollection
的枚举器非常接近微基准测试中的原始数组循环:
public Enumerator GetEnumerator() => new Enumerator(this);
// Being a ref struct makes it less likely to mess up the pointer usage,
// but doesn't affect the foreach loop
// There is no technical reason why this couldn't implement IEnumerator
// as long as lifetime issues are considered
public unsafe ref struct Enumerator
{
// Storing the pointer directly instead of the collection reference to reduce indirection
// Assuming it's immutable for the lifetime of the enumerator
private readonly T* _ptr;
private uint _index;
private readonly uint _endIndex;
public T Current
{
get
{
// This check could be omitted at the cost of safety if consumers are
// expected to never manually use the enumerator in an incorrect order
if (_index >= _endIndex)
ThrowInvalidOp();
// Without the (int) cast Desktop x86 generates much worse code,
// but only if _ptr is generic. Not sure why.
return _ptr[(int)_index];
}
}
internal Enumerator(CustomCollection<T> collection)
{
_ptr = collection._ptr;
_index = UInt32.MaxValue;
_endIndex = (uint)collection.Length;
}
// Technically this could unexpectedly reset the enumerator if someone were to
// manually call MoveNext() countless times after it returns false for some reason
public bool MoveNext() => unchecked(++_index) < _endIndex;
// Pulling this out of the getter improves inlining of Current
private static void ThrowInvalidOp() => throw new InvalidOperationException();
}