展开嵌套循环 C++
Unrolling nested loops c++
我正在尝试展开一个嵌套循环,该循环将数据存储在 C++ 中的二维动态内存分配中。虽然,我不太确定该怎么做。这是我展开前的原始循环:
int steps[1];
Ipp32f* vectx = ippiMalloc_32f_C1(size0, size1, &(steps[0]));
for (int i = 0; i < size0; i++){
for (int j = 0; j < size1; j++){
Ipp32f* pointer = (Ipp32f*)((Ipp8u*)vectx + steps[0]*j + sizeof(Ipp32f)*i);
*pointer = datax[i];
}
}
datax 是一个数组,其值在我的程序中为 size0 = 30 和 size1 = 10000。
我尝试了以下但不幸的是每个位置的值都不相同。有人可以帮助我吗?
for (int i = 0; i < size0; i+=4) {
for (int j = 0; j < size1; j+=4) {
*((Ipp32f*)((Ipp8u*)vectx+ (steps[0] * j +0)+ (sizeof(Ipp32f) * i ))) = datax[i];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i ))) = datax[i ];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i ))) = datax[i ];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i ))) = datax[i ];
}
for (int j = 0; j < size1; j += 4) {
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
}
for (int j = 0; j < size1; j += 4) {
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
}
for (int j = 0; j < size1; j += 4) {
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
}
}
您没有考虑运算符优先级
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
^^^^^^--here ^^^--and here
你应该添加 ()
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * (j + 1)) + (sizeof(Ipp32f) * (i+1)))) = datax[i+1];
^^^^^^ ^^^
显然你应该在所有行中这样做
顺便说一句,size0 = 30
,如果你将循环 4 乘 4 展开,你将在第一个循环的最后一次迭代中超出界限,你应该使用 size0
的倍数,像 5 或 6
最好使用这个 C++20 展开助手:
#pragma once
#include <utility>
#include <concepts>
#include <iterator>
template<size_t N, typename Fn>
requires (N >= 1) && requires( Fn fn, size_t i ) { { fn( i ) } -> std::same_as<void>; }
inline
void unroll( Fn fn )
{
auto unroll_n = [&]<size_t ... Indices>( std::index_sequence<Indices ...> )
{
(fn( Indices ), ...);
};
unroll_n( std::make_index_sequence<N>() );
}
template<std::size_t N, typename RandomIt, typename UnaryFunction>
requires std::random_access_iterator<RandomIt>
&& requires( UnaryFunction fn, typename std::iterator_traits<RandomIt>::value_type elem ) { { fn( elem ) }; }
inline
RandomIt unroll_for_xeach( RandomIt begin, RandomIt end, UnaryFunction fn )
{
RandomIt &it = begin;
if constexpr( N > 1 )
for( ; it + N <= end; it += N )
unroll<N>( [&]( size_t i ) { fn( it[i] ); } );
for( ; it < end; ++it )
fn( *begin );
return it;
}
我正在尝试展开一个嵌套循环,该循环将数据存储在 C++ 中的二维动态内存分配中。虽然,我不太确定该怎么做。这是我展开前的原始循环:
int steps[1];
Ipp32f* vectx = ippiMalloc_32f_C1(size0, size1, &(steps[0]));
for (int i = 0; i < size0; i++){
for (int j = 0; j < size1; j++){
Ipp32f* pointer = (Ipp32f*)((Ipp8u*)vectx + steps[0]*j + sizeof(Ipp32f)*i);
*pointer = datax[i];
}
}
datax 是一个数组,其值在我的程序中为 size0 = 30 和 size1 = 10000。 我尝试了以下但不幸的是每个位置的值都不相同。有人可以帮助我吗?
for (int i = 0; i < size0; i+=4) {
for (int j = 0; j < size1; j+=4) {
*((Ipp32f*)((Ipp8u*)vectx+ (steps[0] * j +0)+ (sizeof(Ipp32f) * i ))) = datax[i];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i ))) = datax[i ];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i ))) = datax[i ];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i ))) = datax[i ];
}
for (int j = 0; j < size1; j += 4) {
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
}
for (int j = 0; j < size1; j += 4) {
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
}
for (int j = 0; j < size1; j += 4) {
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
}
}
您没有考虑运算符优先级
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
^^^^^^--here ^^^--and here
你应该添加 ()
*((Ipp32f*)((Ipp8u*)vectx + (steps[0] * (j + 1)) + (sizeof(Ipp32f) * (i+1)))) = datax[i+1];
^^^^^^ ^^^
显然你应该在所有行中这样做
顺便说一句,size0 = 30
,如果你将循环 4 乘 4 展开,你将在第一个循环的最后一次迭代中超出界限,你应该使用 size0
的倍数,像 5 或 6
最好使用这个 C++20 展开助手:
#pragma once
#include <utility>
#include <concepts>
#include <iterator>
template<size_t N, typename Fn>
requires (N >= 1) && requires( Fn fn, size_t i ) { { fn( i ) } -> std::same_as<void>; }
inline
void unroll( Fn fn )
{
auto unroll_n = [&]<size_t ... Indices>( std::index_sequence<Indices ...> )
{
(fn( Indices ), ...);
};
unroll_n( std::make_index_sequence<N>() );
}
template<std::size_t N, typename RandomIt, typename UnaryFunction>
requires std::random_access_iterator<RandomIt>
&& requires( UnaryFunction fn, typename std::iterator_traits<RandomIt>::value_type elem ) { { fn( elem ) }; }
inline
RandomIt unroll_for_xeach( RandomIt begin, RandomIt end, UnaryFunction fn )
{
RandomIt &it = begin;
if constexpr( N > 1 )
for( ; it + N <= end; it += N )
unroll<N>( [&]( size_t i ) { fn( it[i] ); } );
for( ; it < end; ++it )
fn( *begin );
return it;
}