如何在 C 语言中对这个函数进行矢量化和优化?
How to vectorize and optimize this functions in C?
我有这个函数,结果是正确的,但编译器没有向量化它。
我如何才能实现编译器对其进行矢量化以及如何优化这些代码?
void LongNumSet( char *L, unsigned N, char digit )
{
for (int i = 0; i < N; ++i){
L[i] = digit;
}
}
void LongNumCopy( char *Vin, char *Vout, unsigned N )
{
for ( int i=0; i< N; ++i )
{
Vout[i] = Vin[i];
}
}
char LongNumAddition( char *__restrict Vin1, char * __restrict Vin2, char * __restrict Vout, unsigned N )
{
char CARRY = 0,R,aux;
Vin1 = (char*)__builtin_assume_aligned (Vin1, 1);
Vin2 = (char*)__builtin_assume_aligned (Vin2, 1);
for ( int i=0; i< N; ++i )
{
char R = Vin1[i] + Vin2[i] + CARRY;
aux = R <= 9;
Vout[i] = (aux) ? R:R-ten;
CARRY = (aux) ? 0:1;
}
return CARRY;
}
char LongNumAddDigit( char *V, char digit, unsigned N )
{
int i=0;
char R = V[0] + digit;
if ( R < ten){
V[0] = R;
return 0;
}
V[0] = R-ten;
// add carry, maybe iteratively for all digits
char CARRY = 1;
i = 1;
while ( CARRY && i < N )
{
if ( V[i] < 9 )
{
V[i]++;
CARRY = 0;
}
else
{
V[i] = 0;
i++; // CARRY remains set to 1
}
}
return CARRY;
}
我使用命令 gcc -O3 -ffast-math -msse -funroll-all-loops -ftree-vectorizer-verbose=25 -lm -g $1 -o ${2}.O3 并执行程序在 55 秒内
这是全部代码:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
// Variable used to generate pseudo-random numbers
unsigned int seed;
unsigned int temp;
unsigned int var1 = 214013;
unsigned int var2 = 2531011;
#define val13 13
#define ten 10
// Function to generate pseudo-random numbers
inline int myRandom() {
temp = var1*seed;
seed = temp + var2;
return (seed>>val13);
}
void LongNumInit( char *L, unsigned N )
{
for ( int i=0; i<N;++i )
{
L[i] = myRandom() % ten; // digito decimal
}
}
void LongNumPrint( char *L, unsigned N, char *Name )
{
printf("%s:", Name);
for ( int i=N; i>0; i-- )
{
printf("%d", L[i-1]);
}
printf("\n");
}
void LongNumSet( char *L, unsigned N, char digit )
{
for (int i = 0; i < N; ++i){
L[i] = digit;
}
}
void LongNumCopy( char *Vin, char *Vout, unsigned N )
{
for ( int i=0; i< N; ++i )
{
Vout[i] = Vin[i];
}
}
char LongNumAddition( char *__restrict Vin1, char * __restrict Vin2, char * __restrict Vout, unsigned N )
{
char CARRY = 0,R,aux;
Vin1 = (char*)__builtin_assume_aligned (Vin1, 1);
Vin2 = (char*)__builtin_assume_aligned (Vin2, 1);
for ( int i=0; i< N; ++i )
{
char R = Vin1[i] + Vin2[i] + CARRY;
aux = R <= 9;
Vout[i] = (aux) ? R:R-ten;
CARRY = (aux) ? 0:1;
}
return CARRY;
}
char LongNumAddDigit( char *V, char digit, unsigned N )
{
int i=0;
char R = V[0] + digit;
if ( R < ten){
V[0] = R;
return 0;
}
V[0] = R-ten;
// add carry, maybe iteratively for all digits
char CARRY = 1;
i = 1;
while ( CARRY && i < N )
{
if ( V[i] < 9 )
{
V[i]++;
CARRY = 0;
}
else
{
V[i] = 0;
i++; // CARRY remains set to 1
}
}
return CARRY;
}
char LongNumHorizAdd( char *Vin, char *Vout, unsigned N )
{
char CARRY = 0;
LongNumSet ( Vout, N, 0 );
for ( int i=0; i< N; ++i )
{
LongNumAddDigit ( Vout, Vin[i], N );
}
return 0; // CARRY can never be set
}
char LongNumConstMult( char *V, unsigned N, char digit )
{
char CARRY = 0;
char ja = 0;
for ( int i=0; i< N; ++i )
{
char aux = V[i] * digit;
char R = aux + CARRY;
CARRY = ((u_int32_t)R * (u_int32_t)0xCCCD) >> 19;
ja = (CARRY << 3) + 2*CARRY;
R -= ja;
V[i] = R;
}
return CARRY; // may be from 0 to 9
}
void LongNumMultiply( char *Vin1, char *Vin2, char *VoutH, char *VoutL, unsigned N )
{
// Create Temporal Long Integer with double size
unsigned char *TEMP = (unsigned char*) calloc(2*N,sizeof(unsigned char));
unsigned char *RES = (unsigned char*) calloc( 2*N,sizeof(unsigned char) );
LongNumSet ( RES, 2*N, 0 ); // Set RES to 0
for ( int i=0; i<N; ++i )
{
LongNumSet ( TEMP, 2*N, 0 ); // Set TEMP to 0
LongNumCopy ( Vin1, TEMP+i, N ); // Copy Vin1 -> TEMP, with offset i
LongNumConstMult( TEMP, 2*N, Vin2[i] ); // TEMP * Vin2[i] -> TEMP
LongNumAddition ( TEMP, RES, RES, 2*N ); // TEMP + RES -> RES
}
// Result goes to VoutH-VoutL
LongNumCopy ( RES, VoutL, N ); // Copy RES -> VoutL
LongNumCopy ( RES+N, VoutH, N ); // Copy RES+N -> VoutH
}
int main (int argc, char **argv)
{
int i, sum1, sum2, sum3, N=10000, Rep=50;
seed = 12345;
// obtain parameters at run time
if (argc>1) { N = atoi(argv[1]); }
if (argc>2) { Rep = atoi(argv[2]); }
printf("Challenge #3: Vector size is %d. Repeat %d times\n", N, Rep);
// Create Long Nums
unsigned char *V1= (unsigned char*) malloc( N*sizeof(unsigned char) );
unsigned char *V2= (unsigned char*) malloc( N*sizeof(unsigned char) );
unsigned char *V3= (unsigned char*) malloc( N*sizeof(unsigned char) );
unsigned char *V4= (unsigned char*) malloc( N*sizeof(unsigned char) );
LongNumInit ( V1, N ); LongNumInit ( V2, N ); LongNumInit ( V3, N );
// Repeat
for (i=0; i<Rep; i++)
{
LongNumAddition ( V1, V2, V4, N );
LongNumMultiply ( V3, V4, V2, V1, N );
LongNumHorizAdd ( V1, V2, N );
LongNumAddDigit ( V3, V2[0], N );
}
// Print last 32 digits of Long Numbers
LongNumPrint( V1, 32, "V1" );
LongNumPrint( V2, 32, "V2" );
LongNumPrint( V3, 32, "V3" );
LongNumPrint( V4, 32, "V4" );
free(V1); free(V2); free(V3); free(V4);
return 0;
}
根据您的使用情况,您可以创建和使用 LongNumClear(改进不大)而不是 LongNumSet。
以下是您的某些功能的其他一些潜在重写。我认为您应该注意到一些改进。对我来说,它大约是 44%。我还将类型从 char 更改为 unsigned。
#include <string.h>
void LongNumClear(uint8_t *L, size_t N) {
memset (L, 0, N);
}
void LongNumCopy(const uint8_t *Vin, uint8_t *Vout, size_t N) {
memcpy(Vout, Vin, N);
}
uint8_t LongNumAddition(uint8_t * Vin1, uint8_t * Vin2, uint8_t * Vout, size_t N) {
uint8_t carry = 0;
for (size_t i=0; i < N; ++i) {
Vout[i] = Vin1[i] + Vin2[i] + carry;
carry = (Vout[i] > 9);
if (carry) {
Vout[i] -= ten;
}
}
return carry;
}
uint8_t LongNumAddDigit(uint8_t *V, uint8_t digit, size_t N) {
size_t i=0;
V[0] += digit;
if (V[0] < ten) {
return 0;
}
V[0] -= ten;
while ((++i < N) && (V[i] >= 9)) {
V[i] = 0;
}
if ((i != N) && (V[i] < 9)) {
V[i]++;
return 0;
}
return 1;
}
uint8_t LongNumConstMult(uint8_t *V, size_t N, uint8_t digit) {
uint8_t carry = 0;
for (size_t i = 0; i < N; ++i ) {
V[i] = V[i] * digit + carry;
carry = ((u_int32_t)V[i] * (u_int32_t)0xCCCD) >> 19; // divide by 10
V[i] -= ((carry << 3) + (carry << 1));
}
return carry;
}
我有这个函数,结果是正确的,但编译器没有向量化它。 我如何才能实现编译器对其进行矢量化以及如何优化这些代码?
void LongNumSet( char *L, unsigned N, char digit )
{
for (int i = 0; i < N; ++i){
L[i] = digit;
}
}
void LongNumCopy( char *Vin, char *Vout, unsigned N )
{
for ( int i=0; i< N; ++i )
{
Vout[i] = Vin[i];
}
}
char LongNumAddition( char *__restrict Vin1, char * __restrict Vin2, char * __restrict Vout, unsigned N )
{
char CARRY = 0,R,aux;
Vin1 = (char*)__builtin_assume_aligned (Vin1, 1);
Vin2 = (char*)__builtin_assume_aligned (Vin2, 1);
for ( int i=0; i< N; ++i )
{
char R = Vin1[i] + Vin2[i] + CARRY;
aux = R <= 9;
Vout[i] = (aux) ? R:R-ten;
CARRY = (aux) ? 0:1;
}
return CARRY;
}
char LongNumAddDigit( char *V, char digit, unsigned N )
{
int i=0;
char R = V[0] + digit;
if ( R < ten){
V[0] = R;
return 0;
}
V[0] = R-ten;
// add carry, maybe iteratively for all digits
char CARRY = 1;
i = 1;
while ( CARRY && i < N )
{
if ( V[i] < 9 )
{
V[i]++;
CARRY = 0;
}
else
{
V[i] = 0;
i++; // CARRY remains set to 1
}
}
return CARRY;
}
我使用命令 gcc -O3 -ffast-math -msse -funroll-all-loops -ftree-vectorizer-verbose=25 -lm -g $1 -o ${2}.O3 并执行程序在 55 秒内 这是全部代码:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
// Variable used to generate pseudo-random numbers
unsigned int seed;
unsigned int temp;
unsigned int var1 = 214013;
unsigned int var2 = 2531011;
#define val13 13
#define ten 10
// Function to generate pseudo-random numbers
inline int myRandom() {
temp = var1*seed;
seed = temp + var2;
return (seed>>val13);
}
void LongNumInit( char *L, unsigned N )
{
for ( int i=0; i<N;++i )
{
L[i] = myRandom() % ten; // digito decimal
}
}
void LongNumPrint( char *L, unsigned N, char *Name )
{
printf("%s:", Name);
for ( int i=N; i>0; i-- )
{
printf("%d", L[i-1]);
}
printf("\n");
}
void LongNumSet( char *L, unsigned N, char digit )
{
for (int i = 0; i < N; ++i){
L[i] = digit;
}
}
void LongNumCopy( char *Vin, char *Vout, unsigned N )
{
for ( int i=0; i< N; ++i )
{
Vout[i] = Vin[i];
}
}
char LongNumAddition( char *__restrict Vin1, char * __restrict Vin2, char * __restrict Vout, unsigned N )
{
char CARRY = 0,R,aux;
Vin1 = (char*)__builtin_assume_aligned (Vin1, 1);
Vin2 = (char*)__builtin_assume_aligned (Vin2, 1);
for ( int i=0; i< N; ++i )
{
char R = Vin1[i] + Vin2[i] + CARRY;
aux = R <= 9;
Vout[i] = (aux) ? R:R-ten;
CARRY = (aux) ? 0:1;
}
return CARRY;
}
char LongNumAddDigit( char *V, char digit, unsigned N )
{
int i=0;
char R = V[0] + digit;
if ( R < ten){
V[0] = R;
return 0;
}
V[0] = R-ten;
// add carry, maybe iteratively for all digits
char CARRY = 1;
i = 1;
while ( CARRY && i < N )
{
if ( V[i] < 9 )
{
V[i]++;
CARRY = 0;
}
else
{
V[i] = 0;
i++; // CARRY remains set to 1
}
}
return CARRY;
}
char LongNumHorizAdd( char *Vin, char *Vout, unsigned N )
{
char CARRY = 0;
LongNumSet ( Vout, N, 0 );
for ( int i=0; i< N; ++i )
{
LongNumAddDigit ( Vout, Vin[i], N );
}
return 0; // CARRY can never be set
}
char LongNumConstMult( char *V, unsigned N, char digit )
{
char CARRY = 0;
char ja = 0;
for ( int i=0; i< N; ++i )
{
char aux = V[i] * digit;
char R = aux + CARRY;
CARRY = ((u_int32_t)R * (u_int32_t)0xCCCD) >> 19;
ja = (CARRY << 3) + 2*CARRY;
R -= ja;
V[i] = R;
}
return CARRY; // may be from 0 to 9
}
void LongNumMultiply( char *Vin1, char *Vin2, char *VoutH, char *VoutL, unsigned N )
{
// Create Temporal Long Integer with double size
unsigned char *TEMP = (unsigned char*) calloc(2*N,sizeof(unsigned char));
unsigned char *RES = (unsigned char*) calloc( 2*N,sizeof(unsigned char) );
LongNumSet ( RES, 2*N, 0 ); // Set RES to 0
for ( int i=0; i<N; ++i )
{
LongNumSet ( TEMP, 2*N, 0 ); // Set TEMP to 0
LongNumCopy ( Vin1, TEMP+i, N ); // Copy Vin1 -> TEMP, with offset i
LongNumConstMult( TEMP, 2*N, Vin2[i] ); // TEMP * Vin2[i] -> TEMP
LongNumAddition ( TEMP, RES, RES, 2*N ); // TEMP + RES -> RES
}
// Result goes to VoutH-VoutL
LongNumCopy ( RES, VoutL, N ); // Copy RES -> VoutL
LongNumCopy ( RES+N, VoutH, N ); // Copy RES+N -> VoutH
}
int main (int argc, char **argv)
{
int i, sum1, sum2, sum3, N=10000, Rep=50;
seed = 12345;
// obtain parameters at run time
if (argc>1) { N = atoi(argv[1]); }
if (argc>2) { Rep = atoi(argv[2]); }
printf("Challenge #3: Vector size is %d. Repeat %d times\n", N, Rep);
// Create Long Nums
unsigned char *V1= (unsigned char*) malloc( N*sizeof(unsigned char) );
unsigned char *V2= (unsigned char*) malloc( N*sizeof(unsigned char) );
unsigned char *V3= (unsigned char*) malloc( N*sizeof(unsigned char) );
unsigned char *V4= (unsigned char*) malloc( N*sizeof(unsigned char) );
LongNumInit ( V1, N ); LongNumInit ( V2, N ); LongNumInit ( V3, N );
// Repeat
for (i=0; i<Rep; i++)
{
LongNumAddition ( V1, V2, V4, N );
LongNumMultiply ( V3, V4, V2, V1, N );
LongNumHorizAdd ( V1, V2, N );
LongNumAddDigit ( V3, V2[0], N );
}
// Print last 32 digits of Long Numbers
LongNumPrint( V1, 32, "V1" );
LongNumPrint( V2, 32, "V2" );
LongNumPrint( V3, 32, "V3" );
LongNumPrint( V4, 32, "V4" );
free(V1); free(V2); free(V3); free(V4);
return 0;
}
根据您的使用情况,您可以创建和使用 LongNumClear(改进不大)而不是 LongNumSet。 以下是您的某些功能的其他一些潜在重写。我认为您应该注意到一些改进。对我来说,它大约是 44%。我还将类型从 char 更改为 unsigned。
#include <string.h>
void LongNumClear(uint8_t *L, size_t N) {
memset (L, 0, N);
}
void LongNumCopy(const uint8_t *Vin, uint8_t *Vout, size_t N) {
memcpy(Vout, Vin, N);
}
uint8_t LongNumAddition(uint8_t * Vin1, uint8_t * Vin2, uint8_t * Vout, size_t N) {
uint8_t carry = 0;
for (size_t i=0; i < N; ++i) {
Vout[i] = Vin1[i] + Vin2[i] + carry;
carry = (Vout[i] > 9);
if (carry) {
Vout[i] -= ten;
}
}
return carry;
}
uint8_t LongNumAddDigit(uint8_t *V, uint8_t digit, size_t N) {
size_t i=0;
V[0] += digit;
if (V[0] < ten) {
return 0;
}
V[0] -= ten;
while ((++i < N) && (V[i] >= 9)) {
V[i] = 0;
}
if ((i != N) && (V[i] < 9)) {
V[i]++;
return 0;
}
return 1;
}
uint8_t LongNumConstMult(uint8_t *V, size_t N, uint8_t digit) {
uint8_t carry = 0;
for (size_t i = 0; i < N; ++i ) {
V[i] = V[i] * digit + carry;
carry = ((u_int32_t)V[i] * (u_int32_t)0xCCCD) >> 19; // divide by 10
V[i] -= ((carry << 3) + (carry << 1));
}
return carry;
}