使用 clang 从 C 代码生成 SIMD 代码
Generate SIMD code from C code using clang
我正在尝试从一个简单的 c
程序中获取 SIMD 代码:
#include <stdio.h>
const int N=20000;
int main()
{
// input
int a[N], b[N];
for(int i=0; i<N; i++){
a[i]= i %500;
}
for(int i=0; i<N; i++){
b[i]= i %200;
}
// output
int c[N];
for(int i=0;i<N;i++)
{
c[i]=a[i]+b[i];
}
for(int i=0;i<N;i++)
{
printf("%d\n",c[i]);
}
return 0;
}
首先,我使用命令行标志通过 clang 禁用循环矢量化器并生成汇编代码:
clang -S -fno-vectorize sum_vec.c -o sum_scalar.s
现在我使用命令行标志 -force-vector-width
设置矢量化 SIMD 宽度并生成汇编代码:
clang -S -mllvm -force-vector-width=8 sum_vec.c -o sum_simd.s
但是,生成的代码都是标量的。如何生成 SIMD 代码?
can you post your code? – muiloo
使用 gcc
8.3.1 和 cc -O3 -S -o gvec.s -fverbose-asm fix1.c
[我将您的 const int
更改为 enum
]:
.file "fix1.c"
# GNU C17 (GCC) version 8.3.1 20190223 (Red Hat 8.3.1-2) (x86_64-redhat-linux)
# compiled by GNU C version 8.3.1 20190223 (Red Hat 8.3.1-2), GMP version 6.1.2, MPFR version 3.1.6-p2, MPC version 1.1.0, isl version none
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed: fix1.c -mtune=generic -march=x86-64
# -auxbase-strip gvec.s -O3 -fverbose-asm
# options enabled: -faggressive-loop-optimizations -falign-labels
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fchkp-check-incomplete-type -fchkp-check-read
# -fchkp-check-write -fchkp-instrument-calls -fchkp-narrow-bounds
# -fchkp-optimize -fchkp-store-bounds -fchkp-use-static-bounds
# -fchkp-use-static-const-bounds -fchkp-use-wrappers -fcode-hoisting
# -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers
# -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively
# -fdwarf2-cfi-asm -fearly-inlining -feliminate-unused-debug-types
# -fexpensive-optimizations -fforward-propagate -ffp-int-builtin-inexact
# -ffunction-cse -fgcse -fgcse-after-reload -fgcse-lm -fgnu-runtime
# -fgnu-unique -fguess-branch-probability -fhoist-adjacent-loads -fident
# -fif-conversion -fif-conversion2 -findirect-inlining -finline
# -finline-atomics -finline-functions -finline-functions-called-once
# -finline-small-functions -fipa-bit-cp -fipa-cp -fipa-cp-clone -fipa-icf
# -fipa-icf-functions -fipa-icf-variables -fipa-profile -fipa-pure-const
# -fipa-ra -fipa-reference -fipa-sra -fipa-vrp -fira-hoist-pressure
# -fira-share-save-slots -fira-share-spill-slots
# -fisolate-erroneous-paths-dereference -fivopts -fkeep-static-consts
# -fleading-underscore -flifetime-dse -floop-interchange
# -floop-unroll-and-jam -flra-remat -flto-odr-type-merging -fmath-errno
# -fmerge-constants -fmerge-debug-strings -fmove-loop-invariants
# -fomit-frame-pointer -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeel-loops -fpeephole -fpeephole2 -fplt
# -fpredictive-commoning -fprefetch-loop-arrays -free -freg-struct-return
# -freorder-blocks -freorder-blocks-and-partition -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-fusion
# -fschedule-insns2 -fsemantic-interposition -fshow-column -fshrink-wrap
# -fshrink-wrap-separate -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-loops -fsplit-paths -fsplit-wide-types -fssa-backprop
# -fssa-phiopt -fstdarg-opt -fstore-merging -fstrict-aliasing
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop -ftree-cselim
# -ftree-dce -ftree-dominator-opts -ftree-dse -ftree-forwprop -ftree-fre
# -ftree-loop-distribute-patterns -ftree-loop-distribution
# -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
# -ftree-loop-optimize -ftree-loop-vectorize -ftree-parallelize-loops=
# -ftree-partial-pre -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc
# -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter -ftree-vrp
# -funit-at-a-time -funswitch-loops -funwind-tables -fverbose-asm
# -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387
# -malign-stringops -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mfancy-math-387 -mfp-ret-in-387 -mfxsr
# -mglibc -mieee-fp -mlong-double-80 -mmmx -mno-sse4 -mpush-args -mred-zone
# -msse -msse2 -mstv -mtls-direct-seg-refs -mvzeroupper
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC4:
.string "%d\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB11:
.cfi_startproc
pushq %rbp #
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
# fix1.c:12: a[i] = i % 500;
pxor %xmm6, %xmm6 # tmp120
# fix1.c:7: {
pushq %rbx #
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
# fix1.c:12: a[i] = i % 500;
movdqa %xmm6, %xmm7 # tmp120, tmp124
# fix1.c:7: {
subq 0024, %rsp #,
.cfi_def_cfa_offset 240048
movdqa .LC2(%rip), %xmm5 #, tmp200
# fix1.c:7: {
movdqa .LC0(%rip), %xmm3 #, vect_vec_iv_.9
movdqa .LC1(%rip), %xmm2 #, tmp199
leaq 16(%rsp), %rax #, ivtmp.49
leaq 80016(%rsp), %rdx #, _47
# fix1.c:12: a[i] = i % 500;
pcmpgtd %xmm5, %xmm7 # tmp200, tmp124
# fix1.c:7: {
movdqa %xmm3, %xmm4 # vect_vec_iv_.9, vect_vec_iv_.16
.p2align 4,,10
.p2align 3
.L2:
# fix1.c:12: a[i] = i % 500;
movdqa %xmm4, %xmm1 # vect_vec_iv_.16, tmp117
movdqa %xmm6, %xmm0 # tmp120, tmp121
movdqa %xmm7, %xmm9 # tmp124, tmp126
addq , %rax #, ivtmp.49
punpckldq %xmm4, %xmm1 # vect_vec_iv_.16, tmp117
pcmpgtd %xmm1, %xmm0 # tmp117, tmp121
pmuludq %xmm1, %xmm9 # tmp117, tmp126
movdqa %xmm0, %xmm8 # tmp121, tmp125
movdqa %xmm1, %xmm0 # tmp117, tmp127
movdqa %xmm4, %xmm1 # vect_vec_iv_.16, tmp130
pmuludq %xmm5, %xmm8 # tmp200, tmp125
pmuludq %xmm5, %xmm0 # tmp200, tmp127
punpckhdq %xmm4, %xmm1 # vect_vec_iv_.16, tmp130
paddq %xmm9, %xmm8 # tmp126, tmp125
movdqa %xmm7, %xmm9 # tmp124, tmp139
psllq , %xmm8 #, tmp125
pmuludq %xmm1, %xmm9 # tmp130, tmp139
paddq %xmm8, %xmm0 # tmp125, tmp115
movdqa %xmm6, %xmm8 # tmp120, tmp134
pcmpgtd %xmm1, %xmm8 # tmp130, tmp134
pmuludq %xmm5, %xmm1 # tmp200, tmp140
pmuludq %xmm5, %xmm8 # tmp200, tmp138
paddq %xmm9, %xmm8 # tmp139, tmp138
psllq , %xmm8 #, tmp138
paddq %xmm8, %xmm1 # tmp138, tmp128
shufps 1, %xmm1, %xmm0 #, tmp128, vect_patt_65.17
psrad , %xmm0 #, vect_patt_66.18
movdqa %xmm0, %xmm1 # vect_patt_66.18, tmp146
pslld , %xmm1 #, tmp146
psubd %xmm0, %xmm1 # vect_patt_66.18, tmp147
pslld , %xmm1 #, tmp148
paddd %xmm1, %xmm0 # tmp148, vect_patt_67.19
movdqa %xmm4, %xmm1 # vect_vec_iv_.16, vect_patt_68.20
paddd %xmm2, %xmm4 # tmp199, vect_vec_iv_.16
pslld , %xmm0 #, tmp150
psubd %xmm0, %xmm1 # tmp150, vect_patt_68.20
movaps %xmm1, -16(%rax) # vect_patt_68.20, MEM[base: _49, offset: 0B]
cmpq %rdx, %rax # _47, ivtmp.49
jne .L2 #,
movdqa .LC3(%rip), %xmm4 #, tmp201
# fix1.c:16: b[i] = i % 200;
pxor %xmm5, %xmm5 # tmp158
leaq 80016(%rsp), %rax #, tmp214
movdqa %xmm5, %xmm6 # tmp158, tmp162
leaq 80000(%rax), %rdx #, _4
pcmpgtd %xmm4, %xmm6 # tmp201, tmp162
.p2align 4,,10
.p2align 3
.L3:
# fix1.c:16: b[i] = i % 200;
movdqa %xmm3, %xmm1 # vect_vec_iv_.9, tmp155
movdqa %xmm5, %xmm0 # tmp158, tmp159
movdqa %xmm6, %xmm8 # tmp162, tmp164
addq , %rax #, ivtmp.43
punpckldq %xmm3, %xmm1 # vect_vec_iv_.9, tmp155
pcmpgtd %xmm1, %xmm0 # tmp155, tmp159
pmuludq %xmm1, %xmm8 # tmp155, tmp164
movdqa %xmm0, %xmm7 # tmp159, tmp163
movdqa %xmm1, %xmm0 # tmp155, tmp165
movdqa %xmm3, %xmm1 # vect_vec_iv_.9, tmp168
pmuludq %xmm4, %xmm7 # tmp201, tmp163
pmuludq %xmm4, %xmm0 # tmp201, tmp165
punpckhdq %xmm3, %xmm1 # vect_vec_iv_.9, tmp168
paddq %xmm8, %xmm7 # tmp164, tmp163
movdqa %xmm6, %xmm8 # tmp162, tmp177
psllq , %xmm7 #, tmp163
pmuludq %xmm1, %xmm8 # tmp168, tmp177
paddq %xmm7, %xmm0 # tmp163, tmp153
movdqa %xmm5, %xmm7 # tmp158, tmp172
pcmpgtd %xmm1, %xmm7 # tmp168, tmp172
pmuludq %xmm4, %xmm1 # tmp201, tmp178
pmuludq %xmm4, %xmm7 # tmp201, tmp176
paddq %xmm8, %xmm7 # tmp177, tmp176
psllq , %xmm7 #, tmp176
paddq %xmm7, %xmm1 # tmp176, tmp166
movdqa %xmm3, %xmm7 # vect_vec_iv_.9, vect_patt_50.13
paddd %xmm2, %xmm3 # tmp199, vect_vec_iv_.9
shufps 1, %xmm1, %xmm0 #, tmp166, vect_patt_47.10
psrad , %xmm0 #, vect_patt_48.11
movdqa %xmm0, %xmm1 # vect_patt_48.11, tmp184
pslld , %xmm1 #, tmp184
paddd %xmm0, %xmm1 # vect_patt_48.11, tmp185
pslld , %xmm1 #, tmp186
paddd %xmm1, %xmm0 # tmp186, vect_patt_49.12
pslld , %xmm0 #, tmp188
psubd %xmm0, %xmm7 # tmp188, vect_patt_50.13
movaps %xmm7, -16(%rax) # vect_patt_50.13, MEM[base: _10, offset: 0B]
cmpq %rdx, %rax # _4, ivtmp.43
jne .L3 #,
xorl %eax, %eax # ivtmp.34
.p2align 4,,10
.p2align 3
.L4:
# fix1.c:23: c[i] = a[i] + b[i];
movdqa 80016(%rsp,%rax), %xmm0 # MEM[symbol: b, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
movdqa 16(%rsp,%rax), %xmm2 # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
paddd %xmm2, %xmm0 # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
movaps %xmm2, (%rsp) # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], %sfp
# fix1.c:23: c[i] = a[i] + b[i];
movaps %xmm0, 160016(%rsp,%rax) # vect__5.6, MEM[symbol: c, index: ivtmp.34_1, offset: 0B]
addq , %rax #, ivtmp.34
cmpq 000, %rax #, ivtmp.34
jne .L4 #,
leaq 160016(%rsp), %rbx #, tmp229
leaq 240016(%rsp), %rbp #, _39
.p2align 4,,10
.p2align 3
.L5:
# fix1.c:27: printf("%d\n", c[i]);
movl (%rbx), %esi # MEM[base: _40, offset: 0B],
movl $.LC4, %edi #,
xorl %eax, %eax #
addq , %rbx #, ivtmp.29
call printf #
# fix1.c:26: for (int i = 0; i < N; i++) {
cmpq %rbx, %rbp # ivtmp.29, _39
jne .L5 #,
# fix1.c:31: }
addq 0024, %rsp #,
.cfi_def_cfa_offset 24
xorl %eax, %eax #
popq %rbx #
.cfi_def_cfa_offset 16
popq %rbp #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE11:
.size main, .-main
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC0:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC1:
.long 4
.long 4
.long 4
.long 4
.align 16
.LC2:
.long 274877907
.long 274877907
.long 274877907
.long 274877907
.align 16
.LC3:
.long 1374389535
.long 1374389535
.long 1374389535
.long 1374389535
.ident "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
.section .note.GNU-stack,"",@progbits
我正在尝试从一个简单的 c
程序中获取 SIMD 代码:
#include <stdio.h>
const int N=20000;
int main()
{
// input
int a[N], b[N];
for(int i=0; i<N; i++){
a[i]= i %500;
}
for(int i=0; i<N; i++){
b[i]= i %200;
}
// output
int c[N];
for(int i=0;i<N;i++)
{
c[i]=a[i]+b[i];
}
for(int i=0;i<N;i++)
{
printf("%d\n",c[i]);
}
return 0;
}
首先,我使用命令行标志通过 clang 禁用循环矢量化器并生成汇编代码:
clang -S -fno-vectorize sum_vec.c -o sum_scalar.s
现在我使用命令行标志 -force-vector-width
设置矢量化 SIMD 宽度并生成汇编代码:
clang -S -mllvm -force-vector-width=8 sum_vec.c -o sum_simd.s
但是,生成的代码都是标量的。如何生成 SIMD 代码?
can you post your code? – muiloo
使用 gcc
8.3.1 和 cc -O3 -S -o gvec.s -fverbose-asm fix1.c
[我将您的 const int
更改为 enum
]:
.file "fix1.c"
# GNU C17 (GCC) version 8.3.1 20190223 (Red Hat 8.3.1-2) (x86_64-redhat-linux)
# compiled by GNU C version 8.3.1 20190223 (Red Hat 8.3.1-2), GMP version 6.1.2, MPFR version 3.1.6-p2, MPC version 1.1.0, isl version none
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed: fix1.c -mtune=generic -march=x86-64
# -auxbase-strip gvec.s -O3 -fverbose-asm
# options enabled: -faggressive-loop-optimizations -falign-labels
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fchkp-check-incomplete-type -fchkp-check-read
# -fchkp-check-write -fchkp-instrument-calls -fchkp-narrow-bounds
# -fchkp-optimize -fchkp-store-bounds -fchkp-use-static-bounds
# -fchkp-use-static-const-bounds -fchkp-use-wrappers -fcode-hoisting
# -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers
# -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively
# -fdwarf2-cfi-asm -fearly-inlining -feliminate-unused-debug-types
# -fexpensive-optimizations -fforward-propagate -ffp-int-builtin-inexact
# -ffunction-cse -fgcse -fgcse-after-reload -fgcse-lm -fgnu-runtime
# -fgnu-unique -fguess-branch-probability -fhoist-adjacent-loads -fident
# -fif-conversion -fif-conversion2 -findirect-inlining -finline
# -finline-atomics -finline-functions -finline-functions-called-once
# -finline-small-functions -fipa-bit-cp -fipa-cp -fipa-cp-clone -fipa-icf
# -fipa-icf-functions -fipa-icf-variables -fipa-profile -fipa-pure-const
# -fipa-ra -fipa-reference -fipa-sra -fipa-vrp -fira-hoist-pressure
# -fira-share-save-slots -fira-share-spill-slots
# -fisolate-erroneous-paths-dereference -fivopts -fkeep-static-consts
# -fleading-underscore -flifetime-dse -floop-interchange
# -floop-unroll-and-jam -flra-remat -flto-odr-type-merging -fmath-errno
# -fmerge-constants -fmerge-debug-strings -fmove-loop-invariants
# -fomit-frame-pointer -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeel-loops -fpeephole -fpeephole2 -fplt
# -fpredictive-commoning -fprefetch-loop-arrays -free -freg-struct-return
# -freorder-blocks -freorder-blocks-and-partition -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-fusion
# -fschedule-insns2 -fsemantic-interposition -fshow-column -fshrink-wrap
# -fshrink-wrap-separate -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-loops -fsplit-paths -fsplit-wide-types -fssa-backprop
# -fssa-phiopt -fstdarg-opt -fstore-merging -fstrict-aliasing
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop -ftree-cselim
# -ftree-dce -ftree-dominator-opts -ftree-dse -ftree-forwprop -ftree-fre
# -ftree-loop-distribute-patterns -ftree-loop-distribution
# -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
# -ftree-loop-optimize -ftree-loop-vectorize -ftree-parallelize-loops=
# -ftree-partial-pre -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc
# -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter -ftree-vrp
# -funit-at-a-time -funswitch-loops -funwind-tables -fverbose-asm
# -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387
# -malign-stringops -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mfancy-math-387 -mfp-ret-in-387 -mfxsr
# -mglibc -mieee-fp -mlong-double-80 -mmmx -mno-sse4 -mpush-args -mred-zone
# -msse -msse2 -mstv -mtls-direct-seg-refs -mvzeroupper
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC4:
.string "%d\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB11:
.cfi_startproc
pushq %rbp #
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
# fix1.c:12: a[i] = i % 500;
pxor %xmm6, %xmm6 # tmp120
# fix1.c:7: {
pushq %rbx #
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
# fix1.c:12: a[i] = i % 500;
movdqa %xmm6, %xmm7 # tmp120, tmp124
# fix1.c:7: {
subq 0024, %rsp #,
.cfi_def_cfa_offset 240048
movdqa .LC2(%rip), %xmm5 #, tmp200
# fix1.c:7: {
movdqa .LC0(%rip), %xmm3 #, vect_vec_iv_.9
movdqa .LC1(%rip), %xmm2 #, tmp199
leaq 16(%rsp), %rax #, ivtmp.49
leaq 80016(%rsp), %rdx #, _47
# fix1.c:12: a[i] = i % 500;
pcmpgtd %xmm5, %xmm7 # tmp200, tmp124
# fix1.c:7: {
movdqa %xmm3, %xmm4 # vect_vec_iv_.9, vect_vec_iv_.16
.p2align 4,,10
.p2align 3
.L2:
# fix1.c:12: a[i] = i % 500;
movdqa %xmm4, %xmm1 # vect_vec_iv_.16, tmp117
movdqa %xmm6, %xmm0 # tmp120, tmp121
movdqa %xmm7, %xmm9 # tmp124, tmp126
addq , %rax #, ivtmp.49
punpckldq %xmm4, %xmm1 # vect_vec_iv_.16, tmp117
pcmpgtd %xmm1, %xmm0 # tmp117, tmp121
pmuludq %xmm1, %xmm9 # tmp117, tmp126
movdqa %xmm0, %xmm8 # tmp121, tmp125
movdqa %xmm1, %xmm0 # tmp117, tmp127
movdqa %xmm4, %xmm1 # vect_vec_iv_.16, tmp130
pmuludq %xmm5, %xmm8 # tmp200, tmp125
pmuludq %xmm5, %xmm0 # tmp200, tmp127
punpckhdq %xmm4, %xmm1 # vect_vec_iv_.16, tmp130
paddq %xmm9, %xmm8 # tmp126, tmp125
movdqa %xmm7, %xmm9 # tmp124, tmp139
psllq , %xmm8 #, tmp125
pmuludq %xmm1, %xmm9 # tmp130, tmp139
paddq %xmm8, %xmm0 # tmp125, tmp115
movdqa %xmm6, %xmm8 # tmp120, tmp134
pcmpgtd %xmm1, %xmm8 # tmp130, tmp134
pmuludq %xmm5, %xmm1 # tmp200, tmp140
pmuludq %xmm5, %xmm8 # tmp200, tmp138
paddq %xmm9, %xmm8 # tmp139, tmp138
psllq , %xmm8 #, tmp138
paddq %xmm8, %xmm1 # tmp138, tmp128
shufps 1, %xmm1, %xmm0 #, tmp128, vect_patt_65.17
psrad , %xmm0 #, vect_patt_66.18
movdqa %xmm0, %xmm1 # vect_patt_66.18, tmp146
pslld , %xmm1 #, tmp146
psubd %xmm0, %xmm1 # vect_patt_66.18, tmp147
pslld , %xmm1 #, tmp148
paddd %xmm1, %xmm0 # tmp148, vect_patt_67.19
movdqa %xmm4, %xmm1 # vect_vec_iv_.16, vect_patt_68.20
paddd %xmm2, %xmm4 # tmp199, vect_vec_iv_.16
pslld , %xmm0 #, tmp150
psubd %xmm0, %xmm1 # tmp150, vect_patt_68.20
movaps %xmm1, -16(%rax) # vect_patt_68.20, MEM[base: _49, offset: 0B]
cmpq %rdx, %rax # _47, ivtmp.49
jne .L2 #,
movdqa .LC3(%rip), %xmm4 #, tmp201
# fix1.c:16: b[i] = i % 200;
pxor %xmm5, %xmm5 # tmp158
leaq 80016(%rsp), %rax #, tmp214
movdqa %xmm5, %xmm6 # tmp158, tmp162
leaq 80000(%rax), %rdx #, _4
pcmpgtd %xmm4, %xmm6 # tmp201, tmp162
.p2align 4,,10
.p2align 3
.L3:
# fix1.c:16: b[i] = i % 200;
movdqa %xmm3, %xmm1 # vect_vec_iv_.9, tmp155
movdqa %xmm5, %xmm0 # tmp158, tmp159
movdqa %xmm6, %xmm8 # tmp162, tmp164
addq , %rax #, ivtmp.43
punpckldq %xmm3, %xmm1 # vect_vec_iv_.9, tmp155
pcmpgtd %xmm1, %xmm0 # tmp155, tmp159
pmuludq %xmm1, %xmm8 # tmp155, tmp164
movdqa %xmm0, %xmm7 # tmp159, tmp163
movdqa %xmm1, %xmm0 # tmp155, tmp165
movdqa %xmm3, %xmm1 # vect_vec_iv_.9, tmp168
pmuludq %xmm4, %xmm7 # tmp201, tmp163
pmuludq %xmm4, %xmm0 # tmp201, tmp165
punpckhdq %xmm3, %xmm1 # vect_vec_iv_.9, tmp168
paddq %xmm8, %xmm7 # tmp164, tmp163
movdqa %xmm6, %xmm8 # tmp162, tmp177
psllq , %xmm7 #, tmp163
pmuludq %xmm1, %xmm8 # tmp168, tmp177
paddq %xmm7, %xmm0 # tmp163, tmp153
movdqa %xmm5, %xmm7 # tmp158, tmp172
pcmpgtd %xmm1, %xmm7 # tmp168, tmp172
pmuludq %xmm4, %xmm1 # tmp201, tmp178
pmuludq %xmm4, %xmm7 # tmp201, tmp176
paddq %xmm8, %xmm7 # tmp177, tmp176
psllq , %xmm7 #, tmp176
paddq %xmm7, %xmm1 # tmp176, tmp166
movdqa %xmm3, %xmm7 # vect_vec_iv_.9, vect_patt_50.13
paddd %xmm2, %xmm3 # tmp199, vect_vec_iv_.9
shufps 1, %xmm1, %xmm0 #, tmp166, vect_patt_47.10
psrad , %xmm0 #, vect_patt_48.11
movdqa %xmm0, %xmm1 # vect_patt_48.11, tmp184
pslld , %xmm1 #, tmp184
paddd %xmm0, %xmm1 # vect_patt_48.11, tmp185
pslld , %xmm1 #, tmp186
paddd %xmm1, %xmm0 # tmp186, vect_patt_49.12
pslld , %xmm0 #, tmp188
psubd %xmm0, %xmm7 # tmp188, vect_patt_50.13
movaps %xmm7, -16(%rax) # vect_patt_50.13, MEM[base: _10, offset: 0B]
cmpq %rdx, %rax # _4, ivtmp.43
jne .L3 #,
xorl %eax, %eax # ivtmp.34
.p2align 4,,10
.p2align 3
.L4:
# fix1.c:23: c[i] = a[i] + b[i];
movdqa 80016(%rsp,%rax), %xmm0 # MEM[symbol: b, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
movdqa 16(%rsp,%rax), %xmm2 # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
paddd %xmm2, %xmm0 # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
movaps %xmm2, (%rsp) # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], %sfp
# fix1.c:23: c[i] = a[i] + b[i];
movaps %xmm0, 160016(%rsp,%rax) # vect__5.6, MEM[symbol: c, index: ivtmp.34_1, offset: 0B]
addq , %rax #, ivtmp.34
cmpq 000, %rax #, ivtmp.34
jne .L4 #,
leaq 160016(%rsp), %rbx #, tmp229
leaq 240016(%rsp), %rbp #, _39
.p2align 4,,10
.p2align 3
.L5:
# fix1.c:27: printf("%d\n", c[i]);
movl (%rbx), %esi # MEM[base: _40, offset: 0B],
movl $.LC4, %edi #,
xorl %eax, %eax #
addq , %rbx #, ivtmp.29
call printf #
# fix1.c:26: for (int i = 0; i < N; i++) {
cmpq %rbx, %rbp # ivtmp.29, _39
jne .L5 #,
# fix1.c:31: }
addq 0024, %rsp #,
.cfi_def_cfa_offset 24
xorl %eax, %eax #
popq %rbx #
.cfi_def_cfa_offset 16
popq %rbp #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE11:
.size main, .-main
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC0:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC1:
.long 4
.long 4
.long 4
.long 4
.align 16
.LC2:
.long 274877907
.long 274877907
.long 274877907
.long 274877907
.align 16
.LC3:
.long 1374389535
.long 1374389535
.long 1374389535
.long 1374389535
.ident "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
.section .note.GNU-stack,"",@progbits