sysy-data/performance_c/asm/stencil0.s

279 lines
4.1 KiB
ArmAsm

.file "stencil0.sy"
.option pic
.attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0"
.attribute unaligned_access, 0
.attribute stack_align, 16
.text
.align 1
.globl cutout
.type cutout, @function
cutout:
li a4,255
ble a0,a4,.L2
li a0,255
ret
.L2:
sext.w a0,a0
not a5,a0
srai a5,a5,63
and a0,a0,a5
ret
.size cutout, .-cutout
.section .text.startup,"ax",@progbits
.align 1
.globl main
.type main, @function
main:
addi sp,sp,-80
lla a0,image_in
sd ra,72(sp)
sd s8,0(sp)
sd s0,64(sp)
sd s1,56(sp)
sd s2,48(sp)
sd s3,40(sp)
sd s4,32(sp)
sd s5,24(sp)
sd s6,16(sp)
sd s7,8(sp)
call getarray@plt
mv s8,a0
li a0,25
call _sysy_starttime@plt
li t2,-4096
li t4,4096
li t0,524288
lla t3,image_in+2036
lla t5,image_out+4084
lla t1,image_in
lla a6,image_in+4092
li t6,0
addi t2,t2,12
li a7,255
li ra,509
addi t4,t4,-2048
addi t0,t0,-1024
.L6:
lw s4,4(a6)
lw s3,8(a6)
lw s2,-2044(a6)
lw s1,-2040(a6)
lw s0,-2036(t3)
lw a0,-2032(t3)
addiw t6,t6,512
add a1,a6,t2
addi a2,a6,-2036
addi a3,a6,12
addi a4,t5,-2032
li s5,1
.L15:
slliw a5,s1,3
subw a5,a5,s0
lw s0,0(a1)
subw a5,a5,a0
addiw s5,s5,2
subw a5,a5,s0
subw a5,a5,s2
lw s2,0(a2)
subw a5,a5,s2
subw a5,a5,s4
lw s4,0(a3)
subw a5,a5,s3
slliw s6,s2,3
subw a5,a5,s4
subw s6,s6,a0
sext.w s7,a5
subw s6,s6,s0
ble s7,a7,.L11
li a5,255
li s7,255
.L11:
lw a0,4(a1)
not s7,s7
srai s7,s7,63
subw s6,s6,a0
subw s6,s6,s1
lw s1,4(a2)
and a5,a5,s7
sw a5,0(a4)
subw a5,s6,s1
subw a5,a5,s3
lw s3,4(a3)
subw a5,a5,s4
addi a1,a1,8
subw a5,a5,s3
sext.w s6,a5
ble s6,a7,.L13
li a5,255
li s6,255
.L13:
not s6,s6
srai s6,s6,63
and a5,a5,s6
sw a5,4(a4)
addi a2,a2,8
addi a3,a3,8
addi a4,a4,8
bne s5,ra,.L15
addi a3,a6,-8
addi a1,a6,2040
mv a0,t5
mv a2,t3
.L9:
lw a5,0(a3)
lw s2,-4(a2)
lw s1,0(a2)
lw s0,4(a2)
slliw a5,a5,3
lw a4,-4(a3)
subw a5,a5,s2
subw a5,a5,s1
lw s2,4(a3)
subw a5,a5,s0
lw s1,-4(a1)
subw a5,a5,a4
lw s0,0(a1)
lw a4,4(a1)
subw a5,a5,s2
subw a5,a5,s1
subw a5,a5,s0
subw a5,a5,a4
sext.w a4,a5
addi a3,a3,4
ble a4,a7,.L7
li a5,255
li a4,255
.L7:
not a4,a4
srai a4,a4,63
and a5,a5,a4
sw a5,0(a0)
addi a2,a2,4
addi a1,a1,4
addi a0,a0,4
bne a3,a6,.L9
add t3,t3,t4
add t5,t5,t4
add a6,a3,t4
bne t6,t0,.L6
li a4,4096
lla a5,image_out
lla a1,image_in+2097152
addi a4,a4,-2048
.L18:
lw a2,0(t1)
lw a3,2044(t1)
add t1,t1,a4
sw a2,0(a5)
sw a3,2044(a5)
add a5,a5,a4
bne a1,t1,.L18
lla a5,image_in
lla a4,image_out
lla a3,image_in+2048
.L19:
ld a7,0(a5)
ld a6,8(a5)
ld a0,16(a5)
ld a2,24(a5)
sd a7,0(a4)
sd a6,8(a4)
sd a0,16(a4)
sd a2,24(a4)
addi a5,a5,32
addi a4,a4,32
bne a5,a3,.L19
lla a5,image_in+2095104
lla a4,image_out+2095104
.L20:
ld a6,0(a5)
ld a0,8(a5)
ld a2,16(a5)
ld a3,24(a5)
sd a6,0(a4)
sd a0,8(a4)
sd a2,16(a4)
sd a3,24(a4)
addi a5,a5,32
addi a4,a4,32
bne a5,a1,.L20
li a0,61
call _sysy_stoptime@plt
li a0,524288
lla a1,image_out
call putarray@plt
ld ra,72(sp)
ld s0,64(sp)
ld s1,56(sp)
ld s2,48(sp)
ld s3,40(sp)
ld s4,32(sp)
ld s5,24(sp)
ld s6,16(sp)
ld s7,8(sp)
mv a0,s8
ld s8,0(sp)
addi sp,sp,80
jr ra
.size main, .-main
.globl image_out
.globl image_in
.globl _sysy_idx
.globl _sysy_us
.globl _sysy_s
.globl _sysy_m
.globl _sysy_h
.globl _sysy_l2
.globl _sysy_l1
.globl _sysy_end
.globl _sysy_start
.bss
.align 3
.type image_out, @object
.size image_out, 2097152
image_out:
.zero 2097152
.type image_in, @object
.size image_in, 2097152
image_in:
.zero 2097152
.type _sysy_idx, @object
.size _sysy_idx, 4
_sysy_idx:
.zero 4
.zero 4
.type _sysy_us, @object
.size _sysy_us, 4096
_sysy_us:
.zero 4096
.type _sysy_s, @object
.size _sysy_s, 4096
_sysy_s:
.zero 4096
.type _sysy_m, @object
.size _sysy_m, 4096
_sysy_m:
.zero 4096
.type _sysy_h, @object
.size _sysy_h, 4096
_sysy_h:
.zero 4096
.type _sysy_l2, @object
.size _sysy_l2, 4096
_sysy_l2:
.zero 4096
.type _sysy_l1, @object
.size _sysy_l1, 4096
_sysy_l1:
.zero 4096
.type _sysy_end, @object
.size _sysy_end, 16
_sysy_end:
.zero 16
.type _sysy_start, @object
.size _sysy_start, 16
_sysy_start:
.zero 16
.ident "GCC: (Debian 12.2.0-13) 12.2.0"
.section .note.GNU-stack,"",@progbits