724 lines
36 KiB
LLVM
724 lines
36 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
|
|
; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
|
|
|
|
; Test prolog sequences for stack probing when SVE objects are involved.
|
|
|
|
; The space for SVE objects needs probing in the general case, because
|
|
; the stack adjustment may happen to be too big (i.e. greater than the
|
|
; probe size) to allocate with a single `addvl`.
|
|
; When we do know that the stack adjustment cannot exceed the probe size
|
|
; we can avoid emitting a probe loop and emit a simple `addvl; str`
|
|
; sequence instead.
|
|
|
|
define void @sve_1_vector(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_1_vector:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-1
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #1
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec = alloca <vscale x 4 x float>, align 16
|
|
ret void
|
|
}
|
|
|
|
; As above, but with 4 SVE vectors of stack space.
|
|
define void @sve_4_vector(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_4_vector:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-4
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #4
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec1 = alloca <vscale x 4 x float>, align 16
|
|
%vec2 = alloca <vscale x 4 x float>, align 16
|
|
%vec3 = alloca <vscale x 4 x float>, align 16
|
|
%vec4 = alloca <vscale x 4 x float>, align 16
|
|
ret void
|
|
}
|
|
|
|
; As above, but with 16 SVE vectors of stack space.
|
|
; The stack adjustment is less than or equal to 16 x 256 = 4096, so
|
|
; we can allocate the locals at once.
|
|
define void @sve_16_vector(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_16_vector:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-16
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: addvl sp, sp, #16
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec1 = alloca <vscale x 4 x float>, align 16
|
|
%vec2 = alloca <vscale x 4 x float>, align 16
|
|
%vec3 = alloca <vscale x 4 x float>, align 16
|
|
%vec4 = alloca <vscale x 4 x float>, align 16
|
|
%vec5 = alloca <vscale x 4 x float>, align 16
|
|
%vec6 = alloca <vscale x 4 x float>, align 16
|
|
%vec7 = alloca <vscale x 4 x float>, align 16
|
|
%vec8 = alloca <vscale x 4 x float>, align 16
|
|
%vec9 = alloca <vscale x 4 x float>, align 16
|
|
%vec10 = alloca <vscale x 4 x float>, align 16
|
|
%vec11 = alloca <vscale x 4 x float>, align 16
|
|
%vec12 = alloca <vscale x 4 x float>, align 16
|
|
%vec13 = alloca <vscale x 4 x float>, align 16
|
|
%vec14 = alloca <vscale x 4 x float>, align 16
|
|
%vec15 = alloca <vscale x 4 x float>, align 16
|
|
%vec16 = alloca <vscale x 4 x float>, align 16
|
|
ret void
|
|
}
|
|
|
|
; As above, but with 17 SVE vectors of stack space. Now we need
|
|
; a probing loops since stack adjustment may be greater than
|
|
; the probe size (17 x 256 = 4354 bytes)
|
|
; TODO: Allocating `k*16+r` SVE vectors can be unrolled into
|
|
; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]`
|
|
define void @sve_17_vector(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_17_vector:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl x9, sp, #-17
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
|
|
; CHECK-NEXT: .LBB3_1: // %entry
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
; CHECK-NEXT: cmp sp, x9
|
|
; CHECK-NEXT: b.le .LBB3_3
|
|
; CHECK-NEXT: // %bb.2: // %entry
|
|
; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: b .LBB3_1
|
|
; CHECK-NEXT: .LBB3_3: // %entry
|
|
; CHECK-NEXT: mov sp, x9
|
|
; CHECK-NEXT: ldr xzr, [sp]
|
|
; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
; CHECK-NEXT: addvl sp, sp, #17
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec1 = alloca <vscale x 4 x float>, align 16
|
|
%vec2 = alloca <vscale x 4 x float>, align 16
|
|
%vec3 = alloca <vscale x 4 x float>, align 16
|
|
%vec4 = alloca <vscale x 4 x float>, align 16
|
|
%vec5 = alloca <vscale x 4 x float>, align 16
|
|
%vec6 = alloca <vscale x 4 x float>, align 16
|
|
%vec7 = alloca <vscale x 4 x float>, align 16
|
|
%vec8 = alloca <vscale x 4 x float>, align 16
|
|
%vec9 = alloca <vscale x 4 x float>, align 16
|
|
%vec10 = alloca <vscale x 4 x float>, align 16
|
|
%vec11 = alloca <vscale x 4 x float>, align 16
|
|
%vec12 = alloca <vscale x 4 x float>, align 16
|
|
%vec13 = alloca <vscale x 4 x float>, align 16
|
|
%vec14 = alloca <vscale x 4 x float>, align 16
|
|
%vec15 = alloca <vscale x 4 x float>, align 16
|
|
%vec16 = alloca <vscale x 4 x float>, align 16
|
|
%vec17 = alloca <vscale x 4 x float>, align 16
|
|
ret void
|
|
}
|
|
|
|
; Space for callee-saved SVE register is allocated similarly to allocating
|
|
; space for SVE locals. When we know the stack adjustment cannot exceed the
|
|
; probe size we can skip the explict probe, since saving SVE registers serves
|
|
; as an implicit probe.
|
|
define void @sve_1v_csr(<vscale x 4 x float> %a) #0 {
|
|
; CHECK-LABEL: sve_1v_csr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-1
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
|
|
; CHECK-NEXT: addvl sp, sp, #1
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: .cfi_restore z8
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
call void asm sideeffect "", "~{z8}" ()
|
|
ret void
|
|
}
|
|
|
|
define void @sve_4v_csr(<vscale x 4 x float> %a) #0 {
|
|
; CHECK-LABEL: sve_4v_csr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-4
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: addvl sp, sp, #4
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: .cfi_restore z8
|
|
; CHECK-NEXT: .cfi_restore z9
|
|
; CHECK-NEXT: .cfi_restore z10
|
|
; CHECK-NEXT: .cfi_restore z11
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
|
|
ret void
|
|
}
|
|
|
|
define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
|
|
; CHECK-LABEL: sve_16v_csr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-16
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: addvl sp, sp, #16
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: .cfi_restore z8
|
|
; CHECK-NEXT: .cfi_restore z9
|
|
; CHECK-NEXT: .cfi_restore z10
|
|
; CHECK-NEXT: .cfi_restore z11
|
|
; CHECK-NEXT: .cfi_restore z12
|
|
; CHECK-NEXT: .cfi_restore z13
|
|
; CHECK-NEXT: .cfi_restore z14
|
|
; CHECK-NEXT: .cfi_restore z15
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
|
|
ret void
|
|
}
|
|
|
|
define void @sve_1p_csr(<vscale x 4 x float> %a) #0 {
|
|
; CHECK-LABEL: sve_1p_csr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-1
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
; CHECK-NEXT: addvl sp, sp, #1
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
call void asm sideeffect "", "~{p8}" ()
|
|
ret void
|
|
}
|
|
|
|
define void @sve_4p_csr(<vscale x 4 x float> %a) #0 {
|
|
; CHECK-LABEL: sve_4p_csr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-1
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
|
|
; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill
|
|
; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill
|
|
; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill
|
|
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload
|
|
; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload
|
|
; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload
|
|
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
; CHECK-NEXT: addvl sp, sp, #1
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
|
|
ret void
|
|
}
|
|
|
|
define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
|
|
; CHECK-LABEL: sve_16v_1p_csr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl x9, sp, #-17
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
|
|
; CHECK-NEXT: .LBB9_1: // %entry
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
; CHECK-NEXT: cmp sp, x9
|
|
; CHECK-NEXT: b.le .LBB9_3
|
|
; CHECK-NEXT: // %bb.2: // %entry
|
|
; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: b .LBB9_1
|
|
; CHECK-NEXT: .LBB9_3: // %entry
|
|
; CHECK-NEXT: mov sp, x9
|
|
; CHECK-NEXT: ldr xzr, [sp]
|
|
; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: addvl sp, sp, #17
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: .cfi_restore z8
|
|
; CHECK-NEXT: .cfi_restore z9
|
|
; CHECK-NEXT: .cfi_restore z10
|
|
; CHECK-NEXT: .cfi_restore z11
|
|
; CHECK-NEXT: .cfi_restore z12
|
|
; CHECK-NEXT: .cfi_restore z13
|
|
; CHECK-NEXT: .cfi_restore z14
|
|
; CHECK-NEXT: .cfi_restore z15
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
|
|
ret void
|
|
}
|
|
|
|
; A SVE vector and a 16-byte fixed size object.
|
|
define void @sve_1_vector_16_arr(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_1_vector_16_arr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: sub sp, sp, #16
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
; CHECK-NEXT: addvl sp, sp, #-1
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #1
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 32
|
|
; CHECK-NEXT: add sp, sp, #16
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec = alloca <vscale x 4 x float>, align 16
|
|
%arr = alloca i8, i64 16, align 1
|
|
ret void
|
|
}
|
|
|
|
; A large SVE stack object and a large stack slot, both of which need probing.
|
|
; TODO: This could be optimised by combining the fixed-size offset into the
|
|
; loop.
|
|
define void @sve_1_vector_4096_arr(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_1_vector_4096_arr:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288
|
|
; CHECK-NEXT: .cfi_def_cfa w9, 12304
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG
|
|
; CHECK-NEXT: .LBB11_1: // %entry
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
; CHECK-NEXT: cmp sp, x9
|
|
; CHECK-NEXT: b.le .LBB11_3
|
|
; CHECK-NEXT: // %bb.2: // %entry
|
|
; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: b .LBB11_1
|
|
; CHECK-NEXT: .LBB11_3: // %entry
|
|
; CHECK-NEXT: mov sp, x9
|
|
; CHECK-NEXT: ldr xzr, [sp]
|
|
; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #2
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 12304
|
|
; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec = alloca <vscale x 256 x float>, align 16
|
|
%arr = alloca i8, i64 12288, align 1
|
|
ret void
|
|
}
|
|
|
|
; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
|
|
; supported even without stack-probing.
|
|
|
|
; An SVE vector, and a 16-byte fixed size object, which
|
|
; has a large alignment requirement.
|
|
define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: mov x29, sp
|
|
; CHECK-NEXT: .cfi_def_cfa w29, 16
|
|
; CHECK-NEXT: .cfi_offset w30, -8
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096
|
|
; CHECK-NEXT: sub x9, x9, #4080
|
|
; CHECK-NEXT: addvl x9, x9, #-1
|
|
; CHECK-NEXT: and x9, x9, #0xffffffffffffe000
|
|
; CHECK-NEXT: .LBB12_1: // %entry
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
|
|
; CHECK-NEXT: cmp sp, x9
|
|
; CHECK-NEXT: b.le .LBB12_3
|
|
; CHECK-NEXT: // %bb.2: // %entry
|
|
; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: b .LBB12_1
|
|
; CHECK-NEXT: .LBB12_3: // %entry
|
|
; CHECK-NEXT: mov sp, x9
|
|
; CHECK-NEXT: ldr xzr, [sp]
|
|
; CHECK-NEXT: mov sp, x29
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w30
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec = alloca <vscale x 4 x float>, align 16
|
|
%arr = alloca i8, i64 16, align 8192
|
|
ret void
|
|
}
|
|
|
|
; With 64k guard pages, we can allocate bigger SVE space without a probing loop.
|
|
define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
|
|
; CHECK-LABEL: sve_1024_64k_guard:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #8
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec = alloca <vscale x 1024 x float>, align 16
|
|
ret void
|
|
}
|
|
|
|
define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
|
|
; CHECK-LABEL: sve_1028_64k_guard:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl x9, sp, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-32
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG
|
|
; CHECK-NEXT: addvl x9, x9, #-1
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG
|
|
; CHECK-NEXT: .LBB14_1: // %entry
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
|
|
; CHECK-NEXT: cmp sp, x9
|
|
; CHECK-NEXT: b.le .LBB14_3
|
|
; CHECK-NEXT: // %bb.2: // %entry
|
|
; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: b .LBB14_1
|
|
; CHECK-NEXT: .LBB14_3: // %entry
|
|
; CHECK-NEXT: mov sp, x9
|
|
; CHECK-NEXT: ldr xzr, [sp]
|
|
; CHECK-NEXT: .cfi_def_cfa_register wsp
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #31
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #9
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec = alloca <vscale x 1024 x float>, align 16
|
|
%vec1 = alloca <vscale x 4 x float>, align 16
|
|
ret void
|
|
}
|
|
|
|
; With 5 SVE vectors of stack space the unprobed area
|
|
; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280),
|
|
; hence we need to issue a probe.
|
|
define void @sve_5_vector(ptr %out) #0 {
|
|
; CHECK-LABEL: sve_5_vector:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-5
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: addvl sp, sp, #5
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vec1 = alloca <vscale x 4 x float>, align 16
|
|
%vec2 = alloca <vscale x 4 x float>, align 16
|
|
%vec3 = alloca <vscale x 4 x float>, align 16
|
|
%vec4 = alloca <vscale x 4 x float>, align 16
|
|
%vec5 = alloca <vscale x 4 x float>, align 16
|
|
ret void
|
|
}
|
|
|
|
; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed
|
|
; are bellow the save location of `p9`.
|
|
define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
|
|
; CHECK-LABEL: sve_unprobed_area:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w29, -16
|
|
; CHECK-NEXT: addvl sp, sp, #-4
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
; CHECK-NEXT: str xzr, [sp]
|
|
; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
|
|
; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
|
|
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
|
|
; CHECK-NEXT: addvl sp, sp, #-4
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
|
|
; CHECK-NEXT: //APP
|
|
; CHECK-NEXT: //NO_APP
|
|
; CHECK-NEXT: addvl sp, sp, #4
|
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
|
; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
|
|
; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
|
|
; CHECK-NEXT: addvl sp, sp, #4
|
|
; CHECK-NEXT: .cfi_def_cfa wsp, 16
|
|
; CHECK-NEXT: .cfi_restore z8
|
|
; CHECK-NEXT: .cfi_restore z9
|
|
; CHECK-NEXT: .cfi_restore z10
|
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 0
|
|
; CHECK-NEXT: .cfi_restore w29
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" ()
|
|
|
|
%v0 = alloca <vscale x 4 x float>, align 16
|
|
%v1 = alloca <vscale x 4 x float>, align 16
|
|
%v2 = alloca <vscale x 4 x float>, align 16
|
|
%v3 = alloca <vscale x 4 x float>, align 16
|
|
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }
|