455 lines
23 KiB
LLVM
455 lines
23 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s
|
|
; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s
|
|
|
|
; Test end-to-end codegen for outgoing arguments passed on the
|
|
; stack. This test is likely redundant when all DAG and GlobalISel
|
|
; tests are unified.
|
|
|
|
declare hidden void @external_void_func_v16i32_v16i32_v4i32(<16 x i32>, <16 x i32>, <4 x i32>) #0
|
|
declare hidden void @external_void_func_byval(ptr addrspace(5) byval([16 x i32])) #0
|
|
|
|
define amdgpu_kernel void @kernel_caller_stack() {
|
|
; MUBUF-LABEL: kernel_caller_stack:
|
|
; MUBUF: ; %bb.0:
|
|
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
|
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_mov_b32 s32, 0
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 11
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
|
|
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
|
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
|
|
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
|
|
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: kernel_caller_stack:
|
|
; FLATSCR: ; %bb.0:
|
|
; FLATSCR-NEXT: s_mov_b32 s32, 0
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 4
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 10
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
|
; FLATSCR-NEXT: s_add_u32 s2, s32, 16
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
|
|
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
|
|
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
|
|
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s2
|
|
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; FLATSCR-NEXT: s_endpgm
|
|
call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_caller_byval() {
|
|
; MUBUF-LABEL: kernel_caller_byval:
|
|
; MUBUF: ; %bb.0:
|
|
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
|
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12
|
|
; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16
|
|
; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20
|
|
; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24
|
|
; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28
|
|
; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32
|
|
; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36
|
|
; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40
|
|
; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44
|
|
; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48
|
|
; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52
|
|
; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56
|
|
; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60
|
|
; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64
|
|
; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68
|
|
; MUBUF-NEXT: s_movk_i32 s32, 0x1400
|
|
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
|
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
|
|
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
|
; MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60
|
|
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: kernel_caller_byval:
|
|
; FLATSCR: ; %bb.0:
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:32
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:40
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:48
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:56
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:64
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:72
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:80
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:96
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:128
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8
|
|
; FLATSCR-NEXT: s_nop 0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:16
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:24
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:32
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:40
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:48
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:56
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:64
|
|
; FLATSCR-NEXT: s_movk_i32 s32, 0x50
|
|
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
|
|
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
|
|
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
|
|
; FLATSCR-NEXT: s_add_u32 s2, s32, 8
|
|
; FLATSCR-NEXT: s_add_u32 s3, s32, 16
|
|
; FLATSCR-NEXT: s_add_u32 s4, s32, 24
|
|
; FLATSCR-NEXT: s_add_u32 s5, s32, 32
|
|
; FLATSCR-NEXT: s_add_u32 s6, s32, 40
|
|
; FLATSCR-NEXT: s_add_u32 s7, s32, 48
|
|
; FLATSCR-NEXT: s_add_u32 s8, s32, 56
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s3
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[6:7], s4
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[8:9], s5
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[10:11], s6
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[12:13], s7
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[14:15], s8
|
|
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; FLATSCR-NEXT: s_endpgm
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %alloca, i8 0, i32 128, i1 false)
|
|
call void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) %alloca)
|
|
ret void
|
|
}
|
|
|
|
define void @func_caller_stack() {
|
|
; MUBUF-LABEL: func_caller_stack:
|
|
; MUBUF: ; %bb.0:
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s4, s33
|
|
; MUBUF-NEXT: s_mov_b32 s33, s32
|
|
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
|
|
; MUBUF-NEXT: s_addk_i32 s32, 0x400
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
|
|
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 11
|
|
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
|
|
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
|
|
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
|
|
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
|
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
|
|
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
|
|
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
|
|
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
|
|
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
|
|
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
|
|
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
|
|
; MUBUF-NEXT: s_mov_b32 s33, s4
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: func_caller_stack:
|
|
; FLATSCR: ; %bb.0:
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: s_mov_b32 s0, s33
|
|
; FLATSCR-NEXT: s_mov_b32 s33, s32
|
|
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
|
|
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
|
|
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
|
|
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 4
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 10
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
|
|
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
|
|
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
|
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
|
|
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
|
|
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
|
|
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
|
|
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
|
|
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
|
|
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
|
|
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
|
|
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
|
|
; FLATSCR-NEXT: s_mov_b32 s33, s0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
|
|
ret void
|
|
}
|
|
|
|
define void @func_caller_byval(ptr addrspace(5) %argptr) {
|
|
; MUBUF-LABEL: func_caller_byval:
|
|
; MUBUF: ; %bb.0:
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s4, s33
|
|
; MUBUF-NEXT: s_mov_b32 s33, s32
|
|
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; MUBUF-NEXT: s_addk_i32 s32, 0x400
|
|
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
|
|
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
|
|
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
|
|
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
|
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
|
|
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:8
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:12
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:16
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:20
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:24
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:28
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:32
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:36
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:40
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:44
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:48
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:52
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:48
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52
|
|
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:56
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:60
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
|
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60
|
|
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
|
|
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
|
|
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
|
|
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
|
|
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
|
|
; MUBUF-NEXT: s_mov_b32 s33, s4
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: func_caller_byval:
|
|
; FLATSCR: ; %bb.0:
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: s_mov_b32 s0, s33
|
|
; FLATSCR-NEXT: s_mov_b32 s33, s32
|
|
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
|
|
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off
|
|
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
|
|
; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0
|
|
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
|
|
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
|
|
; FLATSCR-NEXT: s_add_u32 s2, s32, 56
|
|
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
|
|
; FLATSCR-NEXT: v_add_u32_e32 v3, 16, v0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
|
|
; FLATSCR-NEXT: v_add_u32_e32 v3, 24, v0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 24
|
|
; FLATSCR-NEXT: v_add_u32_e32 v3, 32, v0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 32
|
|
; FLATSCR-NEXT: v_add_u32_e32 v3, 40, v0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 40
|
|
; FLATSCR-NEXT: v_add_u32_e32 v3, 48, v0
|
|
; FLATSCR-NEXT: v_add_u32_e32 v0, 56, v0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
|
|
; FLATSCR-NEXT: s_add_u32 s0, s32, 48
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], v0, off
|
|
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
|
|
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
|
|
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2
|
|
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
|
|
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
|
|
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
|
|
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
|
|
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
|
|
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
|
|
; FLATSCR-NEXT: s_mov_b32 s33, s0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
call void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) %argptr)
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #1
|
|
|
|
attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
|
|
attributes #1 = { argmemonly nofree nounwind willreturn writeonly }
|