2107 lines
85 KiB
LLVM
2107 lines
85 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
declare half @llvm.fabs.f16(half)
|
|
declare float @llvm.fabs.f32(float)
|
|
declare double @llvm.fabs.f64(double)
|
|
|
|
; All nan values are converted to 0xffffffff
|
|
define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
|
|
; SI-LABEL: v_cnd_nan_nosgpr:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dword s8, s[0:1], 0xb
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, 0
|
|
; SI-NEXT: s_mov_b32 s3, s7
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_cmp_eq_u32 s8, 0
|
|
; SI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: v_cnd_nan_nosgpr:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_cmp_eq_u32 s2, 0
|
|
; VI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: v_cnd_nan_nosgpr:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
|
|
; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: v_cnd_nan_nosgpr:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
|
|
%f = load float, ptr addrspace(1) %f.gep
|
|
%setcc = icmp ne i32 %c, 0
|
|
%select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
|
|
store float %select, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; This requires slightly trickier SGPR operand legalization since the
|
|
; single constant bus SGPR usage is the last operand, and it should
|
|
; never be moved.
|
|
; However on GFX10 constant bus is limited to 2 scalar operands, not one.
|
|
; All nan values are converted to 0xffffffff
|
|
define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 {
|
|
; SI-LABEL: v_cnd_nan:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: s_cmp_eq_u32 s2, 0
|
|
; SI-NEXT: v_mov_b32_e32 v0, s3
|
|
; SI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: v_cnd_nan:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_cmp_eq_u32 s2, 0
|
|
; VI-NEXT: v_mov_b32_e32 v0, s3
|
|
; VI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: v_cnd_nan:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: v_cnd_nan:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
|
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%setcc = icmp ne i32 %c, 0
|
|
%select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
|
|
store float %select, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; Test different compare and select operand types for optimal code
|
|
; shrinking.
|
|
; (select (cmp (sgprX, constant)), constant, sgprZ)
|
|
define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
|
|
; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v2, s1
|
|
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v2, s1
|
|
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
|
|
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1]
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5]
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%setcc = fcmp one float %x, 0.0
|
|
%select = select i1 %setcc, float 1.0, float %z
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
|
|
; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1]
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%setcc = fcmp one float %x, 0.0
|
|
%select = select i1 %setcc, float 1.0, float %x
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
|
|
; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v2, s1
|
|
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v2, s1
|
|
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
|
|
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1]
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5]
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%setcc = fcmp one float %x, 0.0
|
|
%select = select i1 %setcc, float 0.0, float %z
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
|
|
; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1]
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3]
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%setcc = fcmp one float %x, 0.0
|
|
%select = select i1 %setcc, float 0.0, float %x
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: flat_load_dword v3, v[0:1]
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%z = load float, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp one float %x, 0.0
|
|
%select = select i1 %setcc, float 0.0, float %z
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: flat_load_dword v3, v[0:1]
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%z = load float, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp one float %x, 0.0
|
|
%select = select i1 %setcc, float 1.0, float %z
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 {
|
|
; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
; SI-NEXT: v_mov_b32_e32 v3, s8
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: flat_load_dword v3, v[0:1]
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load float, ptr addrspace(1) %x.gep
|
|
%setcc = fcmp olt float %x, 0.0
|
|
%select = select i1 %setcc, float 1.0, float %z
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; VI-NEXT: flat_load_dword v5, v[0:1] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile float, ptr addrspace(1) %x.gep
|
|
%z = load volatile float, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp ult float %x, 0.0
|
|
%select = select i1 %setcc, float 1.0, float %z
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; VI-NEXT: flat_load_dword v5, v[0:1] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile i32, ptr addrspace(1) %x.gep
|
|
%z = load volatile i32, ptr addrspace(1) %z.gep
|
|
%setcc = icmp slt i32 %x, 0
|
|
%select = select i1 %setcc, i32 2, i32 %z
|
|
store i32 %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc
|
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v5, s5
|
|
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
|
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
|
; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
|
|
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
|
|
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
|
|
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile i64, ptr addrspace(1) %x.gep
|
|
%z = load volatile i64, ptr addrspace(1) %z.gep
|
|
%setcc = icmp slt i64 %x, 0
|
|
%select = select i1 %setcc, i64 2, i64 %z
|
|
store i64 %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: v_mov_b32_e32 v5, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v2, s7
|
|
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
|
|
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
|
|
; VI-NEXT: flat_load_dword v6, v[1:2] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v7, s5
|
|
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5
|
|
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
|
|
; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile float, ptr addrspace(1) %x.gep
|
|
%z = load volatile <4 x float>, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp ugt float %x, 4.0
|
|
%select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
|
|
store <4 x float> %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: v_mov_b32_e32 v5, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v2, s7
|
|
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
|
|
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
|
|
; VI-NEXT: flat_load_dword v6, v[1:2] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v7, s5
|
|
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5
|
|
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
|
|
; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile float, ptr addrspace(1) %x.gep
|
|
%z = load volatile <4 x float>, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp ugt float %x, 4.0
|
|
%select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
|
|
store <4 x float> %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
; This must be swapped as a vector type before the condition has
|
|
; multiple uses.
|
|
define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: v_mov_b32_e32 v5, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v2, s7
|
|
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
|
|
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
|
|
; VI-NEXT: flat_load_dword v6, v[1:2] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v7, s5
|
|
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5
|
|
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
|
|
; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
|
|
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile float, ptr addrspace(1) %x.gep
|
|
%z = load volatile <4 x float>, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp ugt float 4.0, %x
|
|
%select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
|
|
store <4 x float> %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[2:3], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
|
|
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
|
; SI-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v2, s7
|
|
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
|
|
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
|
|
; VI-NEXT: flat_load_dword v2, v[1:2] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_ubyte v3, v[3:4] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
|
|
; VI-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
|
|
; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
|
; VI-NEXT: flat_store_byte v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
|
|
; GFX10-NEXT: v_and_b32_e32 v1, 1, v3
|
|
; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
|
|
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
|
|
; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
|
|
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
|
|
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
|
|
; GFX11-NEXT: global_store_b8 v0, v1, s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile i32, ptr addrspace(1) %x.gep
|
|
%z = load volatile i1, ptr addrspace(1) %z.gep
|
|
%setcc = icmp slt i32 %x, 0
|
|
%select = select i1 %setcc, i1 true, i1 %z
|
|
store i1 %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
; Different types compared vs. selected
|
|
define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: v_mov_b32_e32 v4, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v2, s7
|
|
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
|
|
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
|
|
; VI-NEXT: flat_load_dword v6, v[1:2] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v3, s5
|
|
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5
|
|
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000
|
|
; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile float, ptr addrspace(1) %x.gep
|
|
%z = load volatile double, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp ult float %x, 0.0
|
|
%select = select i1 %setcc, double 1.0, double %z
|
|
store double %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
; Different types compared vs. selected
|
|
define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: v_mov_b32_e32 v4, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v2, s7
|
|
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
|
|
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
|
|
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
|
|
; VI-NEXT: flat_load_dword v6, v[1:2] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v3, s5
|
|
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5
|
|
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
|
|
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
|
|
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
|
|
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile float, ptr addrspace(1) %x.gep
|
|
%z = load volatile i64, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp one float %x, 0.0
|
|
%select = select i1 %setcc, i64 3, i64 %z
|
|
store i64 %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
; Different types compared vs. selected
|
|
define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; VI-NEXT: flat_load_dword v5, v[0:1] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile i32, ptr addrspace(1) %x.gep
|
|
%z = load volatile float, ptr addrspace(1) %z.gep
|
|
%setcc = icmp ugt i32 %x, 1
|
|
%select = select i1 %setcc, float 4.0, float %z
|
|
store float %select, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
; FIXME: Should be able to handle multiple uses
|
|
define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
|
|
; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
|
|
; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2
|
|
; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc
|
|
; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc
|
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; VI-NEXT: flat_load_dword v5, v[0:1] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5
|
|
; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc
|
|
; VI-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
|
|
; VI-NEXT: flat_store_dword v[0:1], v3
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
|
|
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX10-NEXT: global_store_dword v0, v2, s[4:5]
|
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
|
|
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc
|
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc
|
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%tid.ext = sext i32 %tid to i64
|
|
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
|
|
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
|
|
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
|
|
%x = load volatile float, ptr addrspace(1) %x.gep
|
|
%z = load volatile float, ptr addrspace(1) %z.gep
|
|
%setcc = fcmp ugt float 4.0, %x
|
|
%select0 = select i1 %setcc, float -1.0, float %z
|
|
%select1 = select i1 %setcc, float -2.0, float %z
|
|
store volatile float %select0, ptr addrspace(1) %out.gep
|
|
store volatile float %select1, ptr addrspace(1) %out.gep
|
|
ret void
|
|
}
|
|
|
|
; Source modifiers abs/neg only work for f32
|
|
define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
|
|
; SI-LABEL: v_cndmask_abs_neg_f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s8, s[0:1], 0xb
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b32 s7, s3
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: s_cmp_lg_u32 s8, 0
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0|
|
|
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
|
|
; SI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: v_cndmask_abs_neg_f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: flat_load_ushort v0, v[0:1]
|
|
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_cmp_lg_u32 s2, 0
|
|
; VI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
|
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
|
; VI-NEXT: flat_store_short v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: v_cndmask_abs_neg_f16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
|
; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
|
; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; GFX10-NEXT: global_store_short v2, v0, s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: v_cndmask_abs_neg_f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
|
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
|
|
%f = load half, ptr addrspace(1) %f.gep
|
|
%f.abs = call half @llvm.fabs.f16(half %f)
|
|
%f.neg = fneg half %f
|
|
%setcc = icmp ne i32 %c, 0
|
|
%select = select i1 %setcc, half %f.abs, half %f.neg
|
|
store half %select, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
|
|
; SI-LABEL: v_cndmask_abs_neg_f32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dword s8, s[0:1], 0xb
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, 0
|
|
; SI-NEXT: s_mov_b32 s3, s7
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_cmp_lg_u32 s8, 0
|
|
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
|
|
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: v_cndmask_abs_neg_f32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_cmp_lg_u32 s2, 0
|
|
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
|
|
; VI-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: v_cndmask_abs_neg_f32:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
|
; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
|
|
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: v_cndmask_abs_neg_f32:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
|
|
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
|
|
%f = load float, ptr addrspace(1) %f.gep
|
|
%f.abs = call float @llvm.fabs.f32(float %f)
|
|
%f.neg = fneg float %f
|
|
%setcc = icmp ne i32 %c, 0
|
|
%select = select i1 %setcc, float %f.abs, float %f.neg
|
|
store float %select, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
|
|
; SI-LABEL: v_cndmask_abs_neg_f64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s8, s[0:1], 0xb
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, 0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b32 s7, s3
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: s_cmp_lg_u32 s8, 0
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
|
|
; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
|
; SI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: v_cndmask_abs_neg_f64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_cmp_lg_u32 s2, 0
|
|
; VI-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
|
|
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: v_cndmask_abs_neg_f64:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, 0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
|
|
; GFX10-NEXT: s_clause 0x1
|
|
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
|
; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
|
|
; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
|
|
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
|
|
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: v_cndmask_abs_neg_f64:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
|
; GFX11-NEXT: v_mov_b32_e32 v3, 0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
|
|
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
|
|
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
|
|
%f = load double, ptr addrspace(1) %f.gep
|
|
%f.abs = call double @llvm.fabs.f64(double %f)
|
|
%f.neg = fneg double %f
|
|
%setcc = icmp ne i32 %c, 0
|
|
%select = select i1 %setcc, double %f.abs, double %f.neg
|
|
store double %select, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind readnone }
|