; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s ; Test using saddr addressing mode of global_* flat atomic instructions. ; -------------------------------------------------------------------------------- ; atomicrmw max ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i32_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst ret void } define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst ret void } define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i64_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i64_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst ret void } define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw min ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i32_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst ret void } define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst ret void } define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i64_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB13_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i64_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst ret void } define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw umax ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB18_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i32_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst ret void } define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst ret void } define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB20_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i64_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB22_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i64_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst ret void } define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB23_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw umin ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB24_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB25_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB26_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i32_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst ret void } define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB27_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst ret void } define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB28_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i64_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB29_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB30_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i64_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst ret void } define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB31_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst ret void } attributes #0 = { argmemonly nounwind willreturn }