; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX908-LABEL: syncscope_system: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: syncscope_system: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: syncscope_system: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: syncscope_system: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 ; GFX1100-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: buffer_gl1_inv ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: s_cbranch_execnz .LBB0_1 ; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: v_mov_b32_e32 v0, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: syncscope_system: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: s_wait_expcnt 0x0 ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: flat_load_b32 v3, v[0:1] ; GFX1200-NEXT: s_mov_b32 s0, 0 ; GFX1200-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SYS ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB0_1 ; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: v_mov_b32_e32 v0, v3 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val seq_cst ret float %res } define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX908-LABEL: syncscope_workgroup_rtn: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: syncscope_workgroup_rtn: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB1_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB1_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: .LBB1_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB1_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB1_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: .LBB1_6: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB1_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: syncscope_workgroup_rtn: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: syncscope_workgroup_rtn: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: syncscope_workgroup_rtn: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: s_wait_expcnt 0x0 ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret float %res } define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX908-LABEL: syncscope_workgroup_nortn: ; GFX908: ; %bb.0: ; %atomicrmw.check.shared ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.1: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_8 ; GFX908-NEXT: .LBB2_2: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX908-NEXT: s_cbranch_execz .LBB2_5 ; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 ; GFX908-NEXT: .LBB2_5: ; %Flow ; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX908-NEXT: s_cbranch_execz .LBB2_7 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: .LBB2_7: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB2_2 ; GFX908-NEXT: .LBB2_8: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: syncscope_workgroup_nortn: ; GFX90A: ; %bb.0: ; %atomicrmw.check.shared ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 ; GFX90A-NEXT: ; %bb.1: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_8 ; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB2_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: .LBB2_5: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB2_7 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB2_7: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB2_2 ; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: syncscope_workgroup_nortn: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: syncscope_workgroup_nortn: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: syncscope_workgroup_nortn: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: s_wait_expcnt 0x0 ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } define float @no_unsafe(ptr %addr, float %val) { ; GFX908-LABEL: no_unsafe: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: no_unsafe: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: no_unsafe: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: no_unsafe: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 ; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: s_cbranch_execnz .LBB3_1 ; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: v_mov_b32_e32 v0, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-LABEL: no_unsafe: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: s_wait_expcnt 0x0 ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: flat_load_b32 v3, v[0:1] ; GFX1200-NEXT: s_mov_b32 s0, 0 ; GFX1200-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 ; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: v_mov_b32_e32 v0, v3 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret float %res } attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }