; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s ; Function Attrs: mustprogress nounwind willreturn define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half8: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v4, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half8: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX1030-NEXT: s_endpgm %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2 %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3 %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4 %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5 %gep6 = getelementptr half, ptr addrspace(1) %0, i64 6 %gep7 = getelementptr half, ptr addrspace(1) %0, i64 7 %l0 = load half, ptr addrspace(1) %gep0, align 2 %l1 = load half, ptr addrspace(1) %gep1, align 2 %l2 = load half, ptr addrspace(1) %gep2, align 2 %l3 = load half, ptr addrspace(1) %gep3, align 2 %l4 = load half, ptr addrspace(1) %gep4, align 2 %l5 = load half, ptr addrspace(1) %gep5, align 2 %l6 = load half, ptr addrspace(1) %gep6, align 2 %l7 = load half, ptr addrspace(1) %gep7, align 2 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2 %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3 %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4 %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5 %sgep6 = getelementptr half, ptr addrspace(1) %1, i64 6 %sgep7 = getelementptr half, ptr addrspace(1) %1, i64 7 store half %l0, ptr addrspace(1) %sgep0, align 2 store half %l1, ptr addrspace(1) %sgep1, align 2 store half %l2, ptr addrspace(1) %sgep2, align 2 store half %l3, ptr addrspace(1) %sgep3, align 2 store half %l4, ptr addrspace(1) %sgep4, align 2 store half %l5, ptr addrspace(1) %sgep5, align 2 store half %l6, ptr addrspace(1) %sgep6, align 2 store half %l7, ptr addrspace(1) %sgep7, align 2 ret void } ; Function Attrs: mustprogress nounwind willreturn define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half6: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half6: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX1030-NEXT: s_endpgm %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2 %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3 %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4 %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5 %l0 = load half, ptr addrspace(1) %gep0, align 1 %l1 = load half, ptr addrspace(1) %gep1, align 1 %l2 = load half, ptr addrspace(1) %gep2, align 1 %l3 = load half, ptr addrspace(1) %gep3, align 1 %l4 = load half, ptr addrspace(1) %gep4, align 1 %l5 = load half, ptr addrspace(1) %gep5, align 1 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2 %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3 %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4 %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5 store half %l0, ptr addrspace(1) %sgep0, align 1 store half %l1, ptr addrspace(1) %sgep1, align 1 store half %l2, ptr addrspace(1) %sgep2, align 1 store half %l3, ptr addrspace(1) %sgep3, align 1 store half %l4, ptr addrspace(1) %sgep4, align 1 store half %l5, ptr addrspace(1) %sgep5, align 1 ret void } ; Function Attrs: mustprogress nounwind willreturn define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half4: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v0, s0 ; GFX908-NEXT: v_mov_b32_e32 v1, s1 ; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: half4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half4: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1030-NEXT: s_endpgm %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2 %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3 %l0 = load half, ptr addrspace(1) %gep0, align 4 %l1 = load half, ptr addrspace(1) %gep1, align 4 %l2 = load half, ptr addrspace(1) %gep2, align 4 %l3 = load half, ptr addrspace(1) %gep3, align 4 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2 %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3 store half %l0, ptr addrspace(1) %sgep0, align 4 store half %l1, ptr addrspace(1) %sgep1, align 4 store half %l2, ptr addrspace(1) %sgep2, align 4 store half %l3, ptr addrspace(1) %sgep3, align 4 ret void } ; Function Attrs: mustprogress nounwind willreturn define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half2: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dword v1, v0, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_store_dword v0, v1, s[2:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: half2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half2: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1030-NEXT: s_endpgm %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1 %l0 = load half, ptr addrspace(1) %gep0 %l1 = load half, ptr addrspace(1) %gep1 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1 store half %l0, ptr addrspace(1) %sgep0 store half %l1, ptr addrspace(1) %sgep1 ret void }