1359 lines
52 KiB
LLVM
1359 lines
52 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
|
|
|
|
; vgpr offset
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_v:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_v:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_v:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
|
|
%load = load i8, ptr addrspace(5) %gep, align 4
|
|
%ext = zext i8 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_v:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_v:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_v:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
|
|
%load = load i8, ptr addrspace(5) %gep, align 4
|
|
%ext = sext i8 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_zext_v:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_zext_v:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_zext_v:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
|
|
%load = load i16, ptr addrspace(5) %gep, align 4
|
|
%ext = zext i16 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_sext_v:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_sext_v:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_sext_v:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
|
|
%load = load i16, ptr addrspace(5) %gep, align 4
|
|
%ext = sext i16 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
|
|
; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = zext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
|
|
; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = sext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX10-NEXT: scratch_load_short_d16 v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 2, v0
|
|
; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i16, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i16, ptr addrspace(5) %gep
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
|
|
; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = zext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
|
|
; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = sext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0
|
|
; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i16, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i16, ptr addrspace(5) %gep
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
|
|
; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: scratch_store_d16_hi_b8 v2, v0, off offset:4
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%load = load <4 x i8>, ptr %in
|
|
%element = extractelement <4 x i8> %load, i32 2
|
|
%gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
|
|
store i8 %element, ptr addrspace(5) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
|
|
; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_v:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v1, 2, v2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_v:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: scratch_store_d16_hi_b16 v2, v0, off offset:2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%load = load <2 x i16>, ptr %in
|
|
%element = extractelement <2 x i16> %load, i32 1
|
|
%gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
|
|
store i16 %element, ptr addrspace(5) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
|
|
|
|
; sgpr offset
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_s:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_ubyte v2, off, s2 offset:1
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_s:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_u8 v2, off, s0 offset:1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_s:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_u8 v2, off, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
|
|
%load = load i8, ptr addrspace(5) %gep, align 4
|
|
%ext = zext i8 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_s:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_sbyte v2, off, s2 offset:1
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_s:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_i8 v2, off, s0 offset:1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_s:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_i8 v2, off, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
|
|
%load = load i8, ptr addrspace(5) %gep, align 4
|
|
%ext = sext i8 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_zext_s:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_ushort v2, off, s2 offset:2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_zext_s:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_u16 v2, off, s0 offset:2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_zext_s:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_u16 v2, off, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
|
|
%load = load i16, ptr addrspace(5) %gep, align 4
|
|
%ext = zext i16 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_sext_s:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: scratch_load_sshort v2, off, s2 offset:2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_sext_s:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: scratch_load_i16 v2, off, s0 offset:2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_sext_s:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: scratch_load_i16 v2, off, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
%gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
|
|
%load = load i16, ptr addrspace(5) %gep, align 4
|
|
%ext = sext i16 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX10-NEXT: scratch_load_ubyte_d16 v2, off, s2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 1
|
|
; GFX11-NEXT: scratch_load_d16_u8 v2, off, s0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX12-NEXT: scratch_load_d16_u8 v2, off, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = zext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX10-NEXT: scratch_load_sbyte_d16 v2, off, s2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 1
|
|
; GFX11-NEXT: scratch_load_d16_i8 v2, off, s0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX12-NEXT: scratch_load_d16_i8 v2, off, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = sext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 2
|
|
; GFX10-NEXT: scratch_load_short_d16 v2, off, s2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 2
|
|
; GFX11-NEXT: scratch_load_d16_b16 v2, off, s0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000
|
|
; GFX12-NEXT: scratch_load_d16_b16 v2, off, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i16, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i16, ptr addrspace(5) %gep
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX10-NEXT: scratch_load_ubyte_d16_hi v2, off, s2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 1
|
|
; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX12-NEXT: scratch_load_d16_hi_u8 v2, off, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = zext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX10-NEXT: scratch_load_sbyte_d16_hi v2, off, s2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 1
|
|
; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX12-NEXT: scratch_load_d16_hi_i8 v2, off, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i8, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = sext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 2
|
|
; GFX10-NEXT: scratch_load_short_d16_hi v2, off, s2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 2
|
|
; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, -1
|
|
; GFX12-NEXT: scratch_load_d16_hi_b16 v2, off, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[0:1], v2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%gep = getelementptr i16, ptr addrspace(5) %in, i64 1
|
|
%load_lo = load i16, ptr addrspace(5) %gep
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
|
|
; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 4
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: scratch_store_byte_d16_hi off, v0, s2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 4
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: scratch_store_d16_hi_b8 off, v0, s0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: scratch_store_d16_hi_b8 off, v0, s0 offset:4
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%load = load <4 x i8>, ptr %in
|
|
%element = extractelement <4 x i8> %load, i32 2
|
|
%gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
|
|
store i8 %element, ptr addrspace(5) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
|
|
; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_s:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX10-NEXT: s_add_i32 s2, s2, 2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: scratch_store_short_d16_hi off, v0, s2
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX11-NEXT: s_add_i32 s0, s0, 2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: scratch_store_d16_hi_b16 off, v0, s0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: scratch_store_d16_hi_b16 off, v0, s0 offset:2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%load = load <2 x i16>, ptr %in
|
|
%element = extractelement <2 x i16> %load, i32 1
|
|
%gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
|
|
store i16 %element, ptr addrspace(5) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
|
|
|
|
; sgpr + vgpr offset
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_svs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
|
; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_svs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
|
|
; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_svs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_u8 v0, v0, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
|
|
%load = load i8, ptr addrspace(5) %gep, align 4
|
|
%ext = zext i8 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_svs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
|
; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_svs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
|
|
; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_svs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_i8 v0, v0, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
|
|
%load = load i8, ptr addrspace(5) %gep, align 4
|
|
%ext = sext i8 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_zext_svs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
|
; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_zext_svs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
|
|
; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_zext_svs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_u16 v0, v0, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
|
|
%load = load i16, ptr addrspace(5) %gep, align 4
|
|
%ext = zext i16 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_sext_svs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
|
; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v0
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_sext_svs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
|
|
; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_sext_svs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_i16 v0, v0, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v0
|
|
; GFX12-NEXT: s_endpgm
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
|
|
%load = load i16, ptr addrspace(5) %gep, align 4
|
|
%ext = sext i16 %load to i32
|
|
store i32 %ext, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
|
|
; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
|
|
; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_d16_u8 v3, v0, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = zext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
|
|
; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
|
|
; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_d16_i8 v3, v0, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = sext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
|
|
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
|
|
; GFX10-NEXT: scratch_load_short_d16 v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
|
|
; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_d16_b16 v3, v0, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
|
|
%load_lo = load i16, ptr addrspace(5) %gep
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
|
|
; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
|
|
; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = zext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
|
|
; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
|
|
; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, s0 offset:1
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
|
|
%load_lo = load i8, ptr addrspace(5) %gep
|
|
%ext = sext i8 %load_lo to i16
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
|
|
; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v3, -1
|
|
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
|
|
; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: flat_store_dword v[1:2], v3
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
|
|
; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
|
|
; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, s0 offset:2
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: flat_store_b32 v[1:2], v3
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
|
|
%gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
|
|
%load_lo = load i16, ptr addrspace(5) %gep
|
|
%result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
|
|
store <2 x i16> %result, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
|
|
; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
|
|
; GFX10-NEXT: v_add3_u32 v1, s2, v1, 4
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v1, s0, v1, 4
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, s0 offset:4
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%load = load <4 x i8>, ptr %in
|
|
%element = extractelement <4 x i8> %load, i32 2
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
|
|
%gep = getelementptr <4 x i8>, ptr addrspace(5) %gep0, i64 1
|
|
store i8 %element, ptr addrspace(5) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
|
|
; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_svs:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_add_u32 s0, s0, s3
|
|
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
|
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
|
; GFX10-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
|
|
; GFX10-NEXT: v_add3_u32 v1, s2, v1, 2
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_svs:
|
|
; GFX11: ; %bb.0: ; %bb
|
|
; GFX11-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add3_u32 v1, s0, v1, 2
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs:
|
|
; GFX12: ; %bb.0: ; %bb
|
|
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, s0 offset:2
|
|
; GFX12-NEXT: s_endpgm
|
|
bb:
|
|
%load = load <2 x i16>, ptr %in
|
|
%element = extractelement <2 x i16> %load, i32 1
|
|
%voffset4 = mul i32 %voffset, 4
|
|
%gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
|
|
%gep = getelementptr <2 x i8>, ptr addrspace(5) %gep0, i64 1
|
|
store i16 %element, ptr addrspace(5) %gep, align 4
|
|
ret void
|
|
}
|