2373 lines
100 KiB
LLVM
2373 lines
100 KiB
LLVM
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
||
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
|
||
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
|
||
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
|
||
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
|
||
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
|
||
|
|
||
|
; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
|
||
|
|
||
|
; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
|
||
|
; beneficial even without fp32 denormals, but they do require no-infs-fp-math
|
||
|
; for correctness.
|
||
|
|
||
|
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
declare double @llvm.fabs.f64(double) #0
|
||
|
declare double @llvm.fma.f64(double, double, double) #0
|
||
|
declare float @llvm.fma.f32(float, float, float) #0
|
||
|
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
|
||
|
|
||
|
; (fadd (fmul x, y), z) -> (fma x, y, z)
|
||
|
define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_f64_0:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_f64_0:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
|
||
|
; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%fma = fadd double %mul, %c
|
||
|
store double %fma, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fadd (fmul x, y), z) -> (fma x, y, z)
|
||
|
define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_f64_0_2use:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], v[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_f64_0_2use:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5]
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
|
||
|
%gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
%gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
%d = load volatile double, ptr addrspace(1) %gep.3
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%fma0 = fadd double %mul, %c
|
||
|
%fma1 = fadd double %mul, %d
|
||
|
store volatile double %fma0, ptr addrspace(1) %gep.out.0
|
||
|
store volatile double %fma1, ptr addrspace(1) %gep.out.1
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fadd x, (fmul y, z)) -> (fma y, z, x)
|
||
|
define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_f64_1:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_f64_1:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
|
||
|
; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%fma = fadd double %c, %mul
|
||
|
store double %fma, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
|
||
|
define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_fsub_0_f64:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[6:7]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_fsub_0_f64:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
|
||
|
; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%fma = fsub double %mul, %c
|
||
|
store double %fma, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
|
||
|
define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_fsub_f64_0_2use:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], -v[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5]
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
|
||
|
%gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
%gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
%d = load volatile double, ptr addrspace(1) %gep.3
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%fma0 = fsub double %mul, %c
|
||
|
%fma1 = fsub double %mul, %d
|
||
|
store volatile double %fma0, ptr addrspace(1) %gep.out.0
|
||
|
store volatile double %fma1, ptr addrspace(1) %gep.out.1
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
|
||
|
define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_fsub_1_f64:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[6:7]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_fsub_1_f64:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
|
||
|
; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%fma = fsub double %c, %mul
|
||
|
store double %fma, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
|
||
|
define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_fsub_1_f64_2use:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], v[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[8:9]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5]
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7]
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
|
||
|
%gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
%gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
%d = load volatile double, ptr addrspace(1) %gep.3
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%fma0 = fsub double %c, %mul
|
||
|
%fma1 = fsub double %d, %mul
|
||
|
store volatile double %fma0, ptr addrspace(1) %gep.out.0
|
||
|
store volatile double %fma1, ptr addrspace(1) %gep.out.1
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||
|
define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_fsub_2_f64:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[6:7]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_fsub_2_f64:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
|
||
|
; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%mul.neg = fsub double -0.0, %mul
|
||
|
%fma = fsub double %mul.neg, %c
|
||
|
|
||
|
store double %fma, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||
|
define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[8:9]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7]
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
|
||
|
%gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
%gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
%d = load volatile double, ptr addrspace(1) %gep.3
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%mul.neg = fsub double -0.0, %mul
|
||
|
%fma0 = fsub double %mul.neg, %c
|
||
|
%fma1 = fsub double %mul.neg, %d
|
||
|
|
||
|
store volatile double %fma0, ptr addrspace(1) %gep.out.0
|
||
|
store volatile double %fma1, ptr addrspace(1) %gep.out.1
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||
|
define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7]
|
||
|
; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9]
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
|
||
|
; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
|
||
|
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
|
||
|
%gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
%gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
|
||
|
|
||
|
%a = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%b = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%c = load volatile double, ptr addrspace(1) %gep.2
|
||
|
%d = load volatile double, ptr addrspace(1) %gep.3
|
||
|
|
||
|
%mul = fmul double %a, %b
|
||
|
%mul.neg = fsub double -0.0, %mul
|
||
|
%fma0 = fsub double %mul.neg, %c
|
||
|
%fma1 = fsub double %mul, %d
|
||
|
|
||
|
store volatile double %fma0, ptr addrspace(1) %gep.out.0
|
||
|
store volatile double %fma1, ptr addrspace(1) %gep.out.1
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
|
||
|
define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
|
||
|
; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
|
||
|
; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
|
||
|
; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-FMA-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-FMA-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7]
|
||
|
; SI-FMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
|
||
|
; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
|
||
|
; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
|
||
|
; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5]
|
||
|
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
|
||
|
; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
|
||
|
%gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
|
||
|
%gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%x = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%y = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%z = load volatile double, ptr addrspace(1) %gep.2
|
||
|
%u = load volatile double, ptr addrspace(1) %gep.3
|
||
|
%v = load volatile double, ptr addrspace(1) %gep.4
|
||
|
|
||
|
%tmp0 = fmul double %u, %v
|
||
|
%tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
|
||
|
%tmp2 = fsub double %tmp1, %z
|
||
|
|
||
|
store double %tmp2, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; fold (fsub x, (fma y, z, (fmul u, v)))
|
||
|
; -> (fma (fneg y), z, (fma (fneg u), v, x))
|
||
|
define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
|
||
|
; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
|
||
|
; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
|
||
|
; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5]
|
||
|
; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||
|
; SI-FMA-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3]
|
||
|
; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3]
|
||
|
; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
|
||
|
; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
|
||
|
; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1]
|
||
|
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
|
||
|
; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
|
||
|
%gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
|
||
|
%gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
|
||
|
%gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%x = load volatile double, ptr addrspace(1) %gep.0
|
||
|
%y = load volatile double, ptr addrspace(1) %gep.1
|
||
|
%z = load volatile double, ptr addrspace(1) %gep.2
|
||
|
%u = load volatile double, ptr addrspace(1) %gep.3
|
||
|
%v = load volatile double, ptr addrspace(1) %gep.4
|
||
|
|
||
|
; nsz flag is needed since this combine may change sign of zero
|
||
|
%tmp0 = fmul nsz double %u, %v
|
||
|
%tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
|
||
|
%tmp2 = fsub nsz double %x, %tmp1
|
||
|
|
||
|
store double %tmp2, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
;
|
||
|
; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
|
||
|
;
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_add_x_one_y:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load volatile float, ptr addrspace(1) %in1
|
||
|
%y = load volatile float, ptr addrspace(1) %in2
|
||
|
%a = fadd float %x, 1.0
|
||
|
%m = fmul float %a, %y
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_y_add_x_one:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load volatile float, ptr addrspace(1) %in1
|
||
|
%y = load volatile float, ptr addrspace(1) %in2
|
||
|
%a = fadd float %x, 1.0
|
||
|
%m = fmul float %y, %a
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_add_x_negone_y:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%a = fadd float %x, -1.0
|
||
|
%m = fmul float %a, %y
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_y_add_x_negone:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%a = fadd float %x, -1.0
|
||
|
%m = fmul float %y, %a
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_sub_one_x_y:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float 1.0, %x
|
||
|
%m = fmul float %s, %y
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_y_sub_one_x:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float 1.0, %x
|
||
|
%m = fmul float %y, %s
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float -1.0, %x
|
||
|
%m = fmul float %s, %y
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float -1.0, %x
|
||
|
%m = fmul float %y, %s
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_sub_x_one_y:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float %x, 1.0
|
||
|
%m = fmul float %s, %y
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_y_sub_x_one:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float %x, 1.0
|
||
|
%m = fmul float %y, %s
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float %x, -1.0
|
||
|
%m = fmul float %s, %y
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
|
||
|
; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s2, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s2
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, s3
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s0, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s1, s5
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x1
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x1
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%s = fsub float %x, -1.0
|
||
|
%m = fmul float %y, %s
|
||
|
store float %m, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
;
|
||
|
; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
|
||
|
;
|
||
|
|
||
|
define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
|
||
|
; SI-NOFMA-LABEL: test_f32_interp:
|
||
|
; SI-NOFMA: ; %bb.0:
|
||
|
; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s10, -1
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s14, s10
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s15, s11
|
||
|
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s16, s4
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s17, s5
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s4, s6
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s5, s7
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s6, s10
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s7, s11
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s12, s2
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s13, s3
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s18, s10
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s19, s11
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[4:7], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
||
|
; SI-NOFMA-NEXT: buffer_load_dword v2, off, s[12:15], 0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s8, s0
|
||
|
; SI-NOFMA-NEXT: s_mov_b32 s9, s1
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(2)
|
||
|
; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v0
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v3
|
||
|
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v2, v0
|
||
|
; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[8:11], 0
|
||
|
; SI-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; SI-FMA-LABEL: test_f32_interp:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s18, s10
|
||
|
; SI-FMA-NEXT: s_mov_b32 s19, s11
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s16, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s17, s5
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s10
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s11
|
||
|
; SI-FMA-NEXT: s_mov_b32 s4, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s5, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s6, s10
|
||
|
; SI-FMA-NEXT: s_mov_b32 s7, s11
|
||
|
; SI-FMA-NEXT: buffer_load_dword v0, off, s[16:19], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v1, off, s[4:7], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dword v2, off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s8, s0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s9, s1
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, -v1, v0, v0
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f32 v0, v2, v1, v0
|
||
|
; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f32_interp:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x2
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
|
||
|
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1
|
||
|
; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f32_interp:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x2
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2
|
||
|
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2,
|
||
|
ptr addrspace(1) %in3) {
|
||
|
%x = load float, ptr addrspace(1) %in1
|
||
|
%y = load float, ptr addrspace(1) %in2
|
||
|
%t = load float, ptr addrspace(1) %in3
|
||
|
%t1 = fsub float 1.0, %t
|
||
|
%tx = fmul float %x, %t
|
||
|
%ty = fmul float %y, %t1
|
||
|
%r = fadd float %tx, %ty
|
||
|
store float %r, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
|
||
|
; SI-FMA-LABEL: test_f64_interp:
|
||
|
; SI-FMA: ; %bb.0:
|
||
|
; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
||
|
; SI-FMA-NEXT: s_mov_b32 s11, 0xf000
|
||
|
; SI-FMA-NEXT: s_mov_b32 s10, -1
|
||
|
; SI-FMA-NEXT: s_mov_b32 s18, s10
|
||
|
; SI-FMA-NEXT: s_mov_b32 s19, s11
|
||
|
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-FMA-NEXT: s_mov_b32 s16, s4
|
||
|
; SI-FMA-NEXT: s_mov_b32 s17, s5
|
||
|
; SI-FMA-NEXT: s_mov_b32 s4, s6
|
||
|
; SI-FMA-NEXT: s_mov_b32 s5, s7
|
||
|
; SI-FMA-NEXT: s_mov_b32 s6, s10
|
||
|
; SI-FMA-NEXT: s_mov_b32 s7, s11
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s14, s10
|
||
|
; SI-FMA-NEXT: s_mov_b32 s12, s2
|
||
|
; SI-FMA-NEXT: s_mov_b32 s13, s3
|
||
|
; SI-FMA-NEXT: s_mov_b32 s15, s11
|
||
|
; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s8, s0
|
||
|
; SI-FMA-NEXT: s_mov_b32 s9, s1
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; SI-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
|
||
|
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
|
||
|
; SI-FMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||
|
; SI-FMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-NOFMA-LABEL: test_f64_interp:
|
||
|
; GFX11-NOFMA: ; %bb.0:
|
||
|
; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||
|
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: s_clause 0x2
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5]
|
||
|
; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
|
||
|
; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||
|
; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
|
||
|
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
|
||
|
; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1]
|
||
|
; GFX11-NOFMA-NEXT: s_nop 0
|
||
|
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NOFMA-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-FMA-LABEL: test_f64_interp:
|
||
|
; GFX11-FMA: ; %bb.0:
|
||
|
; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||
|
; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0
|
||
|
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_clause 0x2
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5]
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7]
|
||
|
; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
|
||
|
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
|
||
|
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||
|
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
|
||
|
; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1]
|
||
|
; GFX11-FMA-NEXT: s_nop 0
|
||
|
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-FMA-NEXT: s_endpgm
|
||
|
ptr addrspace(1) %in1,
|
||
|
ptr addrspace(1) %in2,
|
||
|
ptr addrspace(1) %in3) {
|
||
|
%x = load double, ptr addrspace(1) %in1
|
||
|
%y = load double, ptr addrspace(1) %in2
|
||
|
%t = load double, ptr addrspace(1) %in3
|
||
|
%t1 = fsub double 1.0, %t
|
||
|
%tx = fmul double %x, %t
|
||
|
%ty = fmul double %y, %t1
|
||
|
%r = fadd double %tx, %ty
|
||
|
store double %r, ptr addrspace(1) %out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Make sure negative constant cancels out fneg
|
||
|
define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||
|
; SI-LABEL: fma_neg_2.0_neg_a_b_f32:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s2, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: v_fma_f32 v2, v2, 2.0, v3
|
||
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1
|
||
|
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||
|
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
|
||
|
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%r1 = load volatile float, ptr addrspace(1) %gep.0
|
||
|
%r2 = load volatile float, ptr addrspace(1) %gep.1
|
||
|
|
||
|
%r1.fneg = fneg float %r1
|
||
|
|
||
|
%r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
|
||
|
store float %r3, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||
|
; SI-LABEL: fma_2.0_neg_a_b_f32:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s2, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: v_fma_f32 v2, v2, -2.0, v3
|
||
|
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: fma_2.0_neg_a_b_f32:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1
|
||
|
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||
|
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
|
||
|
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%r1 = load volatile float, ptr addrspace(1) %gep.0
|
||
|
%r2 = load volatile float, ptr addrspace(1) %gep.1
|
||
|
|
||
|
%r1.fneg = fneg float %r1
|
||
|
|
||
|
%r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
|
||
|
store float %r3, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 {
|
||
|
; SI-LABEL: fma_neg_b_c_v4f32:
|
||
|
; SI: ; %bb.0:
|
||
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||
|
; SI-NEXT: s_mov_b32 s6, 0
|
||
|
; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0
|
||
|
; SI-NEXT: v_mov_b32_e32 v13, 0
|
||
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||
|
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64
|
||
|
; SI-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16
|
||
|
; SI-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:48
|
||
|
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
||
|
; SI-NEXT: v_fma_f32 v3, v11, -v3, -v7
|
||
|
; SI-NEXT: v_fma_f32 v2, v10, -v2, -v6
|
||
|
; SI-NEXT: v_fma_f32 v1, v9, -v1, -v5
|
||
|
; SI-NEXT: v_fma_f32 v0, v8, -v0, -v4
|
||
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], v[12:13], s[0:3], 0 addr64
|
||
|
; SI-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX11-LABEL: fma_neg_b_c_v4f32:
|
||
|
; GFX11: ; %bb.0:
|
||
|
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||
|
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0
|
||
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX11-NEXT: s_clause 0x2
|
||
|
; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16
|
||
|
; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3]
|
||
|
; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48
|
||
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3
|
||
|
; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2
|
||
|
; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1
|
||
|
; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0
|
||
|
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1]
|
||
|
; GFX11-NEXT: s_nop 0
|
||
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
; GFX11-NEXT: s_endpgm
|
||
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||
|
%gep.0 = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %tid
|
||
|
%gep.1 = getelementptr <4 x float>, ptr addrspace(1) %gep.0, i32 1
|
||
|
%gep.2 = getelementptr <4 x float>, ptr addrspace(1) %gep.1, i32 2
|
||
|
%gep.out = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
|
||
|
|
||
|
%tmp0 = load <4 x float>, ptr addrspace(1) %gep.0
|
||
|
%tmp1 = load <4 x float>, ptr addrspace(1) %gep.1
|
||
|
%tmp2 = load <4 x float>, ptr addrspace(1) %gep.2
|
||
|
|
||
|
%fneg0 = fneg fast <4 x float> %tmp0
|
||
|
%fneg1 = fneg fast <4 x float> %tmp1
|
||
|
%fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1)
|
||
|
|
||
|
store <4 x float> %fma0, ptr addrspace(1) %gep.out
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
attributes #0 = { nounwind readnone }
|
||
|
attributes #1 = { nounwind }
|
||
|
attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
|