; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { ; GFX12-LABEL: test_cvt_f32_bf8_byte0: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0) ret float %ret } define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { ; GFX12-LABEL: test_cvt_f32_bf8_byte1: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1) ret float %ret } define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { ; GFX12-LABEL: test_cvt_f32_bf8_byte2: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2) ret float %ret } define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) { ; GFX12-LABEL: test_cvt_f32_fp8_byte3: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3) ret float %ret } define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false) store i32 %ret, ptr addrspace(1) %out ret void } define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true) store i32 %ret, ptr addrspace(1) %out ret void } define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0) store i32 %ret, ptr addrspace(1) %out ret void } define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1) store i32 %ret, ptr addrspace(1) %out ret void } define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2) store i32 %ret, ptr addrspace(1) %out ret void } declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1) declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent }