; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare i16 @llvm.umax.i16(i16, i16) declare i64 @llvm.umin.i64(i64, i64) declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) { ; VI-LABEL: fmul_pow2_4xfloat: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 1 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 1 ; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 1 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 ; VI-NEXT: v_cvt_f32_u32_e32 v2, v2 ; VI-NEXT: v_cvt_f32_u32_e32 v3, v3 ; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; VI-NEXT: v_mul_f32_e32 v1, 0x41100000, v1 ; VI-NEXT: v_mul_f32_e32 v2, 0x41100000, v2 ; VI-NEXT: v_mul_f32_e32 v3, 0x41100000, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow2_4xfloat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 1 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x41100000, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x41100000, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x41100000, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow2_4xfloat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 1 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 1 ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_mul_f32 v0, 0x41100000, v0 :: v_dual_mul_f32 v1, 0x41100000, v1 ; GFX11-NEXT: v_dual_mul_f32 v2, 0x41100000, v2 :: v_dual_mul_f32 v3, 0x41100000, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> %r = fmul <4 x float> , %p2_f ret <4 x float> %r } define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; VI-LABEL: fmul_pow2_ldexp_4xfloat: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, 0x41100000 ; VI-NEXT: v_ldexp_f32 v0, s4, v0 ; VI-NEXT: v_ldexp_f32 v1, s4, v1 ; VI-NEXT: v_ldexp_f32 v2, s4, v2 ; VI-NEXT: v_ldexp_f32 v3, s4, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow2_ldexp_4xfloat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ldexp_f32 v0, 0x41100000, v0 ; GFX10-NEXT: v_ldexp_f32 v1, 0x41100000, v1 ; GFX10-NEXT: v_ldexp_f32 v2, 0x41100000, v2 ; GFX10-NEXT: v_ldexp_f32 v3, 0x41100000, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow2_ldexp_4xfloat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_ldexp_f32 v0, 0x41100000, v0 ; GFX11-NEXT: v_ldexp_f32 v1, 0x41100000, v1 ; GFX11-NEXT: v_ldexp_f32 v2, 0x41100000, v2 ; GFX11-NEXT: v_ldexp_f32 v3, 0x41100000, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) ret <4 x float> %r } define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) { ; VI-LABEL: fdiv_pow2_4xfloat: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v1 ; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 ; VI-NEXT: v_lshlrev_b32_e32 v3, 23, v3 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41100000, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 0x41100000, v1 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41100000, v2 ; VI-NEXT: v_sub_u32_e32 v3, vcc, 0x41100000, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow2_4xfloat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 23, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 23, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow2_4xfloat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 23, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 23, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> %r = fdiv <4 x float> , %p2_f ret <4 x float> %r } declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; VI-LABEL: fmul_pow2_8xhalf: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, 1 ; VI-NEXT: v_lshlrev_b16_e64 v4, v3, 1 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e64 v6, v2, 1 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e64 v7, v1, 1 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e64 v8, v0, 1 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 ; VI-NEXT: v_cvt_f16_u16_e32 v5, v8 ; VI-NEXT: v_cvt_f16_u16_e32 v1, v1 ; VI-NEXT: v_cvt_f16_u16_e32 v7, v7 ; VI-NEXT: v_cvt_f16_u16_e32 v2, v2 ; VI-NEXT: v_cvt_f16_u16_e32 v6, v6 ; VI-NEXT: v_cvt_f16_u16_e32 v3, v3 ; VI-NEXT: v_cvt_f16_u16_e32 v4, v4 ; VI-NEXT: v_mov_b32_e32 v8, 0x7000 ; VI-NEXT: v_mul_f16_e32 v4, 0x7000, v4 ; VI-NEXT: v_mul_f16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e32 v6, 0x7000, v6 ; VI-NEXT: v_mul_f16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e32 v7, 0x7000, v7 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e32 v5, 0x7000, v5 ; VI-NEXT: v_mul_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v5, v0 ; VI-NEXT: v_or_b32_e32 v1, v7, v1 ; VI-NEXT: v_or_b32_e32 v2, v6, v2 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow2_8xhalf: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3 ; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2 ; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1 ; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0 ; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1 ; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2 ; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow2_8xhalf: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f16_u16_e32 v4, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_cvt_f16_u16_e32 v5, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_u16_e32 v6, v6 ; GFX11-NEXT: v_cvt_f16_u16_e32 v7, v7 ; GFX11-NEXT: v_cvt_f16_u16_e32 v2, v2 ; GFX11-NEXT: v_cvt_f16_u16_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6 ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2 ; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %p2 = shl <8 x i16> , %i %p2_f = uitofp <8 x i16> %p2 to <8 x half> %r = fmul <8 x half> , %p2_f ret <8 x half> %r } define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; VI-LABEL: fmul_pow2_ldexp_8xhalf: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, 0x7000 ; VI-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3 ; VI-NEXT: v_ldexp_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2 ; VI-NEXT: v_ldexp_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1 ; VI-NEXT: v_ldexp_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0 ; VI-NEXT: v_ldexp_f16_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v0, v8, v0 ; VI-NEXT: v_or_b32_e32 v1, v7, v1 ; VI-NEXT: v_or_b32_e32 v2, v6, v2 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow2_ldexp_8xhalf: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000 ; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3 ; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2 ; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1 ; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0 ; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0 ; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1 ; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2 ; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow2_ldexp_8xhalf: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_ldexp_f16_e32 v5, 0x7000, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_ldexp_f16_e32 v1, 0x7000, v1 ; GFX11-NEXT: v_ldexp_f16_e32 v0, 0x7000, v0 ; GFX11-NEXT: v_ldexp_f16_e32 v6, 0x7000, v6 ; GFX11-NEXT: v_ldexp_f16_e32 v7, 0x7000, v7 ; GFX11-NEXT: v_ldexp_f16_e32 v2, 0x7000, v2 ; GFX11-NEXT: v_ldexp_f16_e32 v3, 0x7000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6 ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2 ; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> , <8 x i16> %i) ret <8 x half> %r } define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ; VI-LABEL: fdiv_pow2_8xhalf: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, 10 ; VI-NEXT: v_lshlrev_b16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mov_b32_e32 v6, 0x7000 ; VI-NEXT: v_lshlrev_b16_e32 v3, 10, v3 ; VI-NEXT: v_lshlrev_b16_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_e32 v2, 10, v2 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_e32 v1, 10, v1 ; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 ; VI-NEXT: v_sub_u16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_sub_u16_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_sub_u16_sdwa v8, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_sub_u16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0 ; VI-NEXT: v_sub_u16_e32 v1, 0x7000, v1 ; VI-NEXT: v_sub_u16_e32 v2, 0x7000, v2 ; VI-NEXT: v_sub_u16_e32 v3, 0x7000, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v1, v1, v8 ; VI-NEXT: v_or_b32_e32 v2, v2, v7 ; VI-NEXT: v_or_b32_e32 v3, v3, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow2_8xhalf: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow2_8xhalf: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %p2 = shl <8 x i16> , %i %p2_f = uitofp <8 x i16> %p2 to <8 x half> %r = fdiv <8 x half> , %p2_f ret <8 x half> %r } define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rdi, %rcx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: movq %rax, %xmm1 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rdi, %rcx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; VI-NEXT: s_mov_b32 s5, 0x40220000 ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.000000e+00, %conv ret double %mul } define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rdi, %rcx ; CHECK-SSE-NEXT: movl $2, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: movq %rax, %xmm1 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rdi, %rcx ; CHECK-AVX2-NEXT: movl $2, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt2: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx ; CHECK-NO-FASTFMA-NEXT: movl $2, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $2, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; VI-NEXT: s_mov_b32 s5, 0xc0220000 ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double %mul = fmul double -9.000000e+00, %conv ret double %mul } define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ; CHECK-SSE-LABEL: fmul_pow_select: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: andl $1, %esi ; CHECK-SSE-NEXT: movl $2, %eax ; CHECK-SSE-NEXT: subl %esi, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_select: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: andl $1, %esi ; CHECK-AVX2-NEXT: movl $2, %eax ; CHECK-AVX2-NEXT: subl %esi, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_select: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: andl $1, %esi ; CHECK-NO-FASTFMA-NEXT: movl $2, %eax ; CHECK-NO-FASTFMA-NEXT: subl %esi, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_select: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: andl $1, %esi ; CHECK-FMA-NEXT: movl $2, %eax ; CHECK-FMA-NEXT: subl %esi, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_select: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 1, v1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_select: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_select: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt %shl = select i1 %c, i32 %shl1, i32 %shl2 %conv = uitofp i32 %shl to float %mul = fmul float 9.000000e+00, %conv ret float %mul } define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rdi, %rcx ; CHECK-SSE-NEXT: movl $8, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: cmpq $8192, %rax # imm = 0x2000 ; CHECK-SSE-NEXT: movl $8192, %ecx # imm = 0x2000 ; CHECK-SSE-NEXT: cmovbq %rax, %rcx ; CHECK-SSE-NEXT: cvtsi2ss %rcx, %xmm0 ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rdi, %rcx ; CHECK-AVX2-NEXT: movl $8, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: cmpq $8192, %rax # imm = 0x2000 ; CHECK-AVX2-NEXT: movl $8192, %ecx # imm = 0x2000 ; CHECK-AVX2-NEXT: cmovbq %rax, %rcx ; CHECK-AVX2-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax ; CHECK-NO-FASTFMA-NEXT: cmpq $8192, %rax # imm = 0x2000 ; CHECK-NO-FASTFMA-NEXT: movl $8192, %ecx # imm = 0x2000 ; CHECK-NO-FASTFMA-NEXT: cmovbq %rax, %rcx ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $8, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax ; CHECK-FMA-NEXT: cmpq $8192, %rax # imm = 0x2000 ; CHECK-FMA-NEXT: movl $8192, %ecx # imm = 0x2000 ; CHECK-FMA-NEXT: cmovbq %rax, %rcx ; CHECK-FMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_fly_pow_mul_min_pow2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; VI-NEXT: s_mov_b64 s[4:5], 0x2000 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, 0x2000 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-NEXT: v_ffbh_u32_e32 v2, v1 ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; VI-NEXT: v_min_u32_e32 v0, 1, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 ; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_fly_pow_mul_min_pow2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_fly_pow_mul_min_pow2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) %conv = uitofp i64 %shl to float %mul = fmul float 9.000000e+00, %conv ret float %mul } define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $2, %eax ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: movl $1, %edx ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %edx ; CHECK-SSE-NEXT: cmpw %ax, %dx ; CHECK-SSE-NEXT: cmovbel %eax, %edx ; CHECK-SSE-NEXT: movzwl %dx, %eax ; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_mul_max_pow2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $2, %eax ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: movl $1, %edx ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %edx ; CHECK-AVX2-NEXT: cmpw %ax, %dx ; CHECK-AVX2-NEXT: cmovbel %eax, %edx ; CHECK-AVX2-NEXT: movzwl %dx, %eax ; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_mul_max_pow2: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $2, %eax ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: movl $1, %edx ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %edx ; CHECK-NO-FASTFMA-NEXT: cmpw %ax, %dx ; CHECK-NO-FASTFMA-NEXT: cmovbel %eax, %edx ; CHECK-NO-FASTFMA-NEXT: movzwl %dx, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_mul_max_pow2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $2, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: movl $1, %ecx ; CHECK-FMA-NEXT: shlxl %edi, %ecx, %ecx ; CHECK-FMA-NEXT: cmpw %ax, %cx ; CHECK-FMA-NEXT: cmoval %ecx, %eax ; CHECK-FMA-NEXT: movzwl %ax, %eax ; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_mul_max_pow2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: s_mov_b32 s5, 0x40080000 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_mul_max_pow2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_mul_max_pow2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl2 = shl nuw i16 2, %cnt %shl1 = shl nuw i16 1, %cnt %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2) %conv = uitofp i16 %shl to double %mul = fmul double 3.000000e+00, %conv ret double %mul } define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rsi, %rcx ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rdi ; CHECK-SSE-NEXT: movq %rdi, %xmm1 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rsi, %rcx ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rdi ; CHECK-AVX2-NEXT: vmovq %rdi, %xmm0 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rsi, %rcx ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rdi ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: shlxq %rsi, %rdi, %rax ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; VI-NEXT: s_mov_b32 s5, 0x40220000 ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i64 %v, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.000000e+00, %conv ret double %mul } define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [2,2] ; CHECK-SSE-NEXT: movdqa %xmm3, %xmm1 ; CHECK-SSE-NEXT: psllq %xmm2, %xmm1 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm3 ; CHECK-SSE-NEXT: movq %xmm3, %rax ; CHECK-SSE-NEXT: testq %rax, %rax ; CHECK-SSE-NEXT: js .LBB6_1 ; CHECK-SSE-NEXT: # %bb.2: ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 ; CHECK-SSE-NEXT: jmp .LBB6_3 ; CHECK-SSE-NEXT: .LBB6_1: ; CHECK-SSE-NEXT: movq %rax, %rcx ; CHECK-SSE-NEXT: shrq %rcx ; CHECK-SSE-NEXT: andl $1, %eax ; CHECK-SSE-NEXT: orq %rcx, %rax ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 ; CHECK-SSE-NEXT: addss %xmm0, %xmm0 ; CHECK-SSE-NEXT: .LBB6_3: ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-SSE-NEXT: movq %xmm1, %rax ; CHECK-SSE-NEXT: testq %rax, %rax ; CHECK-SSE-NEXT: js .LBB6_4 ; CHECK-SSE-NEXT: # %bb.5: ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: jmp .LBB6_6 ; CHECK-SSE-NEXT: .LBB6_4: ; CHECK-SSE-NEXT: movq %rax, %rcx ; CHECK-SSE-NEXT: shrq %rcx ; CHECK-SSE-NEXT: andl $1, %eax ; CHECK-SSE-NEXT: orq %rcx, %rax ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1 ; CHECK-SSE-NEXT: .LBB6_6: ; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; CHECK-AVX2-NEXT: vmovq %xmm1, %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 ; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] ; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] ; CHECK-NO-FASTFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtuqq2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2 ; VI-NEXT: v_ffbh_u32_e32 v3, v2 ; VI-NEXT: v_min_u32_e32 v5, 32, v3 ; VI-NEXT: v_lshlrev_b64 v[1:2], v5, v[1:2] ; VI-NEXT: v_lshlrev_b64 v[3:4], v0, 2 ; VI-NEXT: v_min_u32_e32 v0, 1, v1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v2, v0 ; VI-NEXT: v_ffbh_u32_e32 v0, v4 ; VI-NEXT: v_min_u32_e32 v6, 32, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], v6, v[3:4] ; VI-NEXT: v_sub_u32_e32 v3, vcc, 32, v5 ; VI-NEXT: v_min_u32_e32 v0, 1, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: v_ldexp_f32 v1, v2, v3 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 ; VI-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-NEXT: v_mul_f32_e32 v0, 0x41700000, v0 ; VI-NEXT: v_mul_f32_e32 v1, 0x41700000, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 ; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 ; GFX10-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX10-NEXT: v_min_u32_e32 v5, 32, v5 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41700000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x41700000, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX11-NEXT: v_min_u32_e32 v5, 32, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v3 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mul_f32 v0, 0x41700000, v0 :: v_dual_mul_f32 v1, 0x41700000, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> %mul = fmul <2 x float> , %conv ret <2 x float> %mul } define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,2] ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: psrlq $32, %xmm1 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 ; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_vec: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 2 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2 ; VI-NEXT: s_mov_b32 s5, 0x402e0000 ; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8] ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_vec: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] ; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] ; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv ret <2 x double> %mul } define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2] ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm0 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] ; CHECK-SSE-NEXT: pand %xmm0, %xmm2 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-SSE-NEXT: psrld $16, %xmm0 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: addps %xmm2, %xmm0 ; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] ; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] ; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1392508928,1392508928,1392508928,1392508928] ; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] ; CHECK-AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] ; CHECK-AVX2-NEXT: vmulps %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] ; CHECK-NO-FASTFMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtudq2ps %zmm0, %zmm0 ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] ; CHECK-NO-FASTFMA-NEXT: vmulps %xmm2, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vzeroupper ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] ; CHECK-FMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 ; CHECK-FMA-NEXT: vcvtudq2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 2 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 2 ; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 2 ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 2 ; VI-NEXT: v_cvt_f32_u32_e32 v3, v3 ; VI-NEXT: v_cvt_f32_u32_e32 v2, v2 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3 ; VI-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2 ; VI-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1 ; VI-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-NEXT: v_add_f32_e32 v1, v1, v5 ; VI-NEXT: v_add_f32_e32 v2, v2, v6 ; VI-NEXT: v_add_f32_e32 v3, v3, v7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 2 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 2 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 2 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 2 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 2 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 2 ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_mul_f32 v0, 0x40a00000, v0 :: v_dual_mul_f32 v1, 0x40a00000, v1 ; GFX11-NEXT: v_dual_mul_f32 v2, 0x40a00000, v2 :: v_dual_mul_f32 v3, 0x40a00000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 ; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <4 x i32> , %cnt %conv = uitofp <4 x i32> %shl to <4 x float> %mul = fmul <4 x float> , %conv %res = fadd <4 x float> %mul, %add ret <4 x float> %res } define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,2] ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: psrlq $32, %xmm1 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 ; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v1 ; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2 ; VI-NEXT: s_mov_b32 s5, 0x402e0000 ; VI-NEXT: v_cvt_f64_u32_e32 v[5:6], v2 ; VI-NEXT: v_ldexp_f64 v[2:3], v[3:4], 32 ; VI-NEXT: v_ldexp_f64 v[4:5], v[5:6], 32 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v0 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v1 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[0:1] ; VI-NEXT: v_mul_f64 v[0:1], v[2:3], s[4:5] ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: s_mov_b32 s5, 0x402c0000 ; VI-NEXT: v_mul_f64 v[2:3], v[4:5], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] ; GFX10-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] ; GFX11-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv ret <2 x double> %mul } define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,1] ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: psrlq $32, %xmm1 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 ; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 1 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2 ; VI-NEXT: s_mov_b32 s5, 0x402e0000 ; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8] ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] ; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 ; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] ; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv ret <2 x double> %mul } define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: subq $40, %rsp ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; CHECK-SSE-NEXT: pslld $23, %xmm0 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: pextrw $1, %xmm0, %eax ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE-NEXT: addq $40, %rsp ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: subq $40, %rsp ; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-AVX2-NEXT: addq $40, %rsp ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm1 ; CHECK-NO-FASTFMA-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpextrw $1, %xmm1, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm1, %eax ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NO-FASTFMA-NEXT: vmovaps {{.*#+}} xmm1 = [16,0,0,0] ; CHECK-NO-FASTFMA-NEXT: xorl %eax, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm2, %eax ; CHECK-NO-FASTFMA-NEXT: vmovd %eax, %xmm2 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastw %xmm2, %xmm2 ; CHECK-NO-FASTFMA-NEXT: vpermt2ps %zmm0, %zmm1, %zmm2 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm2, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] ; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vzeroupper ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vpextrw $7, %xmm0, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; CHECK-FMA-NEXT: vmovd %xmm1, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vpextrw $6, %xmm0, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; CHECK-FMA-NEXT: vmovd %xmm2, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 ; CHECK-FMA-NEXT: vpextrw $5, %xmm0, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; CHECK-FMA-NEXT: vmovd %xmm3, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 ; CHECK-FMA-NEXT: vpextrw $4, %xmm0, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm2 ; CHECK-FMA-NEXT: vmovd %xmm2, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 ; CHECK-FMA-NEXT: vpextrw $3, %xmm0, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; CHECK-FMA-NEXT: vmovd %xmm4, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 ; CHECK-FMA-NEXT: vpextrw $2, %xmm0, %eax ; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm2 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; CHECK-FMA-NEXT: vmovd %xmm2, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 ; CHECK-FMA-NEXT: vpextrw $1, %xmm0, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm3 ; CHECK-FMA-NEXT: vmovd %xmm3, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 ; CHECK-FMA-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-FMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0 ; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; CHECK-FMA-NEXT: vzeroupper ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 2 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2 ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 ; VI-NEXT: v_cvt_f16_u16_e32 v1, v1 ; VI-NEXT: v_mov_b32_e32 v2, 0x4b80 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e32 v0, 0x4b80, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0] ; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i16> , %cnt %conv = uitofp <2 x i16> %shl to <2 x half> %mul = fmul <2 x half> , %conv ret <2 x half> %mul } define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rdi, %rcx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: movq %rax, %xmm1 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rdi, %rcx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1 ; VI-NEXT: s_mov_b32 s4, 0xff5f3992 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; VI-NEXT: s_mov_b32 s5, 0x7befffff ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1 ; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992 ; GFX10-NEXT: s_mov_b32 s5, 0x7befffff ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1 ; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992 ; GFX11-NEXT: s_mov_b32 s1, 0x7befffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.745314e+288, %conv ret double %mul } define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: movzwl %ax, %eax ; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: movzwl %ax, %eax ; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: movzwl %ax, %eax ; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fmul_pow_shl_cnt_safe: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: s_mov_b32 s4, 0xff5f3992 ; VI-NEXT: s_mov_b32 s5, 0x7befffff ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_pow_shl_cnt_safe: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1 ; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992 ; GFX10-NEXT: s_mov_b32 s5, 0x7befffff ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fmul_pow_shl_cnt_safe: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1 ; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992 ; GFX11-NEXT: s_mov_b32 s1, 0x7befffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to double %mul = fmul double 9.745314e+288, %conv ret double %mul } define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,1] ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: psrlq $32, %xmm1 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] ; CHECK-SSE-NEXT: divpd %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] ; CHECK-AVX2-NEXT: # xmm1 = mem[0,0] ; CHECK-AVX2-NEXT: vdivpd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] ; CHECK-NO-FASTFMA-NEXT: # xmm1 = mem[0,0] ; CHECK-NO-FASTFMA-NEXT: vdivpd %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] ; CHECK-FMA-NEXT: # xmm1 = mem[0,0] ; CHECK-FMA-NEXT: vdivpd %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_vec: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v1, 20, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; VI-NEXT: v_sub_u32_e64 v0, vcc, 0, 0 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v4, 20, v2 ; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0 ; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_vec: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 20, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, 0, 0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_vec: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 20, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, 0, 0 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fdiv <2 x double> , %conv ret <2 x double> %mul } define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,1] ; CHECK-SSE-NEXT: movdqa %xmm3, %xmm2 ; CHECK-SSE-NEXT: psllq %xmm1, %xmm2 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm3 ; CHECK-SSE-NEXT: movq %xmm3, %rax ; CHECK-SSE-NEXT: testq %rax, %rax ; CHECK-SSE-NEXT: js .LBB15_1 ; CHECK-SSE-NEXT: # %bb.2: ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: jmp .LBB15_3 ; CHECK-SSE-NEXT: .LBB15_1: ; CHECK-SSE-NEXT: movq %rax, %rcx ; CHECK-SSE-NEXT: shrq %rcx ; CHECK-SSE-NEXT: andl $1, %eax ; CHECK-SSE-NEXT: orq %rcx, %rax ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1 ; CHECK-SSE-NEXT: .LBB15_3: ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; CHECK-SSE-NEXT: movq %xmm0, %rax ; CHECK-SSE-NEXT: testq %rax, %rax ; CHECK-SSE-NEXT: js .LBB15_4 ; CHECK-SSE-NEXT: # %bb.5: ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 ; CHECK-SSE-NEXT: jmp .LBB15_6 ; CHECK-SSE-NEXT: .LBB15_4: ; CHECK-SSE-NEXT: movq %rax, %rcx ; CHECK-SSE-NEXT: shrq %rcx ; CHECK-SSE-NEXT: andl $1, %eax ; CHECK-SSE-NEXT: orq %rcx, %rax ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 ; CHECK-SSE-NEXT: addss %xmm0, %xmm0 ; CHECK-SSE-NEXT: .LBB15_6: ; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u> ; CHECK-SSE-NEXT: divps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 ; CHECK-AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; CHECK-AVX2-NEXT: vmovq %xmm1, %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 ; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; CHECK-AVX2-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; CHECK-NO-FASTFMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtuqq2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; CHECK-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v2 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 1.0, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 1.0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> %mul = fdiv <2 x float> , %conv ret <2 x float> %mul } define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rdi, %rcx ; CHECK-SSE-NEXT: movl $8, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: testq %rax, %rax ; CHECK-SSE-NEXT: js .LBB16_1 ; CHECK-SSE-NEXT: # %bb.2: ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: jmp .LBB16_3 ; CHECK-SSE-NEXT: .LBB16_1: ; CHECK-SSE-NEXT: shrq %rax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1 ; CHECK-SSE-NEXT: .LBB16_3: ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rdi, %rcx ; CHECK-AVX2-NEXT: movl $8, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: testq %rax, %rax ; CHECK-AVX2-NEXT: js .LBB16_1 ; CHECK-AVX2-NEXT: # %bb.2: ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: jmp .LBB16_3 ; CHECK-AVX2-NEXT: .LBB16_1: ; CHECK-AVX2-NEXT: shrq %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: .LBB16_3: ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $8, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax ; CHECK-FMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; VI-NEXT: s_mov_b32 s6, 0xc1100000 ; VI-NEXT: v_ffbh_u32_e32 v2, v1 ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; VI-NEXT: v_min_u32_e32 v0, 1, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 ; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 8, %cnt %conv = uitofp i64 %shl to float %mul = fdiv float -9.000000e+00, %conv ret float %mul } define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rdi, %rcx ; CHECK-SSE-NEXT: movl $8, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rdi, %rcx ; CHECK-AVX2-NEXT: movl $8, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $8, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax ; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; VI-NEXT: s_mov_b32 s6, 0xc1100000 ; VI-NEXT: v_xor_b32_e32 v2, v0, v1 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; VI-NEXT: v_ffbh_i32_e32 v3, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3 ; VI-NEXT: v_min_u32_e32 v2, v3, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; VI-NEXT: v_min_u32_e32 v0, 1, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 ; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_neg_int: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX11-NEXT: v_cls_i32_e32 v3, v1 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000 ; GFX11-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 8, %cnt %conv = sitofp i64 %shl to float %mul = fdiv float -9.000000e+00, %conv ret float %mul } define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movq %rdi, %rcx ; CHECK-SSE-NEXT: andb $31, %cl ; CHECK-SSE-NEXT: movl $8, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movq %rdi, %rcx ; CHECK-AVX2-NEXT: andb $31, %cl ; CHECK-AVX2-NEXT: movl $8, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx ; CHECK-NO-FASTFMA-NEXT: andb $31, %cl ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: andb $31, %dil ; CHECK-FMA-NEXT: movl $8, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax ; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; VI-NEXT: v_ffbh_i32_e32 v3, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3 ; VI-NEXT: v_min_u32_e32 v2, v3, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; VI-NEXT: v_min_u32_e32 v0, 1, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 ; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -0.5 ; VI-NEXT: v_div_scale_f32 v2, vcc, -0.5, v0, -0.5 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX10-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, -0.5 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 31, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cls_i32_e32 v3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX11-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, -0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cnt = and i64 %cnt_in, 31 %shl = shl i64 8, %cnt %conv = sitofp i64 %shl to float %mul = fdiv float -0.500000e+00, %conv ret float %mul } define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pushq %rax ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm0, %xmm1 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: pushq %rax ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: popq %rax ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: s_movk_i32 s4, 0x7000 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_mul_f32_e32 v1, 0x46000000, v1 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; GFX10-NEXT: s_mov_b32 s4, 0x46000000 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_fma_mixlo_f16 v1, v1, s4, 0 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; GFX11-NEXT: s_mov_b32 s0, 0x46000000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_mixlo_f16 v1, v1, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to half %mul = fdiv half 0xH7000, %conv ret half %mul } define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pushq %rax ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: movzwl %ax, %eax ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm0, %xmm1 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: pushq %rax ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: movzwl %ax, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: popq %rax ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: movzwl %ax, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 ; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0 ; GFX10-NEXT: v_sub_nc_u16 v0, 0x7000, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u16 v0, 0x7000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH7000, %conv ret half %mul } define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pushq %rax ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: movzwl %ax, %eax ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm0, %xmm1 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: pushq %rax ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: movzwl %ax, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: popq %rax ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: movzwl %ax, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 ; VI-NEXT: v_sub_u16_e32 v0, 0x4800, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0 ; GFX10-NEXT: v_sub_nc_u16 v0, 0x4800, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u16 v0, 0x4800, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH4800, %conv ret half %mul } define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pushq %rax ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: movzwl %ax, %eax ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm0, %xmm1 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: popq %rax ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: pushq %rax ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: movzwl %ax, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: popq %rax ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: movzwl %ax, %eax ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1 ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_add_f32_e32 v1, v1, v1 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; VI-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1 ; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH4000, %conv ret half %mul } define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: cvtsi2sd %rax, %xmm1 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-SSE-NEXT: divsd %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-AVX2-NEXT: vdivsd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NO-FASTFMA-NEXT: vdivsd %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: vcvtusi2sd %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-FMA-NEXT: vdivsd %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 20, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x36a00000 ; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 ; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 20, v0 ; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to double %mul = fdiv double 0x36A0000000000000, %conv ret double %mul } define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s6, 0x10fffff8 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x10fffff8 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x10fffff8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float %mul = fdiv float 0x3a1fffff00000000, %conv ret float %mul } define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movl %edi, %ecx ; CHECK-SSE-NEXT: movl $1, %eax ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-SSE-NEXT: shll %cl, %eax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: movl %edi, %ecx ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq ; VI-LABEL: fdiv_pow_shl_cnt32_okay: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x11000000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt32_okay: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt32_okay: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float %mul = fdiv float 0x3a20000000000000, %conv ret float %mul }