bolt/deps/llvm-18.1.8/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,NO-SZ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,HAS-SZ

; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
; NO-SZ-LABEL: test1:
; NO-SZ:       # %bb.0: # %entry
; NO-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
; NO-SZ-NEXT:    retq
;
; HAS-SZ-LABEL: test1:
; HAS-SZ:       # %bb.0: # %entry
; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; HAS-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm3
; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
; HAS-SZ-NEXT:    retq
entry:
  %0 = bitcast <32 x half> %a to <16 x float>
  %1 = bitcast <32 x half> %b to <16 x float>
  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
  %3 = bitcast <16 x float> %2 to <32 x half>
  %add.i = fadd <32 x half> %3, %acc
  ret <32 x half> %add.i
}

define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
; NO-SZ-LABEL: test2:
; NO-SZ:       # %bb.0: # %entry
; NO-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
; NO-SZ-NEXT:    retq
;
; HAS-SZ-LABEL: test2:
; HAS-SZ:       # %bb.0: # %entry
; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; HAS-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm3
; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
; HAS-SZ-NEXT:    retq
entry:
  %0 = bitcast <32 x half> %a to <16 x float>
  %1 = bitcast <32 x half> %b to <16 x float>
  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
  %3 = bitcast <16 x float> %2 to <32 x half>
  %add.i = fadd <32 x half> %3, %acc
  ret <32 x half> %add.i
}

define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
; NO-SZ-LABEL: test3:
; NO-SZ:       # %bb.0: # %entry
; NO-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
; NO-SZ-NEXT:    retq
;
; HAS-SZ-LABEL: test3:
; HAS-SZ:       # %bb.0: # %entry
; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; HAS-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm3
; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
; HAS-SZ-NEXT:    retq
entry:
  %0 = bitcast <16 x half> %a to <8 x float>
  %1 = bitcast <16 x half> %b to <8 x float>
  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
  %3 = bitcast <8 x float> %2 to <16 x half>
  %add.i = fadd <16 x half> %3, %acc
  ret <16 x half> %add.i
}

define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
; NO-SZ-LABEL: test4:
; NO-SZ:       # %bb.0: # %entry
; NO-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
; NO-SZ-NEXT:    retq
;
; HAS-SZ-LABEL: test4:
; HAS-SZ:       # %bb.0: # %entry
; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; HAS-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm3
; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
; HAS-SZ-NEXT:    retq
entry:
  %0 = bitcast <16 x half> %a to <8 x float>
  %1 = bitcast <16 x half> %b to <8 x float>
  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
  %3 = bitcast <8 x float> %2 to <16 x half>
  %add.i = fadd <16 x half> %3, %acc
  ret <16 x half> %add.i
}

define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
; NO-SZ-LABEL: test5:
; NO-SZ:       # %bb.0: # %entry
; NO-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
; NO-SZ-NEXT:    retq
;
; HAS-SZ-LABEL: test5:
; HAS-SZ:       # %bb.0: # %entry
; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; HAS-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm3
; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
; HAS-SZ-NEXT:    retq
entry:
  %0 = bitcast <8 x half> %a to <4 x float>
  %1 = bitcast <8 x half> %b to <4 x float>
  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
  %3 = bitcast <4 x float> %2 to <8 x half>
  %add.i = fadd <8 x half> %3, %acc
  ret <8 x half> %add.i
}

define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
; NO-SZ-LABEL: test6:
; NO-SZ:       # %bb.0: # %entry
; NO-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
; NO-SZ-NEXT:    retq
;
; HAS-SZ-LABEL: test6:
; HAS-SZ:       # %bb.0: # %entry
; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
; HAS-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm3
; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
; HAS-SZ-NEXT:    retq
entry:
  %0 = bitcast <8 x half> %a to <4 x float>
  %1 = bitcast <8 x half> %b to <4 x float>
  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
  %3 = bitcast <4 x float> %2 to <8 x half>
  %add.i = fadd <8 x half> %3, %acc
  ret <8 x half> %add.i
}

; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.
define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
; CHECK-LABEL: test13:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
; CHECK-NEXT:    retq
entry:
  %0 = bitcast <32 x half> %a to <16 x float>
  %1 = bitcast <32 x half> %b to <16 x float>
  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
  %3 = bitcast <16 x float> %2 to <32 x half>
  %add.i = fadd <32 x half> %3, %acc
  ret <32 x half> %add.i
}

define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
; CHECK-LABEL: test14:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
; CHECK-NEXT:    retq
entry:
  %0 = bitcast <32 x half> %a to <16 x float>
  %1 = bitcast <32 x half> %b to <16 x float>
  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
  %3 = bitcast <16 x float> %2 to <32 x half>
  %add.i = fadd <32 x half> %3, %acc
  ret <32 x half> %add.i
}

define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
; CHECK-LABEL: test15:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
; CHECK-NEXT:    retq
entry:
  %0 = bitcast <16 x half> %a to <8 x float>
  %1 = bitcast <16 x half> %b to <8 x float>
  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
  %3 = bitcast <8 x float> %2 to <16 x half>
  %add.i = fadd <16 x half> %3, %acc
  ret <16 x half> %add.i
}

define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
; CHECK-LABEL: test16:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
; CHECK-NEXT:    retq
entry:
  %0 = bitcast <16 x half> %a to <8 x float>
  %1 = bitcast <16 x half> %b to <8 x float>
  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
  %3 = bitcast <8 x float> %2 to <16 x half>
  %add.i = fadd <16 x half> %3, %acc
  ret <16 x half> %add.i
}

define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: test17:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
; CHECK-NEXT:    retq
entry:
  %0 = bitcast <8 x half> %a to <4 x float>
  %1 = bitcast <8 x half> %b to <4 x float>
  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
  %3 = bitcast <4 x float> %2 to <8 x half>
  %add.i = fadd <8 x half> %3, %acc
  ret <8 x half> %add.i
}

define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: test18:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
; CHECK-NEXT:    retq
entry:
  %0 = bitcast <8 x half> %a to <4 x float>
  %1 = bitcast <8 x half> %b to <4 x float>
  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
  %3 = bitcast <4 x float> %2 to <8 x half>
  %add.i = fadd <8 x half> %3, %acc
  ret <8 x half> %add.i
}

declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
Embed LLVM 18.1.8 2025-02-14 19:21:04 +01:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 \| FileCheck %s --check-prefixes=CHECK,NO-SZ`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 \| FileCheck %s --check-prefixes=CHECK,HAS-SZ`

			`; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.`
			`define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {`
			`; NO-SZ-LABEL: test1:`
			`; NO-SZ: # %bb.0: # %entry`
			`; NO-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0`
			`; NO-SZ-NEXT: retq`
			`;`
			`; HAS-SZ-LABEL: test1:`
			`; HAS-SZ: # %bb.0: # %entry`
			`; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3`
			`; HAS-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm3`
			`; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0`
			`; HAS-SZ-NEXT: retq`
			`entry:`
			`%0 = bitcast <32 x half> %a to <16 x float>`
			`%1 = bitcast <32 x half> %b to <16 x float>`
			`%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)`
			`%3 = bitcast <16 x float> %2 to <32 x half>`
			`%add.i = fadd <32 x half> %3, %acc`
			`ret <32 x half> %add.i`
			`}`

			`define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {`
			`; NO-SZ-LABEL: test2:`
			`; NO-SZ: # %bb.0: # %entry`
			`; NO-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0`
			`; NO-SZ-NEXT: retq`
			`;`
			`; HAS-SZ-LABEL: test2:`
			`; HAS-SZ: # %bb.0: # %entry`
			`; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3`
			`; HAS-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm3`
			`; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0`
			`; HAS-SZ-NEXT: retq`
			`entry:`
			`%0 = bitcast <32 x half> %a to <16 x float>`
			`%1 = bitcast <32 x half> %b to <16 x float>`
			`%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)`
			`%3 = bitcast <16 x float> %2 to <32 x half>`
			`%add.i = fadd <32 x half> %3, %acc`
			`ret <32 x half> %add.i`
			`}`

			`define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {`
			`; NO-SZ-LABEL: test3:`
			`; NO-SZ: # %bb.0: # %entry`
			`; NO-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0`
			`; NO-SZ-NEXT: retq`
			`;`
			`; HAS-SZ-LABEL: test3:`
			`; HAS-SZ: # %bb.0: # %entry`
			`; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3`
			`; HAS-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm3`
			`; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0`
			`; HAS-SZ-NEXT: retq`
			`entry:`
			`%0 = bitcast <16 x half> %a to <8 x float>`
			`%1 = bitcast <16 x half> %b to <8 x float>`
			`%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)`
			`%3 = bitcast <8 x float> %2 to <16 x half>`
			`%add.i = fadd <16 x half> %3, %acc`
			`ret <16 x half> %add.i`
			`}`

			`define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {`
			`; NO-SZ-LABEL: test4:`
			`; NO-SZ: # %bb.0: # %entry`
			`; NO-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0`
			`; NO-SZ-NEXT: retq`
			`;`
			`; HAS-SZ-LABEL: test4:`
			`; HAS-SZ: # %bb.0: # %entry`
			`; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3`
			`; HAS-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm3`
			`; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0`
			`; HAS-SZ-NEXT: retq`
			`entry:`
			`%0 = bitcast <16 x half> %a to <8 x float>`
			`%1 = bitcast <16 x half> %b to <8 x float>`
			`%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)`
			`%3 = bitcast <8 x float> %2 to <16 x half>`
			`%add.i = fadd <16 x half> %3, %acc`
			`ret <16 x half> %add.i`
			`}`

			`define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {`
			`; NO-SZ-LABEL: test5:`
			`; NO-SZ: # %bb.0: # %entry`
			`; NO-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0`
			`; NO-SZ-NEXT: retq`
			`;`
			`; HAS-SZ-LABEL: test5:`
			`; HAS-SZ: # %bb.0: # %entry`
			`; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3`
			`; HAS-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm3`
			`; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0`
			`; HAS-SZ-NEXT: retq`
			`entry:`
			`%0 = bitcast <8 x half> %a to <4 x float>`
			`%1 = bitcast <8 x half> %b to <4 x float>`
			`%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)`
			`%3 = bitcast <4 x float> %2 to <8 x half>`
			`%add.i = fadd <8 x half> %3, %acc`
			`ret <8 x half> %add.i`
			`}`

			`define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {`
			`; NO-SZ-LABEL: test6:`
			`; NO-SZ: # %bb.0: # %entry`
			`; NO-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0`
			`; NO-SZ-NEXT: retq`
			`;`
			`; HAS-SZ-LABEL: test6:`
			`; HAS-SZ: # %bb.0: # %entry`
			`; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3`
			`; HAS-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm3`
			`; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0`
			`; HAS-SZ-NEXT: retq`
			`entry:`
			`%0 = bitcast <8 x half> %a to <4 x float>`
			`%1 = bitcast <8 x half> %b to <4 x float>`
			`%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)`
			`%3 = bitcast <4 x float> %2 to <8 x half>`
			`%add.i = fadd <8 x half> %3, %acc`
			`ret <8 x half> %add.i`
			`}`

			`; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.`
			`define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {`
			`; CHECK-LABEL: test13:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0`
			`; CHECK-NEXT: retq`
			`entry:`
			`%0 = bitcast <32 x half> %a to <16 x float>`
			`%1 = bitcast <32 x half> %b to <16 x float>`
			%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
			`%3 = bitcast <16 x float> %2 to <32 x half>`
			`%add.i = fadd <32 x half> %3, %acc`
			`ret <32 x half> %add.i`
			`}`

			`define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {`
			`; CHECK-LABEL: test14:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0`
			`; CHECK-NEXT: retq`
			`entry:`
			`%0 = bitcast <32 x half> %a to <16 x float>`
			`%1 = bitcast <32 x half> %b to <16 x float>`
			%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
			`%3 = bitcast <16 x float> %2 to <32 x half>`
			`%add.i = fadd <32 x half> %3, %acc`
			`ret <32 x half> %add.i`
			`}`

			`define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {`
			`; CHECK-LABEL: test15:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0`
			`; CHECK-NEXT: retq`
			`entry:`
			`%0 = bitcast <16 x half> %a to <8 x float>`
			`%1 = bitcast <16 x half> %b to <8 x float>`
			`%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)`
			`%3 = bitcast <8 x float> %2 to <16 x half>`
			`%add.i = fadd <16 x half> %3, %acc`
			`ret <16 x half> %add.i`
			`}`

			`define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {`
			`; CHECK-LABEL: test16:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0`
			`; CHECK-NEXT: retq`
			`entry:`
			`%0 = bitcast <16 x half> %a to <8 x float>`
			`%1 = bitcast <16 x half> %b to <8 x float>`
			`%2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)`
			`%3 = bitcast <8 x float> %2 to <16 x half>`
			`%add.i = fadd <16 x half> %3, %acc`
			`ret <16 x half> %add.i`
			`}`

			`define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {`
			`; CHECK-LABEL: test17:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0`
			`; CHECK-NEXT: retq`
			`entry:`
			`%0 = bitcast <8 x half> %a to <4 x float>`
			`%1 = bitcast <8 x half> %b to <4 x float>`
			`%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)`
			`%3 = bitcast <4 x float> %2 to <8 x half>`
			`%add.i = fadd <8 x half> %3, %acc`
			`ret <8 x half> %add.i`
			`}`

			`define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {`
			`; CHECK-LABEL: test18:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0`
			`; CHECK-NEXT: retq`
			`entry:`
			`%0 = bitcast <8 x half> %a to <4 x float>`
			`%1 = bitcast <8 x half> %b to <4 x float>`
			`%2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)`
			`%3 = bitcast <4 x float> %2 to <8 x half>`
			`%add.i = fadd <8 x half> %3, %acc`
			`ret <8 x half> %add.i`
			`}`

			`declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)`
			`declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)`
			`declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)`
			`declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)`
			`declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)`
			`declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)`