; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s --mattr=+sve -o - | FileCheck %s target triple = "aarch64" %"class.std::complex" = type { { double, double } } ; Zero initialized reduction ; ; complex x = 0.0 + 0.0i; ; for (int i = 0; i < 100; ++i) ; x += a[i] * b[i]; ; define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x12, x0, x8 ; CHECK-NEXT: add x13, x1, x8 ; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl] ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl] ; CHECK-NEXT: adds x10, x10, x9 ; CHECK-NEXT: add x8, x8, x11 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d ; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.vscale.i64() %1 = shl nuw nsw i64 %0, 1 %n.mod.vf = urem i64 100, %1 %n.vec = sub nuw nsw i64 100, %n.mod.vf %2 = shl nuw nsw i64 %0, 5 br label %vector.body vector.body: ; preds = %vector.body, %entry %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ] %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ] %vec.phi = phi [ zeroinitializer, %entry ], [ %16, %vector.body ] %vec.phi12 = phi [ zeroinitializer, %entry ], [ %14, %vector.body ] %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27 %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27 %wide.vec = load , ptr %scevgep46, align 8 %3 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec) %4 = extractvalue { , } %3, 0 %5 = extractvalue { , } %3, 1 %wide.vec30 = load , ptr %scevgep47, align 8 %6 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec30) %7 = extractvalue { , } %6, 0 %8 = extractvalue { , } %6, 1 %9 = fmul fast %8, %4 %10 = fmul fast %7, %5 %11 = fmul fast %7, %4 %12 = fadd fast %11, %vec.phi12 %13 = fmul fast %8, %5 %14 = fsub fast %12, %13 %15 = fadd fast %10, %vec.phi %16 = fadd fast %15, %9 %lsr.iv.next28 = add i64 %lsr.iv27, %2 %lsr.iv.next32 = sub i64 %lsr.iv31, %1 %17 = icmp eq i64 %lsr.iv.next32, 0 br i1 %17, label %exit.block, label %vector.body exit.block: ; preds = %vector.body %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %14) %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %16) %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1 ret %"class.std::complex" %.fca.0.1.insert } ; Fixed value initialized reduction ; ; complex x = 2.0 + 1.0i; ; for (int i = 0; i < 100; ++i) ; x += a[i] * b[i]; ; define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_nonzero_init_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov d0, #1.00000000 ; CHECK-NEXT: mov z1.d, #0 // =0x0 ; CHECK-NEXT: fmov d2, #2.00000000 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d ; CHECK-NEXT: mov z1.d, p0/m, z2.d ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: zip2 z0.d, z1.d, z3.d ; CHECK-NEXT: zip1 z1.d, z1.d, z3.d ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x12, x0, x8 ; CHECK-NEXT: add x13, x1, x8 ; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl] ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl] ; CHECK-NEXT: adds x10, x10, x9 ; CHECK-NEXT: add x8, x8, x11 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d ; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.vscale.i64() %1 = shl nuw nsw i64 %0, 1 %n.mod.vf = urem i64 100, %1 %n.vec = sub nuw nsw i64 100, %n.mod.vf %2 = shl nuw nsw i64 %0, 5 br label %vector.body vector.body: ; preds = %vector.body, %entry %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ] %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ] %vec.phi = phi [ insertelement ( zeroinitializer, double 1.000000e+00, i32 0), %entry ], [ %16, %vector.body ] %vec.phi12 = phi [ insertelement ( zeroinitializer, double 2.000000e+0, i32 0), %entry ], [ %14, %vector.body ] %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27 %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27 %wide.vec = load , ptr %scevgep46, align 8 %3 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec) %4 = extractvalue { , } %3, 0 %5 = extractvalue { , } %3, 1 %wide.vec30 = load , ptr %scevgep47, align 8 %6 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec30) %7 = extractvalue { , } %6, 0 %8 = extractvalue { , } %6, 1 %9 = fmul fast %8, %4 %10 = fmul fast %7, %5 %11 = fmul fast %7, %4 %12 = fadd fast %11, %vec.phi12 %13 = fmul fast %8, %5 %14 = fsub fast %12, %13 %15 = fadd fast %10, %vec.phi %16 = fadd fast %15, %9 %lsr.iv.next28 = add i64 %lsr.iv27, %2 %lsr.iv.next32 = sub i64 %lsr.iv31, %1 %17 = icmp eq i64 %lsr.iv.next32, 0 br i1 %17, label %exit.block, label %vector.body exit.block: ; preds = %vector.body %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %14) %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %16) %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1 ret %"class.std::complex" %.fca.0.1.insert } ; Loop unrolled with factor 2 ; define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntw x9 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov w10, #1000 // =0x3e8 ; CHECK-NEXT: rdvl x12, #2 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: add x11, x1, x12 ; CHECK-NEXT: add x12, x0, x12 ; CHECK-NEXT: rdvl x13, #4 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x14, x0, x8 ; CHECK-NEXT: add x15, x12, x8 ; CHECK-NEXT: add x16, x1, x8 ; CHECK-NEXT: add x17, x11, x8 ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl] ; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8] ; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x8] ; CHECK-NEXT: ld1d { z16.d }, p0/z, [x16, #1, mul vl] ; CHECK-NEXT: ld1d { z17.d }, p0/z, [x15, #1, mul vl] ; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8] ; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl] ; CHECK-NEXT: adds x10, x10, x9 ; CHECK-NEXT: add x8, x8, x13 ; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0 ; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #0 ; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90 ; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90 ; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #90 ; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d ; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d ; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d ; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d ; CHECK-NEXT: fadd z1.d, z4.d, z5.d ; CHECK-NEXT: fadd z2.d, z2.d, z0.d ; CHECK-NEXT: faddv d0, p0, z1.d ; CHECK-NEXT: faddv d1, p0, z2.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.vscale.i64() %1 = shl nuw nsw i64 %0, 2 %n.mod.vf = urem i64 1000, %1 %n.vec = sub i64 1000, %n.mod.vf %2 = shl nuw nsw i64 %0, 6 %3 = shl nuw nsw i64 %0, 5 %scevgep61 = getelementptr i8, ptr %b, i64 %3 %scevgep63 = getelementptr i8, ptr %a, i64 %3 br label %vector.body vector.body: ; preds = %vector.body, %entry %lsr.iv38 = phi i64 [ %lsr.iv.next39, %vector.body ], [ %n.vec, %entry ] %lsr.iv34 = phi i64 [ %lsr.iv.next35, %vector.body ], [ 0, %entry ] %vec.phi = phi [ zeroinitializer, %entry ], [ %30, %vector.body ] %vec.phi12 = phi [ zeroinitializer, %entry ], [ %31, %vector.body ] %vec.phi13 = phi [ zeroinitializer, %entry ], [ %26, %vector.body ] %vec.phi14 = phi [ zeroinitializer, %entry ], [ %27, %vector.body ] %scevgep57 = getelementptr i8, ptr %a, i64 %lsr.iv34 %scevgep64 = getelementptr i8, ptr %scevgep63, i64 %lsr.iv34 %scevgep58 = getelementptr i8, ptr %b, i64 %lsr.iv34 %scevgep62 = getelementptr i8, ptr %scevgep61, i64 %lsr.iv34 %wide.vec = load , ptr %scevgep57, align 8 %wide.vec32 = load , ptr %scevgep64, align 8 %4 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec) %5 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec32) %6 = extractvalue { , } %4, 0 %7 = extractvalue { , } %5, 0 %8 = extractvalue { , } %4, 1 %9 = extractvalue { , } %5, 1 %wide.vec34 = load , ptr %scevgep58, align 8 %wide.vec35 = load , ptr %scevgep62, align 8 %10 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec34) %11 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec35) %12 = extractvalue { , } %10, 0 %13 = extractvalue { , } %11, 0 %14 = extractvalue { , } %10, 1 %15 = extractvalue { , } %11, 1 %16 = fmul fast %14, %6 %17 = fmul fast %15, %7 %18 = fmul fast %12, %8 %19 = fmul fast %13, %9 %20 = fmul fast %12, %6 %21 = fmul fast %13, %7 %22 = fadd fast %20, %vec.phi13 %23 = fadd fast %21, %vec.phi14 %24 = fmul fast %14, %8 %25 = fmul fast %15, %9 %26 = fsub fast %22, %24 %27 = fsub fast %23, %25 %28 = fadd fast %18, %vec.phi %29 = fadd fast %19, %vec.phi12 %30 = fadd fast %28, %16 %31 = fadd fast %29, %17 %lsr.iv.next35 = add i64 %lsr.iv34, %2 %lsr.iv.next39 = sub i64 %lsr.iv38, %1 %32 = icmp eq i64 %lsr.iv.next39, 0 br i1 %32, label %exit.block, label %vector.body exit.block: ; preds = %vector.body %bin.rdx15 = fadd fast %27, %26 %33 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %bin.rdx15) %bin.rdx = fadd fast %31, %30 %34 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %bin.rdx) %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %33, 0, 0 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %34, 0, 1 ret %"class.std::complex" %.fca.0.1.insert } ; Integer and floating point complex number reduction in the same loop: ; complex *s = ...; ; int *a = ...; ; ; for (int i = 0; i < N; ++i) { ; sum += s[i]; ; int_sum += a[i]; ; } ; define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalias nocapture noundef readnone %c, [2 x double] %d.coerce, ptr nocapture noundef readonly %s, ptr nocapture noundef writeonly %outs) local_unnamed_addr #0 { ; CHECK-LABEL: reduction_mix: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z2.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: neg x10, x9 ; CHECK-NEXT: mov w11, #100 // =0x64 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x10, x11 ; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: zip2 z0.d, z2.d, z2.d ; CHECK-NEXT: zip1 z1.d, z2.d, z2.d ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x9 ; CHECK-NEXT: cmp x10, x8 ; CHECK-NEXT: fadd z0.d, z4.d, z0.d ; CHECK-NEXT: fadd z1.d, z3.d, z1.d ; CHECK-NEXT: add z2.d, z5.d, z2.d ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %middle.block ; CHECK-NEXT: uaddv d2, p0, z2.d ; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d ; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d ; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: faddv d0, p0, z3.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-NEXT: str w8, [x4] ; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.vscale.i64() %1 = shl nuw nsw i64 %0, 1 %n.mod.vf = urem i64 100, %1 %n.vec = sub nuw nsw i64 100, %n.mod.vf %2 = tail call i64 @llvm.vscale.i64() %3 = shl nuw nsw i64 %2, 1 br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %vec.phi = phi [ zeroinitializer, %entry ], [ %5, %vector.body ] %vec.phi13 = phi [ zeroinitializer, %entry ], [ %9, %vector.body ] %vec.phi14 = phi [ zeroinitializer, %entry ], [ %10, %vector.body ] %4 = getelementptr inbounds i32, ptr %s, i64 %index %wide.load = load , ptr %4, align 4 %5 = add %wide.load, %vec.phi %6 = getelementptr inbounds %"class.std::complex", ptr %a, i64 %index %wide.vec = load , ptr %6, align 8 %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %wide.vec) %7 = extractvalue { , } %strided.vec, 0 %8 = extractvalue { , } %strided.vec, 1 %9 = fadd fast %7, %vec.phi13 %10 = fadd fast %8, %vec.phi14 %index.next = add nuw i64 %index, %3 %11 = icmp eq i64 %index.next, %n.vec br i1 %11, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %12 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %10) %13 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, %9) %14 = tail call i32 @llvm.vector.reduce.add.nxv2i32( %5) store i32 %14, ptr %outs, align 4 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %12, 0, 0 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %13, 0, 1 ret %"class.std::complex" %.fca.0.1.insert } declare i64 @llvm.vscale.i64() declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() declare double @llvm.vector.reduce.fadd.nxv2f64(double, ) declare i32 @llvm.vector.reduce.add.nxv2i32()