; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s define void @vld2(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld2: ; CHECK: .Lfunc_begin0: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 ; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s ; CHECK-NEXT: str q2, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = shl i64 %index, 1 %next.gep = getelementptr float, ptr %pSrc, i64 %0 %next.gep19 = getelementptr float, ptr %pDst, i64 %index %wide.vec = load <8 x float>, ptr %next.gep, align 4 %1 = fmul fast <8 x float> %wide.vec, %wide.vec %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> %3 = fmul fast <8 x float> %wide.vec, %wide.vec %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> %5 = fadd fast <4 x float> %4, %2 store <4 x float> %5, ptr %next.gep19, align 4 %index.next = add i64 %index, 4 %6 = icmp eq i64 %index.next, 1024 br i1 %6, label %while.end, label %vector.body while.end: ; preds = %vector.body ret void } define void @vld3(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld3: ; CHECK: .Lfunc_begin1: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 ; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s ; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s ; CHECK-NEXT: str q3, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = mul i64 %index, 3 %next.gep = getelementptr float, ptr %pSrc, i64 %0 %next.gep23 = getelementptr float, ptr %pDst, i64 %index %wide.vec = load <12 x float>, ptr %next.gep, align 4 %1 = fmul fast <12 x float> %wide.vec, %wide.vec %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> %3 = fmul fast <12 x float> %wide.vec, %wide.vec %4 = shufflevector <12 x float> %3, <12 x float> undef, <4 x i32> %5 = fadd fast <4 x float> %4, %2 %6 = fmul fast <12 x float> %wide.vec, %wide.vec %7 = shufflevector <12 x float> %6, <12 x float> undef, <4 x i32> %8 = fadd fast <4 x float> %5, %7 store <4 x float> %8, ptr %next.gep23, align 4 %index.next = add i64 %index, 4 %9 = icmp eq i64 %index.next, 1024 br i1 %9, label %while.end, label %vector.body while.end: ; preds = %vector.body ret void } define void @vld4(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld4: ; CHECK: .Lfunc_begin2: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 ; CHECK-NEXT: add x9, x1, x8 ; CHECK-NEXT: add x8, x8, #32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 ; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s ; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s ; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s ; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = shl i64 %index, 2 %next.gep = getelementptr float, ptr %pSrc, i64 %0 %1 = shl i64 %index, 1 %wide.vec = load <16 x float>, ptr %next.gep, align 4 %2 = fmul fast <16 x float> %wide.vec, %wide.vec %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> %4 = fmul fast <16 x float> %wide.vec, %wide.vec %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> %6 = fadd fast <4 x float> %5, %3 %7 = fmul fast <16 x float> %wide.vec, %wide.vec %8 = shufflevector <16 x float> %7, <16 x float> undef, <4 x i32> %9 = fmul fast <16 x float> %wide.vec, %wide.vec %10 = shufflevector <16 x float> %9, <16 x float> undef, <4 x i32> %11 = fadd fast <4 x float> %10, %8 %12 = getelementptr inbounds float, ptr %pDst, i64 %1 %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %11, <8 x i32> store <8 x float> %interleaved.vec, ptr %12, align 4 %index.next = add i64 %index, 4 %13 = icmp eq i64 %index.next, 1024 br i1 %13, label %while.end, label %vector.body while.end: ; preds = %vector.body ret void } define void @twosrc(ptr nocapture readonly %pSrc, ptr nocapture readonly %pSrc2, ptr noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: twosrc: ; CHECK: .Lfunc_begin3: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8 ; CHECK-NEXT: add x10, x1, x8 ; CHECK-NEXT: add x8, x8, #32 ; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 ; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10] ; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s ; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s ; CHECK-NEXT: str q4, [x2], #16 ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = shl i64 %index, 1 %next.gep = getelementptr float, ptr %pSrc, i64 %0 %1 = shl i64 %index, 1 %next.gep23 = getelementptr float, ptr %pSrc2, i64 %1 %next.gep24 = getelementptr float, ptr %pDst, i64 %index %wide.vec = load <8 x float>, ptr %next.gep, align 4 %wide.vec26 = load <8 x float>, ptr %next.gep23, align 4 %2 = fmul fast <8 x float> %wide.vec26, %wide.vec %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> %4 = fmul fast <8 x float> %wide.vec26, %wide.vec %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> %6 = fadd fast <4 x float> %5, %3 store <4 x float> %6, ptr %next.gep24, align 4 %index.next = add i64 %index, 4 %7 = icmp eq i64 %index.next, 1024 br i1 %7, label %while.end, label %vector.body while.end: ; preds = %vector.body ret void } define void @vld2_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld2_multiuse: ; CHECK: .Lfunc_begin4: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB4_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 ; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s ; CHECK-NEXT: str q2, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB4_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = shl i64 %index, 1 %next.gep = getelementptr float, ptr %pSrc, i64 %0 %next.gep19 = getelementptr float, ptr %pDst, i64 %index %wide.vec = load <8 x float>, ptr %next.gep, align 4 %1 = fmul fast <8 x float> %wide.vec, %wide.vec %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> %3 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> %4 = fadd fast <4 x float> %3, %2 store <4 x float> %4, ptr %next.gep19, align 4 %index.next = add i64 %index, 4 %5 = icmp eq i64 %index.next, 1024 br i1 %5, label %while.end, label %vector.body while.end: ; preds = %vector.body ret void } define void @vld3_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld3_multiuse: ; CHECK: .Lfunc_begin5: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB5_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 ; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s ; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s ; CHECK-NEXT: str q3, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB5_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = mul i64 %index, 3 %next.gep = getelementptr float, ptr %pSrc, i64 %0 %next.gep23 = getelementptr float, ptr %pDst, i64 %index %wide.vec = load <12 x float>, ptr %next.gep, align 4 %1 = fmul fast <12 x float> %wide.vec, %wide.vec %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> %3 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> %4 = fadd fast <4 x float> %3, %2 %5 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> %6 = fadd fast <4 x float> %4, %5 store <4 x float> %6, ptr %next.gep23, align 4 %index.next = add i64 %index, 4 %7 = icmp eq i64 %index.next, 1024 br i1 %7, label %while.end, label %vector.body while.end: ; preds = %vector.body ret void } define void @vld4_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld4_multiuse: ; CHECK: .Lfunc_begin6: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB6_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 ; CHECK-NEXT: add x9, x1, x8 ; CHECK-NEXT: add x8, x8, #32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 ; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s ; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s ; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s ; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] ; CHECK-NEXT: b.ne .LBB6_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret entry: br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = shl i64 %index, 2 %next.gep = getelementptr float, ptr %pSrc, i64 %0 %1 = shl i64 %index, 1 %wide.vec = load <16 x float>, ptr %next.gep, align 4 %2 = fmul fast <16 x float> %wide.vec, %wide.vec %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> %4 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> %5 = fadd fast <4 x float> %4, %3 %6 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> %7 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> %8 = fadd fast <4 x float> %7, %6 %9 = getelementptr inbounds float, ptr %pDst, i64 %1 %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %8, <8 x i32> store <8 x float> %interleaved.vec, ptr %9, align 4 %index.next = add i64 %index, 4 %10 = icmp eq i64 %index.next, 1024 br i1 %10, label %while.end, label %vector.body while.end: ; preds = %vector.body ret void } ; This example has store(shuffle(shuffle(... that would be better to be treated ; as a single store. This avoids the vld2 for data that is already shuffled. define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) { ; CHECK-LABEL: transpose_s16_8x8_simpler: ; CHECK: .Lfunc_begin7: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ldp q2, q3, [x0, #64] ; CHECK-NEXT: ldp q4, q5, [x0, #32] ; CHECK-NEXT: ldp q6, q7, [x0, #96] ; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h ; CHECK-NEXT: trn1 v2.8h, v4.8h, v5.8h ; CHECK-NEXT: trn1 v3.8h, v6.8h, v7.8h ; CHECK-NEXT: trn1 v0.4s, v0.4s, v1.4s ; CHECK-NEXT: trn1 v1.4s, v2.4s, v3.4s ; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0] ; CHECK-NEXT: str q2, [x0, #64] ; CHECK-NEXT: ret entry: %0 = load <8 x i16>, ptr %a, align 16 %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1 %1 = load <8 x i16>, ptr %arrayidx1, align 16 %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2 %2 = load <8 x i16>, ptr %arrayidx2, align 16 %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3 %3 = load <8 x i16>, ptr %arrayidx3, align 16 %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4 %4 = load <8 x i16>, ptr %arrayidx5, align 16 %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5 %5 = load <8 x i16>, ptr %arrayidx6, align 16 %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6 %6 = load <8 x i16>, ptr %arrayidx8, align 16 %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7 %7 = load <8 x i16>, ptr %arrayidx9, align 16 %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> %8 = bitcast <8 x i16> %shuffle.i to <4 x i32> %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32> %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32> %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32> %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> store <4 x i32> %vzip.i, ptr %a, align 16 store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16 ret void } ; Same as above with some different shuffles define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) { ; CHECK-LABEL: transpose_s16_8x8_simpler2: ; CHECK: .Lfunc_begin8: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ldp q0, q2, [x0] ; CHECK-NEXT: ldp q3, q4, [x0, #64] ; CHECK-NEXT: ldp q5, q6, [x0, #32] ; CHECK-NEXT: ldp q7, q16, [x0, #96] ; CHECK-NEXT: mov v0.h[5], v2.h[4] ; CHECK-NEXT: zip1 v2.8h, v3.8h, v4.8h ; CHECK-NEXT: zip1 v3.8h, v5.8h, v6.8h ; CHECK-NEXT: mov v7.h[5], v16.h[4] ; CHECK-NEXT: mov v0.s[1], v2.s[0] ; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s ; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0] ; CHECK-NEXT: str q2, [x0, #64] ; CHECK-NEXT: ret entry: %0 = load <8 x i16>, ptr %a, align 16 %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1 %1 = load <8 x i16>, ptr %arrayidx1, align 16 %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2 %2 = load <8 x i16>, ptr %arrayidx2, align 16 %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3 %3 = load <8 x i16>, ptr %arrayidx3, align 16 %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4 %4 = load <8 x i16>, ptr %arrayidx5, align 16 %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5 %5 = load <8 x i16>, ptr %arrayidx6, align 16 %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6 %6 = load <8 x i16>, ptr %arrayidx8, align 16 %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7 %7 = load <8 x i16>, ptr %arrayidx9, align 16 %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> %8 = bitcast <8 x i16> %shuffle.i to <4 x i32> %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32> %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32> %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32> %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> store <4 x i32> %vzip.i, ptr %a, align 16 store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16 ret void } define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) { ; CHECK-LABEL: transpose_s16_8x8: ; CHECK: .Lfunc_begin9: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ldr q3, [x4] ; CHECK-NEXT: ldr q4, [x5] ; CHECK-NEXT: ldr q2, [x2] ; CHECK-NEXT: ldr q5, [x3] ; CHECK-NEXT: trn1 v16.8h, v0.8h, v1.8h ; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ldr q6, [x6] ; CHECK-NEXT: ldr q7, [x7] ; CHECK-NEXT: trn1 v17.8h, v3.8h, v4.8h ; CHECK-NEXT: trn2 v1.8h, v3.8h, v4.8h ; CHECK-NEXT: trn1 v18.8h, v2.8h, v5.8h ; CHECK-NEXT: trn2 v2.8h, v2.8h, v5.8h ; CHECK-NEXT: trn1 v19.8h, v6.8h, v7.8h ; CHECK-NEXT: trn2 v3.8h, v6.8h, v7.8h ; CHECK-NEXT: trn1 v4.4s, v16.4s, v17.4s ; CHECK-NEXT: trn1 v6.4s, v0.4s, v1.4s ; CHECK-NEXT: trn2 v16.4s, v16.4s, v17.4s ; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s ; CHECK-NEXT: trn1 v5.4s, v18.4s, v19.4s ; CHECK-NEXT: trn1 v7.4s, v2.4s, v3.4s ; CHECK-NEXT: trn2 v17.4s, v18.4s, v19.4s ; CHECK-NEXT: trn2 v1.4s, v2.4s, v3.4s ; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x0] ; CHECK-NEXT: zip2 v2.4s, v4.4s, v5.4s ; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s ; CHECK-NEXT: zip2 v4.4s, v16.4s, v17.4s ; CHECK-NEXT: st2 { v6.2s, v7.2s }, [x1] ; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x2] ; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x3] ; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s ; CHECK-NEXT: str q2, [x4] ; CHECK-NEXT: str q3, [x5] ; CHECK-NEXT: str q4, [x6] ; CHECK-NEXT: str q0, [x7] ; CHECK-NEXT: ret %9 = load <8 x i16>, ptr %0, align 16 %10 = load <8 x i16>, ptr %1, align 16 %11 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> %12 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> %13 = load <8 x i16>, ptr %2, align 16 %14 = load <8 x i16>, ptr %3, align 16 %15 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> %16 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> %17 = load <8 x i16>, ptr %4, align 16 %18 = load <8 x i16>, ptr %5, align 16 %19 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> %20 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> %21 = load <8 x i16>, ptr %6, align 16 %22 = load <8 x i16>, ptr %7, align 16 %23 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> %24 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> %25 = bitcast <8 x i16> %11 to <4 x i32> %26 = bitcast <8 x i16> %19 to <4 x i32> %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> %29 = bitcast <8 x i16> %12 to <4 x i32> %30 = bitcast <8 x i16> %20 to <4 x i32> %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> %33 = bitcast <8 x i16> %15 to <4 x i32> %34 = bitcast <8 x i16> %23 to <4 x i32> %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> %37 = bitcast <8 x i16> %16 to <4 x i32> %38 = bitcast <8 x i16> %24 to <4 x i32> %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> store <4 x i32> %41, ptr %0, align 16 store <4 x i32> %43, ptr %1, align 16 store <4 x i32> %45, ptr %2, align 16 store <4 x i32> %47, ptr %3, align 16 store <4 x i32> %42, ptr %4, align 16 store <4 x i32> %44, ptr %5, align 16 store <4 x i32> %46, ptr %6, align 16 store <4 x i32> %48, ptr %7, align 16 ret void } define void @transpose_s16_8x8_(ptr nocapture noundef %0) { ; CHECK-LABEL: transpose_s16_8x8_: ; CHECK: .Lfunc_begin10: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ldp q2, q3, [x0, #32] ; CHECK-NEXT: ldp q4, q5, [x0, #64] ; CHECK-NEXT: ldp q6, q7, [x0, #96] ; CHECK-NEXT: trn1 v16.8h, v0.8h, v1.8h ; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h ; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: trn1 v17.8h, v4.8h, v5.8h ; CHECK-NEXT: trn2 v3.8h, v4.8h, v5.8h ; CHECK-NEXT: trn1 v18.8h, v6.8h, v7.8h ; CHECK-NEXT: trn2 v4.8h, v6.8h, v7.8h ; CHECK-NEXT: trn1 v5.4s, v16.4s, v17.4s ; CHECK-NEXT: trn1 v7.4s, v0.4s, v3.4s ; CHECK-NEXT: trn2 v16.4s, v16.4s, v17.4s ; CHECK-NEXT: trn1 v6.4s, v1.4s, v18.4s ; CHECK-NEXT: trn1 v19.4s, v2.4s, v4.4s ; CHECK-NEXT: trn2 v1.4s, v1.4s, v18.4s ; CHECK-NEXT: trn2 v0.4s, v0.4s, v3.4s ; CHECK-NEXT: trn2 v2.4s, v2.4s, v4.4s ; CHECK-NEXT: zip1 v3.4s, v5.4s, v6.4s ; CHECK-NEXT: zip1 v4.4s, v7.4s, v19.4s ; CHECK-NEXT: zip1 v17.4s, v16.4s, v1.4s ; CHECK-NEXT: zip1 v18.4s, v0.4s, v2.4s ; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s ; CHECK-NEXT: zip2 v1.4s, v16.4s, v1.4s ; CHECK-NEXT: zip2 v0.4s, v0.4s, v2.4s ; CHECK-NEXT: stp q3, q4, [x0] ; CHECK-NEXT: zip2 v3.4s, v7.4s, v19.4s ; CHECK-NEXT: stp q17, q18, [x0, #32] ; CHECK-NEXT: stp q1, q0, [x0, #96] ; CHECK-NEXT: stp q5, q3, [x0, #64] ; CHECK-NEXT: ret %2 = load <8 x i16>, ptr %0, align 16 %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1 %4 = load <8 x i16>, ptr %3, align 1 %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> %6 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> %7 = getelementptr inbounds <8 x i16>, ptr %0, i64 2 %8 = load <8 x i16>, ptr %7, align 16 %9 = getelementptr inbounds <8 x i16>, ptr %0, i64 3 %10 = load <8 x i16>, ptr %9, align 16 %11 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> %12 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> %13 = getelementptr inbounds <8 x i16>, ptr %0, i64 4 %14 = load <8 x i16>, ptr %13, align 16 %15 = getelementptr inbounds <8 x i16>, ptr %0, i64 5 %16 = load <8 x i16>, ptr %15, align 16 %17 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> %18 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> %19 = getelementptr inbounds <8 x i16>, ptr %0, i64 6 %20 = load <8 x i16>, ptr %19, align 16 %21 = getelementptr inbounds <8 x i16>, ptr %0, i64 7 %22 = load <8 x i16>, ptr %21, align 16 %23 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> %24 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> %25 = bitcast <8 x i16> %5 to <4 x i32> %26 = bitcast <8 x i16> %17 to <4 x i32> %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> %29 = bitcast <8 x i16> %6 to <4 x i32> %30 = bitcast <8 x i16> %18 to <4 x i32> %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> %33 = bitcast <8 x i16> %11 to <4 x i32> %34 = bitcast <8 x i16> %23 to <4 x i32> %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> %37 = bitcast <8 x i16> %12 to <4 x i32> %38 = bitcast <8 x i16> %24 to <4 x i32> %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> store <4 x i32> %41, ptr %0, align 16 store <4 x i32> %43, ptr %3, align 16 store <4 x i32> %45, ptr %7, align 16 store <4 x i32> %47, ptr %9, align 16 store <4 x i32> %42, ptr %13, align 16 store <4 x i32> %44, ptr %15, align 16 store <4 x i32> %46, ptr %19, align 16 store <4 x i32> %48, ptr %21, align 16 ret void } define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: store_factor2: ; CHECK: .Lfunc_begin11: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: trn1 v3.4s, v1.4s, v0.4s ; CHECK-NEXT: st2 { v2.4s, v3.4s }, [x0] ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> store <8 x i32> %interleaved.vec, ptr %ptr, align 4 ret void } define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: store_factor2_high: ; CHECK: .Lfunc_begin12: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: trn1 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: zip1 v1.4s, v2.4s, v0.4s ; CHECK-NEXT: trn1 v1.4s, v1.4s, v0.4s ; CHECK-NEXT: zip2 v0.4s, v2.4s, v0.4s ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %interleaved.vec2 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> store <4 x i32> %interleaved.vec, ptr %ptr, align 4 store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4 ret void } define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: store_factor2_high2: ; CHECK: .Lfunc_begin13: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s ; CHECK-NEXT: trn1 v2.4s, v2.4s, v1.4s ; CHECK-NEXT: str q2, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> store <4 x i32> %interleaved.vec, ptr %ptr, align 4 store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4 ret void } define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: store_factor3: ; CHECK: .Lfunc_begin14: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ext v6.16b, v1.16b, v2.16b, #12 ; CHECK-NEXT: zip2 v3.4s, v0.4s, v3.4s ; CHECK-NEXT: mov v3.s[0], v0.s[0] ; CHECK-NEXT: ext v0.16b, v2.16b, v0.16b, #12 ; CHECK-NEXT: zip2 v4.4s, v1.4s, v6.4s ; CHECK-NEXT: mov v4.s[0], v1.s[0] ; CHECK-NEXT: zip2 v5.4s, v2.4s, v0.4s ; CHECK-NEXT: mov v5.s[0], v2.s[0] ; CHECK-NEXT: st3 { v3.4s, v4.4s, v5.4s }, [x0] ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a0, <4 x i32> %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> store <12 x i32> %interleaved.vec, ptr %ptr, align 4 ret void } define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { ; CHECK-LABEL: store_factor4: ; CHECK: .Lfunc_begin15: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: trn1 v4.4s, v0.4s, v1.4s ; CHECK-NEXT: trn1 v5.4s, v1.4s, v2.4s ; CHECK-NEXT: trn1 v6.4s, v2.4s, v3.4s ; CHECK-NEXT: trn1 v7.4s, v3.4s, v0.4s ; CHECK-NEXT: st4 { v4.4s, v5.4s, v6.4s, v7.4s }, [x0] ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <4 x i32> %v3 = shufflevector <4 x i32> %a3, <4 x i32> %a0, <4 x i32> %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> store <16 x i32> %interleaved.vec, ptr %ptr, align 4 ret void } define void @debuginfo(ptr nocapture noundef writeonly %buf, <8 x i16> noundef %a) { ; CHECK-LABEL: debuginfo: ; CHECK: .Lfunc_begin16: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h ; CHECK-NEXT: zip2 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret entry: %vzip.i = shufflevector <8 x i16> %a, <8 x i16> , <8 x i32> %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> , <8 x i32> store <8 x i16> %vzip.i, ptr %buf, align 4 call void @llvm.dbg.value(metadata <8 x i16> %vzip1.i, metadata !21, metadata !DIExpression()), !dbg !23 %add.ptr = getelementptr inbounds i32, ptr %buf, i64 4 store <8 x i16> %vzip1.i, ptr %add.ptr, align 4 ret void } declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!6, !7, !8, !9, !10, !11} !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "a64.c", directory: "", checksumkind: CSK_MD5, checksum: "a1a236fb20d703d1ea5963e75545b91a") !2 = !{!15} !3 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !4 = !{!5} !5 = !DISubrange(count: 8) !6 = !{i32 7, !"Dwarf Version", i32 5} !7 = !{i32 2, !"Debug Info Version", i32 3} !8 = !{i32 1, !"wchar_size", i32 4} !9 = !{i32 7, !"uwtable", i32 2} !10 = !{i32 7, !"frame-pointer", i32 1} !11 = !{i32 7, !"debug-info-assignment-tracking", i1 true} !12 = !DISubroutineType(types: !13) !13 = !{null, !14, !15} !14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3, size: 64) !15 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !16) !16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int16x8_t", file: !1, line: 57, baseType: !17) !17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 128, flags: DIFlagVector, elements: !4) !18 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed) !19 = distinct !DISubprogram(name: "store_s16q_to_tran_low_", scope: !1, file: !1, line: 13, type: !12, scopeLine: 13, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !20) !20 = !{!21} !21 = !DILocalVariable(name: "__s1", scope: !22, file: !1, line: 16, type: !16) !22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 16, column: 3) !23 = !DILocation(line: 0, scope: !22)