; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s ; Various reductions generated fro SLP vectorizing unrolled loops. Generated ; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed. define i32 @addv2i32i32(ptr %x) { ; CHECK-LABEL: addv2i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r0, r1, [r0] ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = load i32, ptr %x, align 4 %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1 %1 = load i32, ptr %arrayidx.1, align 4 %add.1 = add nsw i32 %1, %0 ret i32 %add.1 } define i32 @addv4i32i32(ptr %x) { ; CHECK-LABEL: addv4i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, ptr %x, align 4 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) ret i32 %1 } define i32 @addv8i32i32(ptr %x) { ; CHECK-LABEL: addv8i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r0, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i32>, ptr %x, align 4 %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) ret i32 %1 } define i32 @addv16i32i32(ptr %x) { ; CHECK-LABEL: addv16i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i32>, ptr %x, align 4 %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) ret i32 %1 } define i32 @addv24i32i32(ptr %x) { ; CHECK-LABEL: addv24i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i32>, ptr %x, align 4 %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8 %1 = load <16 x i32>, ptr %arrayidx.8, align 4 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) %op.rdx = add nsw i32 %2, %3 ret i32 %op.rdx } define i32 @addv32i32i32(ptr %x) { ; CHECK-LABEL: addv32i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: vaddv.u32 r0, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #32] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #48] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #64] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #80] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #96] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #112] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i32>, ptr %x, align 4 %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %0) ret i32 %1 } define i32 @addv64i32i32(ptr %x) { ; CHECK-LABEL: addv64i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #128] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #144] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #160] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #176] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #192] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #208] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #224] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #240] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <64 x i32>, ptr %x, align 4 %1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %0) ret i32 %1 } define i32 @addv128i32i32(ptr %x) { ; CHECK-LABEL: addv128i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #128] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #144] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #160] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #176] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #192] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #208] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #224] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #240] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #256] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #272] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #288] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #304] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #320] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #336] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #352] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #368] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #384] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #400] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #416] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #432] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #448] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #464] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #480] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #496] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <4 x i32>, ptr %x, align 4 %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load) %1 = getelementptr inbounds i32, ptr %x, i32 4 %wide.load.1 = load <4 x i32>, ptr %1, align 4 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1) %3 = add i32 %2, %0 %4 = getelementptr inbounds i32, ptr %x, i32 8 %wide.load.2 = load <4 x i32>, ptr %4, align 4 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2) %6 = add i32 %5, %3 %7 = getelementptr inbounds i32, ptr %x, i32 12 %wide.load.3 = load <4 x i32>, ptr %7, align 4 %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3) %9 = add i32 %8, %6 %10 = getelementptr inbounds i32, ptr %x, i32 16 %wide.load.4 = load <4 x i32>, ptr %10, align 4 %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4) %12 = add i32 %11, %9 %13 = getelementptr inbounds i32, ptr %x, i32 20 %wide.load.5 = load <4 x i32>, ptr %13, align 4 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5) %15 = add i32 %14, %12 %16 = getelementptr inbounds i32, ptr %x, i32 24 %wide.load.6 = load <4 x i32>, ptr %16, align 4 %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6) %18 = add i32 %17, %15 %19 = getelementptr inbounds i32, ptr %x, i32 28 %wide.load.7 = load <4 x i32>, ptr %19, align 4 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7) %21 = add i32 %20, %18 %22 = getelementptr inbounds i32, ptr %x, i32 32 %wide.load.8 = load <4 x i32>, ptr %22, align 4 %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8) %24 = add i32 %23, %21 %25 = getelementptr inbounds i32, ptr %x, i32 36 %wide.load.9 = load <4 x i32>, ptr %25, align 4 %26 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9) %27 = add i32 %26, %24 %28 = getelementptr inbounds i32, ptr %x, i32 40 %wide.load.10 = load <4 x i32>, ptr %28, align 4 %29 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10) %30 = add i32 %29, %27 %31 = getelementptr inbounds i32, ptr %x, i32 44 %wide.load.11 = load <4 x i32>, ptr %31, align 4 %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11) %33 = add i32 %32, %30 %34 = getelementptr inbounds i32, ptr %x, i32 48 %wide.load.12 = load <4 x i32>, ptr %34, align 4 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12) %36 = add i32 %35, %33 %37 = getelementptr inbounds i32, ptr %x, i32 52 %wide.load.13 = load <4 x i32>, ptr %37, align 4 %38 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13) %39 = add i32 %38, %36 %40 = getelementptr inbounds i32, ptr %x, i32 56 %wide.load.14 = load <4 x i32>, ptr %40, align 4 %41 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14) %42 = add i32 %41, %39 %43 = getelementptr inbounds i32, ptr %x, i32 60 %wide.load.15 = load <4 x i32>, ptr %43, align 4 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15) %45 = add i32 %44, %42 %46 = getelementptr inbounds i32, ptr %x, i32 64 %wide.load.16 = load <4 x i32>, ptr %46, align 4 %47 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16) %48 = add i32 %47, %45 %49 = getelementptr inbounds i32, ptr %x, i32 68 %wide.load.17 = load <4 x i32>, ptr %49, align 4 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17) %51 = add i32 %50, %48 %52 = getelementptr inbounds i32, ptr %x, i32 72 %wide.load.18 = load <4 x i32>, ptr %52, align 4 %53 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18) %54 = add i32 %53, %51 %55 = getelementptr inbounds i32, ptr %x, i32 76 %wide.load.19 = load <4 x i32>, ptr %55, align 4 %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19) %57 = add i32 %56, %54 %58 = getelementptr inbounds i32, ptr %x, i32 80 %wide.load.20 = load <4 x i32>, ptr %58, align 4 %59 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20) %60 = add i32 %59, %57 %61 = getelementptr inbounds i32, ptr %x, i32 84 %wide.load.21 = load <4 x i32>, ptr %61, align 4 %62 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21) %63 = add i32 %62, %60 %64 = getelementptr inbounds i32, ptr %x, i32 88 %wide.load.22 = load <4 x i32>, ptr %64, align 4 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22) %66 = add i32 %65, %63 %67 = getelementptr inbounds i32, ptr %x, i32 92 %wide.load.23 = load <4 x i32>, ptr %67, align 4 %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23) %69 = add i32 %68, %66 %70 = getelementptr inbounds i32, ptr %x, i32 96 %wide.load.24 = load <4 x i32>, ptr %70, align 4 %71 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24) %72 = add i32 %71, %69 %73 = getelementptr inbounds i32, ptr %x, i32 100 %wide.load.25 = load <4 x i32>, ptr %73, align 4 %74 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25) %75 = add i32 %74, %72 %76 = getelementptr inbounds i32, ptr %x, i32 104 %wide.load.26 = load <4 x i32>, ptr %76, align 4 %77 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26) %78 = add i32 %77, %75 %79 = getelementptr inbounds i32, ptr %x, i32 108 %wide.load.27 = load <4 x i32>, ptr %79, align 4 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27) %81 = add i32 %80, %78 %82 = getelementptr inbounds i32, ptr %x, i32 112 %wide.load.28 = load <4 x i32>, ptr %82, align 4 %83 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28) %84 = add i32 %83, %81 %85 = getelementptr inbounds i32, ptr %x, i32 116 %wide.load.29 = load <4 x i32>, ptr %85, align 4 %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29) %87 = add i32 %86, %84 %88 = getelementptr inbounds i32, ptr %x, i32 120 %wide.load.30 = load <4 x i32>, ptr %88, align 4 %89 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30) %90 = add i32 %89, %87 %91 = getelementptr inbounds i32, ptr %x, i32 124 %wide.load.31 = load <4 x i32>, ptr %91, align 4 %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31) %93 = add i32 %92, %90 ret i32 %93 } define i32 @addv2i32i16(ptr %x) { ; CHECK-LABEL: addv2i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrsh.w r1, [r0] ; CHECK-NEXT: ldrsh.w r0, [r0, #2] ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = load i16, ptr %x, align 2 %conv = sext i16 %0 to i32 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 %1 = load i16, ptr %arrayidx.1, align 2 %conv.1 = sext i16 %1 to i32 %add.1 = add nsw i32 %conv, %conv.1 ret i32 %add.1 } define i32 @addv4i32i16(ptr %x) { ; CHECK-LABEL: addv4i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i16>, ptr %x, align 2 %1 = sext <4 x i16> %0 to <4 x i32> %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) ret i32 %2 } define i32 @addv8i32i16(ptr %x) { ; CHECK-LABEL: addv8i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vaddv.s16 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 %1 = sext <8 x i16> %0 to <8 x i32> %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) ret i32 %2 } define i32 @addv16i32i16(ptr %x) { ; CHECK-LABEL: addv16i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r0] ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i16>, ptr %x, align 2 %1 = sext <16 x i16> %0 to <16 x i32> %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) ret i32 %2 } define i32 @addv24i32i16(ptr %x) { ; CHECK-LABEL: addv24i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r0] ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i16>, ptr %x, align 2 %1 = sext <16 x i16> %0 to <16 x i32> %arrayidx.16 = getelementptr inbounds i16, ptr %x, i32 16 %2 = load <8 x i16>, ptr %arrayidx.16, align 2 %3 = sext <8 x i16> %2 to <8 x i32> %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) %op.rdx = add nsw i32 %4, %5 ret i32 %op.rdx } define i32 @addv32i32i16(ptr %x) { ; CHECK-LABEL: addv32i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r0] ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #40] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #56] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i16>, ptr %x, align 2 %1 = sext <32 x i16> %0 to <32 x i32> %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) ret i32 %2 } define i32 @addv64i32i16(ptr %x) { ; CHECK-LABEL: addv64i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r0] ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: ldrsh.w r1, [r0, #120] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: ldrsh.w r3, [r0, #122] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: ldrsh.w r12, [r0, #124] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #40] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #56] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #64] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #72] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #88] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #112] ; CHECK-NEXT: ldrsh.w r0, [r0, #126] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: add r1, r2 ; CHECK-NEXT: add r1, r3 ; CHECK-NEXT: add r1, r12 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i16>, ptr %x, align 2 %1 = sext <32 x i16> %0 to <32 x i32> %arrayidx.32 = getelementptr inbounds i16, ptr %x, i32 32 %2 = load <16 x i16>, ptr %arrayidx.32, align 2 %3 = sext <16 x i16> %2 to <16 x i32> %arrayidx.48 = getelementptr inbounds i16, ptr %x, i32 48 %4 = load <8 x i16>, ptr %arrayidx.48, align 2 %5 = sext <8 x i16> %4 to <8 x i32> %arrayidx.56 = getelementptr inbounds i16, ptr %x, i32 56 %6 = load <4 x i16>, ptr %arrayidx.56, align 2 %7 = sext <4 x i16> %6 to <4 x i32> %arrayidx.60 = getelementptr inbounds i16, ptr %x, i32 60 %8 = load i16, ptr %arrayidx.60, align 2 %conv.60 = sext i16 %8 to i32 %arrayidx.61 = getelementptr inbounds i16, ptr %x, i32 61 %9 = load i16, ptr %arrayidx.61, align 2 %conv.61 = sext i16 %9 to i32 %arrayidx.62 = getelementptr inbounds i16, ptr %x, i32 62 %10 = load i16, ptr %arrayidx.62, align 2 %conv.62 = sext i16 %10 to i32 %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) %op.rdx = add nsw i32 %11, %12 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) %op.rdx8 = add nsw i32 %op.rdx, %13 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) %op.rdx9 = add nsw i32 %op.rdx8, %14 %15 = add nsw i32 %op.rdx9, %conv.60 %16 = add nsw i32 %15, %conv.61 %17 = add nsw i32 %16, %conv.62 %arrayidx.63 = getelementptr inbounds i16, ptr %x, i32 63 %18 = load i16, ptr %arrayidx.63, align 2 %conv.63 = sext i16 %18 to i32 %add.63 = add nsw i32 %17, %conv.63 ret i32 %add.63 } define i32 @addv128i32i16(ptr %x) { ; CHECK-LABEL: addv128i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.s16 r2, q1 ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i16>, ptr %x, align 2 %0 = sext <8 x i16> %wide.load to <8 x i32> %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) %2 = getelementptr inbounds i16, ptr %x, i32 8 %wide.load.1 = load <8 x i16>, ptr %2, align 2 %3 = sext <8 x i16> %wide.load.1 to <8 x i32> %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) %5 = add i32 %4, %1 %6 = getelementptr inbounds i16, ptr %x, i32 16 %wide.load.2 = load <8 x i16>, ptr %6, align 2 %7 = sext <8 x i16> %wide.load.2 to <8 x i32> %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) %9 = add i32 %8, %5 %10 = getelementptr inbounds i16, ptr %x, i32 24 %wide.load.3 = load <8 x i16>, ptr %10, align 2 %11 = sext <8 x i16> %wide.load.3 to <8 x i32> %12 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %11) %13 = add i32 %12, %9 %14 = getelementptr inbounds i16, ptr %x, i32 32 %wide.load.4 = load <8 x i16>, ptr %14, align 2 %15 = sext <8 x i16> %wide.load.4 to <8 x i32> %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) %17 = add i32 %16, %13 %18 = getelementptr inbounds i16, ptr %x, i32 40 %wide.load.5 = load <8 x i16>, ptr %18, align 2 %19 = sext <8 x i16> %wide.load.5 to <8 x i32> %20 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19) %21 = add i32 %20, %17 %22 = getelementptr inbounds i16, ptr %x, i32 48 %wide.load.6 = load <8 x i16>, ptr %22, align 2 %23 = sext <8 x i16> %wide.load.6 to <8 x i32> %24 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %23) %25 = add i32 %24, %21 %26 = getelementptr inbounds i16, ptr %x, i32 56 %wide.load.7 = load <8 x i16>, ptr %26, align 2 %27 = sext <8 x i16> %wide.load.7 to <8 x i32> %28 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %27) %29 = add i32 %28, %25 %30 = getelementptr inbounds i16, ptr %x, i32 64 %wide.load.8 = load <8 x i16>, ptr %30, align 2 %31 = sext <8 x i16> %wide.load.8 to <8 x i32> %32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %31) %33 = add i32 %32, %29 %34 = getelementptr inbounds i16, ptr %x, i32 72 %wide.load.9 = load <8 x i16>, ptr %34, align 2 %35 = sext <8 x i16> %wide.load.9 to <8 x i32> %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35) %37 = add i32 %36, %33 %38 = getelementptr inbounds i16, ptr %x, i32 80 %wide.load.10 = load <8 x i16>, ptr %38, align 2 %39 = sext <8 x i16> %wide.load.10 to <8 x i32> %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39) %41 = add i32 %40, %37 %42 = getelementptr inbounds i16, ptr %x, i32 88 %wide.load.11 = load <8 x i16>, ptr %42, align 2 %43 = sext <8 x i16> %wide.load.11 to <8 x i32> %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) %45 = add i32 %44, %41 %46 = getelementptr inbounds i16, ptr %x, i32 96 %wide.load.12 = load <8 x i16>, ptr %46, align 2 %47 = sext <8 x i16> %wide.load.12 to <8 x i32> %48 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47) %49 = add i32 %48, %45 %50 = getelementptr inbounds i16, ptr %x, i32 104 %wide.load.13 = load <8 x i16>, ptr %50, align 2 %51 = sext <8 x i16> %wide.load.13 to <8 x i32> %52 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %51) %53 = add i32 %52, %49 %54 = getelementptr inbounds i16, ptr %x, i32 112 %wide.load.14 = load <8 x i16>, ptr %54, align 2 %55 = sext <8 x i16> %wide.load.14 to <8 x i32> %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55) %57 = add i32 %56, %53 %58 = getelementptr inbounds i16, ptr %x, i32 120 %wide.load.15 = load <8 x i16>, ptr %58, align 2 %59 = sext <8 x i16> %wide.load.15 to <8 x i32> %60 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %59) %61 = add i32 %60, %57 ret i32 %61 } define i32 @addv2i32i8(ptr %x) { ; CHECK-LABEL: addv2i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r1, [r0] ; CHECK-NEXT: ldrb r0, [r0, #1] ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = load i8, ptr %x, align 1 %conv = zext i8 %0 to i32 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 %1 = load i8, ptr %arrayidx.1, align 1 %conv.1 = zext i8 %1 to i32 %add.1 = add nuw nsw i32 %conv, %conv.1 ret i32 %add.1 } define i32 @addv4i32i8(ptr %x) { ; CHECK-LABEL: addv4i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i8>, ptr %x, align 1 %1 = zext <4 x i8> %0 to <4 x i32> %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) ret i32 %2 } define i32 @addv8i32i8(ptr %x) { ; CHECK-LABEL: addv8i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 %1 = zext <8 x i8> %0 to <8 x i32> %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) ret i32 %2 } define i32 @addv16i32i8(ptr %x) { ; CHECK-LABEL: addv16i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 %1 = zext <16 x i8> %0 to <16 x i32> %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) ret i32 %2 } define i32 @addv24i32i8(ptr %x) { ; CHECK-LABEL: addv24i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u8 r0, q1 ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 %1 = zext <16 x i8> %0 to <16 x i32> %arrayidx.16 = getelementptr inbounds i8, ptr %x, i32 16 %2 = load <8 x i8>, ptr %arrayidx.16, align 1 %3 = zext <8 x i8> %2 to <8 x i32> %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) %op.rdx = add nuw nsw i32 %4, %5 ret i32 %op.rdx } define i32 @addv32i32i8(ptr %x) { ; CHECK-LABEL: addv32i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r0] ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #20] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i8>, ptr %x, align 1 %1 = zext <32 x i8> %0 to <32 x i32> %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) ret i32 %2 } define i32 @addv64i32i8(ptr %x) { ; CHECK-LABEL: addv64i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r0] ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: ldrb.w r1, [r0, #60] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: ldrb.w r3, [r0, #61] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] ; CHECK-NEXT: ldrb.w r12, [r0, #62] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #20] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #56] ; CHECK-NEXT: ldrb.w r0, [r0, #63] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: add r1, r2 ; CHECK-NEXT: add r1, r3 ; CHECK-NEXT: add r1, r12 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i8>, ptr %x, align 1 %1 = zext <32 x i8> %0 to <32 x i32> %arrayidx.32 = getelementptr inbounds i8, ptr %x, i32 32 %2 = load <16 x i8>, ptr %arrayidx.32, align 1 %3 = zext <16 x i8> %2 to <16 x i32> %arrayidx.48 = getelementptr inbounds i8, ptr %x, i32 48 %4 = load <8 x i8>, ptr %arrayidx.48, align 1 %5 = zext <8 x i8> %4 to <8 x i32> %arrayidx.56 = getelementptr inbounds i8, ptr %x, i32 56 %6 = load <4 x i8>, ptr %arrayidx.56, align 1 %7 = zext <4 x i8> %6 to <4 x i32> %arrayidx.60 = getelementptr inbounds i8, ptr %x, i32 60 %8 = load i8, ptr %arrayidx.60, align 1 %conv.60 = zext i8 %8 to i32 %arrayidx.61 = getelementptr inbounds i8, ptr %x, i32 61 %9 = load i8, ptr %arrayidx.61, align 1 %conv.61 = zext i8 %9 to i32 %arrayidx.62 = getelementptr inbounds i8, ptr %x, i32 62 %10 = load i8, ptr %arrayidx.62, align 1 %conv.62 = zext i8 %10 to i32 %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) %op.rdx = add nuw nsw i32 %11, %12 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) %op.rdx8 = add nuw nsw i32 %op.rdx, %13 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) %op.rdx9 = add nuw nsw i32 %op.rdx8, %14 %15 = add nuw nsw i32 %op.rdx9, %conv.60 %16 = add nuw nsw i32 %15, %conv.61 %17 = add nuw nsw i32 %16, %conv.62 %arrayidx.63 = getelementptr inbounds i8, ptr %x, i32 63 %18 = load i8, ptr %arrayidx.63, align 1 %conv.63 = zext i8 %18 to i32 %add.63 = add nuw nsw i32 %17, %conv.63 ret i32 %add.63 } define i32 @addv128i32i8(ptr %x) { ; CHECK-LABEL: addv128i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: vaddv.u8 r0, q1 ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: vldrb.u8 q0, [r1, #32] ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: vldrb.u8 q0, [r1, #48] ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: vldrb.u8 q0, [r1, #64] ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: vldrb.u8 q0, [r1, #80] ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: vldrb.u8 q0, [r1, #96] ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: vldrb.u8 q0, [r1, #112] ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 %0 = zext <16 x i8> %wide.load to <16 x i32> %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) %2 = getelementptr inbounds i8, ptr %x, i32 16 %wide.load.1 = load <16 x i8>, ptr %2, align 1 %3 = zext <16 x i8> %wide.load.1 to <16 x i32> %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) %5 = add i32 %4, %1 %6 = getelementptr inbounds i8, ptr %x, i32 32 %wide.load.2 = load <16 x i8>, ptr %6, align 1 %7 = zext <16 x i8> %wide.load.2 to <16 x i32> %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7) %9 = add i32 %8, %5 %10 = getelementptr inbounds i8, ptr %x, i32 48 %wide.load.3 = load <16 x i8>, ptr %10, align 1 %11 = zext <16 x i8> %wide.load.3 to <16 x i32> %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %11) %13 = add i32 %12, %9 %14 = getelementptr inbounds i8, ptr %x, i32 64 %wide.load.4 = load <16 x i8>, ptr %14, align 1 %15 = zext <16 x i8> %wide.load.4 to <16 x i32> %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) %17 = add i32 %16, %13 %18 = getelementptr inbounds i8, ptr %x, i32 80 %wide.load.5 = load <16 x i8>, ptr %18, align 1 %19 = zext <16 x i8> %wide.load.5 to <16 x i32> %20 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19) %21 = add i32 %20, %17 %22 = getelementptr inbounds i8, ptr %x, i32 96 %wide.load.6 = load <16 x i8>, ptr %22, align 1 %23 = zext <16 x i8> %wide.load.6 to <16 x i32> %24 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %23) %25 = add i32 %24, %21 %26 = getelementptr inbounds i8, ptr %x, i32 112 %wide.load.7 = load <16 x i8>, ptr %26, align 1 %27 = zext <16 x i8> %wide.load.7 to <16 x i32> %28 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %27) %29 = add i32 %28, %25 ret i32 %29 } define signext i16 @addv2i16i16(ptr %x) { ; CHECK-LABEL: addv2i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrh r1, [r0] ; CHECK-NEXT: ldrh r0, [r0, #2] ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load i16, ptr %x, align 2 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 %1 = load i16, ptr %arrayidx.1, align 2 %add.1 = add i16 %1, %0 ret i16 %add.1 } define signext i16 @addv4i16i16(ptr %x) { ; CHECK-LABEL: addv4i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i16>, ptr %x, align 2 %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %0) ret i16 %1 } define signext i16 @addv8i16i16(ptr %x) { ; CHECK-LABEL: addv8i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) ret i16 %1 } define signext i16 @addv16i16i16(ptr %x) { ; CHECK-LABEL: addv16i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r0, q1 ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i16>, ptr %x, align 2 %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0) ret i16 %1 } define signext i16 @addv24i16i16(ptr %x) { ; CHECK-LABEL: addv24i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 %1 = load <16 x i16>, ptr %arrayidx.8, align 2 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) %op.rdx = add i16 %2, %3 ret i16 %op.rdx } define signext i16 @addv32i16i16(ptr %x) { ; CHECK-LABEL: addv32i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i16>, ptr %x, align 2 %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %0) ret i16 %1 } define signext i16 @addv64i16i16(ptr %x) { ; CHECK-LABEL: addv64i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <64 x i16>, ptr %x, align 2 %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %0) ret i16 %1 } define signext i16 @addv128i16i16(ptr %x) { ; CHECK-LABEL: addv128i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i16>, ptr %x, align 2 %0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load) %1 = getelementptr inbounds i16, ptr %x, i32 8 %wide.load.1 = load <8 x i16>, ptr %1, align 2 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1) %3 = add i16 %2, %0 %4 = getelementptr inbounds i16, ptr %x, i32 16 %wide.load.2 = load <8 x i16>, ptr %4, align 2 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2) %6 = add i16 %5, %3 %7 = getelementptr inbounds i16, ptr %x, i32 24 %wide.load.3 = load <8 x i16>, ptr %7, align 2 %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3) %9 = add i16 %8, %6 %10 = getelementptr inbounds i16, ptr %x, i32 32 %wide.load.4 = load <8 x i16>, ptr %10, align 2 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4) %12 = add i16 %11, %9 %13 = getelementptr inbounds i16, ptr %x, i32 40 %wide.load.5 = load <8 x i16>, ptr %13, align 2 %14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5) %15 = add i16 %14, %12 %16 = getelementptr inbounds i16, ptr %x, i32 48 %wide.load.6 = load <8 x i16>, ptr %16, align 2 %17 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6) %18 = add i16 %17, %15 %19 = getelementptr inbounds i16, ptr %x, i32 56 %wide.load.7 = load <8 x i16>, ptr %19, align 2 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7) %21 = add i16 %20, %18 %22 = getelementptr inbounds i16, ptr %x, i32 64 %wide.load.8 = load <8 x i16>, ptr %22, align 2 %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8) %24 = add i16 %23, %21 %25 = getelementptr inbounds i16, ptr %x, i32 72 %wide.load.9 = load <8 x i16>, ptr %25, align 2 %26 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9) %27 = add i16 %26, %24 %28 = getelementptr inbounds i16, ptr %x, i32 80 %wide.load.10 = load <8 x i16>, ptr %28, align 2 %29 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10) %30 = add i16 %29, %27 %31 = getelementptr inbounds i16, ptr %x, i32 88 %wide.load.11 = load <8 x i16>, ptr %31, align 2 %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11) %33 = add i16 %32, %30 %34 = getelementptr inbounds i16, ptr %x, i32 96 %wide.load.12 = load <8 x i16>, ptr %34, align 2 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12) %36 = add i16 %35, %33 %37 = getelementptr inbounds i16, ptr %x, i32 104 %wide.load.13 = load <8 x i16>, ptr %37, align 2 %38 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13) %39 = add i16 %38, %36 %40 = getelementptr inbounds i16, ptr %x, i32 112 %wide.load.14 = load <8 x i16>, ptr %40, align 2 %41 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14) %42 = add i16 %41, %39 %43 = getelementptr inbounds i16, ptr %x, i32 120 %wide.load.15 = load <8 x i16>, ptr %43, align 2 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15) %45 = add i16 %44, %42 ret i16 %45 } define zeroext i8 @addv2i8i8(ptr %x) { ; CHECK-LABEL: addv2i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r1, [r0] ; CHECK-NEXT: ldrb r0, [r0, #1] ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load i8, ptr %x, align 1 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 %1 = load i8, ptr %arrayidx.1, align 1 %add.1 = add i8 %1, %0 ret i8 %add.1 } define zeroext i8 @addv4i8i8(ptr %x) { ; CHECK-LABEL: addv4i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i8>, ptr %x, align 1 %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %0) ret i8 %1 } define zeroext i8 @addv8i8i8(ptr %x) { ; CHECK-LABEL: addv8i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0) ret i8 %1 } define zeroext i8 @addv16i8i8(ptr %x) { ; CHECK-LABEL: addv16i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0) ret i8 %1 } define zeroext i8 @addv24i8i8(ptr %x) { ; CHECK-LABEL: addv24i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #8] ; CHECK-NEXT: vaddv.u16 r0, q1 ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8 %1 = load <16 x i8>, ptr %arrayidx.8, align 1 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1) %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0) %op.rdx = add i8 %2, %3 ret i8 %op.rdx } define zeroext i8 @addv32i8i8(ptr %x) { ; CHECK-LABEL: addv32i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vaddv.u8 r0, q1 ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i8>, ptr %x, align 1 %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %0) ret i8 %1 } define zeroext i8 @addv64i8i8(ptr %x) { ; CHECK-LABEL: addv64i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vaddv.u8 r2, q1 ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <64 x i8>, ptr %x, align 1 %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %0) ret i8 %1 } define zeroext i8 @addv128i8i8(ptr %x) { ; CHECK-LABEL: addv128i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vaddv.u8 r2, q1 ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #64] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #80] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #96] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #112] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load) %1 = getelementptr inbounds i8, ptr %x, i32 16 %wide.load.1 = load <16 x i8>, ptr %1, align 1 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1) %3 = add i8 %2, %0 %4 = getelementptr inbounds i8, ptr %x, i32 32 %wide.load.2 = load <16 x i8>, ptr %4, align 1 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2) %6 = add i8 %5, %3 %7 = getelementptr inbounds i8, ptr %x, i32 48 %wide.load.3 = load <16 x i8>, ptr %7, align 1 %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3) %9 = add i8 %8, %6 %10 = getelementptr inbounds i8, ptr %x, i32 64 %wide.load.4 = load <16 x i8>, ptr %10, align 1 %11 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4) %12 = add i8 %11, %9 %13 = getelementptr inbounds i8, ptr %x, i32 80 %wide.load.5 = load <16 x i8>, ptr %13, align 1 %14 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5) %15 = add i8 %14, %12 %16 = getelementptr inbounds i8, ptr %x, i32 96 %wide.load.6 = load <16 x i8>, ptr %16, align 1 %17 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6) %18 = add i8 %17, %15 %19 = getelementptr inbounds i8, ptr %x, i32 112 %wide.load.7 = load <16 x i8>, ptr %19, align 1 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7) %21 = add i8 %20, %18 ret i8 %21 } define i32 @mlav2i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav2i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r0, r2, [r0] ; CHECK-NEXT: ldrd r1, r3, [r1] ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: mla r0, r3, r2, r0 ; CHECK-NEXT: bx lr entry: %0 = load i32, ptr %x, align 4 %1 = load i32, ptr %y, align 4 %mul = mul nsw i32 %1, %0 %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1 %2 = load i32, ptr %arrayidx.1, align 4 %arrayidx1.1 = getelementptr inbounds i32, ptr %y, i32 1 %3 = load i32, ptr %arrayidx1.1, align 4 %mul.1 = mul nsw i32 %3, %2 %add.1 = add nsw i32 %mul.1, %mul ret i32 %add.1 } define i32 @mlav4i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav4i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i32>, ptr %x, align 4 %1 = load <4 x i32>, ptr %y, align 4 %2 = mul nsw <4 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) ret i32 %3 } define i32 @mlav8i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav8i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i32>, ptr %x, align 4 %1 = load <8 x i32>, ptr %y, align 4 %2 = mul nsw <8 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) ret i32 %3 } define i32 @mlav16i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav16i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i32>, ptr %x, align 4 %1 = load <16 x i32>, ptr %y, align 4 %2 = mul nsw <16 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) ret i32 %3 } define i32 @mlav24i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav24i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i32>, ptr %x, align 4 %1 = load <8 x i32>, ptr %y, align 4 %2 = mul nsw <8 x i32> %1, %0 %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8 %arrayidx1.8 = getelementptr inbounds i32, ptr %y, i32 8 %3 = load <16 x i32>, ptr %arrayidx.8, align 4 %4 = load <16 x i32>, ptr %arrayidx1.8, align 4 %5 = mul nsw <16 x i32> %4, %3 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) %op.rdx = add nsw i32 %6, %7 ret i32 %op.rdx } define i32 @mlav32i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav32i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96] ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112] ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i32>, ptr %x, align 4 %1 = load <32 x i32>, ptr %y, align 4 %2 = mul nsw <32 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2) ret i32 %3 } define i32 @mlav64i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav64i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96] ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112] ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #128] ; CHECK-NEXT: vldrw.u32 q1, [r1, #128] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #144] ; CHECK-NEXT: vldrw.u32 q1, [r1, #144] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #160] ; CHECK-NEXT: vldrw.u32 q1, [r1, #160] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #176] ; CHECK-NEXT: vldrw.u32 q1, [r1, #176] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #192] ; CHECK-NEXT: vldrw.u32 q1, [r1, #192] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #208] ; CHECK-NEXT: vldrw.u32 q1, [r1, #208] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #224] ; CHECK-NEXT: vldrw.u32 q1, [r1, #224] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #240] ; CHECK-NEXT: vldrw.u32 q1, [r1, #240] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <4 x i32>, ptr %x, align 4 %wide.load10 = load <4 x i32>, ptr %y, align 4 %0 = mul nsw <4 x i32> %wide.load10, %wide.load %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) %2 = getelementptr inbounds i32, ptr %x, i32 4 %wide.load.1 = load <4 x i32>, ptr %2, align 4 %3 = getelementptr inbounds i32, ptr %y, i32 4 %wide.load10.1 = load <4 x i32>, ptr %3, align 4 %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) %6 = add i32 %5, %1 %7 = getelementptr inbounds i32, ptr %x, i32 8 %wide.load.2 = load <4 x i32>, ptr %7, align 4 %8 = getelementptr inbounds i32, ptr %y, i32 8 %wide.load10.2 = load <4 x i32>, ptr %8, align 4 %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) %11 = add i32 %10, %6 %12 = getelementptr inbounds i32, ptr %x, i32 12 %wide.load.3 = load <4 x i32>, ptr %12, align 4 %13 = getelementptr inbounds i32, ptr %y, i32 12 %wide.load10.3 = load <4 x i32>, ptr %13, align 4 %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) %16 = add i32 %15, %11 %17 = getelementptr inbounds i32, ptr %x, i32 16 %wide.load.4 = load <4 x i32>, ptr %17, align 4 %18 = getelementptr inbounds i32, ptr %y, i32 16 %wide.load10.4 = load <4 x i32>, ptr %18, align 4 %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19) %21 = add i32 %20, %16 %22 = getelementptr inbounds i32, ptr %x, i32 20 %wide.load.5 = load <4 x i32>, ptr %22, align 4 %23 = getelementptr inbounds i32, ptr %y, i32 20 %wide.load10.5 = load <4 x i32>, ptr %23, align 4 %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24) %26 = add i32 %25, %21 %27 = getelementptr inbounds i32, ptr %x, i32 24 %wide.load.6 = load <4 x i32>, ptr %27, align 4 %28 = getelementptr inbounds i32, ptr %y, i32 24 %wide.load10.6 = load <4 x i32>, ptr %28, align 4 %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) %31 = add i32 %30, %26 %32 = getelementptr inbounds i32, ptr %x, i32 28 %wide.load.7 = load <4 x i32>, ptr %32, align 4 %33 = getelementptr inbounds i32, ptr %y, i32 28 %wide.load10.7 = load <4 x i32>, ptr %33, align 4 %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34) %36 = add i32 %35, %31 %37 = getelementptr inbounds i32, ptr %x, i32 32 %wide.load.8 = load <4 x i32>, ptr %37, align 4 %38 = getelementptr inbounds i32, ptr %y, i32 32 %wide.load10.8 = load <4 x i32>, ptr %38, align 4 %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39) %41 = add i32 %40, %36 %42 = getelementptr inbounds i32, ptr %x, i32 36 %wide.load.9 = load <4 x i32>, ptr %42, align 4 %43 = getelementptr inbounds i32, ptr %y, i32 36 %wide.load10.9 = load <4 x i32>, ptr %43, align 4 %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44) %46 = add i32 %45, %41 %47 = getelementptr inbounds i32, ptr %x, i32 40 %wide.load.10 = load <4 x i32>, ptr %47, align 4 %48 = getelementptr inbounds i32, ptr %y, i32 40 %wide.load10.10 = load <4 x i32>, ptr %48, align 4 %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49) %51 = add i32 %50, %46 %52 = getelementptr inbounds i32, ptr %x, i32 44 %wide.load.11 = load <4 x i32>, ptr %52, align 4 %53 = getelementptr inbounds i32, ptr %y, i32 44 %wide.load10.11 = load <4 x i32>, ptr %53, align 4 %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54) %56 = add i32 %55, %51 %57 = getelementptr inbounds i32, ptr %x, i32 48 %wide.load.12 = load <4 x i32>, ptr %57, align 4 %58 = getelementptr inbounds i32, ptr %y, i32 48 %wide.load10.12 = load <4 x i32>, ptr %58, align 4 %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59) %61 = add i32 %60, %56 %62 = getelementptr inbounds i32, ptr %x, i32 52 %wide.load.13 = load <4 x i32>, ptr %62, align 4 %63 = getelementptr inbounds i32, ptr %y, i32 52 %wide.load10.13 = load <4 x i32>, ptr %63, align 4 %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) %66 = add i32 %65, %61 %67 = getelementptr inbounds i32, ptr %x, i32 56 %wide.load.14 = load <4 x i32>, ptr %67, align 4 %68 = getelementptr inbounds i32, ptr %y, i32 56 %wide.load10.14 = load <4 x i32>, ptr %68, align 4 %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69) %71 = add i32 %70, %66 %72 = getelementptr inbounds i32, ptr %x, i32 60 %wide.load.15 = load <4 x i32>, ptr %72, align 4 %73 = getelementptr inbounds i32, ptr %y, i32 60 %wide.load10.15 = load <4 x i32>, ptr %73, align 4 %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74) %76 = add i32 %75, %71 ret i32 %76 } define i32 @mlav128i32i32(ptr %x, ptr %y) { ; CHECK-LABEL: mlav128i32i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96] ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112] ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #128] ; CHECK-NEXT: vldrw.u32 q1, [r1, #128] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #144] ; CHECK-NEXT: vldrw.u32 q1, [r1, #144] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #160] ; CHECK-NEXT: vldrw.u32 q1, [r1, #160] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #176] ; CHECK-NEXT: vldrw.u32 q1, [r1, #176] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #192] ; CHECK-NEXT: vldrw.u32 q1, [r1, #192] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #208] ; CHECK-NEXT: vldrw.u32 q1, [r1, #208] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #224] ; CHECK-NEXT: vldrw.u32 q1, [r1, #224] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #240] ; CHECK-NEXT: vldrw.u32 q1, [r1, #240] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #256] ; CHECK-NEXT: vldrw.u32 q1, [r1, #256] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #272] ; CHECK-NEXT: vldrw.u32 q1, [r1, #272] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #288] ; CHECK-NEXT: vldrw.u32 q1, [r1, #288] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #304] ; CHECK-NEXT: vldrw.u32 q1, [r1, #304] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #320] ; CHECK-NEXT: vldrw.u32 q1, [r1, #320] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #336] ; CHECK-NEXT: vldrw.u32 q1, [r1, #336] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #352] ; CHECK-NEXT: vldrw.u32 q1, [r1, #352] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #368] ; CHECK-NEXT: vldrw.u32 q1, [r1, #368] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #384] ; CHECK-NEXT: vldrw.u32 q1, [r1, #384] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #400] ; CHECK-NEXT: vldrw.u32 q1, [r1, #400] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #416] ; CHECK-NEXT: vldrw.u32 q1, [r1, #416] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #432] ; CHECK-NEXT: vldrw.u32 q1, [r1, #432] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #448] ; CHECK-NEXT: vldrw.u32 q1, [r1, #448] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #464] ; CHECK-NEXT: vldrw.u32 q1, [r1, #464] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #480] ; CHECK-NEXT: vldrw.u32 q1, [r1, #480] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #496] ; CHECK-NEXT: vldrw.u32 q1, [r1, #496] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <4 x i32>, ptr %x, align 4 %wide.load10 = load <4 x i32>, ptr %y, align 4 %0 = mul nsw <4 x i32> %wide.load10, %wide.load %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) %2 = getelementptr inbounds i32, ptr %x, i32 4 %wide.load.1 = load <4 x i32>, ptr %2, align 4 %3 = getelementptr inbounds i32, ptr %y, i32 4 %wide.load10.1 = load <4 x i32>, ptr %3, align 4 %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) %6 = add i32 %5, %1 %7 = getelementptr inbounds i32, ptr %x, i32 8 %wide.load.2 = load <4 x i32>, ptr %7, align 4 %8 = getelementptr inbounds i32, ptr %y, i32 8 %wide.load10.2 = load <4 x i32>, ptr %8, align 4 %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) %11 = add i32 %10, %6 %12 = getelementptr inbounds i32, ptr %x, i32 12 %wide.load.3 = load <4 x i32>, ptr %12, align 4 %13 = getelementptr inbounds i32, ptr %y, i32 12 %wide.load10.3 = load <4 x i32>, ptr %13, align 4 %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) %16 = add i32 %15, %11 %17 = getelementptr inbounds i32, ptr %x, i32 16 %wide.load.4 = load <4 x i32>, ptr %17, align 4 %18 = getelementptr inbounds i32, ptr %y, i32 16 %wide.load10.4 = load <4 x i32>, ptr %18, align 4 %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19) %21 = add i32 %20, %16 %22 = getelementptr inbounds i32, ptr %x, i32 20 %wide.load.5 = load <4 x i32>, ptr %22, align 4 %23 = getelementptr inbounds i32, ptr %y, i32 20 %wide.load10.5 = load <4 x i32>, ptr %23, align 4 %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24) %26 = add i32 %25, %21 %27 = getelementptr inbounds i32, ptr %x, i32 24 %wide.load.6 = load <4 x i32>, ptr %27, align 4 %28 = getelementptr inbounds i32, ptr %y, i32 24 %wide.load10.6 = load <4 x i32>, ptr %28, align 4 %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) %31 = add i32 %30, %26 %32 = getelementptr inbounds i32, ptr %x, i32 28 %wide.load.7 = load <4 x i32>, ptr %32, align 4 %33 = getelementptr inbounds i32, ptr %y, i32 28 %wide.load10.7 = load <4 x i32>, ptr %33, align 4 %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34) %36 = add i32 %35, %31 %37 = getelementptr inbounds i32, ptr %x, i32 32 %wide.load.8 = load <4 x i32>, ptr %37, align 4 %38 = getelementptr inbounds i32, ptr %y, i32 32 %wide.load10.8 = load <4 x i32>, ptr %38, align 4 %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39) %41 = add i32 %40, %36 %42 = getelementptr inbounds i32, ptr %x, i32 36 %wide.load.9 = load <4 x i32>, ptr %42, align 4 %43 = getelementptr inbounds i32, ptr %y, i32 36 %wide.load10.9 = load <4 x i32>, ptr %43, align 4 %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44) %46 = add i32 %45, %41 %47 = getelementptr inbounds i32, ptr %x, i32 40 %wide.load.10 = load <4 x i32>, ptr %47, align 4 %48 = getelementptr inbounds i32, ptr %y, i32 40 %wide.load10.10 = load <4 x i32>, ptr %48, align 4 %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49) %51 = add i32 %50, %46 %52 = getelementptr inbounds i32, ptr %x, i32 44 %wide.load.11 = load <4 x i32>, ptr %52, align 4 %53 = getelementptr inbounds i32, ptr %y, i32 44 %wide.load10.11 = load <4 x i32>, ptr %53, align 4 %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54) %56 = add i32 %55, %51 %57 = getelementptr inbounds i32, ptr %x, i32 48 %wide.load.12 = load <4 x i32>, ptr %57, align 4 %58 = getelementptr inbounds i32, ptr %y, i32 48 %wide.load10.12 = load <4 x i32>, ptr %58, align 4 %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59) %61 = add i32 %60, %56 %62 = getelementptr inbounds i32, ptr %x, i32 52 %wide.load.13 = load <4 x i32>, ptr %62, align 4 %63 = getelementptr inbounds i32, ptr %y, i32 52 %wide.load10.13 = load <4 x i32>, ptr %63, align 4 %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) %66 = add i32 %65, %61 %67 = getelementptr inbounds i32, ptr %x, i32 56 %wide.load.14 = load <4 x i32>, ptr %67, align 4 %68 = getelementptr inbounds i32, ptr %y, i32 56 %wide.load10.14 = load <4 x i32>, ptr %68, align 4 %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69) %71 = add i32 %70, %66 %72 = getelementptr inbounds i32, ptr %x, i32 60 %wide.load.15 = load <4 x i32>, ptr %72, align 4 %73 = getelementptr inbounds i32, ptr %y, i32 60 %wide.load10.15 = load <4 x i32>, ptr %73, align 4 %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74) %76 = add i32 %75, %71 %77 = getelementptr inbounds i32, ptr %x, i32 64 %wide.load.16 = load <4 x i32>, ptr %77, align 4 %78 = getelementptr inbounds i32, ptr %y, i32 64 %wide.load10.16 = load <4 x i32>, ptr %78, align 4 %79 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %79) %81 = add i32 %80, %76 %82 = getelementptr inbounds i32, ptr %x, i32 68 %wide.load.17 = load <4 x i32>, ptr %82, align 4 %83 = getelementptr inbounds i32, ptr %y, i32 68 %wide.load10.17 = load <4 x i32>, ptr %83, align 4 %84 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17 %85 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %84) %86 = add i32 %85, %81 %87 = getelementptr inbounds i32, ptr %x, i32 72 %wide.load.18 = load <4 x i32>, ptr %87, align 4 %88 = getelementptr inbounds i32, ptr %y, i32 72 %wide.load10.18 = load <4 x i32>, ptr %88, align 4 %89 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18 %90 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %89) %91 = add i32 %90, %86 %92 = getelementptr inbounds i32, ptr %x, i32 76 %wide.load.19 = load <4 x i32>, ptr %92, align 4 %93 = getelementptr inbounds i32, ptr %y, i32 76 %wide.load10.19 = load <4 x i32>, ptr %93, align 4 %94 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19 %95 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %94) %96 = add i32 %95, %91 %97 = getelementptr inbounds i32, ptr %x, i32 80 %wide.load.20 = load <4 x i32>, ptr %97, align 4 %98 = getelementptr inbounds i32, ptr %y, i32 80 %wide.load10.20 = load <4 x i32>, ptr %98, align 4 %99 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99) %101 = add i32 %100, %96 %102 = getelementptr inbounds i32, ptr %x, i32 84 %wide.load.21 = load <4 x i32>, ptr %102, align 4 %103 = getelementptr inbounds i32, ptr %y, i32 84 %wide.load10.21 = load <4 x i32>, ptr %103, align 4 %104 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21 %105 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %104) %106 = add i32 %105, %101 %107 = getelementptr inbounds i32, ptr %x, i32 88 %wide.load.22 = load <4 x i32>, ptr %107, align 4 %108 = getelementptr inbounds i32, ptr %y, i32 88 %wide.load10.22 = load <4 x i32>, ptr %108, align 4 %109 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22 %110 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %109) %111 = add i32 %110, %106 %112 = getelementptr inbounds i32, ptr %x, i32 92 %wide.load.23 = load <4 x i32>, ptr %112, align 4 %113 = getelementptr inbounds i32, ptr %y, i32 92 %wide.load10.23 = load <4 x i32>, ptr %113, align 4 %114 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23 %115 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %114) %116 = add i32 %115, %111 %117 = getelementptr inbounds i32, ptr %x, i32 96 %wide.load.24 = load <4 x i32>, ptr %117, align 4 %118 = getelementptr inbounds i32, ptr %y, i32 96 %wide.load10.24 = load <4 x i32>, ptr %118, align 4 %119 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24 %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %119) %121 = add i32 %120, %116 %122 = getelementptr inbounds i32, ptr %x, i32 100 %wide.load.25 = load <4 x i32>, ptr %122, align 4 %123 = getelementptr inbounds i32, ptr %y, i32 100 %wide.load10.25 = load <4 x i32>, ptr %123, align 4 %124 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25 %125 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %124) %126 = add i32 %125, %121 %127 = getelementptr inbounds i32, ptr %x, i32 104 %wide.load.26 = load <4 x i32>, ptr %127, align 4 %128 = getelementptr inbounds i32, ptr %y, i32 104 %wide.load10.26 = load <4 x i32>, ptr %128, align 4 %129 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26 %130 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %129) %131 = add i32 %130, %126 %132 = getelementptr inbounds i32, ptr %x, i32 108 %wide.load.27 = load <4 x i32>, ptr %132, align 4 %133 = getelementptr inbounds i32, ptr %y, i32 108 %wide.load10.27 = load <4 x i32>, ptr %133, align 4 %134 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27 %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134) %136 = add i32 %135, %131 %137 = getelementptr inbounds i32, ptr %x, i32 112 %wide.load.28 = load <4 x i32>, ptr %137, align 4 %138 = getelementptr inbounds i32, ptr %y, i32 112 %wide.load10.28 = load <4 x i32>, ptr %138, align 4 %139 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28 %140 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %139) %141 = add i32 %140, %136 %142 = getelementptr inbounds i32, ptr %x, i32 116 %wide.load.29 = load <4 x i32>, ptr %142, align 4 %143 = getelementptr inbounds i32, ptr %y, i32 116 %wide.load10.29 = load <4 x i32>, ptr %143, align 4 %144 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29 %145 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %144) %146 = add i32 %145, %141 %147 = getelementptr inbounds i32, ptr %x, i32 120 %wide.load.30 = load <4 x i32>, ptr %147, align 4 %148 = getelementptr inbounds i32, ptr %y, i32 120 %wide.load10.30 = load <4 x i32>, ptr %148, align 4 %149 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30 %150 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %149) %151 = add i32 %150, %146 %152 = getelementptr inbounds i32, ptr %x, i32 124 %wide.load.31 = load <4 x i32>, ptr %152, align 4 %153 = getelementptr inbounds i32, ptr %y, i32 124 %wide.load10.31 = load <4 x i32>, ptr %153, align 4 %154 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31 %155 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %154) %156 = add i32 %155, %151 ret i32 %156 } define i32 @mlav2i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav2i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrsh.w r2, [r0] ; CHECK-NEXT: ldrsh.w r3, [r1] ; CHECK-NEXT: ldrsh.w r0, [r0, #2] ; CHECK-NEXT: ldrsh.w r1, [r1, #2] ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: smlabb r0, r3, r2, r0 ; CHECK-NEXT: bx lr entry: %0 = load i16, ptr %x, align 2 %conv = sext i16 %0 to i32 %1 = load i16, ptr %y, align 2 %conv2 = sext i16 %1 to i32 %mul = mul nsw i32 %conv2, %conv %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 %2 = load i16, ptr %arrayidx.1, align 2 %conv.1 = sext i16 %2 to i32 %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1 %3 = load i16, ptr %arrayidx1.1, align 2 %conv2.1 = sext i16 %3 to i32 %mul.1 = mul nsw i32 %conv2.1, %conv.1 %add.1 = add nsw i32 %mul.1, %mul ret i32 %add.1 } define i32 @mlav4i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav4i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i16>, ptr %x, align 2 %1 = sext <4 x i16> %0 to <4 x i32> %2 = load <4 x i16>, ptr %y, align 2 %3 = sext <4 x i16> %2 to <4 x i32> %4 = mul nsw <4 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) ret i32 %5 } define i32 @mlav8i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav8i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.s16 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 %1 = sext <8 x i16> %0 to <8 x i32> %2 = load <8 x i16>, ptr %y, align 2 %3 = sext <8 x i16> %2 to <8 x i32> %4 = mul nsw <8 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) ret i32 %5 } define i32 @mlav16i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav16i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i16>, ptr %x, align 2 %1 = sext <16 x i16> %0 to <16 x i32> %2 = load <16 x i16>, ptr %y, align 2 %3 = sext <16 x i16> %2 to <16 x i32> %4 = mul nsw <16 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) ret i32 %5 } define i32 @mlav24i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav24i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #32] ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #40] ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 %1 = sext <8 x i16> %0 to <8 x i32> %2 = load <8 x i16>, ptr %y, align 2 %3 = sext <8 x i16> %2 to <8 x i32> %4 = mul nsw <8 x i32> %3, %1 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8 %5 = load <16 x i16>, ptr %arrayidx.8, align 2 %6 = sext <16 x i16> %5 to <16 x i32> %7 = load <16 x i16>, ptr %arrayidx1.8, align 2 %8 = sext <16 x i16> %7 to <16 x i32> %9 = mul nsw <16 x i32> %8, %6 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9) %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) %op.rdx = add nsw i32 %10, %11 ret i32 %op.rdx } define i32 @mlav32i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav32i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #8] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #32] ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #40] ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #48] ; CHECK-NEXT: vldrh.s32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #56] ; CHECK-NEXT: vldrh.s32 q1, [r1, #56] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i16>, ptr %x, align 2 %1 = sext <32 x i16> %0 to <32 x i32> %2 = load <32 x i16>, ptr %y, align 2 %3 = sext <32 x i16> %2 to <32 x i32> %4 = mul nsw <32 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) ret i32 %5 } define i32 @mlav64i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav64i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #64] ; CHECK-NEXT: vldrh.u16 q1, [r1, #64] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #80] ; CHECK-NEXT: vldrh.u16 q1, [r1, #80] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #96] ; CHECK-NEXT: vldrh.u16 q1, [r1, #96] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #112] ; CHECK-NEXT: vldrh.u16 q1, [r1, #112] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i16>, ptr %x, align 2 %0 = sext <8 x i16> %wide.load to <8 x i32> %wide.load11 = load <8 x i16>, ptr %y, align 2 %1 = sext <8 x i16> %wide.load11 to <8 x i32> %2 = mul nsw <8 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) %4 = getelementptr inbounds i16, ptr %x, i32 8 %wide.load.1 = load <8 x i16>, ptr %4, align 2 %5 = sext <8 x i16> %wide.load.1 to <8 x i32> %6 = getelementptr inbounds i16, ptr %y, i32 8 %wide.load11.1 = load <8 x i16>, ptr %6, align 2 %7 = sext <8 x i16> %wide.load11.1 to <8 x i32> %8 = mul nsw <8 x i32> %7, %5 %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) %10 = add i32 %9, %3 %11 = getelementptr inbounds i16, ptr %x, i32 16 %wide.load.2 = load <8 x i16>, ptr %11, align 2 %12 = sext <8 x i16> %wide.load.2 to <8 x i32> %13 = getelementptr inbounds i16, ptr %y, i32 16 %wide.load11.2 = load <8 x i16>, ptr %13, align 2 %14 = sext <8 x i16> %wide.load11.2 to <8 x i32> %15 = mul nsw <8 x i32> %14, %12 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) %17 = add i32 %16, %10 %18 = getelementptr inbounds i16, ptr %x, i32 24 %wide.load.3 = load <8 x i16>, ptr %18, align 2 %19 = sext <8 x i16> %wide.load.3 to <8 x i32> %20 = getelementptr inbounds i16, ptr %y, i32 24 %wide.load11.3 = load <8 x i16>, ptr %20, align 2 %21 = sext <8 x i16> %wide.load11.3 to <8 x i32> %22 = mul nsw <8 x i32> %21, %19 %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22) %24 = add i32 %23, %17 %25 = getelementptr inbounds i16, ptr %x, i32 32 %wide.load.4 = load <8 x i16>, ptr %25, align 2 %26 = sext <8 x i16> %wide.load.4 to <8 x i32> %27 = getelementptr inbounds i16, ptr %y, i32 32 %wide.load11.4 = load <8 x i16>, ptr %27, align 2 %28 = sext <8 x i16> %wide.load11.4 to <8 x i32> %29 = mul nsw <8 x i32> %28, %26 %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29) %31 = add i32 %30, %24 %32 = getelementptr inbounds i16, ptr %x, i32 40 %wide.load.5 = load <8 x i16>, ptr %32, align 2 %33 = sext <8 x i16> %wide.load.5 to <8 x i32> %34 = getelementptr inbounds i16, ptr %y, i32 40 %wide.load11.5 = load <8 x i16>, ptr %34, align 2 %35 = sext <8 x i16> %wide.load11.5 to <8 x i32> %36 = mul nsw <8 x i32> %35, %33 %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36) %38 = add i32 %37, %31 %39 = getelementptr inbounds i16, ptr %x, i32 48 %wide.load.6 = load <8 x i16>, ptr %39, align 2 %40 = sext <8 x i16> %wide.load.6 to <8 x i32> %41 = getelementptr inbounds i16, ptr %y, i32 48 %wide.load11.6 = load <8 x i16>, ptr %41, align 2 %42 = sext <8 x i16> %wide.load11.6 to <8 x i32> %43 = mul nsw <8 x i32> %42, %40 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) %45 = add i32 %44, %38 %46 = getelementptr inbounds i16, ptr %x, i32 56 %wide.load.7 = load <8 x i16>, ptr %46, align 2 %47 = sext <8 x i16> %wide.load.7 to <8 x i32> %48 = getelementptr inbounds i16, ptr %y, i32 56 %wide.load11.7 = load <8 x i16>, ptr %48, align 2 %49 = sext <8 x i16> %wide.load11.7 to <8 x i32> %50 = mul nsw <8 x i32> %49, %47 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50) %52 = add i32 %51, %45 ret i32 %52 } define i32 @mlav128i32i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav128i32i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #64] ; CHECK-NEXT: vldrh.u16 q1, [r1, #64] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #80] ; CHECK-NEXT: vldrh.u16 q1, [r1, #80] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #96] ; CHECK-NEXT: vldrh.u16 q1, [r1, #96] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #112] ; CHECK-NEXT: vldrh.u16 q1, [r1, #112] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #128] ; CHECK-NEXT: vldrh.u16 q1, [r1, #128] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #144] ; CHECK-NEXT: vldrh.u16 q1, [r1, #144] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #160] ; CHECK-NEXT: vldrh.u16 q1, [r1, #160] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #176] ; CHECK-NEXT: vldrh.u16 q1, [r1, #176] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #192] ; CHECK-NEXT: vldrh.u16 q1, [r1, #192] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #208] ; CHECK-NEXT: vldrh.u16 q1, [r1, #208] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #224] ; CHECK-NEXT: vldrh.u16 q1, [r1, #224] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r2, #240] ; CHECK-NEXT: vldrh.u16 q1, [r1, #240] ; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i16>, ptr %x, align 2 %0 = sext <8 x i16> %wide.load to <8 x i32> %wide.load11 = load <8 x i16>, ptr %y, align 2 %1 = sext <8 x i16> %wide.load11 to <8 x i32> %2 = mul nsw <8 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) %4 = getelementptr inbounds i16, ptr %x, i32 8 %wide.load.1 = load <8 x i16>, ptr %4, align 2 %5 = sext <8 x i16> %wide.load.1 to <8 x i32> %6 = getelementptr inbounds i16, ptr %y, i32 8 %wide.load11.1 = load <8 x i16>, ptr %6, align 2 %7 = sext <8 x i16> %wide.load11.1 to <8 x i32> %8 = mul nsw <8 x i32> %7, %5 %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) %10 = add i32 %9, %3 %11 = getelementptr inbounds i16, ptr %x, i32 16 %wide.load.2 = load <8 x i16>, ptr %11, align 2 %12 = sext <8 x i16> %wide.load.2 to <8 x i32> %13 = getelementptr inbounds i16, ptr %y, i32 16 %wide.load11.2 = load <8 x i16>, ptr %13, align 2 %14 = sext <8 x i16> %wide.load11.2 to <8 x i32> %15 = mul nsw <8 x i32> %14, %12 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) %17 = add i32 %16, %10 %18 = getelementptr inbounds i16, ptr %x, i32 24 %wide.load.3 = load <8 x i16>, ptr %18, align 2 %19 = sext <8 x i16> %wide.load.3 to <8 x i32> %20 = getelementptr inbounds i16, ptr %y, i32 24 %wide.load11.3 = load <8 x i16>, ptr %20, align 2 %21 = sext <8 x i16> %wide.load11.3 to <8 x i32> %22 = mul nsw <8 x i32> %21, %19 %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22) %24 = add i32 %23, %17 %25 = getelementptr inbounds i16, ptr %x, i32 32 %wide.load.4 = load <8 x i16>, ptr %25, align 2 %26 = sext <8 x i16> %wide.load.4 to <8 x i32> %27 = getelementptr inbounds i16, ptr %y, i32 32 %wide.load11.4 = load <8 x i16>, ptr %27, align 2 %28 = sext <8 x i16> %wide.load11.4 to <8 x i32> %29 = mul nsw <8 x i32> %28, %26 %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29) %31 = add i32 %30, %24 %32 = getelementptr inbounds i16, ptr %x, i32 40 %wide.load.5 = load <8 x i16>, ptr %32, align 2 %33 = sext <8 x i16> %wide.load.5 to <8 x i32> %34 = getelementptr inbounds i16, ptr %y, i32 40 %wide.load11.5 = load <8 x i16>, ptr %34, align 2 %35 = sext <8 x i16> %wide.load11.5 to <8 x i32> %36 = mul nsw <8 x i32> %35, %33 %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36) %38 = add i32 %37, %31 %39 = getelementptr inbounds i16, ptr %x, i32 48 %wide.load.6 = load <8 x i16>, ptr %39, align 2 %40 = sext <8 x i16> %wide.load.6 to <8 x i32> %41 = getelementptr inbounds i16, ptr %y, i32 48 %wide.load11.6 = load <8 x i16>, ptr %41, align 2 %42 = sext <8 x i16> %wide.load11.6 to <8 x i32> %43 = mul nsw <8 x i32> %42, %40 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) %45 = add i32 %44, %38 %46 = getelementptr inbounds i16, ptr %x, i32 56 %wide.load.7 = load <8 x i16>, ptr %46, align 2 %47 = sext <8 x i16> %wide.load.7 to <8 x i32> %48 = getelementptr inbounds i16, ptr %y, i32 56 %wide.load11.7 = load <8 x i16>, ptr %48, align 2 %49 = sext <8 x i16> %wide.load11.7 to <8 x i32> %50 = mul nsw <8 x i32> %49, %47 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50) %52 = add i32 %51, %45 %53 = getelementptr inbounds i16, ptr %x, i32 64 %wide.load.8 = load <8 x i16>, ptr %53, align 2 %54 = sext <8 x i16> %wide.load.8 to <8 x i32> %55 = getelementptr inbounds i16, ptr %y, i32 64 %wide.load11.8 = load <8 x i16>, ptr %55, align 2 %56 = sext <8 x i16> %wide.load11.8 to <8 x i32> %57 = mul nsw <8 x i32> %56, %54 %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57) %59 = add i32 %58, %52 %60 = getelementptr inbounds i16, ptr %x, i32 72 %wide.load.9 = load <8 x i16>, ptr %60, align 2 %61 = sext <8 x i16> %wide.load.9 to <8 x i32> %62 = getelementptr inbounds i16, ptr %y, i32 72 %wide.load11.9 = load <8 x i16>, ptr %62, align 2 %63 = sext <8 x i16> %wide.load11.9 to <8 x i32> %64 = mul nsw <8 x i32> %63, %61 %65 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %64) %66 = add i32 %65, %59 %67 = getelementptr inbounds i16, ptr %x, i32 80 %wide.load.10 = load <8 x i16>, ptr %67, align 2 %68 = sext <8 x i16> %wide.load.10 to <8 x i32> %69 = getelementptr inbounds i16, ptr %y, i32 80 %wide.load11.10 = load <8 x i16>, ptr %69, align 2 %70 = sext <8 x i16> %wide.load11.10 to <8 x i32> %71 = mul nsw <8 x i32> %70, %68 %72 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %71) %73 = add i32 %72, %66 %74 = getelementptr inbounds i16, ptr %x, i32 88 %wide.load.11 = load <8 x i16>, ptr %74, align 2 %75 = sext <8 x i16> %wide.load.11 to <8 x i32> %76 = getelementptr inbounds i16, ptr %y, i32 88 %wide.load11.11 = load <8 x i16>, ptr %76, align 2 %77 = sext <8 x i16> %wide.load11.11 to <8 x i32> %78 = mul nsw <8 x i32> %77, %75 %79 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %78) %80 = add i32 %79, %73 %81 = getelementptr inbounds i16, ptr %x, i32 96 %wide.load.12 = load <8 x i16>, ptr %81, align 2 %82 = sext <8 x i16> %wide.load.12 to <8 x i32> %83 = getelementptr inbounds i16, ptr %y, i32 96 %wide.load11.12 = load <8 x i16>, ptr %83, align 2 %84 = sext <8 x i16> %wide.load11.12 to <8 x i32> %85 = mul nsw <8 x i32> %84, %82 %86 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %85) %87 = add i32 %86, %80 %88 = getelementptr inbounds i16, ptr %x, i32 104 %wide.load.13 = load <8 x i16>, ptr %88, align 2 %89 = sext <8 x i16> %wide.load.13 to <8 x i32> %90 = getelementptr inbounds i16, ptr %y, i32 104 %wide.load11.13 = load <8 x i16>, ptr %90, align 2 %91 = sext <8 x i16> %wide.load11.13 to <8 x i32> %92 = mul nsw <8 x i32> %91, %89 %93 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %92) %94 = add i32 %93, %87 %95 = getelementptr inbounds i16, ptr %x, i32 112 %wide.load.14 = load <8 x i16>, ptr %95, align 2 %96 = sext <8 x i16> %wide.load.14 to <8 x i32> %97 = getelementptr inbounds i16, ptr %y, i32 112 %wide.load11.14 = load <8 x i16>, ptr %97, align 2 %98 = sext <8 x i16> %wide.load11.14 to <8 x i32> %99 = mul nsw <8 x i32> %98, %96 %100 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %99) %101 = add i32 %100, %94 %102 = getelementptr inbounds i16, ptr %x, i32 120 %wide.load.15 = load <8 x i16>, ptr %102, align 2 %103 = sext <8 x i16> %wide.load.15 to <8 x i32> %104 = getelementptr inbounds i16, ptr %y, i32 120 %wide.load11.15 = load <8 x i16>, ptr %104, align 2 %105 = sext <8 x i16> %wide.load11.15 to <8 x i32> %106 = mul nsw <8 x i32> %105, %103 %107 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %106) %108 = add i32 %107, %101 ret i32 %108 } define i32 @mlav2i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav2i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r2, [r0] ; CHECK-NEXT: ldrb r3, [r1] ; CHECK-NEXT: ldrb r0, [r0, #1] ; CHECK-NEXT: ldrb r1, [r1, #1] ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: smlabb r0, r3, r2, r0 ; CHECK-NEXT: bx lr entry: %0 = load i8, ptr %x, align 1 %conv = zext i8 %0 to i32 %1 = load i8, ptr %y, align 1 %conv2 = zext i8 %1 to i32 %mul = mul nuw nsw i32 %conv2, %conv %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 %2 = load i8, ptr %arrayidx.1, align 1 %conv.1 = zext i8 %2 to i32 %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1 %3 = load i8, ptr %arrayidx1.1, align 1 %conv2.1 = zext i8 %3 to i32 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 %add.1 = add nuw nsw i32 %mul.1, %mul ret i32 %add.1 } define i32 @mlav4i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav4i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vldrb.u32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i8>, ptr %x, align 1 %1 = zext <4 x i8> %0 to <4 x i32> %2 = load <4 x i8>, ptr %y, align 1 %3 = zext <4 x i8> %2 to <4 x i32> %4 = mul nuw nsw <4 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) ret i32 %5 } define i32 @mlav8i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav8i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vldrb.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 %1 = zext <8 x i8> %0 to <8 x i32> %2 = load <8 x i8>, ptr %y, align 1 %3 = zext <8 x i8> %2 to <8 x i32> %4 = mul nuw nsw <8 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) ret i32 %5 } define i32 @mlav16i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav16i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] ; CHECK-NEXT: vmlav.u8 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 %1 = zext <16 x i8> %0 to <16 x i32> %2 = load <16 x i8>, ptr %y, align 1 %3 = zext <16 x i8> %2 to <16 x i32> %4 = mul nuw nsw <16 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) ret i32 %5 } define i32 @mlav24i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav24i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vldrb.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8] ; CHECK-NEXT: vldrb.u8 q1, [r1, #8] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 %1 = zext <8 x i8> %0 to <8 x i32> %2 = load <8 x i8>, ptr %y, align 1 %3 = zext <8 x i8> %2 to <8 x i32> %4 = mul nuw nsw <8 x i32> %3, %1 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8 %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8 %5 = load <16 x i8>, ptr %arrayidx.8, align 1 %6 = zext <16 x i8> %5 to <16 x i32> %7 = load <16 x i8>, ptr %arrayidx1.8, align 1 %8 = zext <16 x i8> %7 to <16 x i32> %9 = mul nuw nsw <16 x i32> %8, %6 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9) %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) %op.rdx = add nuw nsw i32 %10, %11 ret i32 %op.rdx } define i32 @mlav32i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav32i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vldrb.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #4] ; CHECK-NEXT: vldrb.u32 q1, [r1, #4] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #8] ; CHECK-NEXT: vldrb.u32 q1, [r1, #8] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #12] ; CHECK-NEXT: vldrb.u32 q1, [r1, #12] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #16] ; CHECK-NEXT: vldrb.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #20] ; CHECK-NEXT: vldrb.u32 q1, [r1, #20] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #24] ; CHECK-NEXT: vldrb.u32 q1, [r1, #24] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #28] ; CHECK-NEXT: vldrb.u32 q1, [r1, #28] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i8>, ptr %x, align 1 %1 = zext <32 x i8> %0 to <32 x i32> %2 = load <32 x i8>, ptr %y, align 1 %3 = zext <32 x i8> %2 to <32 x i32> %4 = mul nuw nsw <32 x i32> %3, %1 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) ret i32 %5 } define i32 @mlav64i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav64i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] ; CHECK-NEXT: vmlav.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vldrb.u8 q1, [r1, #16] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q1, [r1, #32] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] ; CHECK-NEXT: vldrb.u8 q1, [r1, #48] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 %0 = zext <16 x i8> %wide.load to <16 x i32> %wide.load11 = load <16 x i8>, ptr %y, align 1 %1 = zext <16 x i8> %wide.load11 to <16 x i32> %2 = mul nuw nsw <16 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) %4 = getelementptr inbounds i8, ptr %x, i32 16 %wide.load.1 = load <16 x i8>, ptr %4, align 1 %5 = zext <16 x i8> %wide.load.1 to <16 x i32> %6 = getelementptr inbounds i8, ptr %y, i32 16 %wide.load11.1 = load <16 x i8>, ptr %6, align 1 %7 = zext <16 x i8> %wide.load11.1 to <16 x i32> %8 = mul nuw nsw <16 x i32> %7, %5 %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8) %10 = add i32 %9, %3 %11 = getelementptr inbounds i8, ptr %x, i32 32 %wide.load.2 = load <16 x i8>, ptr %11, align 1 %12 = zext <16 x i8> %wide.load.2 to <16 x i32> %13 = getelementptr inbounds i8, ptr %y, i32 32 %wide.load11.2 = load <16 x i8>, ptr %13, align 1 %14 = zext <16 x i8> %wide.load11.2 to <16 x i32> %15 = mul nuw nsw <16 x i32> %14, %12 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) %17 = add i32 %16, %10 %18 = getelementptr inbounds i8, ptr %x, i32 48 %wide.load.3 = load <16 x i8>, ptr %18, align 1 %19 = zext <16 x i8> %wide.load.3 to <16 x i32> %20 = getelementptr inbounds i8, ptr %y, i32 48 %wide.load11.3 = load <16 x i8>, ptr %20, align 1 %21 = zext <16 x i8> %wide.load11.3 to <16 x i32> %22 = mul nuw nsw <16 x i32> %21, %19 %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22) %24 = add i32 %23, %17 ret i32 %24 } define i32 @mlav128i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav128i32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u8 r0, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r2, #16] ; CHECK-NEXT: vldrb.u8 q1, [r1, #16] ; CHECK-NEXT: vmlava.u8 r0, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r2, #32] ; CHECK-NEXT: vldrb.u8 q1, [r1, #32] ; CHECK-NEXT: vmlava.u8 r0, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r2, #48] ; CHECK-NEXT: vldrb.u8 q1, [r1, #48] ; CHECK-NEXT: vmlava.u8 r0, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r2, #64] ; CHECK-NEXT: vldrb.u8 q1, [r1, #64] ; CHECK-NEXT: vmlava.u8 r0, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r2, #80] ; CHECK-NEXT: vldrb.u8 q1, [r1, #80] ; CHECK-NEXT: vmlava.u8 r0, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r2, #96] ; CHECK-NEXT: vldrb.u8 q1, [r1, #96] ; CHECK-NEXT: vmlava.u8 r0, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r2, #112] ; CHECK-NEXT: vldrb.u8 q1, [r1, #112] ; CHECK-NEXT: vmlava.u8 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 %0 = zext <16 x i8> %wide.load to <16 x i32> %wide.load11 = load <16 x i8>, ptr %y, align 1 %1 = zext <16 x i8> %wide.load11 to <16 x i32> %2 = mul nuw nsw <16 x i32> %1, %0 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) %4 = getelementptr inbounds i8, ptr %x, i32 16 %wide.load.1 = load <16 x i8>, ptr %4, align 1 %5 = zext <16 x i8> %wide.load.1 to <16 x i32> %6 = getelementptr inbounds i8, ptr %y, i32 16 %wide.load11.1 = load <16 x i8>, ptr %6, align 1 %7 = zext <16 x i8> %wide.load11.1 to <16 x i32> %8 = mul nuw nsw <16 x i32> %7, %5 %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8) %10 = add i32 %9, %3 %11 = getelementptr inbounds i8, ptr %x, i32 32 %wide.load.2 = load <16 x i8>, ptr %11, align 1 %12 = zext <16 x i8> %wide.load.2 to <16 x i32> %13 = getelementptr inbounds i8, ptr %y, i32 32 %wide.load11.2 = load <16 x i8>, ptr %13, align 1 %14 = zext <16 x i8> %wide.load11.2 to <16 x i32> %15 = mul nuw nsw <16 x i32> %14, %12 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) %17 = add i32 %16, %10 %18 = getelementptr inbounds i8, ptr %x, i32 48 %wide.load.3 = load <16 x i8>, ptr %18, align 1 %19 = zext <16 x i8> %wide.load.3 to <16 x i32> %20 = getelementptr inbounds i8, ptr %y, i32 48 %wide.load11.3 = load <16 x i8>, ptr %20, align 1 %21 = zext <16 x i8> %wide.load11.3 to <16 x i32> %22 = mul nuw nsw <16 x i32> %21, %19 %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22) %24 = add i32 %23, %17 %25 = getelementptr inbounds i8, ptr %x, i32 64 %wide.load.4 = load <16 x i8>, ptr %25, align 1 %26 = zext <16 x i8> %wide.load.4 to <16 x i32> %27 = getelementptr inbounds i8, ptr %y, i32 64 %wide.load11.4 = load <16 x i8>, ptr %27, align 1 %28 = zext <16 x i8> %wide.load11.4 to <16 x i32> %29 = mul nuw nsw <16 x i32> %28, %26 %30 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %29) %31 = add i32 %30, %24 %32 = getelementptr inbounds i8, ptr %x, i32 80 %wide.load.5 = load <16 x i8>, ptr %32, align 1 %33 = zext <16 x i8> %wide.load.5 to <16 x i32> %34 = getelementptr inbounds i8, ptr %y, i32 80 %wide.load11.5 = load <16 x i8>, ptr %34, align 1 %35 = zext <16 x i8> %wide.load11.5 to <16 x i32> %36 = mul nuw nsw <16 x i32> %35, %33 %37 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %36) %38 = add i32 %37, %31 %39 = getelementptr inbounds i8, ptr %x, i32 96 %wide.load.6 = load <16 x i8>, ptr %39, align 1 %40 = zext <16 x i8> %wide.load.6 to <16 x i32> %41 = getelementptr inbounds i8, ptr %y, i32 96 %wide.load11.6 = load <16 x i8>, ptr %41, align 1 %42 = zext <16 x i8> %wide.load11.6 to <16 x i32> %43 = mul nuw nsw <16 x i32> %42, %40 %44 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %43) %45 = add i32 %44, %38 %46 = getelementptr inbounds i8, ptr %x, i32 112 %wide.load.7 = load <16 x i8>, ptr %46, align 1 %47 = zext <16 x i8> %wide.load.7 to <16 x i32> %48 = getelementptr inbounds i8, ptr %y, i32 112 %wide.load11.7 = load <16 x i8>, ptr %48, align 1 %49 = zext <16 x i8> %wide.load11.7 to <16 x i32> %50 = mul nuw nsw <16 x i32> %49, %47 %51 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %50) %52 = add i32 %51, %45 ret i32 %52 } define signext i16 @mlav2i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav2i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrh r2, [r0] ; CHECK-NEXT: ldrh r3, [r1] ; CHECK-NEXT: ldrh r0, [r0, #2] ; CHECK-NEXT: ldrh r1, [r1, #2] ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: mla r0, r1, r0, r2 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load i16, ptr %x, align 2 %1 = load i16, ptr %y, align 2 %mul = mul i16 %1, %0 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 %2 = load i16, ptr %arrayidx.1, align 2 %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1 %3 = load i16, ptr %arrayidx1.1, align 2 %mul.1 = mul i16 %3, %2 %add.1 = add i16 %mul.1, %mul ret i16 %add.1 } define signext i16 @mlav4i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav4i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vldrh.u32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i16>, ptr %x, align 2 %1 = load <4 x i16>, ptr %y, align 2 %2 = mul <4 x i16> %1, %0 %3 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %2) ret i16 %3 } define signext i16 @mlav8i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav8i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r0, q1, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 %1 = load <8 x i16>, ptr %y, align 2 %2 = mul <8 x i16> %1, %0 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) ret i16 %3 } define signext i16 @mlav16i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav16i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i16>, ptr %x, align 2 %1 = load <16 x i16>, ptr %y, align 2 %2 = mul <16 x i16> %1, %0 %3 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %2) ret i16 %3 } define signext i16 @mlav24i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav24i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i16>, ptr %x, align 2 %1 = load <8 x i16>, ptr %y, align 2 %2 = mul <8 x i16> %1, %0 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8 %3 = load <16 x i16>, ptr %arrayidx.8, align 2 %4 = load <16 x i16>, ptr %arrayidx1.8, align 2 %5 = mul <16 x i16> %4, %3 %6 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5) %7 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) %op.rdx = add i16 %6, %7 ret i16 %op.rdx } define signext i16 @mlav32i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav32i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i16>, ptr %x, align 2 %1 = load <32 x i16>, ptr %y, align 2 %2 = mul <32 x i16> %1, %0 %3 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %2) ret i16 %3 } define signext i16 @mlav64i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav64i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] ; CHECK-NEXT: vldrh.u16 q1, [r1, #64] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] ; CHECK-NEXT: vldrh.u16 q1, [r1, #80] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vldrh.u16 q1, [r1, #96] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] ; CHECK-NEXT: vldrh.u16 q1, [r1, #112] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i16>, ptr %x, align 2 %wide.load13 = load <8 x i16>, ptr %y, align 2 %0 = mul <8 x i16> %wide.load13, %wide.load %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) %2 = getelementptr inbounds i16, ptr %x, i32 8 %wide.load.1 = load <8 x i16>, ptr %2, align 2 %3 = getelementptr inbounds i16, ptr %y, i32 8 %wide.load13.1 = load <8 x i16>, ptr %3, align 2 %4 = mul <8 x i16> %wide.load13.1, %wide.load.1 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4) %6 = add i16 %5, %1 %7 = getelementptr inbounds i16, ptr %x, i32 16 %wide.load.2 = load <8 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %y, i32 16 %wide.load13.2 = load <8 x i16>, ptr %8, align 2 %9 = mul <8 x i16> %wide.load13.2, %wide.load.2 %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9) %11 = add i16 %10, %6 %12 = getelementptr inbounds i16, ptr %x, i32 24 %wide.load.3 = load <8 x i16>, ptr %12, align 2 %13 = getelementptr inbounds i16, ptr %y, i32 24 %wide.load13.3 = load <8 x i16>, ptr %13, align 2 %14 = mul <8 x i16> %wide.load13.3, %wide.load.3 %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14) %16 = add i16 %15, %11 %17 = getelementptr inbounds i16, ptr %x, i32 32 %wide.load.4 = load <8 x i16>, ptr %17, align 2 %18 = getelementptr inbounds i16, ptr %y, i32 32 %wide.load13.4 = load <8 x i16>, ptr %18, align 2 %19 = mul <8 x i16> %wide.load13.4, %wide.load.4 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19) %21 = add i16 %20, %16 %22 = getelementptr inbounds i16, ptr %x, i32 40 %wide.load.5 = load <8 x i16>, ptr %22, align 2 %23 = getelementptr inbounds i16, ptr %y, i32 40 %wide.load13.5 = load <8 x i16>, ptr %23, align 2 %24 = mul <8 x i16> %wide.load13.5, %wide.load.5 %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24) %26 = add i16 %25, %21 %27 = getelementptr inbounds i16, ptr %x, i32 48 %wide.load.6 = load <8 x i16>, ptr %27, align 2 %28 = getelementptr inbounds i16, ptr %y, i32 48 %wide.load13.6 = load <8 x i16>, ptr %28, align 2 %29 = mul <8 x i16> %wide.load13.6, %wide.load.6 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29) %31 = add i16 %30, %26 %32 = getelementptr inbounds i16, ptr %x, i32 56 %wide.load.7 = load <8 x i16>, ptr %32, align 2 %33 = getelementptr inbounds i16, ptr %y, i32 56 %wide.load13.7 = load <8 x i16>, ptr %33, align 2 %34 = mul <8 x i16> %wide.load13.7, %wide.load.7 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34) %36 = add i16 %35, %31 ret i16 %36 } define signext i16 @mlav128i16i16(ptr %x, ptr %y) { ; CHECK-LABEL: mlav128i16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] ; CHECK-NEXT: vldrh.u16 q1, [r1, #64] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] ; CHECK-NEXT: vldrh.u16 q1, [r1, #80] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vldrh.u16 q1, [r1, #96] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] ; CHECK-NEXT: vldrh.u16 q1, [r1, #112] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128] ; CHECK-NEXT: vldrh.u16 q1, [r1, #128] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144] ; CHECK-NEXT: vldrh.u16 q1, [r1, #144] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160] ; CHECK-NEXT: vldrh.u16 q1, [r1, #160] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176] ; CHECK-NEXT: vldrh.u16 q1, [r1, #176] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192] ; CHECK-NEXT: vldrh.u16 q1, [r1, #192] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208] ; CHECK-NEXT: vldrh.u16 q1, [r1, #208] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224] ; CHECK-NEXT: vldrh.u16 q1, [r1, #224] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240] ; CHECK-NEXT: vldrh.u16 q1, [r1, #240] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i16>, ptr %x, align 2 %wide.load13 = load <8 x i16>, ptr %y, align 2 %0 = mul <8 x i16> %wide.load13, %wide.load %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) %2 = getelementptr inbounds i16, ptr %x, i32 8 %wide.load.1 = load <8 x i16>, ptr %2, align 2 %3 = getelementptr inbounds i16, ptr %y, i32 8 %wide.load13.1 = load <8 x i16>, ptr %3, align 2 %4 = mul <8 x i16> %wide.load13.1, %wide.load.1 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4) %6 = add i16 %5, %1 %7 = getelementptr inbounds i16, ptr %x, i32 16 %wide.load.2 = load <8 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %y, i32 16 %wide.load13.2 = load <8 x i16>, ptr %8, align 2 %9 = mul <8 x i16> %wide.load13.2, %wide.load.2 %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9) %11 = add i16 %10, %6 %12 = getelementptr inbounds i16, ptr %x, i32 24 %wide.load.3 = load <8 x i16>, ptr %12, align 2 %13 = getelementptr inbounds i16, ptr %y, i32 24 %wide.load13.3 = load <8 x i16>, ptr %13, align 2 %14 = mul <8 x i16> %wide.load13.3, %wide.load.3 %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14) %16 = add i16 %15, %11 %17 = getelementptr inbounds i16, ptr %x, i32 32 %wide.load.4 = load <8 x i16>, ptr %17, align 2 %18 = getelementptr inbounds i16, ptr %y, i32 32 %wide.load13.4 = load <8 x i16>, ptr %18, align 2 %19 = mul <8 x i16> %wide.load13.4, %wide.load.4 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19) %21 = add i16 %20, %16 %22 = getelementptr inbounds i16, ptr %x, i32 40 %wide.load.5 = load <8 x i16>, ptr %22, align 2 %23 = getelementptr inbounds i16, ptr %y, i32 40 %wide.load13.5 = load <8 x i16>, ptr %23, align 2 %24 = mul <8 x i16> %wide.load13.5, %wide.load.5 %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24) %26 = add i16 %25, %21 %27 = getelementptr inbounds i16, ptr %x, i32 48 %wide.load.6 = load <8 x i16>, ptr %27, align 2 %28 = getelementptr inbounds i16, ptr %y, i32 48 %wide.load13.6 = load <8 x i16>, ptr %28, align 2 %29 = mul <8 x i16> %wide.load13.6, %wide.load.6 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29) %31 = add i16 %30, %26 %32 = getelementptr inbounds i16, ptr %x, i32 56 %wide.load.7 = load <8 x i16>, ptr %32, align 2 %33 = getelementptr inbounds i16, ptr %y, i32 56 %wide.load13.7 = load <8 x i16>, ptr %33, align 2 %34 = mul <8 x i16> %wide.load13.7, %wide.load.7 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34) %36 = add i16 %35, %31 %37 = getelementptr inbounds i16, ptr %x, i32 64 %wide.load.8 = load <8 x i16>, ptr %37, align 2 %38 = getelementptr inbounds i16, ptr %y, i32 64 %wide.load13.8 = load <8 x i16>, ptr %38, align 2 %39 = mul <8 x i16> %wide.load13.8, %wide.load.8 %40 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %39) %41 = add i16 %40, %36 %42 = getelementptr inbounds i16, ptr %x, i32 72 %wide.load.9 = load <8 x i16>, ptr %42, align 2 %43 = getelementptr inbounds i16, ptr %y, i32 72 %wide.load13.9 = load <8 x i16>, ptr %43, align 2 %44 = mul <8 x i16> %wide.load13.9, %wide.load.9 %45 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %44) %46 = add i16 %45, %41 %47 = getelementptr inbounds i16, ptr %x, i32 80 %wide.load.10 = load <8 x i16>, ptr %47, align 2 %48 = getelementptr inbounds i16, ptr %y, i32 80 %wide.load13.10 = load <8 x i16>, ptr %48, align 2 %49 = mul <8 x i16> %wide.load13.10, %wide.load.10 %50 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %49) %51 = add i16 %50, %46 %52 = getelementptr inbounds i16, ptr %x, i32 88 %wide.load.11 = load <8 x i16>, ptr %52, align 2 %53 = getelementptr inbounds i16, ptr %y, i32 88 %wide.load13.11 = load <8 x i16>, ptr %53, align 2 %54 = mul <8 x i16> %wide.load13.11, %wide.load.11 %55 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %54) %56 = add i16 %55, %51 %57 = getelementptr inbounds i16, ptr %x, i32 96 %wide.load.12 = load <8 x i16>, ptr %57, align 2 %58 = getelementptr inbounds i16, ptr %y, i32 96 %wide.load13.12 = load <8 x i16>, ptr %58, align 2 %59 = mul <8 x i16> %wide.load13.12, %wide.load.12 %60 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %59) %61 = add i16 %60, %56 %62 = getelementptr inbounds i16, ptr %x, i32 104 %wide.load.13 = load <8 x i16>, ptr %62, align 2 %63 = getelementptr inbounds i16, ptr %y, i32 104 %wide.load13.13 = load <8 x i16>, ptr %63, align 2 %64 = mul <8 x i16> %wide.load13.13, %wide.load.13 %65 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %64) %66 = add i16 %65, %61 %67 = getelementptr inbounds i16, ptr %x, i32 112 %wide.load.14 = load <8 x i16>, ptr %67, align 2 %68 = getelementptr inbounds i16, ptr %y, i32 112 %wide.load13.14 = load <8 x i16>, ptr %68, align 2 %69 = mul <8 x i16> %wide.load13.14, %wide.load.14 %70 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %69) %71 = add i16 %70, %66 %72 = getelementptr inbounds i16, ptr %x, i32 120 %wide.load.15 = load <8 x i16>, ptr %72, align 2 %73 = getelementptr inbounds i16, ptr %y, i32 120 %wide.load13.15 = load <8 x i16>, ptr %73, align 2 %74 = mul <8 x i16> %wide.load13.15, %wide.load.15 %75 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %74) %76 = add i16 %75, %71 ret i16 %76 } define zeroext i8 @mlav2i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav2i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r2, [r0] ; CHECK-NEXT: ldrb r3, [r1] ; CHECK-NEXT: ldrb r0, [r0, #1] ; CHECK-NEXT: ldrb r1, [r1, #1] ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: mla r0, r1, r0, r2 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load i8, ptr %x, align 1 %1 = load i8, ptr %y, align 1 %mul = mul i8 %1, %0 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 %2 = load i8, ptr %arrayidx.1, align 1 %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1 %3 = load i8, ptr %arrayidx1.1, align 1 %mul.1 = mul i8 %3, %2 %add.1 = add i8 %mul.1, %mul ret i8 %add.1 } define zeroext i8 @mlav4i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav4i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r0] ; CHECK-NEXT: vldrb.u32 q1, [r1] ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <4 x i8>, ptr %x, align 1 %1 = load <4 x i8>, ptr %y, align 1 %2 = mul <4 x i8> %1, %0 %3 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %2) ret i8 %3 } define zeroext i8 @mlav8i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav8i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vldrb.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r0, q1, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 %1 = load <8 x i8>, ptr %y, align 1 %2 = mul <8 x i8> %1, %0 %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2) ret i8 %3 } define zeroext i8 @mlav16i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav16i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] ; CHECK-NEXT: vmlav.u8 r0, q1, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 %1 = load <16 x i8>, ptr %y, align 1 %2 = mul <16 x i8> %1, %0 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) ret i8 %3 } define zeroext i8 @mlav24i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav24i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vldrb.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8] ; CHECK-NEXT: vldrb.u8 q1, [r1, #8] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 %1 = load <8 x i8>, ptr %y, align 1 %2 = mul <8 x i8> %1, %0 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8 %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8 %3 = load <16 x i8>, ptr %arrayidx.8, align 1 %4 = load <16 x i8>, ptr %arrayidx1.8, align 1 %5 = mul <16 x i8> %4, %3 %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5) %7 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2) %op.rdx = add i8 %6, %7 ret i8 %op.rdx } define zeroext i8 @mlav32i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav32i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] ; CHECK-NEXT: vmlav.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vldrb.u8 q1, [r1, #16] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <32 x i8>, ptr %x, align 1 %1 = load <32 x i8>, ptr %y, align 1 %2 = mul <32 x i8> %1, %0 %3 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %2) ret i8 %3 } define zeroext i8 @mlav64i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav64i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] ; CHECK-NEXT: vmlav.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vldrb.u8 q1, [r1, #16] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q1, [r1, #32] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] ; CHECK-NEXT: vldrb.u8 q1, [r1, #48] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 %wide.load12 = load <16 x i8>, ptr %y, align 1 %0 = mul <16 x i8> %wide.load12, %wide.load %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0) %2 = getelementptr inbounds i8, ptr %x, i32 16 %wide.load.1 = load <16 x i8>, ptr %2, align 1 %3 = getelementptr inbounds i8, ptr %y, i32 16 %wide.load12.1 = load <16 x i8>, ptr %3, align 1 %4 = mul <16 x i8> %wide.load12.1, %wide.load.1 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4) %6 = add i8 %5, %1 %7 = getelementptr inbounds i8, ptr %x, i32 32 %wide.load.2 = load <16 x i8>, ptr %7, align 1 %8 = getelementptr inbounds i8, ptr %y, i32 32 %wide.load12.2 = load <16 x i8>, ptr %8, align 1 %9 = mul <16 x i8> %wide.load12.2, %wide.load.2 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9) %11 = add i8 %10, %6 %12 = getelementptr inbounds i8, ptr %x, i32 48 %wide.load.3 = load <16 x i8>, ptr %12, align 1 %13 = getelementptr inbounds i8, ptr %y, i32 48 %wide.load12.3 = load <16 x i8>, ptr %13, align 1 %14 = mul <16 x i8> %wide.load12.3, %wide.load.3 %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14) %16 = add i8 %15, %11 ret i8 %16 } define zeroext i8 @mlav128i8i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav128i8i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] ; CHECK-NEXT: vmlav.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vldrb.u8 q1, [r1, #16] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q1, [r1, #32] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] ; CHECK-NEXT: vldrb.u8 q1, [r1, #48] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #64] ; CHECK-NEXT: vldrb.u8 q1, [r1, #64] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #80] ; CHECK-NEXT: vldrb.u8 q1, [r1, #80] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #96] ; CHECK-NEXT: vldrb.u8 q1, [r1, #96] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #112] ; CHECK-NEXT: vldrb.u8 q1, [r1, #112] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 %wide.load12 = load <16 x i8>, ptr %y, align 1 %0 = mul <16 x i8> %wide.load12, %wide.load %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0) %2 = getelementptr inbounds i8, ptr %x, i32 16 %wide.load.1 = load <16 x i8>, ptr %2, align 1 %3 = getelementptr inbounds i8, ptr %y, i32 16 %wide.load12.1 = load <16 x i8>, ptr %3, align 1 %4 = mul <16 x i8> %wide.load12.1, %wide.load.1 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4) %6 = add i8 %5, %1 %7 = getelementptr inbounds i8, ptr %x, i32 32 %wide.load.2 = load <16 x i8>, ptr %7, align 1 %8 = getelementptr inbounds i8, ptr %y, i32 32 %wide.load12.2 = load <16 x i8>, ptr %8, align 1 %9 = mul <16 x i8> %wide.load12.2, %wide.load.2 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9) %11 = add i8 %10, %6 %12 = getelementptr inbounds i8, ptr %x, i32 48 %wide.load.3 = load <16 x i8>, ptr %12, align 1 %13 = getelementptr inbounds i8, ptr %y, i32 48 %wide.load12.3 = load <16 x i8>, ptr %13, align 1 %14 = mul <16 x i8> %wide.load12.3, %wide.load.3 %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14) %16 = add i8 %15, %11 %17 = getelementptr inbounds i8, ptr %x, i32 64 %wide.load.4 = load <16 x i8>, ptr %17, align 1 %18 = getelementptr inbounds i8, ptr %y, i32 64 %wide.load12.4 = load <16 x i8>, ptr %18, align 1 %19 = mul <16 x i8> %wide.load12.4, %wide.load.4 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %19) %21 = add i8 %20, %16 %22 = getelementptr inbounds i8, ptr %x, i32 80 %wide.load.5 = load <16 x i8>, ptr %22, align 1 %23 = getelementptr inbounds i8, ptr %y, i32 80 %wide.load12.5 = load <16 x i8>, ptr %23, align 1 %24 = mul <16 x i8> %wide.load12.5, %wide.load.5 %25 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %24) %26 = add i8 %25, %21 %27 = getelementptr inbounds i8, ptr %x, i32 96 %wide.load.6 = load <16 x i8>, ptr %27, align 1 %28 = getelementptr inbounds i8, ptr %y, i32 96 %wide.load12.6 = load <16 x i8>, ptr %28, align 1 %29 = mul <16 x i8> %wide.load12.6, %wide.load.6 %30 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %29) %31 = add i8 %30, %26 %32 = getelementptr inbounds i8, ptr %x, i32 112 %wide.load.7 = load <16 x i8>, ptr %32, align 1 %33 = getelementptr inbounds i8, ptr %y, i32 112 %wide.load12.7 = load <16 x i8>, ptr %33, align 1 %34 = mul <16 x i8> %wide.load12.7, %wide.load.7 %35 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %34) %36 = add i8 %35, %31 ret i8 %36 } define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: add_two_const: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vaddv.u32 r0, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: adds r0, #10 ; CHECK-NEXT: bx lr entry: %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) %c = add i32 %a, %b %d = add i32 %c, 10 ret i32 %d } define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: add_two_const2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vaddv.u32 r0, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: adds r0, #10 ; CHECK-NEXT: bx lr entry: %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) %c = add i32 %a, 10 %d = add i32 %c, %b ret i32 %d } define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: add_two_const3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: vaddva.u32 r0, q1 ; CHECK-NEXT: adds r0, #20 ; CHECK-NEXT: bx lr entry: %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) %c = add i32 %a, 10 %d = add i32 %b, 10 %e = add i32 %c, %d ret i32 %e } declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)