; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: sxtw x9, w3 ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x2] ; CHECK-NEXT: add x10, x0, x8 ; CHECK-NEXT: add x11, x2, x9 ; CHECK-NEXT: ldr d2, [x10] ; CHECK-NEXT: add x10, x10, x8 ; CHECK-NEXT: ldr d3, [x11] ; CHECK-NEXT: add x11, x11, x9 ; CHECK-NEXT: ldr d4, [x10] ; CHECK-NEXT: ldr d6, [x10, x8] ; CHECK-NEXT: ldr d5, [x11] ; CHECK-NEXT: ldr d7, [x11, x9] ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b ; CHECK-NEXT: shll2 v4.4s, v2.8h, #16 ; CHECK-NEXT: shll2 v5.4s, v0.8h, #16 ; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 ; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 ; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h ; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h ; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h ; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h ; CHECK-NEXT: mov v7.16b, v2.16b ; CHECK-NEXT: zip1 v4.4s, v2.4s, v0.4s ; CHECK-NEXT: zip2 v6.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp2 v5.4s, v3.4s, v1.4s ; CHECK-NEXT: mov v17.16b, v1.16b ; CHECK-NEXT: zip2 v16.4s, v1.4s, v3.4s ; CHECK-NEXT: mov v7.s[3], v0.s[2] ; CHECK-NEXT: ext v18.16b, v3.16b, v3.16b, #12 ; CHECK-NEXT: ext v2.16b, v2.16b, v4.16b, #8 ; CHECK-NEXT: mov v17.s[1], v3.s[0] ; CHECK-NEXT: uzp2 v0.4s, v5.4s, v3.4s ; CHECK-NEXT: zip2 v5.4s, v3.4s, v1.4s ; CHECK-NEXT: mov v3.s[0], v1.s[1] ; CHECK-NEXT: ext v1.16b, v1.16b, v18.16b, #12 ; CHECK-NEXT: mov v16.d[1], v7.d[1] ; CHECK-NEXT: mov v17.d[1], v2.d[1] ; CHECK-NEXT: mov v0.d[1], v6.d[1] ; CHECK-NEXT: mov v5.d[1], v7.d[1] ; CHECK-NEXT: mov v3.d[1], v4.d[1] ; CHECK-NEXT: mov v1.d[1], v6.d[1] ; CHECK-NEXT: add v0.4s, v0.4s, v5.4s ; CHECK-NEXT: add v2.4s, v3.4s, v17.4s ; CHECK-NEXT: sub v3.4s, v17.4s, v3.4s ; CHECK-NEXT: sub v1.4s, v16.4s, v1.4s ; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v2.4s ; CHECK-NEXT: add v6.4s, v1.4s, v3.4s ; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s ; CHECK-NEXT: mov v4.d[1], v0.d[1] ; CHECK-NEXT: mov v5.d[1], v2.d[1] ; CHECK-NEXT: rev64 v3.4s, v1.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v5.4s ; CHECK-NEXT: rev64 v4.4s, v6.4s ; CHECK-NEXT: rev64 v5.4s, v2.4s ; CHECK-NEXT: rev64 v7.4s, v0.4s ; CHECK-NEXT: addp v16.4s, v0.4s, v6.4s ; CHECK-NEXT: addp v17.4s, v2.4s, v1.4s ; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s ; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s ; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s ; CHECK-NEXT: ext v5.16b, v17.16b, v1.16b, #4 ; CHECK-NEXT: ext v6.16b, v16.16b, v4.16b, #4 ; CHECK-NEXT: mov v18.16b, v1.16b ; CHECK-NEXT: mov v19.16b, v4.16b ; CHECK-NEXT: ext v3.16b, v2.16b, v17.16b, #8 ; CHECK-NEXT: ext v7.16b, v0.16b, v16.16b, #4 ; CHECK-NEXT: mov v18.s[2], v17.s[3] ; CHECK-NEXT: zip2 v5.4s, v5.4s, v17.4s ; CHECK-NEXT: zip2 v6.4s, v6.4s, v16.4s ; CHECK-NEXT: mov v19.s[2], v16.s[3] ; CHECK-NEXT: trn2 v0.4s, v21.4s, v0.4s ; CHECK-NEXT: ext v20.16b, v3.16b, v2.16b, #4 ; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #4 ; CHECK-NEXT: mov v2.s[2], v17.s[1] ; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #12 ; CHECK-NEXT: ext v4.16b, v4.16b, v6.16b, #12 ; CHECK-NEXT: mov v5.16b, v18.16b ; CHECK-NEXT: uzp2 v3.4s, v3.4s, v20.4s ; CHECK-NEXT: mov v6.16b, v7.16b ; CHECK-NEXT: mov v20.16b, v19.16b ; CHECK-NEXT: mov v21.16b, v2.16b ; CHECK-NEXT: mov v5.s[1], v17.s[2] ; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s ; CHECK-NEXT: mov v6.s[0], v16.s[1] ; CHECK-NEXT: mov v20.s[1], v16.s[2] ; CHECK-NEXT: sub v16.4s, v19.4s, v4.4s ; CHECK-NEXT: mov v21.s[1], v17.s[0] ; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s ; CHECK-NEXT: sub v17.4s, v18.4s, v1.4s ; CHECK-NEXT: add v1.4s, v5.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v6.4s ; CHECK-NEXT: add v4.4s, v20.4s, v4.4s ; CHECK-NEXT: add v3.4s, v21.4s, v3.4s ; CHECK-NEXT: mov v1.d[1], v17.d[1] ; CHECK-NEXT: mov v0.d[1], v7.d[1] ; CHECK-NEXT: mov v4.d[1], v16.d[1] ; CHECK-NEXT: mov v3.d[1], v2.d[1] ; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 ; CHECK-NEXT: cmlt v2.8h, v0.8h, #0 ; CHECK-NEXT: cmlt v6.8h, v4.8h, #0 ; CHECK-NEXT: cmlt v5.8h, v3.8h, #0 ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s ; CHECK-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NEXT: add v4.4s, v6.4s, v4.4s ; CHECK-NEXT: add v3.4s, v5.4s, v3.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b ; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b ; CHECK-NEXT: eor v2.16b, v3.16b, v5.16b ; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b ; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 ; CHECK-NEXT: add w8, w9, w8, uxth ; CHECK-NEXT: lsr w0, w8, #1 ; CHECK-NEXT: ret entry: %idx.ext = sext i32 %i1 to i64 %idx.ext63 = sext i32 %i2 to i64 %arrayidx3 = getelementptr inbounds i8, ptr %p1, i64 4 %arrayidx5 = getelementptr inbounds i8, ptr %p2, i64 4 %0 = load <4 x i8>, ptr %p1, align 1 %1 = load <4 x i8>, ptr %p2, align 1 %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext %add.ptr64 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext63 %arrayidx3.1 = getelementptr inbounds i8, ptr %add.ptr, i64 4 %arrayidx5.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 4 %2 = load <4 x i8>, ptr %add.ptr, align 1 %3 = load <4 x i8>, ptr %add.ptr64, align 1 %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63 %arrayidx3.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 4 %arrayidx5.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 4 %4 = load <4 x i8>, ptr %add.ptr.1, align 1 %5 = load <4 x i8>, ptr %add.ptr64.1, align 1 %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63 %arrayidx3.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 4 %arrayidx5.3 = getelementptr inbounds i8, ptr %add.ptr64.2, i64 4 %6 = load <4 x i8>, ptr %add.ptr.2, align 1 %7 = load <4 x i8>, ptr %add.ptr64.2, align 1 %8 = load <4 x i8>, ptr %arrayidx3, align 1 %9 = load <4 x i8>, ptr %arrayidx3.1, align 1 %10 = load <4 x i8>, ptr %arrayidx3.2, align 1 %11 = load <4 x i8>, ptr %arrayidx3.3, align 1 %12 = shufflevector <4 x i8> %11, <4 x i8> %10, <16 x i32> %13 = shufflevector <4 x i8> %9, <4 x i8> poison, <16 x i32> %14 = shufflevector <16 x i8> %12, <16 x i8> %13, <16 x i32> %15 = shufflevector <4 x i8> %8, <4 x i8> poison, <16 x i32> %16 = shufflevector <16 x i8> %14, <16 x i8> %15, <16 x i32> %17 = zext <16 x i8> %16 to <16 x i32> %18 = load <4 x i8>, ptr %arrayidx5, align 1 %19 = load <4 x i8>, ptr %arrayidx5.1, align 1 %20 = load <4 x i8>, ptr %arrayidx5.2, align 1 %21 = load <4 x i8>, ptr %arrayidx5.3, align 1 %22 = shufflevector <4 x i8> %21, <4 x i8> %20, <16 x i32> %23 = shufflevector <4 x i8> %19, <4 x i8> poison, <16 x i32> %24 = shufflevector <16 x i8> %22, <16 x i8> %23, <16 x i32> %25 = shufflevector <4 x i8> %18, <4 x i8> poison, <16 x i32> %26 = shufflevector <16 x i8> %24, <16 x i8> %25, <16 x i32> %27 = zext <16 x i8> %26 to <16 x i32> %28 = shufflevector <4 x i8> %6, <4 x i8> %4, <16 x i32> %29 = shufflevector <4 x i8> %2, <4 x i8> poison, <16 x i32> %30 = shufflevector <16 x i8> %28, <16 x i8> %29, <16 x i32> %31 = shufflevector <4 x i8> %0, <4 x i8> poison, <16 x i32> %32 = shufflevector <16 x i8> %30, <16 x i8> %31, <16 x i32> %33 = zext <16 x i8> %32 to <16 x i32> %34 = shufflevector <4 x i8> %7, <4 x i8> %5, <16 x i32> %35 = shufflevector <4 x i8> %3, <4 x i8> poison, <16 x i32> %36 = shufflevector <16 x i8> %34, <16 x i8> %35, <16 x i32> %37 = shufflevector <4 x i8> %1, <4 x i8> poison, <16 x i32> %38 = shufflevector <16 x i8> %36, <16 x i8> %37, <16 x i32> %39 = zext <16 x i8> %38 to <16 x i32> %40 = sub nsw <16 x i32> %33, %39 %41 = sub nsw <16 x i32> %17, %27 %42 = shl nsw <16 x i32> %41, %43 = add nsw <16 x i32> %42, %40 %44 = shufflevector <16 x i32> %43, <16 x i32> poison, <16 x i32> %45 = shufflevector <16 x i32> %43, <16 x i32> undef, <16 x i32> %46 = add nsw <16 x i32> %44, %45 %47 = sub nsw <16 x i32> %44, %45 %48 = shufflevector <16 x i32> %46, <16 x i32> %47, <16 x i32> %49 = shufflevector <16 x i32> %46, <16 x i32> %47, <16 x i32> %50 = add nsw <16 x i32> %48, %49 %51 = sub nsw <16 x i32> %48, %49 %52 = shufflevector <16 x i32> %50, <16 x i32> %51, <16 x i32> %53 = shufflevector <16 x i32> %50, <16 x i32> %51, <16 x i32> %54 = add nsw <16 x i32> %52, %53 %55 = sub nsw <16 x i32> %52, %53 %56 = shufflevector <16 x i32> %54, <16 x i32> %55, <16 x i32> %57 = shufflevector <16 x i32> %54, <16 x i32> %55, <16 x i32> %58 = add nsw <16 x i32> %56, %57 %59 = sub nsw <16 x i32> %56, %57 %60 = shufflevector <16 x i32> %58, <16 x i32> %59, <16 x i32> %61 = lshr <16 x i32> %60, %62 = and <16 x i32> %61, %63 = mul nuw <16 x i32> %62, %64 = add <16 x i32> %63, %60 %65 = xor <16 x i32> %64, %63 %66 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %65) %conv118 = and i32 %66, 65535 %shr = lshr i32 %66, 16 %add119 = add nuw nsw i32 %conv118, %shr %shr120 = lshr i32 %add119, 1 ret i32 %shr120 } define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: sxtw x9, w3 ; CHECK-NEXT: ldr d4, [x0] ; CHECK-NEXT: ldr d5, [x2] ; CHECK-NEXT: add x10, x0, x8 ; CHECK-NEXT: add x11, x2, x9 ; CHECK-NEXT: add x12, x10, x8 ; CHECK-NEXT: ldr d6, [x10] ; CHECK-NEXT: ldr d7, [x11] ; CHECK-NEXT: ldr d0, [x12, x8] ; CHECK-NEXT: add x8, x11, x9 ; CHECK-NEXT: ldr d1, [x12] ; CHECK-NEXT: ldr d2, [x8, x9] ; CHECK-NEXT: ldr d3, [x8] ; CHECK-NEXT: usubl v1.8h, v1.8b, v3.8b ; CHECK-NEXT: usubl v0.8h, v0.8b, v2.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b ; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b ; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 ; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 ; CHECK-NEXT: shll2 v7.4s, v3.8h, #16 ; CHECK-NEXT: shll2 v6.4s, v2.8h, #16 ; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h ; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h ; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h ; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h ; CHECK-NEXT: uzp2 v4.4s, v0.4s, v1.4s ; CHECK-NEXT: mov v7.16b, v3.16b ; CHECK-NEXT: mov v17.16b, v1.16b ; CHECK-NEXT: zip1 v5.4s, v3.4s, v2.4s ; CHECK-NEXT: zip2 v6.4s, v3.4s, v2.4s ; CHECK-NEXT: zip2 v16.4s, v0.4s, v1.4s ; CHECK-NEXT: ext v18.16b, v0.16b, v0.16b, #12 ; CHECK-NEXT: mov v7.s[3], v2.s[2] ; CHECK-NEXT: mov v17.s[1], v0.s[0] ; CHECK-NEXT: uzp2 v2.4s, v4.4s, v0.4s ; CHECK-NEXT: mov v4.16b, v0.16b ; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8 ; CHECK-NEXT: mov v4.s[0], v1.s[1] ; CHECK-NEXT: mov v16.d[1], v7.d[1] ; CHECK-NEXT: ext v1.16b, v1.16b, v18.16b, #12 ; CHECK-NEXT: mov v2.d[1], v6.d[1] ; CHECK-NEXT: mov v0.d[1], v7.d[1] ; CHECK-NEXT: mov v17.d[1], v3.d[1] ; CHECK-NEXT: mov v4.d[1], v5.d[1] ; CHECK-NEXT: mov v1.d[1], v6.d[1] ; CHECK-NEXT: add v2.4s, v2.4s, v16.4s ; CHECK-NEXT: add v3.4s, v4.4s, v17.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: sub v1.4s, v17.4s, v4.4s ; CHECK-NEXT: rev64 v5.4s, v2.4s ; CHECK-NEXT: rev64 v6.4s, v3.4s ; CHECK-NEXT: sub v4.4s, v1.4s, v0.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: mov v5.d[1], v2.d[1] ; CHECK-NEXT: mov v6.d[1], v3.d[1] ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: add v1.4s, v2.4s, v6.4s ; CHECK-NEXT: zip1 v2.4s, v3.4s, v4.4s ; CHECK-NEXT: zip2 v7.4s, v3.4s, v4.4s ; CHECK-NEXT: zip1 v5.4s, v1.4s, v0.4s ; CHECK-NEXT: uzp2 v6.4s, v1.4s, v0.4s ; CHECK-NEXT: mov v18.16b, v1.16b ; CHECK-NEXT: ext v16.16b, v3.16b, v2.16b, #8 ; CHECK-NEXT: zip2 v17.4s, v1.4s, v0.4s ; CHECK-NEXT: mov v3.s[3], v4.s[2] ; CHECK-NEXT: mov v18.s[1], v0.s[1] ; CHECK-NEXT: trn2 v4.4s, v1.4s, v5.4s ; CHECK-NEXT: uzp2 v1.4s, v6.4s, v1.4s ; CHECK-NEXT: mov v17.d[1], v3.d[1] ; CHECK-NEXT: mov v18.d[1], v2.d[1] ; CHECK-NEXT: mov v4.d[1], v16.d[1] ; CHECK-NEXT: mov v1.d[1], v7.d[1] ; CHECK-NEXT: add v0.4s, v17.4s, v1.4s ; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s ; CHECK-NEXT: add v2.4s, v18.4s, v4.4s ; CHECK-NEXT: sub v3.4s, v4.4s, v18.4s ; CHECK-NEXT: zip2 v4.4s, v0.4s, v1.4s ; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #4 ; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #4 ; CHECK-NEXT: zip2 v7.4s, v1.4s, v0.4s ; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s ; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s ; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s ; CHECK-NEXT: zip1 v2.4s, v2.4s, v3.4s ; CHECK-NEXT: ext v1.16b, v5.16b, v1.16b, #8 ; CHECK-NEXT: ext v18.16b, v6.16b, v3.16b, #8 ; CHECK-NEXT: add v3.4s, v16.4s, v7.4s ; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #4 ; CHECK-NEXT: ext v5.16b, v18.16b, v6.16b, #4 ; CHECK-NEXT: cmlt v2.8h, v4.8h, #0 ; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s ; CHECK-NEXT: add v4.4s, v2.4s, v4.4s ; CHECK-NEXT: add v1.4s, v5.4s, v1.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b ; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b ; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 ; CHECK-NEXT: add v2.4s, v3.4s, v2.4s ; CHECK-NEXT: add v1.4s, v4.4s, v1.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 ; CHECK-NEXT: add w8, w9, w8, uxth ; CHECK-NEXT: lsr w0, w8, #1 ; CHECK-NEXT: ret entry: %idx.ext = sext i32 %i1 to i64 %idx.ext63 = sext i32 %i2 to i64 %arrayidx3 = getelementptr inbounds i8, ptr %p1, i64 4 %arrayidx5 = getelementptr inbounds i8, ptr %p2, i64 4 %0 = load <4 x i8>, ptr %p1, align 1 %1 = load <4 x i8>, ptr %p2, align 1 %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext %add.ptr64 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext63 %arrayidx3.1 = getelementptr inbounds i8, ptr %add.ptr, i64 4 %arrayidx5.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 4 %2 = load <4 x i8>, ptr %add.ptr, align 1 %3 = load <4 x i8>, ptr %add.ptr64, align 1 %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63 %arrayidx3.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 4 %arrayidx5.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 4 %4 = load <4 x i8>, ptr %add.ptr.1, align 1 %5 = load <4 x i8>, ptr %add.ptr64.1, align 1 %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63 %arrayidx3.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 4 %arrayidx5.3 = getelementptr inbounds i8, ptr %add.ptr64.2, i64 4 %6 = load <4 x i8>, ptr %add.ptr.2, align 1 %7 = load <4 x i8>, ptr %add.ptr64.2, align 1 %8 = load <4 x i8>, ptr %arrayidx3, align 1 %9 = load <4 x i8>, ptr %arrayidx3.1, align 1 %10 = load <4 x i8>, ptr %arrayidx3.2, align 1 %11 = load <4 x i8>, ptr %arrayidx3.3, align 1 %12 = shufflevector <4 x i8> %11, <4 x i8> %10, <16 x i32> %13 = shufflevector <4 x i8> %9, <4 x i8> poison, <16 x i32> %14 = shufflevector <16 x i8> %12, <16 x i8> %13, <16 x i32> %15 = shufflevector <4 x i8> %8, <4 x i8> poison, <16 x i32> %16 = shufflevector <16 x i8> %14, <16 x i8> %15, <16 x i32> %17 = zext <16 x i8> %16 to <16 x i32> %18 = load <4 x i8>, ptr %arrayidx5, align 1 %19 = load <4 x i8>, ptr %arrayidx5.1, align 1 %20 = load <4 x i8>, ptr %arrayidx5.2, align 1 %21 = load <4 x i8>, ptr %arrayidx5.3, align 1 %22 = shufflevector <4 x i8> %21, <4 x i8> %20, <16 x i32> %23 = shufflevector <4 x i8> %19, <4 x i8> poison, <16 x i32> %24 = shufflevector <16 x i8> %22, <16 x i8> %23, <16 x i32> %25 = shufflevector <4 x i8> %18, <4 x i8> poison, <16 x i32> %26 = shufflevector <16 x i8> %24, <16 x i8> %25, <16 x i32> %27 = zext <16 x i8> %26 to <16 x i32> %28 = shufflevector <4 x i8> %6, <4 x i8> %4, <16 x i32> %29 = shufflevector <4 x i8> %2, <4 x i8> poison, <16 x i32> %30 = shufflevector <16 x i8> %28, <16 x i8> %29, <16 x i32> %31 = shufflevector <4 x i8> %0, <4 x i8> poison, <16 x i32> %32 = shufflevector <16 x i8> %30, <16 x i8> %31, <16 x i32> %33 = zext <16 x i8> %32 to <16 x i32> %34 = shufflevector <4 x i8> %7, <4 x i8> %5, <16 x i32> %35 = shufflevector <4 x i8> %3, <4 x i8> poison, <16 x i32> %36 = shufflevector <16 x i8> %34, <16 x i8> %35, <16 x i32> %37 = shufflevector <4 x i8> %1, <4 x i8> poison, <16 x i32> %38 = shufflevector <16 x i8> %36, <16 x i8> %37, <16 x i32> %39 = zext <16 x i8> %38 to <16 x i32> %40 = sub nsw <16 x i32> %33, %39 %41 = sub nsw <16 x i32> %17, %27 %42 = shl nsw <16 x i32> %41, %43 = add nsw <16 x i32> %42, %40 %44 = shufflevector <16 x i32> %43, <16 x i32> poison, <16 x i32> %reorder = shufflevector <16 x i32> %44, <16 x i32> poison, <16 x i32> %45 = add nsw <16 x i32> %44, %reorder %46 = sub nsw <16 x i32> %44, %reorder %47 = shufflevector <16 x i32> %45, <16 x i32> %46, <16 x i32> %reorder191 = shufflevector <16 x i32> %45, <16 x i32> %46, <16 x i32> %48 = add nsw <16 x i32> %47, %reorder191 %49 = sub nsw <16 x i32> %47, %reorder191 %50 = shufflevector <16 x i32> %48, <16 x i32> %49, <16 x i32> %51 = shufflevector <16 x i32> %48, <16 x i32> %49, <16 x i32> %52 = shufflevector <16 x i32> %48, <16 x i32> %49, <16 x i32> %53 = shufflevector <16 x i32> %48, <16 x i32> %49, <16 x i32> %54 = add nsw <16 x i32> %50, %52 %55 = sub nsw <16 x i32> %51, %53 %56 = shufflevector <16 x i32> %54, <16 x i32> %55, <16 x i32> %57 = shufflevector <16 x i32> %54, <16 x i32> %55, <16 x i32> %58 = shufflevector <16 x i32> %54, <16 x i32> %55, <16 x i32> %59 = shufflevector <16 x i32> %54, <16 x i32> %55, <16 x i32> %60 = add nsw <16 x i32> %56, %58 %61 = sub nsw <16 x i32> %57, %59 %62 = shufflevector <16 x i32> %60, <16 x i32> %61, <16 x i32> %63 = lshr <16 x i32> %62, %64 = and <16 x i32> %63, %65 = mul nuw <16 x i32> %64, %66 = add <16 x i32> %65, %62 %67 = xor <16 x i32> %66, %65 %68 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %67) %conv118 = and i32 %68, 65535 %shr = lshr i32 %68, 16 %add119 = add nuw nsw i32 %conv118, %shr %shr120 = lshr i32 %add119, 1 ret i32 %shr120 } define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: sxtw x9, w3 ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x2] ; CHECK-NEXT: add x10, x0, x8 ; CHECK-NEXT: add x11, x2, x9 ; CHECK-NEXT: ldr d2, [x10] ; CHECK-NEXT: ldr d3, [x11] ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: add x10, x10, x8 ; CHECK-NEXT: add x11, x11, x9 ; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b ; CHECK-NEXT: ldr d2, [x10, x8] ; CHECK-NEXT: ldr d3, [x11, x9] ; CHECK-NEXT: ldr d4, [x10] ; CHECK-NEXT: ldr d5, [x11] ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b ; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 ; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 ; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h ; CHECK-NEXT: shll2 v4.4s, v2.8h, #16 ; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h ; CHECK-NEXT: shll2 v5.4s, v3.8h, #16 ; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h ; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h ; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v1.4s ; CHECK-NEXT: rev64 v6.4s, v2.4s ; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s ; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s ; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s ; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s ; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s ; CHECK-NEXT: sub v1.4s, v3.4s, v7.4s ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v7.16b, v5.16b, v4.16b, #4 ; CHECK-NEXT: mov v4.s[3], v5.s[2] ; CHECK-NEXT: zip2 v16.4s, v6.4s, v1.4s ; CHECK-NEXT: zip1 v1.4s, v6.4s, v1.4s ; CHECK-NEXT: uzp2 v6.4s, v2.4s, v0.4s ; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #4 ; CHECK-NEXT: uzp1 v0.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s ; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s ; CHECK-NEXT: mov v16.d[1], v4.d[1] ; CHECK-NEXT: rev64 v3.4s, v6.4s ; CHECK-NEXT: mov v1.d[1], v5.d[1] ; CHECK-NEXT: rev64 v0.4s, v0.4s ; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s ; CHECK-NEXT: sub v4.4s, v1.4s, v16.4s ; CHECK-NEXT: add v0.4s, v3.4s, v0.4s ; CHECK-NEXT: add v1.4s, v16.4s, v1.4s ; CHECK-NEXT: zip1 v3.4s, v2.4s, v4.4s ; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s ; CHECK-NEXT: uzp2 v6.4s, v0.4s, v1.4s ; CHECK-NEXT: zip2 v7.4s, v0.4s, v1.4s ; CHECK-NEXT: zip2 v17.4s, v2.4s, v4.4s ; CHECK-NEXT: ext v16.16b, v2.16b, v3.16b, #8 ; CHECK-NEXT: trn2 v5.4s, v0.4s, v5.4s ; CHECK-NEXT: uzp2 v6.4s, v6.4s, v0.4s ; CHECK-NEXT: mov v2.s[3], v4.s[2] ; CHECK-NEXT: mov v0.s[1], v1.s[1] ; CHECK-NEXT: mov v5.d[1], v16.d[1] ; CHECK-NEXT: mov v6.d[1], v17.d[1] ; CHECK-NEXT: mov v7.d[1], v2.d[1] ; CHECK-NEXT: mov v0.d[1], v3.d[1] ; CHECK-NEXT: add v1.4s, v6.4s, v7.4s ; CHECK-NEXT: sub v2.4s, v7.4s, v6.4s ; CHECK-NEXT: add v3.4s, v5.4s, v0.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s ; CHECK-NEXT: zip2 v4.4s, v1.4s, v2.4s ; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #4 ; CHECK-NEXT: ext v6.16b, v3.16b, v3.16b, #4 ; CHECK-NEXT: zip2 v7.4s, v2.4s, v1.4s ; CHECK-NEXT: zip2 v16.4s, v0.4s, v3.4s ; CHECK-NEXT: zip2 v17.4s, v3.4s, v0.4s ; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #8 ; CHECK-NEXT: ext v18.16b, v6.16b, v0.16b, #8 ; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s ; CHECK-NEXT: add v3.4s, v16.4s, v7.4s ; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s ; CHECK-NEXT: ext v2.16b, v2.16b, v5.16b, #4 ; CHECK-NEXT: ext v5.16b, v18.16b, v6.16b, #4 ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: cmlt v1.8h, v4.8h, #0 ; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s ; CHECK-NEXT: add v4.4s, v1.4s, v4.4s ; CHECK-NEXT: add v2.4s, v5.4s, v2.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b ; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b ; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 ; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: eor v1.16b, v2.16b, v4.16b ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 ; CHECK-NEXT: add w8, w9, w8, uxth ; CHECK-NEXT: lsr w0, w8, #1 ; CHECK-NEXT: ret entry: %idx.ext = sext i32 %i1 to i64 %idx.ext63 = sext i32 %i2 to i64 %arrayidx3 = getelementptr inbounds i8, ptr %p1, i64 4 %arrayidx5 = getelementptr inbounds i8, ptr %p2, i64 4 %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext %add.ptr64 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext63 %arrayidx3.1 = getelementptr inbounds i8, ptr %add.ptr, i64 4 %arrayidx5.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 4 %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63 %arrayidx3.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 4 %arrayidx5.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 4 %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63 %arrayidx3.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 4 %arrayidx5.3 = getelementptr inbounds i8, ptr %add.ptr64.2, i64 4 %0 = load <4 x i8>, ptr %p1, align 1 %1 = load <4 x i8>, ptr %p2, align 1 %2 = load <4 x i8>, ptr %arrayidx3, align 1 %3 = load <4 x i8>, ptr %arrayidx5, align 1 %4 = load <4 x i8>, ptr %add.ptr, align 1 %5 = load <4 x i8>, ptr %add.ptr64, align 1 %6 = load <4 x i8>, ptr %arrayidx3.1, align 1 %7 = load <4 x i8>, ptr %arrayidx5.1, align 1 %8 = load <4 x i8>, ptr %add.ptr.1, align 1 %9 = load <4 x i8>, ptr %add.ptr64.1, align 1 %10 = load <4 x i8>, ptr %arrayidx3.2, align 1 %11 = load <4 x i8>, ptr %arrayidx5.2, align 1 %12 = load <4 x i8>, ptr %add.ptr.2, align 1 %13 = shufflevector <4 x i8> %12, <4 x i8> %8, <16 x i32> %14 = shufflevector <4 x i8> %4, <4 x i8> poison, <16 x i32> %15 = shufflevector <16 x i8> %13, <16 x i8> %14, <16 x i32> %16 = shufflevector <4 x i8> %0, <4 x i8> poison, <16 x i32> %17 = shufflevector <16 x i8> %15, <16 x i8> %16, <16 x i32> %18 = zext <16 x i8> %17 to <16 x i32> %19 = load <4 x i8>, ptr %add.ptr64.2, align 1 %20 = shufflevector <4 x i8> %19, <4 x i8> %9, <16 x i32> %21 = shufflevector <4 x i8> %5, <4 x i8> poison, <16 x i32> %22 = shufflevector <16 x i8> %20, <16 x i8> %21, <16 x i32> %23 = shufflevector <4 x i8> %1, <4 x i8> poison, <16 x i32> %24 = shufflevector <16 x i8> %22, <16 x i8> %23, <16 x i32> %25 = zext <16 x i8> %24 to <16 x i32> %26 = sub nsw <16 x i32> %18, %25 %27 = load <4 x i8>, ptr %arrayidx3.3, align 1 %28 = shufflevector <4 x i8> %27, <4 x i8> %10, <16 x i32> %29 = shufflevector <4 x i8> %6, <4 x i8> poison, <16 x i32> %30 = shufflevector <16 x i8> %28, <16 x i8> %29, <16 x i32> %31 = shufflevector <4 x i8> %2, <4 x i8> poison, <16 x i32> %32 = shufflevector <16 x i8> %30, <16 x i8> %31, <16 x i32> %33 = zext <16 x i8> %32 to <16 x i32> %34 = load <4 x i8>, ptr %arrayidx5.3, align 1 %35 = shufflevector <4 x i8> %34, <4 x i8> %11, <16 x i32> %36 = shufflevector <4 x i8> %7, <4 x i8> poison, <16 x i32> %37 = shufflevector <16 x i8> %35, <16 x i8> %36, <16 x i32> %38 = shufflevector <4 x i8> %3, <4 x i8> poison, <16 x i32> %39 = shufflevector <16 x i8> %37, <16 x i8> %38, <16 x i32> %40 = zext <16 x i8> %39 to <16 x i32> %41 = sub nsw <16 x i32> %33, %40 %42 = shl nsw <16 x i32> %41, %43 = add nsw <16 x i32> %42, %26 %reorder = shufflevector <16 x i32> %43, <16 x i32> poison, <16 x i32> %44 = add nsw <16 x i32> %43, %reorder %45 = sub nsw <16 x i32> %43, %reorder %46 = shufflevector <16 x i32> %44, <16 x i32> %45, <16 x i32> %47 = shufflevector <16 x i32> %44, <16 x i32> %45, <16 x i32> %48 = shufflevector <16 x i32> %44, <16 x i32> %45, <16 x i32> %49 = shufflevector <16 x i32> %44, <16 x i32> %45, <16 x i32> %50 = add nsw <16 x i32> %46, %48 %51 = sub nsw <16 x i32> %47, %49 %52 = shufflevector <16 x i32> %50, <16 x i32> %51, <16 x i32> %53 = shufflevector <16 x i32> %50, <16 x i32> %51, <16 x i32> %54 = shufflevector <16 x i32> %50, <16 x i32> %51, <16 x i32> %55 = shufflevector <16 x i32> %50, <16 x i32> %51, <16 x i32> %56 = add nsw <16 x i32> %52, %54 %57 = sub nsw <16 x i32> %53, %55 %58 = shufflevector <16 x i32> %56, <16 x i32> %57, <16 x i32> %59 = shufflevector <16 x i32> %56, <16 x i32> %57, <16 x i32> %60 = shufflevector <16 x i32> %56, <16 x i32> %57, <16 x i32> %61 = shufflevector <16 x i32> %56, <16 x i32> %57, <16 x i32> %62 = add nsw <16 x i32> %58, %60 %63 = sub nsw <16 x i32> %59, %61 %64 = shufflevector <16 x i32> %62, <16 x i32> %63, <16 x i32> %65 = lshr <16 x i32> %64, %66 = and <16 x i32> %65, %67 = mul nuw <16 x i32> %66, %68 = add <16 x i32> %67, %64 %69 = xor <16 x i32> %68, %67 %70 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %69) %conv118 = and i32 %70, 65535 %shr = lshr i32 %70, 16 %add119 = add nuw nsw i32 %conv118, %shr %shr120 = lshr i32 %add119, 1 ret i32 %shr120 } declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)