; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s ; One dimensional loop with load that can be hoisted outside of loop ; for (int i = 0; i < N; ++i) ; if (!memcmp(a[i], b, 4)) ; sum += 1; ; define i64 @one_dimensional(ptr %a, ptr %b, i64 %N) { ; CHECK-LABEL: one_dimensional: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x10, [x0], #8 ; CHECK-NEXT: ldr w10, [x10] ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x2, x2, #1 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: br label %for.body for.body: ; preds = %entry, %for.body %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ] %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06 %0 = load ptr, ptr %arrayidx, align 8 %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4) %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 %spec.select = add i64 %sum.05, %add %inc = add nuw i64 %i.06, 1 %exitcond = icmp eq i64 %inc, %N br i1 %exitcond, label %for.exit, label %for.body for.exit: ; preds = %for.body ret i64 %spec.select } ; Same but loop is two dimensional. Load is hosted outside of both loops ; for (int i = 0; i < N; ++i) ; for (int j = 0; j < M; ++j) ; if (!memcmp(a[i][j], b, 4)) ; sum += 1; ; define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M) { ; CHECK-LABEL: two_dimensional: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB1_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB1_2 Depth 2 ; CHECK-NEXT: ldr x11, [x0, x9, lsl #3] ; CHECK-NEXT: mov x12, x3 ; CHECK-NEXT: .LBB1_2: // %for.body4 ; CHECK-NEXT: // Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldr x13, [x11], #8 ; CHECK-NEXT: ldr w13, [x13] ; CHECK-NEXT: cmp w13, w10 ; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x12, x12, #1 ; CHECK-NEXT: b.ne .LBB1_2 ; CHECK-NEXT: // %bb.3: // %for.cond1.for.exit3_crit_edge ; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: cmp x9, x2 ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.4: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: br label %for.cond1.preheader for.cond1.preheader: ; preds = %entry, %for.cond1.for.exit3_crit_edge %i.019 = phi i64 [ %inc7, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ] %sum.018 = phi i64 [ %spec.select, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019 %0 = load ptr, ptr %arrayidx, align 8 br label %for.body4 for.body4: ; preds = %for.cond1.preheader, %for.body4 %j.016 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ] %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ] %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016 %1 = load ptr, ptr %arrayidx5, align 8 %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4) %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 %spec.select = add i64 %sum.115, %add %inc = add nuw i64 %j.016, 1 %exitcond = icmp eq i64 %inc, %M br i1 %exitcond, label %for.cond1.for.exit3_crit_edge, label %for.body4 for.cond1.for.exit3_crit_edge: ; preds = %for.body4 %inc7 = add nuw i64 %i.019, 1 %exitcond22 = icmp eq i64 %inc7, %N br i1 %exitcond22, label %for.exit, label %for.cond1.preheader for.exit: ; preds = %for.cond1.for.exit3_crit_edge ret i64 %spec.select } ; Same but loop is three dimensional. Load is hosted outside of all three loops ; for (int i = 0; i < N; ++i) ; for (int j = 0; j < M; ++j) ; for (int k = 0; k < K; ++k) ; if (!memcmp(a[i][j][k], b, 4)) ; sum += 1; ; define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { ; CHECK-LABEL: three_dimensional: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB2_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB2_2 Depth 2 ; CHECK-NEXT: // Child Loop BB2_3 Depth 3 ; CHECK-NEXT: ldr x11, [x0, x9, lsl #3] ; CHECK-NEXT: mov x12, xzr ; CHECK-NEXT: .LBB2_2: // %for.cond5.preheader ; CHECK-NEXT: // Parent Loop BB2_1 Depth=1 ; CHECK-NEXT: // => This Loop Header: Depth=2 ; CHECK-NEXT: // Child Loop BB2_3 Depth 3 ; CHECK-NEXT: ldr x13, [x11, x12, lsl #3] ; CHECK-NEXT: mov x14, x4 ; CHECK-NEXT: .LBB2_3: // %for.body8 ; CHECK-NEXT: // Parent Loop BB2_1 Depth=1 ; CHECK-NEXT: // Parent Loop BB2_2 Depth=2 ; CHECK-NEXT: // => This Inner Loop Header: Depth=3 ; CHECK-NEXT: ldr x15, [x13], #8 ; CHECK-NEXT: ldr w15, [x15] ; CHECK-NEXT: cmp w15, w10 ; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x14, x14, #1 ; CHECK-NEXT: b.ne .LBB2_3 ; CHECK-NEXT: // %bb.4: // %for.cond5.for.cond ; CHECK-NEXT: // in Loop: Header=BB2_2 Depth=2 ; CHECK-NEXT: add x12, x12, #1 ; CHECK-NEXT: cmp x12, x3 ; CHECK-NEXT: b.ne .LBB2_2 ; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond ; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: cmp x9, x2 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.6: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: br label %for.cond1.preheader for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ] %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033 %0 = load ptr, ptr %arrayidx, align 8 br label %for.cond5.preheader for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ] %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029 %1 = load ptr, ptr %arrayidx9, align 8 br label %for.body8 for.body8: ; preds = %for.body8, %for.cond5.preheader %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ] %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026 %2 = load ptr, ptr %arrayidx10, align 8 %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4) %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 %spec.select = add i64 %sum.225, %add %inc = add nuw i64 %k.026, 1 %exitcond = icmp eq i64 %inc, %K br i1 %exitcond, label %for.cond5.for.cond, label %for.body8 for.cond5.for.cond: ; preds = %for.body8 %inc12 = add nuw i64 %j.029, 1 %exitcond44 = icmp eq i64 %inc12, %M br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader for.cond1.for.cond: ; preds = %for.cond5.for.cond %inc15 = add nuw i64 %i.033, 1 %exitcond45 = icmp eq i64 %inc15, %N br i1 %exitcond45, label %for.exit, label %for.cond1.preheader for.exit: ; preds = %for.cond1.for.cond ret i64 %spec.select } ; Three dimensional loop but `b` is invariant only relatively to the inner loop. ; Make sure that load is hoisted only outside of first loop ; for (int i = 0; i < N; ++i) ; for (int j = 0; j < M; ++j) ; for (int k = 0; k < K; ++k) ; if (!memcmp(a[i][j][k], b[j], 4)) ; sum += 1; ; define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { ; CHECK-LABEL: three_dimensional_middle: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB3_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB3_2 Depth 2 ; CHECK-NEXT: // Child Loop BB3_3 Depth 3 ; CHECK-NEXT: ldr x10, [x0, x9, lsl #3] ; CHECK-NEXT: mov x11, xzr ; CHECK-NEXT: .LBB3_2: // %for.cond5.preheader ; CHECK-NEXT: // Parent Loop BB3_1 Depth=1 ; CHECK-NEXT: // => This Loop Header: Depth=2 ; CHECK-NEXT: // Child Loop BB3_3 Depth 3 ; CHECK-NEXT: lsl x12, x11, #3 ; CHECK-NEXT: mov x14, x4 ; CHECK-NEXT: ldr x13, [x1, x12] ; CHECK-NEXT: ldr x12, [x10, x12] ; CHECK-NEXT: ldr w13, [x13] ; CHECK-NEXT: .LBB3_3: // %for.body8 ; CHECK-NEXT: // Parent Loop BB3_1 Depth=1 ; CHECK-NEXT: // Parent Loop BB3_2 Depth=2 ; CHECK-NEXT: // => This Inner Loop Header: Depth=3 ; CHECK-NEXT: ldr x15, [x12], #8 ; CHECK-NEXT: ldr w15, [x15] ; CHECK-NEXT: cmp w15, w13 ; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x14, x14, #1 ; CHECK-NEXT: b.ne .LBB3_3 ; CHECK-NEXT: // %bb.4: // %for.cond5.for.cond ; CHECK-NEXT: // in Loop: Header=BB3_2 Depth=2 ; CHECK-NEXT: add x11, x11, #1 ; CHECK-NEXT: cmp x11, x3 ; CHECK-NEXT: b.ne .LBB3_2 ; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: cmp x9, x2 ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.6: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: br label %for.cond1.preheader for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ] %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035 %0 = load ptr, ptr %arrayidx, align 8 br label %for.cond5.preheader for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ] %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031 %1 = load ptr, ptr %arrayidx9, align 8 %arrayidx11 = getelementptr inbounds ptr, ptr %b, i64 %j.031 %2 = load ptr, ptr %arrayidx11, align 8 br label %for.body8 for.body8: ; preds = %for.body8, %for.cond5.preheader %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ] %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028 %3 = load ptr, ptr %arrayidx10, align 8 %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4) %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 %spec.select = add i64 %sum.227, %add %inc = add nuw i64 %k.028, 1 %exitcond = icmp eq i64 %inc, %K br i1 %exitcond, label %for.cond5.for.cond, label %for.body8 for.cond5.for.cond: ; preds = %for.body8 %inc13 = add nuw i64 %j.031, 1 %exitcond46 = icmp eq i64 %inc13, %M br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader for.cond1.for.cond: ; preds = %for.cond5.for.cond %inc16 = add nuw i64 %i.035, 1 %exitcond47 = icmp eq i64 %inc16, %N br i1 %exitcond47, label %for.exit, label %for.cond1.preheader for.exit: ; preds = %for.cond1.for.cond ret i64 %spec.select } ; Make sure that store inside loop prevents hoisting invariant loads ; for (int i = 0; i < N; ++i) ; c[i] = memcmp(a[i], b, 4); ; define void @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N) { ; CHECK-LABEL: one_dimensional_with_store: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: .LBB4_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x9, [x0], #8 ; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: rev w10, w10 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w9, w10 ; CHECK-NEXT: cset w9, hi ; CHECK-NEXT: cset w10, lo ; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: sub w9, w9, w10 ; CHECK-NEXT: strb w9, [x2], #1 ; CHECK-NEXT: b.ne .LBB4_1 ; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: ret entry: br label %for.body.preheader for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %N to i64 br label %for.body for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv %0 = load ptr, ptr %arrayidx, align 8 %call = tail call i32 @memcmp(ptr %0, ptr %b, i64 4) %conv = trunc i32 %call to i8 %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv store i8 %conv, ptr %arrayidx2, align 1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body ret void } ; Make sure that call inside loop prevents hoisting invariant loads ; define i32 @one_dimensional_with_call(ptr %a, ptr %b, i32 %N) { ; CHECK-LABEL: one_dimensional_with_call: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 ; CHECK-NEXT: mov x19, x1 ; CHECK-NEXT: mov x21, x0 ; CHECK-NEXT: mov w20, wzr ; CHECK-NEXT: mov w22, w2 ; CHECK-NEXT: .LBB5_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x8, [x21], #8 ; CHECK-NEXT: ldr w9, [x19] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cinc w20, w20, eq ; CHECK-NEXT: bl func ; CHECK-NEXT: subs x22, x22, #1 ; CHECK-NEXT: b.ne .LBB5_1 ; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: br label %for.body.preheader for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %N to i64 br label %for.body for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv %0 = load ptr, ptr %arrayidx, align 8 %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4) %tobool.not = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool.not to i32 %spec.select = add nuw nsw i32 %sum.05, %add tail call void @func() %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body ret i32 %spec.select } ; One dimensional loop with memcmp size equal six. ; The test shows that shows that several loads can be hoisted at the same time. ; for (int i = 0; i < N; ++i) ; if (!memcmp(a[i], b, 6)) ; sum += 1; ; define i64 @one_dimensional_two_loads(ptr %a, ptr %b, i64 %N) { ; CHECK-LABEL: one_dimensional_two_loads: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: ldrh w10, [x1, #4] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB6_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x11, [x0], #8 ; CHECK-NEXT: ldr w12, [x11] ; CHECK-NEXT: ldrh w11, [x11, #4] ; CHECK-NEXT: cmp w12, w9 ; CHECK-NEXT: ccmp w11, w10, #0, eq ; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x2, x2, #1 ; CHECK-NEXT: b.ne .LBB6_1 ; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: br label %for.body for.body: ; preds = %entry, %for.body %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ] %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06 %0 = load ptr, ptr %arrayidx, align 8 %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 6) %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 %spec.select = add i64 %sum.05, %add %inc = add nuw i64 %i.06, 1 %exitcond = icmp eq i64 %inc, %N br i1 %exitcond, label %for.exit, label %for.body for.exit: ; preds = %for.body ret i64 %spec.select } ; See issue https://github.com/llvm/llvm-project/issues/72855 ; ; When hoisting instruction out of the loop, ensure that loads are not common ; subexpressions eliminated. In this example pointer %c may alias pointer %b, ; so when hoisting `%y = load i64, ptr %b` instruction we can't replace it with ; `%b.val = load i64, ptr %b` ; define i64 @hoisting_no_cse(ptr %a, ptr %b, ptr %c, i64 %N) { ; CHECK-LABEL: hoisting_no_cse: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [x1] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: str x8, [x2] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: ldr x9, [x1] ; CHECK-NEXT: .LBB7_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x10, [x0], #8 ; CHECK-NEXT: ldr x10, [x10] ; CHECK-NEXT: cmp x10, x9 ; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x3, x3, #1 ; CHECK-NEXT: b.ne .LBB7_1 ; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: %b.val = load i64, ptr %b %b.val.changed = add i64 %b.val, 1 store i64 %b.val.changed, ptr %c br label %for.body for.body: ; preds = %entry, %for.body %idx = phi i64 [ %inc, %for.body ], [ 0, %entry ] %sum = phi i64 [ %spec.select, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %idx %0 = load ptr, ptr %arrayidx, align 8 %x = load i64, ptr %0 %y = load i64, ptr %b %cmp = icmp eq i64 %x, %y %add = zext i1 %cmp to i64 %spec.select = add i64 %sum, %add %inc = add nuw i64 %idx, 1 %exitcond = icmp eq i64 %inc, %N br i1 %exitcond, label %for.exit, label %for.body for.exit: ; preds = %for.body ret i64 %spec.select } declare i32 @bcmp(ptr, ptr, i64) declare i32 @memcmp(ptr, ptr, i64) declare void @func()