; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9 ; @simple is the most basic chain of address induction variables. Chaining ; saves at least one register and avoids complex addressing and setup ; code. ; ; no expensive address computation in the preheader ; no complex address modes define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind { ; A9-LABEL: simple: ; A9: @ %bb.0: @ %entry ; A9-NEXT: .save {r4, r5, r6, lr} ; A9-NEXT: push {r4, r5, r6, lr} ; A9-NEXT: mov r3, r0 ; A9-NEXT: lsls r2, r2, #2 ; A9-NEXT: movs r0, #0 ; A9-NEXT: .LBB0_1: @ %loop ; A9-NEXT: @ =>This Inner Loop Header: Depth=1 ; A9-NEXT: add.w lr, r3, r2 ; A9-NEXT: ldr.w r12, [r3, r2] ; A9-NEXT: ldr r3, [r3] ; A9-NEXT: add.w r4, lr, r2 ; A9-NEXT: ldr.w r6, [lr, r2] ; A9-NEXT: add r0, r3 ; A9-NEXT: adds r3, r4, r2 ; A9-NEXT: add r0, r12 ; A9-NEXT: ldr r5, [r4, r2] ; A9-NEXT: add r0, r6 ; A9-NEXT: add r3, r2 ; A9-NEXT: add r0, r5 ; A9-NEXT: cmp r3, r1 ; A9-NEXT: bne .LBB0_1 ; A9-NEXT: @ %bb.2: @ %exit ; A9-NEXT: pop {r4, r5, r6, pc} entry: br label %loop loop: %iv = phi ptr [ %a, %entry ], [ %iv4, %loop ] %s = phi i32 [ 0, %entry ], [ %s4, %loop ] %v = load i32, ptr %iv %iv1 = getelementptr inbounds i32, ptr %iv, i32 %x %v1 = load i32, ptr %iv1 %iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x %v2 = load i32, ptr %iv2 %iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x %v3 = load i32, ptr %iv3 %s1 = add i32 %s, %v %s2 = add i32 %s1, %v1 %s3 = add i32 %s2, %v2 %s4 = add i32 %s3, %v3 %iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x %cmp = icmp eq ptr %iv4, %b br i1 %cmp, label %exit, label %loop exit: ret i32 %s4 } ; @user is not currently chained because the IV is live across memory ops. ; ; stride multiples computed in the preheader ; complex address modes define i32 @user(ptr %a, ptr %b, i32 %x) nounwind { ; A9-LABEL: user: ; A9: @ %bb.0: @ %entry ; A9-NEXT: .save {r4, r5, r6, r7, lr} ; A9-NEXT: push {r4, r5, r6, r7, lr} ; A9-NEXT: add.w r3, r2, r2, lsl #1 ; A9-NEXT: lsl.w r12, r2, #4 ; A9-NEXT: lsl.w lr, r3, #2 ; A9-NEXT: movs r3, #0 ; A9-NEXT: .LBB1_1: @ %loop ; A9-NEXT: @ =>This Inner Loop Header: Depth=1 ; A9-NEXT: ldr r4, [r0] ; A9-NEXT: ldr.w r5, [r0, r2, lsl #3] ; A9-NEXT: ldr.w r6, [r0, r2, lsl #2] ; A9-NEXT: add r3, r4 ; A9-NEXT: ldr.w r7, [r0, lr] ; A9-NEXT: add r3, r6 ; A9-NEXT: add r3, r5 ; A9-NEXT: add r3, r7 ; A9-NEXT: str r3, [r0] ; A9-NEXT: add r0, r12 ; A9-NEXT: cmp r0, r1 ; A9-NEXT: bne .LBB1_1 ; A9-NEXT: @ %bb.2: @ %exit ; A9-NEXT: mov r0, r3 ; A9-NEXT: pop {r4, r5, r6, r7, pc} entry: br label %loop loop: %iv = phi ptr [ %a, %entry ], [ %iv4, %loop ] %s = phi i32 [ 0, %entry ], [ %s4, %loop ] %v = load i32, ptr %iv %iv1 = getelementptr inbounds i32, ptr %iv, i32 %x %v1 = load i32, ptr %iv1 %iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x %v2 = load i32, ptr %iv2 %iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x %v3 = load i32, ptr %iv3 %s1 = add i32 %s, %v %s2 = add i32 %s1, %v1 %s3 = add i32 %s2, %v2 %s4 = add i32 %s3, %v3 %iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x store i32 %s4, ptr %iv %cmp = icmp eq ptr %iv4, %b br i1 %cmp, label %exit, label %loop exit: ret i32 %s4 } ; @extrastride is a slightly more interesting case of a single ; complete chain with multiple strides. The test case IR is what LSR ; used to do, and exactly what we don't want to do. LSR's new IV ; chaining feature should now undo the damage. ; ; no spills ; only one stride multiple in the preheader ; no complex address modes or reloads define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %res, i32 %x, i32 %y, i32 %z) nounwind { ; A9-LABEL: extrastride: ; A9: @ %bb.0: @ %entry ; A9-NEXT: .save {r4, r5, r6, r7, lr} ; A9-NEXT: push {r4, r5, r6, r7, lr} ; A9-NEXT: ldr.w r12, [sp, #24] ; A9-NEXT: cmp.w r12, #0 ; A9-NEXT: beq .LBB2_3 ; A9-NEXT: @ %bb.1: @ %for.body.lr.ph ; A9-NEXT: ldr r4, [sp, #20] ; A9-NEXT: add.w lr, r3, r1 ; A9-NEXT: lsls r3, r4, #2 ; A9-NEXT: .LBB2_2: @ %for.body ; A9-NEXT: @ =>This Inner Loop Header: Depth=1 ; A9-NEXT: adds r5, r0, r1 ; A9-NEXT: ldr r4, [r0, r1] ; A9-NEXT: ldr r0, [r0] ; A9-NEXT: subs.w r12, r12, #1 ; A9-NEXT: ldr r6, [r5, r1] ; A9-NEXT: add r5, r1 ; A9-NEXT: add r0, r4 ; A9-NEXT: ldr r7, [r5, r1] ; A9-NEXT: add r5, r1 ; A9-NEXT: add r0, r6 ; A9-NEXT: ldr r4, [r5, r1] ; A9-NEXT: add r0, r7 ; A9-NEXT: add r0, r4 ; A9-NEXT: str r0, [r2] ; A9-NEXT: add.w r0, r5, r1 ; A9-NEXT: add r2, r3 ; A9-NEXT: add r0, lr ; A9-NEXT: bne .LBB2_2 ; A9-NEXT: .LBB2_3: @ %for.end ; A9-NEXT: pop {r4, r5, r6, r7, pc} entry: %cmp8 = icmp eq i32 %z, 0 br i1 %cmp8, label %for.end, label %for.body.lr.ph for.body.lr.ph: ; preds = %entry %add.ptr.sum = shl i32 %main_stride, 1 ; s*2 %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride ; s*3 %add.ptr2.sum = add i32 %x, %main_stride ; s + x %add.ptr4.sum = shl i32 %main_stride, 2 ; s*4 %add.ptr3.sum = add i32 %add.ptr2.sum, %add.ptr4.sum ; total IV stride = s*5+x br label %for.body for.body: ; preds = %for.body.lr.ph, %for.body %main.addr.011 = phi ptr [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] %res.addr.09 = phi ptr [ %res, %for.body.lr.ph ], [ %add.ptr7, %for.body ] %0 = load i32, ptr %main.addr.011, align 4 %add.ptr = getelementptr inbounds i8, ptr %main.addr.011, i32 %main_stride %1 = load i32, ptr %add.ptr, align 4 %add.ptr1 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr.sum %2 = load i32, ptr %add.ptr1, align 4 %add.ptr2 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr1.sum %3 = load i32, ptr %add.ptr2, align 4 %add.ptr3 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr4.sum %4 = load i32, ptr %add.ptr3, align 4 %add = add i32 %1, %0 %add4 = add i32 %add, %2 %add5 = add i32 %add4, %3 %add6 = add i32 %add5, %4 store i32 %add6, ptr %res.addr.09, align 4 %add.ptr6 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr3.sum %add.ptr7 = getelementptr inbounds i32, ptr %res.addr.09, i32 %y %inc = add i32 %i.010, 1 %cmp = icmp eq i32 %inc, %z br i1 %cmp, label %for.end, label %for.body for.end: ; preds = %for.body, %entry ret void } ; @foldedidx is an unrolled variant of this loop: ; for (unsigned long i = 0; i < len; i += s) { ; c[i] = a[i] + b[i]; ; } ; where 's' can be folded into the addressing mode. ; Consequently, we should *not* form any chains. define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nounwind ssp { ; A9-LABEL: foldedidx: ; A9: @ %bb.0: @ %entry ; A9-NEXT: .save {r4, r5, r6, lr} ; A9-NEXT: push {r4, r5, r6, lr} ; A9-NEXT: mov.w lr, #0 ; A9-NEXT: .LBB3_1: @ %for.body ; A9-NEXT: @ =>This Inner Loop Header: Depth=1 ; A9-NEXT: ldrb.w r12, [r0, lr] ; A9-NEXT: add.w r4, r1, lr ; A9-NEXT: ldrb.w r3, [r1, lr] ; A9-NEXT: add r3, r12 ; A9-NEXT: strb.w r3, [r2, lr] ; A9-NEXT: add.w r3, r0, lr ; A9-NEXT: ldrb.w r12, [r3, #1] ; A9-NEXT: ldrb r5, [r4, #1] ; A9-NEXT: add r12, r5 ; A9-NEXT: add.w r5, r2, lr ; A9-NEXT: strb.w r12, [r5, #1] ; A9-NEXT: add.w lr, lr, #4 ; A9-NEXT: cmp.w lr, #400 ; A9-NEXT: ldrb.w r12, [r3, #2] ; A9-NEXT: ldrb r6, [r4, #2] ; A9-NEXT: add r6, r12 ; A9-NEXT: strb r6, [r5, #2] ; A9-NEXT: ldrb r3, [r3, #3] ; A9-NEXT: ldrb r6, [r4, #3] ; A9-NEXT: add r3, r6 ; A9-NEXT: strb r3, [r5, #3] ; A9-NEXT: bne .LBB3_1 ; A9-NEXT: @ %bb.2: @ %for.end ; A9-NEXT: pop {r4, r5, r6, pc} entry: br label %for.body for.body: ; preds = %for.body, %entry %i.07 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ] %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.07 %0 = load i8, ptr %arrayidx, align 1 %conv5 = zext i8 %0 to i32 %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.07 %1 = load i8, ptr %arrayidx1, align 1 %conv26 = zext i8 %1 to i32 %add = add nsw i32 %conv26, %conv5 %conv3 = trunc i32 %add to i8 %arrayidx4 = getelementptr inbounds i8, ptr %c, i32 %i.07 store i8 %conv3, ptr %arrayidx4, align 1 %inc1 = or disjoint i32 %i.07, 1 %arrayidx.1 = getelementptr inbounds i8, ptr %a, i32 %inc1 %2 = load i8, ptr %arrayidx.1, align 1 %conv5.1 = zext i8 %2 to i32 %arrayidx1.1 = getelementptr inbounds i8, ptr %b, i32 %inc1 %3 = load i8, ptr %arrayidx1.1, align 1 %conv26.1 = zext i8 %3 to i32 %add.1 = add nsw i32 %conv26.1, %conv5.1 %conv3.1 = trunc i32 %add.1 to i8 %arrayidx4.1 = getelementptr inbounds i8, ptr %c, i32 %inc1 store i8 %conv3.1, ptr %arrayidx4.1, align 1 %inc.12 = or disjoint i32 %i.07, 2 %arrayidx.2 = getelementptr inbounds i8, ptr %a, i32 %inc.12 %4 = load i8, ptr %arrayidx.2, align 1 %conv5.2 = zext i8 %4 to i32 %arrayidx1.2 = getelementptr inbounds i8, ptr %b, i32 %inc.12 %5 = load i8, ptr %arrayidx1.2, align 1 %conv26.2 = zext i8 %5 to i32 %add.2 = add nsw i32 %conv26.2, %conv5.2 %conv3.2 = trunc i32 %add.2 to i8 %arrayidx4.2 = getelementptr inbounds i8, ptr %c, i32 %inc.12 store i8 %conv3.2, ptr %arrayidx4.2, align 1 %inc.23 = or disjoint i32 %i.07, 3 %arrayidx.3 = getelementptr inbounds i8, ptr %a, i32 %inc.23 %6 = load i8, ptr %arrayidx.3, align 1 %conv5.3 = zext i8 %6 to i32 %arrayidx1.3 = getelementptr inbounds i8, ptr %b, i32 %inc.23 %7 = load i8, ptr %arrayidx1.3, align 1 %conv26.3 = zext i8 %7 to i32 %add.3 = add nsw i32 %conv26.3, %conv5.3 %conv3.3 = trunc i32 %add.3 to i8 %arrayidx4.3 = getelementptr inbounds i8, ptr %c, i32 %inc.23 store i8 %conv3.3, ptr %arrayidx4.3, align 1 %inc.3 = add nsw i32 %i.07, 4 %exitcond.3 = icmp eq i32 %inc.3, 400 br i1 %exitcond.3, label %for.end, label %for.body for.end: ; preds = %for.body ret void } ; @testNeon is an important example of the nead for ivchains. ; ; Loads and stores should use post-increment addressing, no add's or add.w's. ; Most importantly, there should be no spills or reloads! define hidden void @testNeon(ptr %ref_data, i32 %ref_stride, i32 %limit, ptr nocapture %data) nounwind optsize { ; A9-LABEL: testNeon: ; A9: @ %bb.0: ; A9-NEXT: .save {r4, r5, r7, lr} ; A9-NEXT: push {r4, r5, r7, lr} ; A9-NEXT: vmov.i32 q8, #0x0 ; A9-NEXT: cmp r2, #1 ; A9-NEXT: blt .LBB4_4 ; A9-NEXT: @ %bb.1: @ %.lr.ph ; A9-NEXT: movs r5, #0 ; A9-NEXT: movw r4, #64464 ; A9-NEXT: sub.w r12, r5, r2, lsl #6 ; A9-NEXT: sub.w lr, r1, r1, lsl #4 ; A9-NEXT: movt r4, #65535 ; A9-NEXT: mov r5, r3 ; A9-NEXT: .LBB4_2: @ =>This Inner Loop Header: Depth=1 ; A9-NEXT: vld1.64 {d18}, [r0], r1 ; A9-NEXT: subs r2, #1 ; A9-NEXT: vld1.64 {d19}, [r0], r1 ; A9-NEXT: vst1.8 {d18, d19}, [r5]! ; A9-NEXT: vld1.64 {d20}, [r0], r1 ; A9-NEXT: vld1.64 {d21}, [r0], r1 ; A9-NEXT: vst1.8 {d20, d21}, [r5]! ; A9-NEXT: vld1.64 {d22}, [r0], r1 ; A9-NEXT: vadd.i8 q9, q9, q10 ; A9-NEXT: vld1.64 {d23}, [r0], r1 ; A9-NEXT: vst1.8 {d22, d23}, [r5]! ; A9-NEXT: vld1.64 {d20}, [r0], r1 ; A9-NEXT: vadd.i8 q9, q9, q11 ; A9-NEXT: vld1.64 {d21}, [r0], lr ; A9-NEXT: vadd.i8 q9, q9, q10 ; A9-NEXT: vadd.i8 q8, q8, q9 ; A9-NEXT: vst1.8 {d20, d21}, [r5], r4 ; A9-NEXT: bne .LBB4_2 ; A9-NEXT: @ %bb.3: @ %._crit_edge ; A9-NEXT: add.w r3, r3, r12, lsl #4 ; A9-NEXT: .LBB4_4: ; A9-NEXT: vst1.32 {d16, d17}, [r3] ; A9-NEXT: pop {r4, r5, r7, pc} %1 = icmp sgt i32 %limit, 0 br i1 %1, label %.lr.ph, label %45 .lr.ph: ; preds = %0 %2 = shl nsw i32 %ref_stride, 1 %3 = mul nsw i32 %ref_stride, 3 %4 = shl nsw i32 %ref_stride, 2 %5 = mul nsw i32 %ref_stride, 5 %6 = mul nsw i32 %ref_stride, 6 %7 = mul nsw i32 %ref_stride, 7 %8 = shl nsw i32 %ref_stride, 3 %9 = sub i32 0, %8 %10 = mul i32 %limit, -64 br label %11 ;