; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64 < %s | FileCheck %s define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %data, i32 noundef %width) { ; CHECK-LABEL: loop1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: subs w8, w2, #1 ; CHECK-NEXT: b.lt .LBB0_8 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: cmp w8, #6 ; CHECK-NEXT: b.hi .LBB0_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w10, wzr ; CHECK-NEXT: mov x8, x1 ; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 ; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000 ; CHECK-NEXT: add x12, x0, #4 ; CHECK-NEXT: and x10, x11, #0x1fffffff8 ; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: add x13, x1, #16 ; CHECK-NEXT: add x8, x1, x10, lsl #2 ; CHECK-NEXT: add x9, x0, x10 ; CHECK-NEXT: mov x14, x10 ; CHECK-NEXT: .LBB0_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q1, q2, [x13, #-16] ; CHECK-NEXT: subs x14, x14, #8 ; CHECK-NEXT: add x13, x13, #32 ; CHECK-NEXT: fcmgt v3.4s, v1.4s, v0.4s ; CHECK-NEXT: fcmgt v4.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0 ; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0 ; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b ; CHECK-NEXT: bit v2.16b, v0.16b, v4.16b ; CHECK-NEXT: bic v1.16b, v1.16b, v5.16b ; CHECK-NEXT: bic v2.16b, v2.16b, v6.16b ; CHECK-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-NEXT: fcvtzs v2.4s, v2.4s ; CHECK-NEXT: xtn v1.4h, v1.4s ; CHECK-NEXT: xtn v2.4h, v2.4s ; CHECK-NEXT: xtn v1.8b, v1.8h ; CHECK-NEXT: xtn v2.8b, v2.8h ; CHECK-NEXT: mov v1.s[1], v2.s[0] ; CHECK-NEXT: stur d1, [x12, #-4] ; CHECK-NEXT: add x12, x12, #8 ; CHECK-NEXT: b.ne .LBB0_4 ; CHECK-NEXT: // %bb.5: // %middle.block ; CHECK-NEXT: cmp x11, x10 ; CHECK-NEXT: b.eq .LBB0_8 ; CHECK-NEXT: .LBB0_6: // %for.body.preheader1 ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000 ; CHECK-NEXT: sub w10, w2, w10 ; CHECK-NEXT: fmov s1, w11 ; CHECK-NEXT: .LBB0_7: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr s2, [x8], #4 ; CHECK-NEXT: fcmp s2, s1 ; CHECK-NEXT: fcsel s3, s1, s2, gt ; CHECK-NEXT: fcmp s2, #0.0 ; CHECK-NEXT: fcsel s2, s0, s3, mi ; CHECK-NEXT: subs w10, w10, #1 ; CHECK-NEXT: fcvtzs w11, s2 ; CHECK-NEXT: strb w11, [x9], #1 ; CHECK-NEXT: b.ne .LBB0_7 ; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup ; CHECK-NEXT: ret entry: %cmp9 = icmp sgt i32 %width, 0 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry %0 = add i32 %width, -1 %1 = zext i32 %0 to i64 %2 = add nuw nsw i64 %1, 1 %min.iters.check = icmp ult i32 %0, 7 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph vector.ph: ; preds = %for.body.preheader %n.vec = and i64 %2, 8589934584 %ind.end = trunc i64 %n.vec to i32 %ind.end14 = getelementptr float, ptr %data, i64 %n.vec %ind.end16 = getelementptr i8, ptr %dst, i64 %n.vec br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %next.gep = getelementptr float, ptr %data, i64 %index %next.gep18 = getelementptr i8, ptr %dst, i64 %index %wide.load = load <4 x float>, ptr %next.gep, align 4 %3 = getelementptr float, ptr %next.gep, i64 4 %wide.load20 = load <4 x float>, ptr %3, align 4 %4 = fcmp olt <4 x float> %wide.load, zeroinitializer %5 = fcmp olt <4 x float> %wide.load20, zeroinitializer %6 = fcmp ogt <4 x float> %wide.load, %7 = fcmp ogt <4 x float> %wide.load20, %8 = select <4 x i1> %6, <4 x float> , <4 x float> %wide.load %9 = select <4 x i1> %7, <4 x float> , <4 x float> %wide.load20 %10 = select <4 x i1> %4, <4 x float> zeroinitializer, <4 x float> %8 %11 = select <4 x i1> %5, <4 x float> zeroinitializer, <4 x float> %9 %12 = fptoui <4 x float> %10 to <4 x i8> %13 = fptoui <4 x float> %11 to <4 x i8> store <4 x i8> %12, ptr %next.gep18, align 1 %14 = getelementptr i8, ptr %next.gep18, i64 4 store <4 x i8> %13, ptr %14, align 1 %index.next = add nuw i64 %index, 8 %15 = icmp eq i64 %index.next, %n.vec br i1 %15, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %2, %n.vec br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 for.body.preheader21: ; preds = %for.body.preheader, %middle.block %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %ind.end, %middle.block ] %src.011.ph = phi ptr [ %data, %for.body.preheader ], [ %ind.end14, %middle.block ] %dst.addr.010.ph = phi ptr [ %dst, %for.body.preheader ], [ %ind.end16, %middle.block ] br label %for.body for.cond.cleanup: ; preds = %for.body, %middle.block, %entry ret void for.body: ; preds = %for.body.preheader21, %for.body %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] %src.011 = phi ptr [ %add.ptr, %for.body ], [ %src.011.ph, %for.body.preheader21 ] %dst.addr.010 = phi ptr [ %add.ptr2, %for.body ], [ %dst.addr.010.ph, %for.body.preheader21 ] %16 = load float, ptr %src.011, align 4 %cmp.i = fcmp olt float %16, 0.000000e+00 %cmp1.i = fcmp ogt float %16, 2.550000e+02 %.x.i = select i1 %cmp1.i, float 2.550000e+02, float %16 %retval.0.i = select i1 %cmp.i, float 0.000000e+00, float %.x.i %conv = fptoui float %retval.0.i to i8 store i8 %conv, ptr %dst.addr.010, align 1 %add.ptr = getelementptr inbounds float, ptr %src.011, i64 1 %add.ptr2 = getelementptr inbounds i8, ptr %dst.addr.010, i64 1 %inc = add nuw nsw i32 %i.012, 1 %exitcond.not = icmp eq i32 %inc, %width br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %data, i32 noundef %width) { ; CHECK-LABEL: loop2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: subs w8, w2, #1 ; CHECK-NEXT: b.lt .LBB1_7 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: cmp w8, #2 ; CHECK-NEXT: b.ls .LBB1_4 ; CHECK-NEXT: // %bb.2: // %vector.memcheck ; CHECK-NEXT: ubfiz x9, x8, #1, #32 ; CHECK-NEXT: add x9, x9, #2 ; CHECK-NEXT: add x10, x1, x9, lsl #2 ; CHECK-NEXT: cmp x10, x0 ; CHECK-NEXT: b.ls .LBB1_8 ; CHECK-NEXT: // %bb.3: // %vector.memcheck ; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: cmp x9, x1 ; CHECK-NEXT: b.ls .LBB1_8 ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: mov w10, wzr ; CHECK-NEXT: mov x8, x1 ; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: .LBB1_5: // %for.body.preheader1 ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000 ; CHECK-NEXT: sub w10, w2, w10 ; CHECK-NEXT: fmov s1, w11 ; CHECK-NEXT: .LBB1_6: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp s2, s3, [x8], #8 ; CHECK-NEXT: fcmp s2, s1 ; CHECK-NEXT: fcsel s4, s1, s2, gt ; CHECK-NEXT: fcmp s2, #0.0 ; CHECK-NEXT: fcsel s2, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: fcvtzs w11, s2 ; CHECK-NEXT: fcsel s3, s0, s4, mi ; CHECK-NEXT: subs w10, w10, #1 ; CHECK-NEXT: strb w11, [x9] ; CHECK-NEXT: fcvtzs w12, s3 ; CHECK-NEXT: strb w12, [x9, #1] ; CHECK-NEXT: add x9, x9, #2 ; CHECK-NEXT: b.ne .LBB1_6 ; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_8: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 ; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000 ; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: add x8, x1, x10, lsl #3 ; CHECK-NEXT: add x9, x0, x10, lsl #1 ; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: .LBB1_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld2 { v1.4s, v2.4s }, [x1], #32 ; CHECK-NEXT: subs x12, x12, #4 ; CHECK-NEXT: fcmgt v3.4s, v1.4s, v0.4s ; CHECK-NEXT: fcmgt v4.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0 ; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b ; CHECK-NEXT: bsl v4.16b, v0.16b, v2.16b ; CHECK-NEXT: fcmlt v1.4s, v2.4s, #0.0 ; CHECK-NEXT: bic v2.16b, v3.16b, v5.16b ; CHECK-NEXT: bic v1.16b, v4.16b, v1.16b ; CHECK-NEXT: fcvtzs v2.4s, v2.4s ; CHECK-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-NEXT: xtn v2.4h, v2.4s ; CHECK-NEXT: xtn v1.4h, v1.4s ; CHECK-NEXT: trn1 v1.8b, v2.8b, v1.8b ; CHECK-NEXT: str d1, [x0], #8 ; CHECK-NEXT: b.ne .LBB1_9 ; CHECK-NEXT: // %bb.10: // %middle.block ; CHECK-NEXT: cmp x11, x10 ; CHECK-NEXT: b.ne .LBB1_5 ; CHECK-NEXT: b .LBB1_7 entry: %cmp19 = icmp sgt i32 %width, 0 br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry %0 = add i32 %width, -1 %1 = zext i32 %0 to i64 %2 = add nuw nsw i64 %1, 1 %min.iters.check = icmp ult i32 %0, 3 br i1 %min.iters.check, label %for.body.preheader35, label %vector.memcheck vector.memcheck: ; preds = %for.body.preheader %3 = add i32 %width, -1 %4 = zext i32 %3 to i64 %5 = shl nuw nsw i64 %4, 1 %6 = add nuw nsw i64 %5, 2 %scevgep = getelementptr i8, ptr %dst, i64 %6 %scevgep24 = getelementptr float, ptr %data, i64 %6 %bound0 = icmp ugt ptr %scevgep24, %dst %bound1 = icmp ugt ptr %scevgep, %data %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %for.body.preheader35, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i64 %2, 8589934588 %ind.end = trunc i64 %n.vec to i32 %7 = shl nuw nsw i64 %n.vec, 1 %ind.end27 = getelementptr float, ptr %data, i64 %7 %8 = shl nuw nsw i64 %n.vec, 1 %ind.end29 = getelementptr i8, ptr %dst, i64 %8 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %9 = shl i64 %index, 1 %next.gep = getelementptr float, ptr %data, i64 %9 %10 = shl i64 %index, 1 %wide.vec = load <8 x float>, ptr %next.gep, align 4 %strided.vec = shufflevector <8 x float> %wide.vec, <8 x float> poison, <4 x i32> %strided.vec34 = shufflevector <8 x float> %wide.vec, <8 x float> poison, <4 x i32> %11 = fcmp olt <4 x float> %strided.vec, zeroinitializer %12 = fcmp ogt <4 x float> %strided.vec, %13 = select <4 x i1> %12, <4 x float> , <4 x float> %strided.vec %14 = select <4 x i1> %11, <4 x float> zeroinitializer, <4 x float> %13 %15 = fptoui <4 x float> %14 to <4 x i8> %16 = fcmp olt <4 x float> %strided.vec34, zeroinitializer %17 = fcmp ogt <4 x float> %strided.vec34, %18 = select <4 x i1> %17, <4 x float> , <4 x float> %strided.vec34 %19 = select <4 x i1> %16, <4 x float> zeroinitializer, <4 x float> %18 %20 = fptoui <4 x float> %19 to <4 x i8> %21 = getelementptr inbounds i8, ptr %dst, i64 %10 %interleaved.vec = shufflevector <4 x i8> %15, <4 x i8> %20, <8 x i32> store <8 x i8> %interleaved.vec, ptr %21, align 1 %index.next = add nuw i64 %index, 4 %22 = icmp eq i64 %index.next, %n.vec br i1 %22, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %2, %n.vec br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader35 for.body.preheader35: ; preds = %vector.memcheck, %for.body.preheader, %middle.block %i.022.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %ind.end, %middle.block ] %src.021.ph = phi ptr [ %data, %vector.memcheck ], [ %data, %for.body.preheader ], [ %ind.end27, %middle.block ] %dst.addr.020.ph = phi ptr [ %dst, %vector.memcheck ], [ %dst, %for.body.preheader ], [ %ind.end29, %middle.block ] br label %for.body for.cond.cleanup: ; preds = %for.body, %middle.block, %entry ret void for.body: ; preds = %for.body.preheader35, %for.body %i.022 = phi i32 [ %inc, %for.body ], [ %i.022.ph, %for.body.preheader35 ] %src.021 = phi ptr [ %add.ptr, %for.body ], [ %src.021.ph, %for.body.preheader35 ] %dst.addr.020 = phi ptr [ %add.ptr6, %for.body ], [ %dst.addr.020.ph, %for.body.preheader35 ] %23 = load float, ptr %src.021, align 4 %cmp.i = fcmp olt float %23, 0.000000e+00 %cmp1.i = fcmp ogt float %23, 2.550000e+02 %.x.i = select i1 %cmp1.i, float 2.550000e+02, float %23 %retval.0.i = select i1 %cmp.i, float 0.000000e+00, float %.x.i %conv = fptoui float %retval.0.i to i8 store i8 %conv, ptr %dst.addr.020, align 1 %arrayidx2 = getelementptr inbounds float, ptr %src.021, i64 1 %24 = load float, ptr %arrayidx2, align 4 %cmp.i15 = fcmp olt float %24, 0.000000e+00 %cmp1.i16 = fcmp ogt float %24, 2.550000e+02 %.x.i17 = select i1 %cmp1.i16, float 2.550000e+02, float %24 %retval.0.i18 = select i1 %cmp.i15, float 0.000000e+00, float %.x.i17 %conv4 = fptoui float %retval.0.i18 to i8 %arrayidx5 = getelementptr inbounds i8, ptr %dst.addr.020, i64 1 store i8 %conv4, ptr %arrayidx5, align 1 %add.ptr = getelementptr inbounds float, ptr %src.021, i64 2 %add.ptr6 = getelementptr inbounds i8, ptr %dst.addr.020, i64 2 %inc = add nuw nsw i32 %i.022, 1 %exitcond.not = icmp eq i32 %inc, %width br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %data, i32 noundef %width) { ; CHECK-LABEL: loop3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: subs w8, w2, #1 ; CHECK-NEXT: b.lt .LBB2_9 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: cmp w8, #2 ; CHECK-NEXT: b.ls .LBB2_6 ; CHECK-NEXT: // %bb.2: // %vector.memcheck ; CHECK-NEXT: add x9, x8, w8, uxtw #1 ; CHECK-NEXT: add x9, x9, #3 ; CHECK-NEXT: add x10, x1, x9, lsl #2 ; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: cmp x10, x0 ; CHECK-NEXT: ccmp x9, x1, #0, hi ; CHECK-NEXT: b.hi .LBB2_6 ; CHECK-NEXT: // %bb.3: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 ; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000 ; CHECK-NEXT: adrp x12, .LCPI2_0 ; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI2_0] ; CHECK-NEXT: add x9, x10, x10, lsl #1 ; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: add x8, x1, x9, lsl #2 ; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: .LBB2_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48 ; CHECK-NEXT: add x13, x0, #8 ; CHECK-NEXT: subs x12, x12, #4 ; CHECK-NEXT: fcmgt v5.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmgt v6.4s, v3.4s, v0.4s ; CHECK-NEXT: fcmgt v7.4s, v4.4s, v0.4s ; CHECK-NEXT: fcmlt v16.4s, v2.4s, #0.0 ; CHECK-NEXT: fcmlt v17.4s, v3.4s, #0.0 ; CHECK-NEXT: bsl v5.16b, v0.16b, v2.16b ; CHECK-NEXT: bsl v6.16b, v0.16b, v3.16b ; CHECK-NEXT: bsl v7.16b, v0.16b, v4.16b ; CHECK-NEXT: fcmlt v2.4s, v4.4s, #0.0 ; CHECK-NEXT: bic v3.16b, v5.16b, v16.16b ; CHECK-NEXT: bic v4.16b, v6.16b, v17.16b ; CHECK-NEXT: bic v2.16b, v7.16b, v2.16b ; CHECK-NEXT: fcvtzs v3.4s, v3.4s ; CHECK-NEXT: fcvtzs v4.4s, v4.4s ; CHECK-NEXT: fcvtzs v2.4s, v2.4s ; CHECK-NEXT: xtn v5.4h, v3.4s ; CHECK-NEXT: xtn v6.4h, v4.4s ; CHECK-NEXT: xtn v7.4h, v2.4s ; CHECK-NEXT: tbl v2.16b, { v5.16b, v6.16b, v7.16b }, v1.16b ; CHECK-NEXT: st1 { v2.s }[2], [x13] ; CHECK-NEXT: str d2, [x0], #12 ; CHECK-NEXT: b.ne .LBB2_4 ; CHECK-NEXT: // %bb.5: // %middle.block ; CHECK-NEXT: cmp x11, x10 ; CHECK-NEXT: b.ne .LBB2_7 ; CHECK-NEXT: b .LBB2_9 ; CHECK-NEXT: .LBB2_6: ; CHECK-NEXT: mov w10, wzr ; CHECK-NEXT: mov x8, x1 ; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: .LBB2_7: // %for.body.preheader1 ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000 ; CHECK-NEXT: sub w10, w2, w10 ; CHECK-NEXT: fmov s1, w11 ; CHECK-NEXT: .LBB2_8: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp s2, s3, [x8] ; CHECK-NEXT: fcmp s2, s1 ; CHECK-NEXT: fcsel s4, s1, s2, gt ; CHECK-NEXT: fcmp s2, #0.0 ; CHECK-NEXT: fcsel s2, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: ldr s3, [x8, #8] ; CHECK-NEXT: fcvtzs w11, s2 ; CHECK-NEXT: add x8, x8, #12 ; CHECK-NEXT: fcsel s4, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: strb w11, [x9] ; CHECK-NEXT: fcsel s5, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: fcvtzs w12, s4 ; CHECK-NEXT: fcsel s3, s0, s5, mi ; CHECK-NEXT: subs w10, w10, #1 ; CHECK-NEXT: strb w12, [x9, #1] ; CHECK-NEXT: fcvtzs w13, s3 ; CHECK-NEXT: strb w13, [x9, #2] ; CHECK-NEXT: add x9, x9, #3 ; CHECK-NEXT: b.ne .LBB2_8 ; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup ; CHECK-NEXT: ret entry: %cmp29 = icmp sgt i32 %width, 0 br i1 %cmp29, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry %0 = add i32 %width, -1 %1 = zext i32 %0 to i64 %2 = add nuw nsw i64 %1, 1 %min.iters.check = icmp ult i32 %0, 3 br i1 %min.iters.check, label %for.body.preheader46, label %vector.memcheck vector.memcheck: ; preds = %for.body.preheader %3 = add i32 %width, -1 %4 = zext i32 %3 to i64 %5 = mul nuw nsw i64 %4, 3 %6 = add nuw nsw i64 %5, 3 %scevgep = getelementptr i8, ptr %dst, i64 %6 %scevgep34 = getelementptr float, ptr %data, i64 %6 %bound0 = icmp ugt ptr %scevgep34, %dst %bound1 = icmp ugt ptr %scevgep, %data %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %for.body.preheader46, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i64 %2, 8589934588 %ind.end = trunc i64 %n.vec to i32 %7 = mul nuw nsw i64 %n.vec, 3 %ind.end37 = getelementptr float, ptr %data, i64 %7 %8 = mul nuw nsw i64 %n.vec, 3 %ind.end39 = getelementptr i8, ptr %dst, i64 %8 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %9 = mul i64 %index, 3 %next.gep = getelementptr float, ptr %data, i64 %9 %10 = mul i64 %index, 3 %wide.vec = load <12 x float>, ptr %next.gep, align 4 %strided.vec = shufflevector <12 x float> %wide.vec, <12 x float> poison, <4 x i32> %strided.vec44 = shufflevector <12 x float> %wide.vec, <12 x float> poison, <4 x i32> %strided.vec45 = shufflevector <12 x float> %wide.vec, <12 x float> poison, <4 x i32> %11 = fcmp olt <4 x float> %strided.vec, zeroinitializer %12 = fcmp ogt <4 x float> %strided.vec, %13 = select <4 x i1> %12, <4 x float> , <4 x float> %strided.vec %14 = select <4 x i1> %11, <4 x float> zeroinitializer, <4 x float> %13 %15 = fptoui <4 x float> %14 to <4 x i8> %16 = fcmp olt <4 x float> %strided.vec44, zeroinitializer %17 = fcmp ogt <4 x float> %strided.vec44, %18 = select <4 x i1> %17, <4 x float> , <4 x float> %strided.vec44 %19 = select <4 x i1> %16, <4 x float> zeroinitializer, <4 x float> %18 %20 = fptoui <4 x float> %19 to <4 x i8> %21 = fcmp olt <4 x float> %strided.vec45, zeroinitializer %22 = fcmp ogt <4 x float> %strided.vec45, %23 = select <4 x i1> %22, <4 x float> , <4 x float> %strided.vec45 %24 = select <4 x i1> %21, <4 x float> zeroinitializer, <4 x float> %23 %25 = fptoui <4 x float> %24 to <4 x i8> %26 = getelementptr inbounds i8, ptr %dst, i64 %10 %27 = shufflevector <4 x i8> %15, <4 x i8> %20, <8 x i32> %28 = shufflevector <4 x i8> %25, <4 x i8> poison, <8 x i32> %interleaved.vec = shufflevector <8 x i8> %27, <8 x i8> %28, <12 x i32> store <12 x i8> %interleaved.vec, ptr %26, align 1 %index.next = add nuw i64 %index, 4 %29 = icmp eq i64 %index.next, %n.vec br i1 %29, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %2, %n.vec br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader46 for.body.preheader46: ; preds = %vector.memcheck, %for.body.preheader, %middle.block %i.032.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %ind.end, %middle.block ] %src.031.ph = phi ptr [ %data, %vector.memcheck ], [ %data, %for.body.preheader ], [ %ind.end37, %middle.block ] %dst.addr.030.ph = phi ptr [ %dst, %vector.memcheck ], [ %dst, %for.body.preheader ], [ %ind.end39, %middle.block ] br label %for.body for.cond.cleanup: ; preds = %for.body, %middle.block, %entry ret void for.body: ; preds = %for.body.preheader46, %for.body %i.032 = phi i32 [ %inc, %for.body ], [ %i.032.ph, %for.body.preheader46 ] %src.031 = phi ptr [ %add.ptr, %for.body ], [ %src.031.ph, %for.body.preheader46 ] %dst.addr.030 = phi ptr [ %add.ptr10, %for.body ], [ %dst.addr.030.ph, %for.body.preheader46 ] %30 = load float, ptr %src.031, align 4 %cmp.i = fcmp olt float %30, 0.000000e+00 %cmp1.i = fcmp ogt float %30, 2.550000e+02 %.x.i = select i1 %cmp1.i, float 2.550000e+02, float %30 %retval.0.i = select i1 %cmp.i, float 0.000000e+00, float %.x.i %conv = fptoui float %retval.0.i to i8 store i8 %conv, ptr %dst.addr.030, align 1 %arrayidx2 = getelementptr inbounds float, ptr %src.031, i64 1 %31 = load float, ptr %arrayidx2, align 4 %cmp.i21 = fcmp olt float %31, 0.000000e+00 %cmp1.i22 = fcmp ogt float %31, 2.550000e+02 %.x.i23 = select i1 %cmp1.i22, float 2.550000e+02, float %31 %retval.0.i24 = select i1 %cmp.i21, float 0.000000e+00, float %.x.i23 %conv4 = fptoui float %retval.0.i24 to i8 %arrayidx5 = getelementptr inbounds i8, ptr %dst.addr.030, i64 1 store i8 %conv4, ptr %arrayidx5, align 1 %arrayidx6 = getelementptr inbounds float, ptr %src.031, i64 2 %32 = load float, ptr %arrayidx6, align 4 %cmp.i25 = fcmp olt float %32, 0.000000e+00 %cmp1.i26 = fcmp ogt float %32, 2.550000e+02 %.x.i27 = select i1 %cmp1.i26, float 2.550000e+02, float %32 %retval.0.i28 = select i1 %cmp.i25, float 0.000000e+00, float %.x.i27 %conv8 = fptoui float %retval.0.i28 to i8 %arrayidx9 = getelementptr inbounds i8, ptr %dst.addr.030, i64 2 store i8 %conv8, ptr %arrayidx9, align 1 %add.ptr = getelementptr inbounds float, ptr %src.031, i64 3 %add.ptr10 = getelementptr inbounds i8, ptr %dst.addr.030, i64 3 %inc = add nuw nsw i32 %i.032, 1 %exitcond.not = icmp eq i32 %inc, %width br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %data, i32 noundef %width) { ; CHECK-LABEL: loop4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: subs w8, w2, #1 ; CHECK-NEXT: b.lt .LBB3_7 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: cmp w8, #2 ; CHECK-NEXT: b.ls .LBB3_4 ; CHECK-NEXT: // %bb.2: // %vector.memcheck ; CHECK-NEXT: ubfiz x9, x8, #2, #32 ; CHECK-NEXT: add x9, x9, #4 ; CHECK-NEXT: add x10, x1, x9, lsl #2 ; CHECK-NEXT: cmp x10, x0 ; CHECK-NEXT: b.ls .LBB3_8 ; CHECK-NEXT: // %bb.3: // %vector.memcheck ; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: cmp x9, x1 ; CHECK-NEXT: b.ls .LBB3_8 ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: mov w10, wzr ; CHECK-NEXT: mov x8, x1 ; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: .LBB3_5: // %for.body.preheader1 ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000 ; CHECK-NEXT: sub w10, w2, w10 ; CHECK-NEXT: fmov s1, w11 ; CHECK-NEXT: .LBB3_6: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp s2, s3, [x8] ; CHECK-NEXT: fcmp s2, s1 ; CHECK-NEXT: fcsel s4, s1, s2, gt ; CHECK-NEXT: fcmp s2, #0.0 ; CHECK-NEXT: fcsel s2, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: ldp s3, s5, [x8, #8] ; CHECK-NEXT: fcvtzs w11, s2 ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: fcsel s4, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: strb w11, [x9] ; CHECK-NEXT: fcsel s6, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: fcvtzs w12, s4 ; CHECK-NEXT: fcsel s3, s0, s6, mi ; CHECK-NEXT: fcmp s5, s1 ; CHECK-NEXT: strb w12, [x9, #1] ; CHECK-NEXT: fcsel s6, s1, s5, gt ; CHECK-NEXT: fcmp s5, #0.0 ; CHECK-NEXT: fcvtzs w13, s3 ; CHECK-NEXT: fcsel s5, s0, s6, mi ; CHECK-NEXT: subs w10, w10, #1 ; CHECK-NEXT: strb w13, [x9, #2] ; CHECK-NEXT: fcvtzs w14, s5 ; CHECK-NEXT: strb w14, [x9, #3] ; CHECK-NEXT: add x9, x9, #4 ; CHECK-NEXT: b.ne .LBB3_6 ; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_8: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 ; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000 ; CHECK-NEXT: adrp x12, .LCPI3_0 ; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI3_0] ; CHECK-NEXT: add x8, x1, x10, lsl #4 ; CHECK-NEXT: add x9, x0, x10, lsl #2 ; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: .LBB3_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64 ; CHECK-NEXT: subs x12, x12, #4 ; CHECK-NEXT: fcmgt v6.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmgt v7.4s, v3.4s, v0.4s ; CHECK-NEXT: fcmgt v16.4s, v4.4s, v0.4s ; CHECK-NEXT: fcmgt v17.4s, v5.4s, v0.4s ; CHECK-NEXT: fcmlt v18.4s, v2.4s, #0.0 ; CHECK-NEXT: fcmlt v19.4s, v3.4s, #0.0 ; CHECK-NEXT: fcmlt v20.4s, v4.4s, #0.0 ; CHECK-NEXT: bsl v6.16b, v0.16b, v2.16b ; CHECK-NEXT: bsl v7.16b, v0.16b, v3.16b ; CHECK-NEXT: bsl v16.16b, v0.16b, v4.16b ; CHECK-NEXT: bsl v17.16b, v0.16b, v5.16b ; CHECK-NEXT: fcmlt v2.4s, v5.4s, #0.0 ; CHECK-NEXT: bic v3.16b, v6.16b, v18.16b ; CHECK-NEXT: bic v4.16b, v7.16b, v19.16b ; CHECK-NEXT: bic v5.16b, v16.16b, v20.16b ; CHECK-NEXT: bic v2.16b, v17.16b, v2.16b ; CHECK-NEXT: fcvtzs v3.4s, v3.4s ; CHECK-NEXT: fcvtzs v4.4s, v4.4s ; CHECK-NEXT: fcvtzs v5.4s, v5.4s ; CHECK-NEXT: fcvtzs v2.4s, v2.4s ; CHECK-NEXT: xtn v16.4h, v3.4s ; CHECK-NEXT: xtn v17.4h, v4.4s ; CHECK-NEXT: xtn v18.4h, v5.4s ; CHECK-NEXT: xtn v19.4h, v2.4s ; CHECK-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b ; CHECK-NEXT: str q2, [x0], #16 ; CHECK-NEXT: b.ne .LBB3_9 ; CHECK-NEXT: // %bb.10: // %middle.block ; CHECK-NEXT: cmp x11, x10 ; CHECK-NEXT: b.ne .LBB3_5 ; CHECK-NEXT: b .LBB3_7 entry: %cmp39 = icmp sgt i32 %width, 0 br i1 %cmp39, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry %0 = add i32 %width, -1 %1 = zext i32 %0 to i64 %2 = add nuw nsw i64 %1, 1 %min.iters.check = icmp ult i32 %0, 3 br i1 %min.iters.check, label %for.body.preheader57, label %vector.memcheck vector.memcheck: ; preds = %for.body.preheader %3 = add i32 %width, -1 %4 = zext i32 %3 to i64 %5 = shl nuw nsw i64 %4, 2 %6 = add nuw nsw i64 %5, 4 %scevgep = getelementptr i8, ptr %dst, i64 %6 %scevgep44 = getelementptr float, ptr %data, i64 %6 %bound0 = icmp ugt ptr %scevgep44, %dst %bound1 = icmp ugt ptr %scevgep, %data %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %for.body.preheader57, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i64 %2, 8589934588 %ind.end = trunc i64 %n.vec to i32 %7 = shl nuw nsw i64 %n.vec, 2 %ind.end47 = getelementptr float, ptr %data, i64 %7 %8 = shl nuw nsw i64 %n.vec, 2 %ind.end49 = getelementptr i8, ptr %dst, i64 %8 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %9 = shl i64 %index, 2 %next.gep = getelementptr float, ptr %data, i64 %9 %10 = shl i64 %index, 2 %wide.vec = load <16 x float>, ptr %next.gep, align 4 %strided.vec = shufflevector <16 x float> %wide.vec, <16 x float> poison, <4 x i32> %strided.vec54 = shufflevector <16 x float> %wide.vec, <16 x float> poison, <4 x i32> %strided.vec55 = shufflevector <16 x float> %wide.vec, <16 x float> poison, <4 x i32> %strided.vec56 = shufflevector <16 x float> %wide.vec, <16 x float> poison, <4 x i32> %11 = fcmp olt <4 x float> %strided.vec, zeroinitializer %12 = fcmp ogt <4 x float> %strided.vec, %13 = select <4 x i1> %12, <4 x float> , <4 x float> %strided.vec %14 = select <4 x i1> %11, <4 x float> zeroinitializer, <4 x float> %13 %15 = fptoui <4 x float> %14 to <4 x i8> %16 = fcmp olt <4 x float> %strided.vec54, zeroinitializer %17 = fcmp ogt <4 x float> %strided.vec54, %18 = select <4 x i1> %17, <4 x float> , <4 x float> %strided.vec54 %19 = select <4 x i1> %16, <4 x float> zeroinitializer, <4 x float> %18 %20 = fptoui <4 x float> %19 to <4 x i8> %21 = fcmp olt <4 x float> %strided.vec55, zeroinitializer %22 = fcmp ogt <4 x float> %strided.vec55, %23 = select <4 x i1> %22, <4 x float> , <4 x float> %strided.vec55 %24 = select <4 x i1> %21, <4 x float> zeroinitializer, <4 x float> %23 %25 = fptoui <4 x float> %24 to <4 x i8> %26 = fcmp olt <4 x float> %strided.vec56, zeroinitializer %27 = fcmp ogt <4 x float> %strided.vec56, %28 = select <4 x i1> %27, <4 x float> , <4 x float> %strided.vec56 %29 = select <4 x i1> %26, <4 x float> zeroinitializer, <4 x float> %28 %30 = fptoui <4 x float> %29 to <4 x i8> %31 = getelementptr inbounds i8, ptr %dst, i64 %10 %32 = shufflevector <4 x i8> %15, <4 x i8> %20, <8 x i32> %33 = shufflevector <4 x i8> %25, <4 x i8> %30, <8 x i32> %interleaved.vec = shufflevector <8 x i8> %32, <8 x i8> %33, <16 x i32> store <16 x i8> %interleaved.vec, ptr %31, align 1 %index.next = add nuw i64 %index, 4 %34 = icmp eq i64 %index.next, %n.vec br i1 %34, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %2, %n.vec br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader57 for.body.preheader57: ; preds = %vector.memcheck, %for.body.preheader, %middle.block %i.042.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %ind.end, %middle.block ] %src.041.ph = phi ptr [ %data, %vector.memcheck ], [ %data, %for.body.preheader ], [ %ind.end47, %middle.block ] %dst.addr.040.ph = phi ptr [ %dst, %vector.memcheck ], [ %dst, %for.body.preheader ], [ %ind.end49, %middle.block ] br label %for.body for.cond.cleanup: ; preds = %for.body, %middle.block, %entry ret void for.body: ; preds = %for.body.preheader57, %for.body %i.042 = phi i32 [ %inc, %for.body ], [ %i.042.ph, %for.body.preheader57 ] %src.041 = phi ptr [ %add.ptr, %for.body ], [ %src.041.ph, %for.body.preheader57 ] %dst.addr.040 = phi ptr [ %add.ptr14, %for.body ], [ %dst.addr.040.ph, %for.body.preheader57 ] %35 = load float, ptr %src.041, align 4 %cmp.i = fcmp olt float %35, 0.000000e+00 %cmp1.i = fcmp ogt float %35, 2.550000e+02 %.x.i = select i1 %cmp1.i, float 2.550000e+02, float %35 %retval.0.i = select i1 %cmp.i, float 0.000000e+00, float %.x.i %conv = fptoui float %retval.0.i to i8 store i8 %conv, ptr %dst.addr.040, align 1 %arrayidx2 = getelementptr inbounds float, ptr %src.041, i64 1 %36 = load float, ptr %arrayidx2, align 4 %cmp.i27 = fcmp olt float %36, 0.000000e+00 %cmp1.i28 = fcmp ogt float %36, 2.550000e+02 %.x.i29 = select i1 %cmp1.i28, float 2.550000e+02, float %36 %retval.0.i30 = select i1 %cmp.i27, float 0.000000e+00, float %.x.i29 %conv4 = fptoui float %retval.0.i30 to i8 %arrayidx5 = getelementptr inbounds i8, ptr %dst.addr.040, i64 1 store i8 %conv4, ptr %arrayidx5, align 1 %arrayidx6 = getelementptr inbounds float, ptr %src.041, i64 2 %37 = load float, ptr %arrayidx6, align 4 %cmp.i31 = fcmp olt float %37, 0.000000e+00 %cmp1.i32 = fcmp ogt float %37, 2.550000e+02 %.x.i33 = select i1 %cmp1.i32, float 2.550000e+02, float %37 %retval.0.i34 = select i1 %cmp.i31, float 0.000000e+00, float %.x.i33 %conv8 = fptoui float %retval.0.i34 to i8 %arrayidx9 = getelementptr inbounds i8, ptr %dst.addr.040, i64 2 store i8 %conv8, ptr %arrayidx9, align 1 %arrayidx10 = getelementptr inbounds float, ptr %src.041, i64 3 %38 = load float, ptr %arrayidx10, align 4 %cmp.i35 = fcmp olt float %38, 0.000000e+00 %cmp1.i36 = fcmp ogt float %38, 2.550000e+02 %.x.i37 = select i1 %cmp1.i36, float 2.550000e+02, float %38 %retval.0.i38 = select i1 %cmp.i35, float 0.000000e+00, float %.x.i37 %conv12 = fptoui float %retval.0.i38 to i8 %arrayidx13 = getelementptr inbounds i8, ptr %dst.addr.040, i64 3 store i8 %conv12, ptr %arrayidx13, align 1 %add.ptr = getelementptr inbounds float, ptr %src.041, i64 4 %add.ptr14 = getelementptr inbounds i8, ptr %dst.addr.040, i64 4 %inc = add nuw nsw i32 %i.042, 1 %exitcond.not = icmp eq i32 %inc, %width br i1 %exitcond.not, label %for.cond.cleanup, label %for.body }