109 lines
6.8 KiB
LLVM
109 lines
6.8 KiB
LLVM
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
|
|
|
|
; TODO: The unrolled pattern is preventing the transform
|
|
; CHECK-LABEL: mul_v16i8_unroll
|
|
; CHECK-NOT: call i32 @llvm.arm.vcpt
|
|
define void @mul_v16i8_unroll(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
|
|
entry:
|
|
%cmp8 = icmp eq i32 %N, 0
|
|
%tmp8 = add i32 %N, 15
|
|
%tmp9 = lshr i32 %tmp8, 4
|
|
%tmp10 = shl nuw i32 %tmp9, 4
|
|
%tmp11 = add i32 %tmp10, -16
|
|
%tmp12 = lshr i32 %tmp11, 4
|
|
%tmp13 = add nuw nsw i32 %tmp12, 1
|
|
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
|
|
|
|
vector.ph: ; preds = %entry
|
|
%trip.count.minus.1 = add i32 %N, -1
|
|
%broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
|
|
%broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
%xtraiter = and i32 %tmp13, 1
|
|
%0 = icmp ult i32 %tmp12, 1
|
|
br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new
|
|
|
|
vector.ph.new: ; preds = %vector.ph
|
|
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
|
|
%unroll_iter = sub i32 %tmp13, %xtraiter
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph.new
|
|
%index = phi i32 [ 0, %vector.ph.new ], [ %index.next.1, %vector.body ]
|
|
%niter = phi i32 [ %unroll_iter, %vector.ph.new ], [ %niter.nsub.1, %vector.body ]
|
|
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
|
|
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
%induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
%tmp = getelementptr inbounds i8, ptr %a, i32 %index
|
|
%tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
|
|
%wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
|
|
%tmp3 = getelementptr inbounds i8, ptr %b, i32 %index
|
|
%wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
|
|
%mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
|
|
%tmp6 = getelementptr inbounds i8, ptr %c, i32 %index
|
|
tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul, ptr %tmp6, i32 4, <16 x i1> %tmp1)
|
|
%index.next = add nuw nsw i32 %index, 16
|
|
%niter.nsub = sub i32 %niter, 1
|
|
%broadcast.splatinsert.1 = insertelement <16 x i32> undef, i32 %index.next, i32 0
|
|
%broadcast.splat.1 = shufflevector <16 x i32> %broadcast.splatinsert.1, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
%induction.1 = add <16 x i32> %broadcast.splat.1, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
%tmp.1 = getelementptr inbounds i8, ptr %a, i32 %index.next
|
|
%tmp1.1 = icmp ule <16 x i32> %induction.1, %broadcast.splat11
|
|
%wide.masked.load.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
|
|
%tmp3.1 = getelementptr inbounds i8, ptr %b, i32 %index.next
|
|
%wide.masked.load2.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
|
|
%mul.1 = mul nsw <16 x i8> %wide.masked.load2.1, %wide.masked.load.1
|
|
%tmp6.1 = getelementptr inbounds i8, ptr %c, i32 %index.next
|
|
tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.1, ptr %tmp6.1, i32 4, <16 x i1> %tmp1.1)
|
|
%index.next.1 = add i32 %index.next, 16
|
|
%niter.nsub.1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %niter.nsub, i32 1)
|
|
%niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0
|
|
br i1 %niter.ncmp.1, label %vector.body, label %for.cond.cleanup.loopexit.unr-lcssa.loopexit
|
|
|
|
for.cond.cleanup.loopexit.unr-lcssa.loopexit: ; preds = %vector.body
|
|
%index.unr.ph = phi i32 [ %index.next.1, %vector.body ]
|
|
%tmp14.unr.ph = phi i32 [ -2, %vector.body ]
|
|
br label %for.cond.cleanup.loopexit.unr-lcssa
|
|
|
|
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.cond.cleanup.loopexit.unr-lcssa.loopexit, %vector.ph
|
|
%index.unr = phi i32 [ 0, %vector.ph ], [ %index.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
|
|
%tmp14.unr = phi i32 [ %tmp13, %vector.ph ], [ %tmp14.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
|
|
%lcmp.mod = icmp ne i32 %xtraiter, 0
|
|
br i1 %lcmp.mod, label %vector.body.epil.preheader, label %for.cond.cleanup.loopexit
|
|
|
|
vector.body.epil.preheader: ; preds = %for.cond.cleanup.loopexit.unr-lcssa
|
|
br label %vector.body.epil
|
|
|
|
vector.body.epil: ; preds = %vector.body.epil.preheader
|
|
%index.epil = phi i32 [ %index.unr, %vector.body.epil.preheader ]
|
|
%tmp14.epil = phi i32 [ %tmp14.unr, %vector.body.epil.preheader ]
|
|
%broadcast.splatinsert.epil = insertelement <16 x i32> undef, i32 %index.epil, i32 0
|
|
%broadcast.splat.epil = shufflevector <16 x i32> %broadcast.splatinsert.epil, <16 x i32> undef, <16 x i32> zeroinitializer
|
|
%induction.epil = add <16 x i32> %broadcast.splat.epil, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
%tmp.epil = getelementptr inbounds i8, ptr %a, i32 %index.epil
|
|
%tmp1.epil = icmp ule <16 x i32> %induction.epil, %broadcast.splat11
|
|
%wide.masked.load.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
|
|
%tmp3.epil = getelementptr inbounds i8, ptr %b, i32 %index.epil
|
|
%wide.masked.load2.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
|
|
%mul.epil = mul nsw <16 x i8> %wide.masked.load2.epil, %wide.masked.load.epil
|
|
%tmp6.epil = getelementptr inbounds i8, ptr %c, i32 %index.epil
|
|
tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.epil, ptr %tmp6.epil, i32 4, <16 x i1> %tmp1.epil)
|
|
%index.next.epil = add i32 %index.epil, 16
|
|
%tmp15.epil = add nuw nsw i32 %tmp14.epil, -1
|
|
%tmp16.epil = icmp ne i32 %tmp15.epil, 0
|
|
br label %for.cond.cleanup.loopexit.epilog-lcssa
|
|
|
|
for.cond.cleanup.loopexit.epilog-lcssa: ; preds = %vector.body.epil
|
|
br label %for.cond.cleanup.loopexit
|
|
|
|
for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.cond.cleanup.loopexit.epilog-lcssa
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
|
ret void
|
|
}
|
|
|
|
declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) #1
|
|
declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>) #2
|
|
declare i32 @llvm.start.loop.iterations.i32(i32) #3
|
|
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
|
|
|