4436 lines
172 KiB
LLVM
4436 lines
172 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s
|
|
|
|
; Test that we can unfold constant pool loads when we're using avx512's
|
|
; ability to fold a broadcast load into an operation.
|
|
|
|
define void @bcast_unfold_add_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_add_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB0_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB0_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <16 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <16 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_add_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_add_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB1_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB1_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <8 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_add_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_add_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB2_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB2_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2>
|
|
store <4 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_add_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_add_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB3_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB3_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
store <8 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_add_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_add_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB4_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB4_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2>
|
|
store <4 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_add_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_add_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB5_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB5_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <2 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2>
|
|
store <2 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_mul_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_mul_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB6_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
|
|
; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm1
|
|
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB6_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <16 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
store <16 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_mul_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_mul_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB7_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
|
|
; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm1
|
|
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
|
; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB7_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
store <8 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_mul_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_mul_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB8_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
|
|
; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1
|
|
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
|
; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB8_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
|
|
store <4 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_mul_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_mul_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB9_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
|
|
; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1
|
|
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB9_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
|
|
store <8 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_mul_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_mul_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB10_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
|
|
; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1
|
|
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
|
|
; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB10_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
|
|
store <4 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_mul_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_mul_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB11_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0
|
|
; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1
|
|
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
|
|
; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB11_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <2 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3>
|
|
store <2 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_or_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_or_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB12_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB12_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <16 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
store <16 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_or_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_or_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB13_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB13_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
store <8 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_or_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_or_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB14_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB14_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
|
|
store <4 x i32> %tmp6, ptr %tmp3, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_or_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_or_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB15_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB15_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
|
|
store <8 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_or_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_or_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB16_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB16_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
|
|
store <4 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_or_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_or_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3,3]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB17_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB17_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp5 = load <2 x i64>, ptr %tmp3, align 8
|
|
%tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3>
|
|
store <2 x i64> %tmp6, ptr %tmp3, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb2
|
|
|
|
bb10: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fneg_v16f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fneg_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB18_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB18_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fneg <16 x float> %tmp4
|
|
store <16 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 16
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fneg_v8f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fneg_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB19_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB19_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fneg <8 x float> %tmp4
|
|
store <8 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fneg_v4f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fneg_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB20_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB20_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fneg <4 x float> %tmp4
|
|
store <4 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fneg_v8f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fneg_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB21_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB21_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fneg <8 x double> %tmp4
|
|
store <8 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fneg_v4f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fneg_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB22_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB22_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fneg <4 x double> %tmp4
|
|
store <4 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fneg_v2f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fneg_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB23_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB23_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fneg <2 x double> %tmp4
|
|
store <2 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 2
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fabs_v16f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fabs_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB24_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB24_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4)
|
|
store <16 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 16
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0
|
|
|
|
define void @bcast_unfold_fabs_v8f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fabs_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB25_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB25_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4)
|
|
store <8 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0
|
|
|
|
define void @bcast_unfold_fabs_v4f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fabs_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB26_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB26_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4)
|
|
store <4 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0
|
|
|
|
define void @bcast_unfold_fabs_v8f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fabs_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB27_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB27_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4)
|
|
store <8 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0
|
|
|
|
define void @bcast_unfold_fabs_v4f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fabs_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB28_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB28_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4)
|
|
store <4 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0
|
|
|
|
define void @bcast_unfold_fabs_v2f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fabs_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [NaN,NaN]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB29_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB29_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4)
|
|
store <2 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 2
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
|
|
|
|
define void @bcast_unfold_fadd_v16f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fadd_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB30_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB30_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <16 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 16
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fadd_v8f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fadd_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB31_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB31_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <8 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fadd_v4f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fadd_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB32_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB32_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <4 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fadd_v8f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fadd_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB33_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB33_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <8 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fadd_v4f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fadd_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB34_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB34_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <4 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fadd_v2f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fadd_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB35_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB35_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
|
|
store <2 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 2
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmul_v16f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmul_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB36_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB36_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
|
|
store <16 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 16
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmul_v8f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmul_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB37_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB37_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
|
|
store <8 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmul_v4f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmul_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB38_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB38_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
|
|
store <4 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmul_v8f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmul_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB39_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB39_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
|
|
store <8 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmul_v4f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmul_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB40_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB40_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
|
|
store <4 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmul_v2f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmul_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB41_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB41_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
|
|
store <2 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 2
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fdiv_v16f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB42_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1
|
|
; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB42_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fdiv <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <16 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 16
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fdiv_v8f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB43_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB43_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fdiv <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <8 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fdiv_v4f32(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB44_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB44_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fdiv <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <4 x float> %tmp5, ptr %tmp2, align 4
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fdiv_v8f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB45_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1
|
|
; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB45_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fdiv <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <8 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 8
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fdiv_v4f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB46_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1
|
|
; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB46_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fdiv <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <4 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 4
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fdiv_v2f64(ptr nocapture %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB47_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vdivpd %xmm0, %xmm1, %xmm1
|
|
; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB47_1
|
|
; CHECK-NEXT: # %bb.2: # %bb9
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fdiv <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
|
|
store <2 x double> %tmp5, ptr %tmp2, align 8
|
|
%tmp7 = add i64 %tmp, 2
|
|
%tmp8 = icmp eq i64 %tmp7, 1024
|
|
br i1 %tmp8, label %bb9, label %bb1
|
|
|
|
bb9: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma213_v4f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma213_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB48_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB48_1
|
|
; CHECK-NEXT: # %bb.2: # %bb11
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
|
|
%tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x float>, ptr %tmp3, align 4
|
|
%tmp6 = fmul contract <4 x float> %tmp5, %tmp5
|
|
%tmp7 = fadd contract <4 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <4 x float> %tmp7, ptr %tmp3, align 4
|
|
%tmp9 = add i64 %tmp, 4
|
|
%tmp10 = icmp eq i64 %tmp9, 1024
|
|
br i1 %tmp10, label %bb11, label %bb2
|
|
|
|
bb11: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma231_v4f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma231_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB49_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB49_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fmul contract <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = fadd contract <4 x float> %tmp4, %tmp5
|
|
store <4 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma213_v8f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma213_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB50_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB50_1
|
|
; CHECK-NEXT: # %bb.2: # %bb11
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
|
|
%tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x float>, ptr %tmp3, align 4
|
|
%tmp6 = fmul contract <8 x float> %tmp5, %tmp5
|
|
%tmp7 = fadd contract <8 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <8 x float> %tmp7, ptr %tmp3, align 4
|
|
%tmp9 = add i64 %tmp, 8
|
|
%tmp10 = icmp eq i64 %tmp9, 1024
|
|
br i1 %tmp10, label %bb11, label %bb2
|
|
|
|
bb11: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma231_v8f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma231_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB51_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB51_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fmul contract <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = fadd contract <8 x float> %tmp4, %tmp5
|
|
store <8 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma213_v16f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma213_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB52_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
|
|
; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB52_1
|
|
; CHECK-NEXT: # %bb.2: # %bb11
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
|
|
%tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp5 = load <16 x float>, ptr %tmp3, align 4
|
|
%tmp6 = fmul contract <16 x float> %tmp5, %tmp5
|
|
%tmp7 = fadd contract <16 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <16 x float> %tmp7, ptr %tmp3, align 4
|
|
%tmp9 = add i64 %tmp, 16
|
|
%tmp10 = icmp eq i64 %tmp9, 1024
|
|
br i1 %tmp10, label %bb11, label %bb2
|
|
|
|
bb11: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma231_v16f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma231_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB53_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
|
|
; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB53_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fmul contract <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = fadd contract <16 x float> %tmp4, %tmp5
|
|
store <16 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma213_v2f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma213_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB54_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
|
|
; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB54_1
|
|
; CHECK-NEXT: # %bb.2: # %bb11
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
|
|
%tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp5 = load <2 x double>, ptr %tmp3, align 4
|
|
%tmp6 = fmul contract <2 x double> %tmp5, %tmp5
|
|
%tmp7 = fadd contract <2 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00>
|
|
store <2 x double> %tmp7, ptr %tmp3, align 8
|
|
%tmp9 = add i64 %tmp, 2
|
|
%tmp10 = icmp eq i64 %tmp9, 1024
|
|
br i1 %tmp10, label %bb11, label %bb2
|
|
|
|
bb11: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma231_v2f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma231_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB55_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
|
|
; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB55_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fmul contract <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = fadd contract <2 x double> %tmp4, %tmp5
|
|
store <2 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma213_v4f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma213_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB56_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
|
|
; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB56_1
|
|
; CHECK-NEXT: # %bb.2: # %bb11
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
|
|
%tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp5 = load <4 x double>, ptr %tmp3, align 8
|
|
%tmp6 = fmul contract <4 x double> %tmp5, %tmp5
|
|
%tmp7 = fadd contract <4 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <4 x double> %tmp7, ptr %tmp3, align 8
|
|
%tmp9 = add i64 %tmp, 4
|
|
%tmp10 = icmp eq i64 %tmp9, 1024
|
|
br i1 %tmp10, label %bb11, label %bb2
|
|
|
|
bb11: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma231_v4f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma231_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB57_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
|
|
; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB57_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fmul contract <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = fadd contract <4 x double> %tmp4, %tmp5
|
|
store <4 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma213_v8f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma213_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB58_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
|
|
; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB58_1
|
|
; CHECK-NEXT: # %bb.2: # %bb11
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
|
|
%tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp5 = load <8 x double>, ptr %tmp3, align 8
|
|
%tmp6 = fmul contract <8 x double> %tmp5, %tmp5
|
|
%tmp7 = fadd contract <8 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <8 x double> %tmp7, ptr %tmp3, align 8
|
|
%tmp9 = add i64 %tmp, 8
|
|
%tmp10 = icmp eq i64 %tmp9, 1024
|
|
br i1 %tmp10, label %bb11, label %bb2
|
|
|
|
bb11: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fma231_v8f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fma231_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB59_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
|
|
; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB59_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fmul contract <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = fadd contract <8 x double> %tmp4, %tmp5
|
|
store <8 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmax_v4f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmax_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB60_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB60_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp ogt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <4 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmax_v8f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmax_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB61_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB61_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp ogt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <8 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmax_v16f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmax_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB62_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1
|
|
; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB62_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp ogt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <16 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmax_v2f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmax_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB63_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
|
|
; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB63_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp ogt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
|
|
store <2 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmax_v4f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmax_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB64_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1
|
|
; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB64_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp ogt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <4 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmax_v8f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmax_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB65_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1
|
|
; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB65_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp ogt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <8 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmin_v4f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmin_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB66_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1
|
|
; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB66_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <4 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmin_v8f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmin_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB67_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1
|
|
; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB67_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <8 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmin_v16f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmin_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB68_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1
|
|
; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB68_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
store <16 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmin_v2f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmin_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB69_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1
|
|
; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB69_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
|
|
store <2 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmin_v4f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmin_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB70_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1
|
|
; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB70_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <4 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_fmin_v8f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_fmin_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB71_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1
|
|
; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB71_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
store <8 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smin_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smin_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB72_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB72_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smin_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smin_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB73_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB73_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smin_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smin_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB74_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB74_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smin_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smin_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB75_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB75_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp slt <2 x i64> %tmp4, <i64 2, i64 2>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smin_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smin_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB76_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB76_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp slt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smin_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smin_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB77_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB77_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp slt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smax_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smax_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB78_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB78_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smax_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smax_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB79_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB79_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smax_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smax_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB80_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB80_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smax_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smax_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB81_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB81_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp sgt <2 x i64> %tmp4, <i64 2, i64 2>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smax_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smax_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB82_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB82_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp sgt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_smax_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_smax_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB83_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB83_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp sgt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umin_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umin_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB84_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminud 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB84_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umin_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umin_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB85_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminud 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB85_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umin_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umin_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB86_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminud 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB86_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umin_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umin_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB87_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB87_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umin_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umin_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB88_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB88_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umin_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umin_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB89_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB89_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umax_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umax_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB90_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB90_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ugt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umax_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umax_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB91_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB91_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ugt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umax_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umax_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB92_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB92_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ugt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umax_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umax_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB93_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1
|
|
; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB93_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp ugt <2 x i64> %tmp4, <i64 2, i64 2>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umax_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umax_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB94_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB94_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp ugt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_umax_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_umax_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB95_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB95_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 8
|
|
%tmp5 = icmp ugt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpgt_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB96_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB96_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpgt_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB97_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB97_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpgt_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB98_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB98_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB99_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB99_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <2 x i64> %tmp4, <i64 1, i64 1>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB100_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB100_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpgt_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB101_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB101_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp sgt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpeq_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB102_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB102_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp eq <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpeq_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB103_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB103_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp eq <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpeq_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB104_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB104_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp eq <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB105_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB105_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp eq <2 x i64> %tmp4, <i64 1, i64 1>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB106_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB106_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp eq <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpeq_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB107_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB107_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp eq <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmp_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmp_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB108_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
|
|
; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
|
|
; CHECK-NEXT: addq $4, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: jg .LBB108_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp slt i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmp_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmp_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB109_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
|
|
; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
|
|
; CHECK-NEXT: addq $8, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: jg .LBB109_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp slt i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmp_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmp_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB110_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
|
|
; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: jg .LBB110_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp slt i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmp_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmp_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB111_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
|
|
; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
|
|
; CHECK-NEXT: addq $2, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: jg .LBB111_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <2 x i64> %tmp4, <i64 1, i64 1>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp slt i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmp_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmp_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB112_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
|
|
; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
|
|
; CHECK-NEXT: addq $4, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: jg .LBB112_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp slt i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmp_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmp_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB113_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
|
|
; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
|
|
; CHECK-NEXT: addq $8, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: jg .LBB113_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp slt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp slt i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpu_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpu_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB114_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
|
|
; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
|
|
; CHECK-NEXT: addq $4, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: ja .LBB114_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp ult i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpu_v8i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpu_v8i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB115_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
|
|
; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
|
|
; CHECK-NEXT: addq $8, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: ja .LBB115_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
|
|
store <8 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp ult i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpu_v16i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpu_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB116_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
|
|
; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: ja .LBB116_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x i32>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
|
|
store <16 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp ult i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpu_v2i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpu_v2i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB117_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
|
|
; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
|
|
; CHECK-NEXT: addq $2, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: ja .LBB117_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
|
|
store <2 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp ult i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpu_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpu_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB118_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
|
|
; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
|
|
; CHECK-NEXT: addq $4, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: ja .LBB118_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp ult i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_pcmpu_v8i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_pcmpu_v8i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB119_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
|
|
; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
|
|
; CHECK-NEXT: addq $8, %rax
|
|
; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
|
|
; CHECK-NEXT: ja .LBB119_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x i64>, ptr %tmp2, align 4
|
|
%tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
|
|
store <8 x i64> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp ult i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_cmp_v4f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_cmp_v4f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB120_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2
|
|
; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1
|
|
; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1}
|
|
; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB120_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
|
|
store <4 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_cmp_v8f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_cmp_v8f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB121_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2
|
|
; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1
|
|
; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1}
|
|
; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB121_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
|
|
store <8 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_cmp_v16f32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_cmp_v16f32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB122_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2
|
|
; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1
|
|
; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1}
|
|
; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB122_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
|
|
%tmp4 = load <16 x float>, ptr %tmp2, align 4
|
|
%tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
|
|
store <16 x float> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 16
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_cmp_v2f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_cmp_v2f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: # xmm0 = mem[0,0]
|
|
; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: # xmm1 = mem[0,0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB123_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2
|
|
; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1
|
|
; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1}
|
|
; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB123_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <2 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 3.000000e+00, double 3.000000e+00>
|
|
store <2 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 2
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_cmp_v4f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_cmp_v4f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB124_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2
|
|
; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1
|
|
; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1}
|
|
; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB124_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
|
|
store <4 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_cmp_v8f64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_cmp_v8f64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB125_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2
|
|
; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1
|
|
; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1}
|
|
; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB125_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
|
|
%tmp4 = load <8 x double>, ptr %tmp2, align 8
|
|
%tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
|
%tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
|
|
store <8 x double> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 8
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_cmp_v8f32_refold(ptr nocapture %0) {
|
|
; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
|
|
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB126_1: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vcmpgtps 4096(%rdi,%rax), %ymm0, %k1
|
|
; CHECK-NEXT: vblendmps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 {%k1}
|
|
; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB126_1
|
|
; CHECK-NEXT: # %bb.2:
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
br label %2
|
|
|
|
2: ; preds = %2, %1
|
|
%3 = phi i64 [ 0, %1 ], [ %8, %2 ]
|
|
%4 = getelementptr inbounds float, ptr %0, i64 %3
|
|
%5 = load <8 x float>, ptr %4, align 4
|
|
%6 = fcmp olt <8 x float> %5, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
|
|
%7 = select <8 x i1> %6, <8 x float> <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
|
|
store <8 x float> %7, ptr %4, align 4
|
|
%8 = add i64 %3, 8
|
|
%9 = icmp eq i64 %8, 1024
|
|
br i1 %9, label %10, label %2
|
|
|
|
10: ; preds = %2
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_ptestm_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB127_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB127_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
|
|
%tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_ptestnm_v4i32(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB128_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
|
|
; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1
|
|
; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $16, %rax
|
|
; CHECK-NEXT: jne .LBB128_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i32>, ptr %tmp2, align 4
|
|
%tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
|
|
%tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
|
|
store <4 x i32> %tmp6, ptr %tmp2, align 4
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_ptestm_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB129_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB129_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 8
|
|
%tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
|
|
%tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_ptestnm_v4i64(ptr %arg) {
|
|
; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
|
|
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB130_1: # %bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
|
|
; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1
|
|
; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
|
|
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
|
|
; CHECK-NEXT: addq $32, %rax
|
|
; CHECK-NEXT: jne .LBB130_1
|
|
; CHECK-NEXT: # %bb.2: # %bb10
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
|
|
%tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
|
|
%tmp4 = load <4 x i64>, ptr %tmp2, align 8
|
|
%tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
|
|
%tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
|
|
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
|
|
store <4 x i64> %tmp6, ptr %tmp2, align 8
|
|
%tmp8 = add i64 %tmp, 4
|
|
%tmp9 = icmp eq i64 %tmp8, 1024
|
|
br i1 %tmp9, label %bb10, label %bb1
|
|
|
|
bb10: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @bcast_unfold_vpternlog_v16i32(ptr %arg, ptr %arg1) {
|
|
; CHECK-LABEL: bcast_unfold_vpternlog_v16i32:
|
|
; CHECK: # %bb.0: # %bb
|
|
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB131_1: # %bb2
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
|
|
; CHECK-NEXT: vmovdqu64 4096(%rsi,%rax), %zmm2
|
|
; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm3
|
|
; CHECK-NEXT: vpternlogd $216, %zmm0, %zmm1, %zmm2
|
|
; CHECK-NEXT: vpmulld %zmm3, %zmm2, %zmm1
|
|
; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
|
|
; CHECK-NEXT: addq $64, %rax
|
|
; CHECK-NEXT: jne .LBB131_1
|
|
; CHECK-NEXT: # %bb.2: # %bb20
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
bb:
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp18, %bb2 ]
|
|
%tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
|
|
%tmp5 = load <16 x i32>, ptr %tmp3, align 4
|
|
%tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp
|
|
%tmp11 = load <16 x i32>, ptr %tmp6, align 4
|
|
%tmp12 = and <16 x i32> %tmp5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
|
|
%tmp13 = and <16 x i32> %tmp11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
|
|
%tmp14 = or <16 x i32> %tmp12, %tmp13
|
|
%tmp15 = mul <16 x i32> %tmp14, %tmp5
|
|
%tmp16 = mul <16 x i32> %tmp15, %tmp11
|
|
store <16 x i32> %tmp16, ptr %tmp3, align 4
|
|
%tmp18 = add i64 %tmp, 16
|
|
%tmp19 = icmp eq i64 %tmp18, 1024
|
|
br i1 %tmp19, label %bb20, label %bb2
|
|
|
|
bb20: ; preds = %bb2
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|