; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt < %s -loop-reduce -mcpu=btver2 -S | FileCheck %s --check-prefix=JAG ; RUN: opt < %s -loop-reduce -mcpu=bdver2 -S | FileCheck %s --check-prefix=BUL ; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW ; RUN: llc < %s | FileCheck %s --check-prefix=BASE ; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE ; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" ; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681 ; FIXME: If a CPU can macro-fuse a compare and branch, then we discount that ; cost in LSR and avoid generating large offsets in each memory access. ; This reduces code size and may improve decode throughput. define void @maxArray(ptr noalias nocapture %x, ptr noalias nocapture readonly %y) { ; JAG-LABEL: @maxArray( ; JAG-NEXT: entry: ; JAG-NEXT: br label [[VECTOR_BODY:%.*]] ; JAG: vector.body: ; JAG-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ] ; JAG-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 [[LSR_IV]] ; JAG-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP2]], i64 524288 ; JAG-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Y:%.*]], i64 [[LSR_IV]] ; JAG-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 524288 ; JAG-NEXT: [[XVAL:%.*]] = load <2 x double>, ptr [[SCEVGEP3]], align 8 ; JAG-NEXT: [[YVAL:%.*]] = load <2 x double>, ptr [[SCEVGEP1]], align 8 ; JAG-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] ; JAG-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] ; JAG-NEXT: store <2 x double> [[MAX]], ptr [[SCEVGEP3]], align 8 ; JAG-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16 ; JAG-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 ; JAG-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] ; JAG: exit: ; JAG-NEXT: ret void ; ; BUL-LABEL: @maxArray( ; BUL-NEXT: entry: ; BUL-NEXT: br label [[VECTOR_BODY:%.*]] ; BUL: vector.body: ; BUL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; BUL-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[INDEX]], 3 ; BUL-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 [[TMP0]] ; BUL-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDEX]], 3 ; BUL-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Y:%.*]], i64 [[TMP1]] ; BUL-NEXT: [[XVAL:%.*]] = load <2 x double>, ptr [[SCEVGEP1]], align 8 ; BUL-NEXT: [[YVAL:%.*]] = load <2 x double>, ptr [[SCEVGEP]], align 8 ; BUL-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] ; BUL-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] ; BUL-NEXT: store <2 x double> [[MAX]], ptr [[SCEVGEP1]], align 8 ; BUL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; BUL-NEXT: [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 ; BUL-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] ; BUL: exit: ; BUL-NEXT: ret void ; ; HSW-LABEL: @maxArray( ; HSW-NEXT: entry: ; HSW-NEXT: br label [[VECTOR_BODY:%.*]] ; HSW: vector.body: ; HSW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; HSW-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[INDEX]], 3 ; HSW-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 [[TMP0]] ; HSW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDEX]], 3 ; HSW-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Y:%.*]], i64 [[TMP1]] ; HSW-NEXT: [[XVAL:%.*]] = load <2 x double>, ptr [[SCEVGEP1]], align 8 ; HSW-NEXT: [[YVAL:%.*]] = load <2 x double>, ptr [[SCEVGEP]], align 8 ; HSW-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] ; HSW-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] ; HSW-NEXT: store <2 x double> [[MAX]], ptr [[SCEVGEP1]], align 8 ; HSW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; HSW-NEXT: [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 ; HSW-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] ; HSW: exit: ; HSW-NEXT: ret void ; ; BASE-LABEL: maxArray: ; BASE: # %bb.0: # %entry ; BASE-NEXT: movq $-524288, %rax # imm = 0xFFF80000 ; BASE-NEXT: .p2align 4, 0x90 ; BASE-NEXT: .LBB0_1: # %vector.body ; BASE-NEXT: # =>This Inner Loop Header: Depth=1 ; BASE-NEXT: movupd 524288(%rdi,%rax), %xmm0 ; BASE-NEXT: movupd 524288(%rsi,%rax), %xmm1 ; BASE-NEXT: maxpd %xmm0, %xmm1 ; BASE-NEXT: movupd %xmm1, 524288(%rdi,%rax) ; BASE-NEXT: addq $16, %rax ; BASE-NEXT: jne .LBB0_1 ; BASE-NEXT: # %bb.2: # %exit ; BASE-NEXT: retq ; FUSE-LABEL: maxArray: ; FUSE: # %bb.0: # %entry ; FUSE-NEXT: xorl %eax, %eax ; FUSE-NEXT: .p2align 4, 0x90 ; FUSE-NEXT: .LBB0_1: # %vector.body ; FUSE-NEXT: # =>This Inner Loop Header: Depth=1 ; FUSE-NEXT: movupd (%rdi,%rax,8), %xmm0 ; FUSE-NEXT: movupd (%rsi,%rax,8), %xmm1 ; FUSE-NEXT: maxpd %xmm0, %xmm1 ; FUSE-NEXT: movupd %xmm1, (%rdi,%rax,8) ; FUSE-NEXT: addq $2, %rax ; FUSE-NEXT: cmpq $65536, %rax # imm = 0x10000 ; FUSE-NEXT: jne .LBB0_1 ; FUSE-NEXT: # %bb.2: # %exit ; FUSE-NEXT: retq entry: br label %vector.body vector.body: %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %gepx = getelementptr inbounds double, ptr %x, i64 %index %gepy = getelementptr inbounds double, ptr %y, i64 %index %xval = load <2 x double>, ptr %gepx, align 8 %yval = load <2 x double>, ptr %gepy, align 8 %cmp = fcmp ogt <2 x double> %yval, %xval %max = select <2 x i1> %cmp, <2 x double> %yval, <2 x double> %xval store <2 x double> %max, ptr %gepx, align 8 %index.next = add i64 %index, 2 %done = icmp eq i64 %index.next, 65536 br i1 %done, label %exit, label %vector.body exit: ret void }