; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc                                                       < %s -o - | FileCheck %s
; RUN: llc -mattr=+alu-lsl-fast --aarch64-enable-sink-fold=false < %s -o - | FileCheck %s -check-prefix=LSLFAST
target triple = "aarch64-linux"

declare void @g(...)

; Check that ADDWrs/ADDXrs with shift > 4 is considered relatively
; slow, thus CSE-d.
define void @f0(i1 %c0, i1 %c1, ptr %a, i64 %i) {
; CHECK-LABEL: f0:
; CHECK:       // %bb.0: // %E
; CHECK-NEXT:    tbz w0, #0, .LBB0_5
; CHECK-NEXT:  // %bb.1: // %A
; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w30, -16
; CHECK-NEXT:    add x0, x2, x3, lsl #5
; CHECK-NEXT:    tbz w1, #0, .LBB0_3
; CHECK-NEXT:  // %bb.2: // %B
; CHECK-NEXT:    bl g
; CHECK-NEXT:    b .LBB0_4
; CHECK-NEXT:  .LBB0_3: // %C
; CHECK-NEXT:    mov x1, x0
; CHECK-NEXT:    bl g
; CHECK-NEXT:  .LBB0_4:
; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:  .LBB0_5: // %X
; CHECK-NEXT:    ret
;
; LSLFAST-LABEL: f0:
; LSLFAST:       // %bb.0: // %E
; LSLFAST-NEXT:    tbz w0, #0, .LBB0_5
; LSLFAST-NEXT:  // %bb.1: // %A
; LSLFAST-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; LSLFAST-NEXT:    .cfi_def_cfa_offset 16
; LSLFAST-NEXT:    .cfi_offset w30, -16
; LSLFAST-NEXT:    add x0, x2, x3, lsl #5
; LSLFAST-NEXT:    tbz w1, #0, .LBB0_3
; LSLFAST-NEXT:  // %bb.2: // %B
; LSLFAST-NEXT:    bl g
; LSLFAST-NEXT:    b .LBB0_4
; LSLFAST-NEXT:  .LBB0_3: // %C
; LSLFAST-NEXT:    mov x1, x0
; LSLFAST-NEXT:    bl g
; LSLFAST-NEXT:  .LBB0_4:
; LSLFAST-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; LSLFAST-NEXT:  .LBB0_5: // %X
; LSLFAST-NEXT:    ret
E:
    %p0 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
    br i1 %c0, label %A, label %X

A:
    br i1 %c1, label %B, label %C

B:
    call void @g(ptr %p0)
    br label %X

C:
    %p1 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
    call void @g(ptr %p1, ptr %p0)
    br label %X

X:
    ret void
}

; Check that ADDWrs/ADDXrs with shift <= 4 is considered relatively fast on sub-targets
; with feature +alu-lsl-fast, thus *not* CSE-d.
define void @f1(i1 %c0, i1 %c1, ptr %a, i64 %i) {
; CHECK-LABEL: f1:
; CHECK:       // %bb.0: // %E
; CHECK-NEXT:    tbz w0, #0, .LBB1_5
; CHECK-NEXT:  // %bb.1: // %A
; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w30, -16
; CHECK-NEXT:    add x0, x2, x3, lsl #4
; CHECK-NEXT:    tbz w1, #0, .LBB1_3
; CHECK-NEXT:  // %bb.2: // %B
; CHECK-NEXT:    bl g
; CHECK-NEXT:    b .LBB1_4
; CHECK-NEXT:  .LBB1_3: // %C
; CHECK-NEXT:    mov x1, x0
; CHECK-NEXT:    bl g
; CHECK-NEXT:  .LBB1_4:
; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:  .LBB1_5: // %X
; CHECK-NEXT:    ret
;
; LSLFAST-LABEL: f1:
; LSLFAST:       // %bb.0: // %E
; LSLFAST-NEXT:    tbz w0, #0, .LBB1_5
; LSLFAST-NEXT:  // %bb.1: // %A
; LSLFAST-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; LSLFAST-NEXT:    .cfi_def_cfa_offset 16
; LSLFAST-NEXT:    .cfi_offset w30, -16
; LSLFAST-NEXT:    add x8, x2, x3, lsl #4
; LSLFAST-NEXT:    tbz w1, #0, .LBB1_3
; LSLFAST-NEXT:  // %bb.2: // %B
; LSLFAST-NEXT:    mov x0, x8
; LSLFAST-NEXT:    bl g
; LSLFAST-NEXT:    b .LBB1_4
; LSLFAST-NEXT:  .LBB1_3: // %C
; LSLFAST-NEXT:    add x0, x2, x3, lsl #4
; LSLFAST-NEXT:    mov x1, x8
; LSLFAST-NEXT:    bl g
; LSLFAST-NEXT:  .LBB1_4:
; LSLFAST-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; LSLFAST-NEXT:  .LBB1_5: // %X
; LSLFAST-NEXT:    ret
E:
    %p0 = getelementptr {i64, i64}, ptr %a, i64 %i
    br i1 %c0, label %A, label %X

A:
    br i1 %c1, label %B, label %C

B:
    call void @g(ptr %p0)
    br label %X

C:
    %p1 = getelementptr {i64, i64}, ptr %a, i64 %i
    call void @g(ptr %p1, ptr %p0)
    br label %X

X:
    ret void
}