bolt/deps/llvm-18.1.8/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-7.ll

; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v6, ptr %out6"
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i8] zeroinitializer, align 128
@B = global [1024 x i32] zeroinitializer, align 128

define void @test() {
; SSE2-LABEL: 'test'
; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
; SSE2:  LV: Found an estimated cost of 51 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
; SSE2:  LV: Found an estimated cost of 108 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
; SSE2:  LV: Found an estimated cost of 216 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
;
; AVX1-LABEL: 'test'
; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
; AVX1:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
; AVX1:  LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
; AVX1:  LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
; AVX1:  LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
;
; AVX2-LABEL: 'test'
; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
; AVX2:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
; AVX2:  LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
; AVX2:  LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
;
; AVX512-LABEL: 'test'
; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
; AVX512:  LV: Found an estimated cost of 20 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store i32 %v6, ptr %out6, align 4
;
entry:
  br label %for.body

for.body:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

  %iv.0 = add nuw nsw i64 %iv, 0
  %iv.1 = add nuw nsw i64 %iv, 1
  %iv.2 = add nuw nsw i64 %iv, 2
  %iv.3 = add nuw nsw i64 %iv, 3
  %iv.4 = add nuw nsw i64 %iv, 4
  %iv.5 = add nuw nsw i64 %iv, 5
  %iv.6 = add nuw nsw i64 %iv, 6

  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
  %v.narrow = load i8, ptr %in

  %v = zext i8 %v.narrow to i32

  %v0 = add i32 %v, 0
  %v1 = add i32 %v, 1
  %v2 = add i32 %v, 2
  %v3 = add i32 %v, 3
  %v4 = add i32 %v, 4
  %v5 = add i32 %v, 5
  %v6 = add i32 %v, 6

  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
  %out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2
  %out3 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.3
  %out4 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.4
  %out5 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.5
  %out6 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.6

  store i32 %v0, ptr %out0
  store i32 %v1, ptr %out1
  store i32 %v2, ptr %out2
  store i32 %v3, ptr %out3
  store i32 %v4, ptr %out4
  store i32 %v5, ptr %out5
  store i32 %v6, ptr %out6

  %iv.next = add nuw nsw i64 %iv.0, 7
  %cmp = icmp ult i64 %iv.next, 1024
  br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
  ret void
}
Embed LLVM 18.1.8 2025-02-14 19:21:04 +01:00			`; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v6, ptr %out6"`
			`; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 \| FileCheck %s --check-prefix=SSE2`
			`; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 \| FileCheck %s --check-prefix=AVX1`
			`; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 \| FileCheck %s --check-prefix=AVX2`
			`; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 \| FileCheck %s --check-prefix=AVX512`
			`; REQUIRES: asserts`

			`target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"`
			`target triple = "x86_64-unknown-linux-gnu"`

			`@A = global [1024 x i8] zeroinitializer, align 128`
			`@B = global [1024 x i32] zeroinitializer, align 128`

			`define void @test() {`
			`; SSE2-LABEL: 'test'`
			`; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4`
			`; SSE2: LV: Found an estimated cost of 51 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4`
			`; SSE2: LV: Found an estimated cost of 108 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4`
			`; SSE2: LV: Found an estimated cost of 216 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4`
			`;`
			`; AVX1-LABEL: 'test'`
			`; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX1: LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX1: LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX1: LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX1: LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4`
			`;`
			`; AVX2-LABEL: 'test'`
			`; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX2: LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX2: LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX2: LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4`
			`;`
			`; AVX512-LABEL: 'test'`
			`; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX512: LV: Found an estimated cost of 20 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX512: LV: Found an estimated cost of 40 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX512: LV: Found an estimated cost of 70 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4`
			`; AVX512: LV: Found an estimated cost of 140 for VF 32 For instruction: store i32 %v6, ptr %out6, align 4`
			`;`
			`entry:`
			`br label %for.body`

			`for.body:`
			`%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]`

			`%iv.0 = add nuw nsw i64 %iv, 0`
			`%iv.1 = add nuw nsw i64 %iv, 1`
			`%iv.2 = add nuw nsw i64 %iv, 2`
			`%iv.3 = add nuw nsw i64 %iv, 3`
			`%iv.4 = add nuw nsw i64 %iv, 4`
			`%iv.5 = add nuw nsw i64 %iv, 5`
			`%iv.6 = add nuw nsw i64 %iv, 6`

			`%in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0`
			`%v.narrow = load i8, ptr %in`

			`%v = zext i8 %v.narrow to i32`

			`%v0 = add i32 %v, 0`
			`%v1 = add i32 %v, 1`
			`%v2 = add i32 %v, 2`
			`%v3 = add i32 %v, 3`
			`%v4 = add i32 %v, 4`
			`%v5 = add i32 %v, 5`
			`%v6 = add i32 %v, 6`

			`%out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0`
			`%out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1`
			`%out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2`
			`%out3 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.3`
			`%out4 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.4`
			`%out5 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.5`
			`%out6 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.6`

			`store i32 %v0, ptr %out0`
			`store i32 %v1, ptr %out1`
			`store i32 %v2, ptr %out2`
			`store i32 %v3, ptr %out3`
			`store i32 %v4, ptr %out4`
			`store i32 %v5, ptr %out5`
			`store i32 %v6, ptr %out6`

			`%iv.next = add nuw nsw i64 %iv.0, 7`
			`%cmp = icmp ult i64 %iv.next, 1024`
			`br i1 %cmp, label %for.body, label %for.cond.cleanup`

			`for.cond.cleanup:`
			`ret void`
			`}`