; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 ; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in store i32 %load, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 ; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 ; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 store i32 %load, ptr addrspace(1) %out store i32 %load1, ptr addrspace(1) %out1 ret void } define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 ; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 ; NO-PRELOAD-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 ; PRELOAD-1-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 ; PRELOAD-1-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 ; PRELOAD-1-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-3-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 ; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 ; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 ; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-3-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 ; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-3-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 ; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 ; PRELOAD-3-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 ; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 ; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 ; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 %load2 = load i32, ptr addrspace(1) %in2 %load3 = load i32, ptr addrspace(1) %in3 store i32 %load, ptr addrspace(1) %out store i32 %load1, ptr addrspace(1) %out1 store i32 %load2, ptr addrspace(1) %out2 store i32 %load3, ptr addrspace(1) %out3 ret void } ; Preload args with inreg in the NO-PRELOAD case. define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset ; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 store i32 %load, ptr addrspace(1) %out store i32 %load1, ptr addrspace(1) %out1 ret void } ; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument. define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence ; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence ; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 store i32 %load, ptr addrspace(1) %out store i32 %load1, ptr addrspace(1) %out1 ret void } define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned ; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 ; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 ; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] ; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned ; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] ; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned ; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 ; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 ; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !0 ; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] ; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned ; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 ; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] ; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 %ext = zext i16 %arg0 to i32 %add = add i32 %load, %ext store i32 %add, ptr addrspace(1) %out store i32 %load1, ptr addrspace(1) %out1 ret void } ; In this case both i16 args with be preloaded into the first SGPR. define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 ; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 ; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 ; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 ; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 ; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 ; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 ; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] ; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 ; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 ; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 ; PRELOAD-1-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 ; PRELOAD-1-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 ; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-1-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32 ; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] ; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 ; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-3-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 ; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] ; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 ; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 ; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] ; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: ret void ; %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 %add = add i32 %ext, %ext1 store i32 %add, ptr addrspace(1) %out ret void } attributes #0 = { nounwind }