; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s ; 2-lane contiguous load/stores define void @test_masked_ldst_sv2i8(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i8: ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv2i8(* %base_i8, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i8( %data, * %base_i8, i32 1, %mask) ret void } define void @test_masked_ldst_sv2i16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv2i16(* %base_i16, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i16( %data, * %base_i16, i32 1, %mask) ret void } define void @test_masked_ldst_sv2i32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i32: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv2i32(* %base_i32, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i32( %data, * %base_i32, i32 1, %mask) ret void } define void @test_masked_ldst_sv2i64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i64: ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] ; CHECK-NEXT: ret %base_i64 = getelementptr i64, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv2i64(* %base_i64, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i64( %data, * %base_i64, i32 1, %mask) ret void } define void @test_masked_ldst_sv2f16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2f16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_half = getelementptr half, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv2f16(* %base_half, i32 1, %mask, undef) call void @llvm.masked.store.nxv2f16( %data, * %base_half, i32 1, %mask) ret void } define void @test_masked_ldst_sv2f32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2f32: ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_float = getelementptr float, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv2f32(* %base_float, i32 1, %mask, undef) call void @llvm.masked.store.nxv2f32( %data, * %base_float, i32 1, %mask) ret void } define void @test_masked_ldst_sv2f64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2f64: ; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] ; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] ; CHECK-NEXT: ret %base_double = getelementptr double, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv2f64(* %base_double, i32 1, %mask, undef) call void @llvm.masked.store.nxv2f64( %data, * %base_double, i32 1, %mask) ret void } ; 2-lane zero/sign extended contiguous loads. define @masked_zload_sv2i8_to_sv2i64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv2i8(* %base_i8, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv2i8_to_sv2i64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv2i8(* %base_i8, i32 1, %mask, undef) %ext = sext %load to ret %ext } define @masked_zload_sv2i16_to_sv2i64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv2i16(* %base_i16, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv2i16_to_sv2i64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv2i16(* %base_i16, i32 1, %mask, undef) %ext = sext %load to ret %ext } define @masked_zload_sv2i32_to_sv2i64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv2i32(* %base_i32, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv2i32_to_sv2i64(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv2i32(* %base_i32, i32 1, %mask, undef) %ext = sext %load to ret %ext } ; 2-lane truncating contiguous stores. define void @masked_trunc_store_sv2i64_to_sv2i8( %val, ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: ; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %trunc = trunc %val to call void @llvm.masked.store.nxv2i8( %trunc, *%base_i8, i32 1, %mask) ret void } define void @masked_trunc_store_sv2i64_to_sv2i16( %val, ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: ; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %trunc = trunc %val to call void @llvm.masked.store.nxv2i16( %trunc, *%base_i16, i32 1, %mask) ret void } define void @masked_trunc_store_sv2i64_to_sv2i32( %val, ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, ptr %base, i64 %offset %trunc = trunc %val to call void @llvm.masked.store.nxv2i32( %trunc, *%base_i32, i32 1, %mask) ret void } ; 4-lane contiguous load/stores. define void @test_masked_ldst_sv4i8(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4i8: ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv4i8(* %base_i8, i32 1, %mask, undef) call void @llvm.masked.store.nxv4i8( %data, * %base_i8, i32 1, %mask) ret void } define void @test_masked_ldst_sv4i16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4i16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv4i16(* %base_i16, i32 1, %mask, undef) call void @llvm.masked.store.nxv4i16( %data, * %base_i16, i32 1, %mask) ret void } define void @test_masked_ldst_sv4i32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4i32: ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv4i32(* %base_i32, i32 1, %mask, undef) call void @llvm.masked.store.nxv4i32( %data, * %base_i32, i32 1, %mask) ret void } define void @test_masked_ldst_sv4f16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4f16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_f16 = getelementptr half, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv4f16(* %base_f16, i32 1, %mask, undef) call void @llvm.masked.store.nxv4f16( %data, * %base_f16, i32 1, %mask) ret void } define void @test_masked_ldst_sv4f32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4f32: ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_f32 = getelementptr float, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv4f32(* %base_f32, i32 1, %mask, undef) call void @llvm.masked.store.nxv4f32( %data, * %base_f32, i32 1, %mask) ret void } ; 4-lane zero/sign extended contiguous loads. define @masked_zload_sv4i8_to_sv4i32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv4i8(* %base_i8, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv4i8_to_sv4i32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv4i8(* %base_i8, i32 1, %mask, undef) %ext = sext %load to ret %ext } define @masked_zload_sv4i16_to_sv4i32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv4i16(* %base_i16, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv4i16_to_sv4i32(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv4i16(* %base_i16, i32 1, %mask, undef) %ext = sext %load to ret %ext } ; 4-lane truncating contiguous stores. define void @masked_trunc_store_sv4i32_to_sv4i8( %val, ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: ; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %trunc = trunc %val to call void @llvm.masked.store.nxv4i8( %trunc, *%base_i8, i32 1, %mask) ret void } define void @masked_trunc_store_sv4i32_to_sv4i16( %val, ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: ; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %trunc = trunc %val to call void @llvm.masked.store.nxv4i16( %trunc, *%base_i16, i32 1, %mask) ret void } ; 8-lane contiguous load/stores. define void @test_masked_ldst_sv8i8(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv8i8: ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv8i8(* %base_i8, i32 1, %mask, undef) call void @llvm.masked.store.nxv8i8( %data, * %base_i8, i32 1, %mask) ret void } define void @test_masked_ldst_sv8i16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv8i16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv8i16(* %base_i16, i32 1, %mask, undef) call void @llvm.masked.store.nxv8i16( %data, * %base_i16, i32 1, %mask) ret void } define void @test_masked_ldst_sv8f16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv8f16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_f16 = getelementptr half, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv8f16(* %base_f16, i32 1, %mask, undef) call void @llvm.masked.store.nxv8f16( %data, * %base_f16, i32 1, %mask) ret void } define void @test_masked_ldst_sv8bf16(ptr %base, %mask, i64 %offset) nounwind #0 { ; CHECK-LABEL: test_masked_ldst_sv8bf16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_f16 = getelementptr bfloat, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv8bf16(* %base_f16, i32 1, %mask, undef) call void @llvm.masked.store.nxv8bf16( %data, * %base_f16, i32 1, %mask) ret void } ; 8-lane zero/sign extended contiguous loads. define @masked_zload_sv8i8_to_sv8i16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv8i8(* %base_i8, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv8i8_to_sv8i16(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %load = call @llvm.masked.load.nxv8i8(* %base_i8, i32 1, %mask, undef) %ext = sext %load to ret %ext } ; 8-lane truncating contiguous stores. define void @masked_trunc_store_sv8i16_to_sv8i8( %val, ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: ; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %trunc = trunc %val to call void @llvm.masked.store.nxv8i8( %trunc, *%base_i8, i32 1, %mask) ret void } ; 16-lane contiguous load/stores. define void @test_masked_ldst_sv16i8(ptr %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv16i8: ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, ptr %base, i64 %offset %data = call @llvm.masked.load.nxv16i8(* %base_i8, i32 1, %mask, undef) call void @llvm.masked.store.nxv16i8( %data, * %base_i8, i32 1, %mask) ret void } ; 2-element contiguous loads. declare @llvm.masked.load.nxv2i8 (* , i32, , ) declare @llvm.masked.load.nxv2i16(*, i32, , ) declare @llvm.masked.load.nxv2i32(*, i32, , ) declare @llvm.masked.load.nxv2i64(*, i32, , ) declare @llvm.masked.load.nxv2f16(*, i32, , ) declare @llvm.masked.load.nxv2f32(*, i32, , ) declare @llvm.masked.load.nxv2f64(*, i32, , ) ; 4-element contiguous loads. declare @llvm.masked.load.nxv4i8 (* , i32, , ) declare @llvm.masked.load.nxv4i16(*, i32, , ) declare @llvm.masked.load.nxv4i32(*, i32, , ) declare @llvm.masked.load.nxv4f16(*, i32, , ) declare @llvm.masked.load.nxv4f32(*, i32, , ) ; 8-element contiguous loads. declare @llvm.masked.load.nxv8i8 (* , i32, , ) declare @llvm.masked.load.nxv8i16(*, i32, , ) declare @llvm.masked.load.nxv8f16(*, i32, , ) declare @llvm.masked.load.nxv8bf16(*, i32, , ) ; 16-element contiguous loads. declare @llvm.masked.load.nxv16i8(*, i32, , ) ; 2-element contiguous stores. declare void @llvm.masked.store.nxv2i8 ( , * , i32, ) declare void @llvm.masked.store.nxv2i16(, *, i32, ) declare void @llvm.masked.store.nxv2i32(, *, i32, ) declare void @llvm.masked.store.nxv2i64(, *, i32, ) declare void @llvm.masked.store.nxv2f16(, *, i32, ) declare void @llvm.masked.store.nxv2f32(, *, i32, ) declare void @llvm.masked.store.nxv2f64(, *, i32, ) ; 4-element contiguous stores. declare void @llvm.masked.store.nxv4i8 ( , * , i32, ) declare void @llvm.masked.store.nxv4i16(, *, i32, ) declare void @llvm.masked.store.nxv4i32(, *, i32, ) declare void @llvm.masked.store.nxv4f16(, *, i32, ) declare void @llvm.masked.store.nxv4f32(, *, i32, ) ; 8-element contiguous stores. declare void @llvm.masked.store.nxv8i8 ( , * , i32, ) declare void @llvm.masked.store.nxv8i16(, *, i32, ) declare void @llvm.masked.store.nxv8f16(, *, i32, ) declare void @llvm.masked.store.nxv8bf16(, *, i32, ) ; 16-element contiguous stores. declare void @llvm.masked.store.nxv16i8(, *, i32, ) ; +bf16 is required for the bfloat version. attributes #0 = { "target-features"="+sve,+bf16" }