; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK ; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK define ptr @ldrwu32_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwu32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef) store <4 x i32> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwu32_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwu32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: adds r3, r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef) store <4 x i32> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwu32_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwu32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: adds r3, r0, #2 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef) store <4 x i32> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwu32_508(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwu32_508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef) store <4 x i32> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwu32_512(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwu32_512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef) store <4 x i32> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwu32_m508(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwu32_m508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef) store <4 x i32> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwu32_m512(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwu32_m512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef) store <4 x i32> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrhu32_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = zext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhu32_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: adds r3, r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = zext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhu32_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r0, #2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = zext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhu32_254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu32_254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r0, #254] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 254 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = zext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhu32_256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu32_256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 256 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = zext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhu32_m254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu32_m254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r0, #-254] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -254 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = zext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhu32_m256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu32_m256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -256 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = zext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhs32_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhs32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = sext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhs32_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhs32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: adds r3, r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = sext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhs32_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhs32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r0, #2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = sext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhs32_254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhs32_254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r0, #254] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 254 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = sext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhs32_256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhs32_256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 256 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = sext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhs32_m254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhs32_m254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r0, #-254] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -254 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = sext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhs32_m256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhs32_m256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -256 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef) %1 = sext <4 x i16> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrhu16_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #4] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef) store <8 x i16> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhu16_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: adds r3, r0, #3 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef) store <8 x i16> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhu16_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu16_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #2] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef) store <8 x i16> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhu16_254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu16_254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #254] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef) store <8 x i16> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhu16_256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu16_256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef) store <8 x i16> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhu16_m254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu16_m254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef) store <8 x i16> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhu16_m256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhu16_m256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef) store <8 x i16> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrbu32_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbu32_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r0, #3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbu32_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r0, #2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbu32_127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu32_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r0, #127] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 127 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbu32_128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu32_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 128 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbu32_m127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu32_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -127 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbu32_m128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu32_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -128 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbs32_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = sext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbs32_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r0, #3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = sext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbs32_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r0, #2] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = sext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbs32_127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs32_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r0, #127] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 127 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = sext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbs32_128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs32_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 128 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = sext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbs32_m127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs32_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -127 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = sext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbs32_m128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs32_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -128 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef) %1 = sext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, ptr %y, align 4 ret ptr %x } define ptr @ldrbu16_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r0, #4] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = zext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbu16_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r0, #3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = zext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbu16_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu16_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r0, #2] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = zext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbu16_127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu16_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r0, #127] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 127 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = zext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbu16_128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu16_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 128 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = zext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbu16_m127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu16_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -127 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = zext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbu16_m128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu16_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -128 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = zext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbs16_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r0, #4] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = sext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbs16_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r0, #3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = sext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbs16_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs16_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r0, #2] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = sext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbs16_127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs16_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r0, #127] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 127 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = sext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbs16_128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs16_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 128 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = sext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbs16_m127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs16_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -127 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = sext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbs16_m128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbs16_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -128 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef) %1 = sext <8 x i8> %0 to <8 x i16> store <8 x i16> %1, ptr %y, align 2 ret ptr %x } define ptr @ldrbu8_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu8_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0, #4] ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef) store <16 x i8> %0, ptr %y, align 1 ret ptr %x } define ptr @ldrbu8_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu8_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0, #3] ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef) store <16 x i8> %0, ptr %y, align 1 ret ptr %x } define ptr @ldrbu8_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu8_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0, #2] ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef) store <16 x i8> %0, ptr %y, align 1 ret ptr %x } define ptr @ldrbu8_127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu8_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0, #127] ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 127 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef) store <16 x i8> %0, ptr %y, align 1 ret ptr %x } define ptr @ldrbu8_128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu8_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #128 ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r3] ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 128 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef) store <16 x i8> %0, ptr %y, align 1 ret ptr %x } define ptr @ldrbu8_m127(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu8_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127] ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -127 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef) store <16 x i8> %0, ptr %y, align 1 ret ptr %x } define ptr @ldrbu8_m128(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrbu8_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #128 ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r3] ; CHECK-NEXT: vstrb.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -128 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef) store <16 x i8> %0, ptr %y, align 1 ret ptr %x } define ptr @ldrwf32_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwf32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef) store <4 x float> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwf32_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwf32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: adds r3, r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef) store <4 x float> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwf32_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwf32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: adds r3, r0, #2 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef) store <4 x float> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwf32_508(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwf32_508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef) store <4 x float> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwf32_512(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwf32_512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef) store <4 x float> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwf32_m508(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwf32_m508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef) store <4 x float> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrwf32_m512(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrwf32_m512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef) store <4 x float> %0, ptr %y, align 4 ret ptr %x } define ptr @ldrhf16_4(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhf16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #4] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 4 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef) store <8 x half> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhf16_3(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhf16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: adds r3, r0, #3 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 3 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef) store <8 x half> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhf16_2(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhf16_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #2] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 2 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef) store <8 x half> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhf16_254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhf16_254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #254] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef) store <8 x half> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhf16_256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhf16_256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: add.w r3, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef) store <8 x half> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhf16_m254(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhf16_m254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef) store <8 x half> %0, ptr %y, align 2 ret ptr %x } define ptr @ldrhf16_m256(ptr %x, ptr %y, ptr %m) { ; CHECK-LABEL: ldrhf16_m256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: sub.w r3, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %x, i32 -256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef) store <8 x half> %0, ptr %y, align 2 ret ptr %x } define ptr @strw32_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strw32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i32>, ptr %x, align 4 call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strw32_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strw32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i32>, ptr %x, align 4 call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strw32_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strw32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #2 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i32>, ptr %x, align 4 call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strw32_508(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strw32_508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #508] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i32>, ptr %x, align 4 call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strw32_512(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strw32_512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i32>, ptr %x, align 4 call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strw32_m508(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strw32_m508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i32>, ptr %x, align 4 call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strw32_m512(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strw32_m512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i32>, ptr %x, align 4 call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strh32_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i16>, ptr %x, align 2 call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c) ret ptr %y } define ptr @strh32_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i16>, ptr %x, align 2 call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c) ret ptr %y } define ptr @strh32_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #2] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i16>, ptr %x, align 2 call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c) ret ptr %y } define ptr @strh32_254(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh32_254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #254] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 254 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i16>, ptr %x, align 2 call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c) ret ptr %y } define ptr @strh32_256(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh32_256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 256 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i16>, ptr %x, align 2 call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c) ret ptr %y } define ptr @strh32_m254(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh32_m254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #-254] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -254 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i16>, ptr %x, align 2 call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c) ret ptr %y } define ptr @strh32_m256(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh32_m256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -256 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i16>, ptr %x, align 2 call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c) ret ptr %y } define ptr @strh16_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i16>, ptr %x, align 2 call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strh16_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i16>, ptr %x, align 2 call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strh16_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh16_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #2] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i16>, ptr %x, align 2 call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strh16_254(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh16_254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #254] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i16>, ptr %x, align 2 call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strh16_256(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh16_256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i16>, ptr %x, align 2 call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strh16_m254(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh16_m254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #-254] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i16>, ptr %x, align 2 call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strh16_m256(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strh16_m256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i16>, ptr %x, align 2 call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strb32_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i8>, ptr %x, align 1 call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c) ret ptr %y } define ptr @strb32_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #3] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i8>, ptr %x, align 1 call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c) ret ptr %y } define ptr @strb32_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #2] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i8>, ptr %x, align 1 call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c) ret ptr %y } define ptr @strb32_127(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb32_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #127] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 127 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i8>, ptr %x, align 1 call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c) ret ptr %y } define ptr @strb32_128(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb32_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #128 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 128 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i8>, ptr %x, align 1 call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c) ret ptr %y } define ptr @strb32_m127(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb32_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #-127] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -127 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i8>, ptr %x, align 1 call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c) ret ptr %y } define ptr @strb32_m128(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb32_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #128 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -128 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x i8>, ptr %x, align 1 call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c) ret ptr %y } define ptr @strb16_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i8>, ptr %x, align 1 call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c) ret ptr %y } define ptr @strb16_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #3] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i8>, ptr %x, align 1 call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c) ret ptr %y } define ptr @strb16_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb16_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #2] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i8>, ptr %x, align 1 call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c) ret ptr %y } define ptr @strb16_127(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb16_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #127] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 127 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i8>, ptr %x, align 1 call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c) ret ptr %y } define ptr @strb16_128(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb16_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #128 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 128 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i8>, ptr %x, align 1 call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c) ret ptr %y } define ptr @strb16_m127(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb16_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #-127] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -127 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i8>, ptr %x, align 1 call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c) ret ptr %y } define ptr @strb16_m128(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb16_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #128 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -128 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x i8>, ptr %x, align 1 call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c) ret ptr %y } define ptr @strb8_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb8_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = load <16 x i8>, ptr %x, align 1 call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c) ret ptr %y } define ptr @strb8_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb8_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #3] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = load <16 x i8>, ptr %x, align 1 call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c) ret ptr %y } define ptr @strb8_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb8_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #2] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = load <16 x i8>, ptr %x, align 1 call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c) ret ptr %y } define ptr @strb8_127(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb8_127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #127] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 127 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = load <16 x i8>, ptr %x, align 1 call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c) ret ptr %y } define ptr @strb8_128(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb8_128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #128 ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 128 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = load <16 x i8>, ptr %x, align 1 call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c) ret ptr %y } define ptr @strb8_m127(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb8_m127: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #-127] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -127 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = load <16 x i8>, ptr %x, align 1 call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c) ret ptr %y } define ptr @strb8_m128(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strb8_m128: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #128 ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -128 %mask = load <16 x i8>, ptr %m, align 1 %c = icmp ne <16 x i8> %mask, zeroinitializer %0 = load <16 x i8>, ptr %x, align 1 call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c) ret ptr %y } define ptr @strwf32_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strwf32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x float>, ptr %x, align 4 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strwf32_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strwf32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x float>, ptr %x, align 4 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strwf32_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strwf32_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #2 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x float>, ptr %x, align 4 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strwf32_508(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strwf32_508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #508] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x float>, ptr %x, align 4 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strwf32_512(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strwf32_512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x float>, ptr %x, align 4 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strwf32_m508(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strwf32_m508: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -508 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x float>, ptr %x, align 4 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strwf32_m512(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strwf32_m512: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -512 %mask = load <4 x i32>, ptr %m, align 4 %c = icmp ne <4 x i32> %mask, zeroinitializer %0 = load <4 x float>, ptr %x, align 4 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c) ret ptr %y } define ptr @strhf16_4(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strhf16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #4] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 4 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x half>, ptr %x, align 2 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strhf16_3(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strhf16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 3 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x half>, ptr %x, align 2 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strhf16_2(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strhf16_2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #2] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 2 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x half>, ptr %x, align 2 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strhf16_254(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strhf16_254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #254] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x half>, ptr %x, align 2 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strhf16_256(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strhf16_256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x half>, ptr %x, align 2 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strhf16_m254(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strhf16_m254: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #-254] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -254 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x half>, ptr %x, align 2 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } define ptr @strhf16_m256(ptr %y, ptr %x, ptr %m) { ; CHECK-LABEL: strhf16_m256: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %z = getelementptr inbounds i8, ptr %y, i32 -256 %mask = load <8 x i16>, ptr %m, align 2 %c = icmp ne <8 x i16> %mask, zeroinitializer %0 = load <8 x half>, ptr %x, align 2 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c) ret ptr %y } declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>) declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>) declare <4 x i8> @llvm.masked.load.v4i8.p0(ptr, i32, <4 x i1>, <4 x i8>) declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>) declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>) declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>) declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32, <8 x i1>, <8 x half>) declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>) declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>) declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>) declare void @llvm.masked.store.v4i8.p0(<4 x i8>, ptr, i32, <4 x i1>) declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>)