; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+fast-unaligned-access -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN ; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F ; The two loads are contigous and should be folded into one define void @widen_2xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: widen_2xv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 8 %b = load <4 x i16>, ptr %b.gep %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } define void @widen_3xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: widen_3xv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a2, a0, 8 ; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 8 %b = load <4 x i16>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 8 %c = load <4 x i16>, ptr %c.gep %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> store <12 x i16> %d.2, ptr %z ret void } define void @widen_4xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: widen_4xv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 8 %b = load <4 x i16>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 8 %c = load <4 x i16>, ptr %c.gep %d.gep = getelementptr i8, ptr %c.gep, i64 8 %d = load <4 x i16>, ptr %d.gep %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> store <16 x i16> %e.2, ptr %z ret void } define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) { ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned: ; CHECK-NO-MISALIGN: # %bb.0: ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 ; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16 ; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2) ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24 ; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0) ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) ; CHECK-NO-MISALIGN-NEXT: ret ; ; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned: ; RV64-MISALIGN: # %bb.0: ; RV64-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-MISALIGN-NEXT: vle16.v v8, (a0) ; RV64-MISALIGN-NEXT: vse16.v v8, (a1) ; RV64-MISALIGN-NEXT: ret %a = load <4 x i16>, ptr %x, align 1 %b.gep = getelementptr i8, ptr %x, i64 8 %b = load <4 x i16>, ptr %b.gep, align 1 %c.gep = getelementptr i8, ptr %b.gep, i64 8 %c = load <4 x i16>, ptr %c.gep, align 1 %d.gep = getelementptr i8, ptr %c.gep, i64 8 %d = load <4 x i16>, ptr %d.gep, align 1 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> store <16 x i16> %e.2, ptr %z ret void } ; Should be a strided load - with type coercion to i64 define void @strided_constant(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 16 %b = load <4 x i16>, ptr %b.gep %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } ; Should be a strided load define void @strided_constant_64(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant_64: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 64 %b = load <4 x i16>, ptr %b.gep %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } ; Vector is too large to fit into a single strided load define void @strided_constant_v4i32(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 32 %b = load <4 x i32>, ptr %b.gep %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> store <8 x i32> %c, ptr %z ret void } ; Interestingly, can be a stride 0 load define void @strided_constant_0(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant_0: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: vslideup.vi v9, v8, 4 ; CHECK-NEXT: vse16.v v9, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %x %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } ; Stride isn't consistent, so shouldn't be combined define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant_mismatch_4xv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a2, a0, 2 ; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: addi a2, a0, 6 ; CHECK-NEXT: vle16.v v12, (a2) ; CHECK-NEXT: addi a0, a0, 8 ; CHECK-NEXT: vle16.v v14, (a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v14, 12 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 2 %b = load <4 x i16>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 4 %c = load <4 x i16>, ptr %c.gep %d.gep = getelementptr i8, ptr %c.gep, i64 2 %d = load <4 x i16>, ptr %d.gep %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> store <16 x i16> %e.2, ptr %z ret void } define void @strided_runtime(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_runtime: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_runtime_4xv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 %s %c = load <4 x i16>, ptr %c.gep %d.gep = getelementptr i8, ptr %c.gep, i64 %s %d = load <4 x i16>, ptr %d.gep %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> store <16 x i16> %e.2, ptr %z ret void } ; Stride isn't consistent, so shouldn't be combined define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV32-LABEL: strided_runtime_mismatch_4xv4i16: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: vle16.v v10, (a0) ; RV32-NEXT: add a0, a0, a4 ; RV32-NEXT: vle16.v v12, (a0) ; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: vle16.v v14, (a0) ; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma ; RV32-NEXT: vslideup.vi v8, v10, 4 ; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma ; RV32-NEXT: vslideup.vi v8, v12, 8 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vslideup.vi v8, v14, 12 ; RV32-NEXT: vse16.v v8, (a1) ; RV32-NEXT: ret ; ; RV64-LABEL: strided_runtime_mismatch_4xv4i16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: add a0, a0, a2 ; RV64-NEXT: vle16.v v10, (a0) ; RV64-NEXT: add a0, a0, a3 ; RV64-NEXT: vle16.v v12, (a0) ; RV64-NEXT: add a0, a0, a2 ; RV64-NEXT: vle16.v v14, (a0) ; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma ; RV64-NEXT: vslideup.vi v8, v10, 4 ; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma ; RV64-NEXT: vslideup.vi v8, v12, 8 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vslideup.vi v8, v14, 12 ; RV64-NEXT: vse16.v v8, (a1) ; RV64-NEXT: ret ; ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16: ; ZVE64F: # %bb.0: ; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVE64F-NEXT: vle16.v v8, (a0) ; ZVE64F-NEXT: add a0, a0, a2 ; ZVE64F-NEXT: vle16.v v10, (a0) ; ZVE64F-NEXT: add a0, a0, a3 ; ZVE64F-NEXT: vle16.v v12, (a0) ; ZVE64F-NEXT: add a0, a0, a2 ; ZVE64F-NEXT: vle16.v v14, (a0) ; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma ; ZVE64F-NEXT: vslideup.vi v8, v10, 4 ; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma ; ZVE64F-NEXT: vslideup.vi v8, v12, 8 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVE64F-NEXT: vslideup.vi v8, v14, 12 ; ZVE64F-NEXT: vse16.v v8, (a1) ; ZVE64F-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 %t %c = load <4 x i16>, ptr %c.gep %d.gep = getelementptr i8, ptr %c.gep, i64 %s %d = load <4 x i16>, ptr %d.gep %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> store <16 x i16> %e.2, ptr %z ret void } define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_runtime_4xv4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x half>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 %s %c = load <4 x half>, ptr %c.gep %d.gep = getelementptr i8, ptr %c.gep, i64 %s %d = load <4 x half>, ptr %d.gep %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> store <16 x half> %e.2, ptr %z ret void } define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_runtime_4xv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <2 x float>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 %s %c = load <2 x float>, ptr %c.gep %d.gep = getelementptr i8, ptr %c.gep, i64 %s %d = load <2 x float>, ptr %d.gep %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> store <8 x float> %e.2, ptr %z ret void } define void @strided_unaligned(ptr %x, ptr %z, i64 %s) { ; CHECK-NO-MISALIGN-LABEL: strided_unaligned: ; CHECK-NO-MISALIGN: # %bb.0: ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) ; CHECK-NO-MISALIGN-NEXT: add a0, a0, a2 ; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0) ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) ; CHECK-NO-MISALIGN-NEXT: ret ; ; RV64-MISALIGN-LABEL: strided_unaligned: ; RV64-MISALIGN: # %bb.0: ; RV64-MISALIGN-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-MISALIGN-NEXT: vlse64.v v8, (a0), a2 ; RV64-MISALIGN-NEXT: vse64.v v8, (a1) ; RV64-MISALIGN-NEXT: ret %a = load <4 x i16>, ptr %x, align 1 %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep, align 1 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } ; Should use the most restrictive common alignment define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_mismatched_alignments: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x, align 8 %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep, align 16 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_ok_alignments_8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x, align 8 %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep, align 8 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_ok_alignments_16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x, align 16 %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep, align 16 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } ; Shouldn't be combined because one of the loads is not simple define void @strided_non_simple_load(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_non_simple_load: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s %b = load volatile <4 x i16>, ptr %b.gep %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } ; Shouldn't be combined because one of the operands is not a load define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) { ; CHECK-LABEL: strided_non_load: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vi v9, v8, 4 ; CHECK-NEXT: vse16.v v9, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %c, ptr %z ret void } define void @strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_constant_neg_4xv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, -64 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 -64 %b = load <2 x float>, ptr %b.gep %c.gep = getelementptr i8, ptr %b.gep, i64 -64 %c = load <2 x float>, ptr %c.gep %d.gep = getelementptr i8, ptr %c.gep, i64 -64 %d = load <2 x float>, ptr %d.gep %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> store <8 x float> %e.2, ptr %z ret void } ; This is a strided load with a negative stride define void @reverse_strided_constant_pos_4xv2f32(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: reverse_strided_constant_pos_4xv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi a0, a0, 192 ; CHECK-NEXT: li a2, -64 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %x.1 = getelementptr i8, ptr %x, i64 64 %x.2 = getelementptr i8, ptr %x.1, i64 64 %x.3 = getelementptr i8, ptr %x.2, i64 64 %a = load <2 x float>, ptr %x.3 %b = load <2 x float>, ptr %x.2 %c = load <2 x float>, ptr %x.1 %d = load <2 x float>, ptr %x %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> store <8 x float> %e.2, ptr %z ret void } define void @reverse_strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: reverse_strided_constant_neg_4xv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi a0, a0, -192 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %x.1 = getelementptr i8, ptr %x, i64 -64 %x.2 = getelementptr i8, ptr %x.1, i64 -64 %x.3 = getelementptr i8, ptr %x.2, i64 -64 %a = load <2 x float>, ptr %x.3 %b = load <2 x float>, ptr %x.2 %c = load <2 x float>, ptr %x.1 %d = load <2 x float>, ptr %x %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> store <8 x float> %e.2, ptr %z ret void } ; This is a strided load with a negative stride define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: reverse_strided_runtime_4xv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a3, a2, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: neg a2, a2 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %x.1 = getelementptr i8, ptr %x, i64 %s %x.2 = getelementptr i8, ptr %x.1, i64 %s %x.3 = getelementptr i8, ptr %x.2, i64 %s %a = load <2 x float>, ptr %x.3 %b = load <2 x float>, ptr %x.2 %c = load <2 x float>, ptr %x.1 %d = load <2 x float>, ptr %x %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> store <8 x float> %e.2, ptr %z ret void }