; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECKLE ; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefixes=CHECKBE define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrni8: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr d0, [x0] ; CHECKLE-NEXT: ldr d1, [x1] ; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b ; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b ; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrni8: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.8b }, [x0] ; CHECKBE-NEXT: ld1 { v1.8b }, [x1] ; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b ; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b ; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b ; CHECKBE-NEXT: rev64 v0.8b, v0.8b ; CHECKBE-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp5 = add <8 x i8> %tmp3, %tmp4 ret <8 x i8> %tmp5 } define <4 x i16> @vtrni16(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrni16: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr d0, [x0] ; CHECKLE-NEXT: ldr d1, [x1] ; CHECKLE-NEXT: trn1 v2.4h, v0.4h, v1.4h ; CHECKLE-NEXT: trn2 v0.4h, v0.4h, v1.4h ; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrni16: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.4h }, [x0] ; CHECKBE-NEXT: ld1 { v1.4h }, [x1] ; CHECKBE-NEXT: trn1 v2.4h, v0.4h, v1.4h ; CHECKBE-NEXT: trn2 v0.4h, v0.4h, v1.4h ; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h ; CHECKBE-NEXT: rev64 v0.4h, v0.4h ; CHECKBE-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> %tmp5 = add <4 x i16> %tmp3, %tmp4 ret <4 x i16> %tmp5 } define <8 x i8> @vtrni16_viabitcast(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrni16_viabitcast: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr d0, [x0] ; CHECKLE-NEXT: ldr d1, [x1] ; CHECKLE-NEXT: trn1 v0.4h, v0.4h, v1.4h ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrni16_viabitcast: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.4h }, [x0] ; CHECKBE-NEXT: ld1 { v1.4h }, [x1] ; CHECKBE-NEXT: trn1 v0.4h, v0.4h, v1.4h ; CHECKBE-NEXT: rev64 v0.4h, v0.4h ; CHECKBE-NEXT: ret %l1 = load <4 x i16>, ptr %A %l2 = load <4 x i16>, ptr %B %b1 = bitcast <4 x i16> %l1 to <8 x i8> %b2 = bitcast <4 x i16> %l2 to <8 x i8> %tmp3 = shufflevector <8 x i8> %b1, <8 x i8> %b2, <8 x i32> ret <8 x i8> %tmp3 } ; 2xi32 TRN is redundant with ZIP define <2 x i32> @vtrni32(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrni32: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr d0, [x0] ; CHECKLE-NEXT: ldr d1, [x1] ; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s ; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s ; CHECKLE-NEXT: add v0.2s, v2.2s, v0.2s ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrni32: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.2s }, [x0] ; CHECKBE-NEXT: ld1 { v1.2s }, [x1] ; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s ; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s ; CHECKBE-NEXT: add v0.2s, v2.2s, v0.2s ; CHECKBE-NEXT: rev64 v0.2s, v0.2s ; CHECKBE-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp5 = add <2 x i32> %tmp3, %tmp4 ret <2 x i32> %tmp5 } define <2 x float> @vtrnf(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrnf: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr d0, [x0] ; CHECKLE-NEXT: ldr d1, [x1] ; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s ; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s ; CHECKLE-NEXT: fadd v0.2s, v2.2s, v0.2s ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrnf: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.2s }, [x0] ; CHECKBE-NEXT: ld1 { v1.2s }, [x1] ; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s ; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s ; CHECKBE-NEXT: fadd v0.2s, v2.2s, v0.2s ; CHECKBE-NEXT: rev64 v0.2s, v0.2s ; CHECKBE-NEXT: ret %tmp1 = load <2 x float>, ptr %A %tmp2 = load <2 x float>, ptr %B %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> %tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> %tmp5 = fadd <2 x float> %tmp3, %tmp4 ret <2 x float> %tmp5 } define <16 x i8> @vtrnQi8(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrnQi8: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr q0, [x0] ; CHECKLE-NEXT: ldr q1, [x1] ; CHECKLE-NEXT: trn1 v2.16b, v0.16b, v1.16b ; CHECKLE-NEXT: trn2 v0.16b, v0.16b, v1.16b ; CHECKLE-NEXT: add v0.16b, v2.16b, v0.16b ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrnQi8: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.16b }, [x0] ; CHECKBE-NEXT: ld1 { v1.16b }, [x1] ; CHECKBE-NEXT: trn1 v2.16b, v0.16b, v1.16b ; CHECKBE-NEXT: trn2 v0.16b, v0.16b, v1.16b ; CHECKBE-NEXT: add v0.16b, v2.16b, v0.16b ; CHECKBE-NEXT: rev64 v0.16b, v0.16b ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECKBE-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> %tmp5 = add <16 x i8> %tmp3, %tmp4 ret <16 x i8> %tmp5 } define <8 x i16> @vtrnQi16(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrnQi16: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr q0, [x0] ; CHECKLE-NEXT: ldr q1, [x1] ; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h ; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h ; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrnQi16: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.8h }, [x0] ; CHECKBE-NEXT: ld1 { v1.8h }, [x1] ; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h ; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h ; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h ; CHECKBE-NEXT: rev64 v0.8h, v0.8h ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECKBE-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> %tmp5 = add <8 x i16> %tmp3, %tmp4 ret <8 x i16> %tmp5 } define <4 x i32> @vtrnQi32(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrnQi32: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr q0, [x0] ; CHECKLE-NEXT: ldr q1, [x1] ; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s ; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s ; CHECKLE-NEXT: add v0.4s, v2.4s, v0.4s ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrnQi32: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.4s }, [x0] ; CHECKBE-NEXT: ld1 { v1.4s }, [x1] ; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s ; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s ; CHECKBE-NEXT: add v0.4s, v2.4s, v0.4s ; CHECKBE-NEXT: rev64 v0.4s, v0.4s ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECKBE-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp5 = add <4 x i32> %tmp3, %tmp4 ret <4 x i32> %tmp5 } define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrnQf: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr q0, [x0] ; CHECKLE-NEXT: ldr q1, [x1] ; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s ; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s ; CHECKLE-NEXT: fadd v0.4s, v2.4s, v0.4s ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrnQf: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.4s }, [x0] ; CHECKBE-NEXT: ld1 { v1.4s }, [x1] ; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s ; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s ; CHECKBE-NEXT: fadd v0.4s, v2.4s, v0.4s ; CHECKBE-NEXT: rev64 v0.4s, v0.4s ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECKBE-NEXT: ret %tmp1 = load <4 x float>, ptr %A %tmp2 = load <4 x float>, ptr %B %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> %tmp5 = fadd <4 x float> %tmp3, %tmp4 ret <4 x float> %tmp5 } ; Undef shuffle indices should not prevent matching to VTRN: define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrni8_undef: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr d0, [x0] ; CHECKLE-NEXT: ldr d1, [x1] ; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b ; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b ; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrni8_undef: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.8b }, [x0] ; CHECKBE-NEXT: ld1 { v1.8b }, [x1] ; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b ; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b ; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b ; CHECKBE-NEXT: rev64 v0.8b, v0.8b ; CHECKBE-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> %tmp5 = add <8 x i8> %tmp3, %tmp4 ret <8 x i8> %tmp5 } define <8 x i16> @vtrnQi16_undef(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrnQi16_undef: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: ldr q0, [x0] ; CHECKLE-NEXT: ldr q1, [x1] ; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h ; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h ; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h ; CHECKLE-NEXT: ret ; ; CHECKBE-LABEL: vtrnQi16_undef: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ld1 { v0.8h }, [x0] ; CHECKBE-NEXT: ld1 { v1.8h }, [x1] ; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h ; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h ; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h ; CHECKBE-NEXT: rev64 v0.8h, v0.8h ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECKBE-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> %tmp5 = add <8 x i16> %tmp3, %tmp4 ret <8 x i16> %tmp5 }