; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; CHECK-GI: warning: Instruction selection used fallback path for smull_zext_v4i16_v4i32 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for smlsl2_v8i16_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for umlsl2_v8i16_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for smlsl2_v4i32_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for umlsl2_v4i32_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for smlsl_smlsl2_v8i16_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for umlsl_umlsl2_v8i16_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for smlsl_smlsl2_v4i32_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for umlsl_umlsl2_v4i32_uzp1 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for do_stuff define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull_v8i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> %tmp5 = mul <8 x i16> %tmp3, %tmp4 ret <8 x i16> %tmp5 } define <4 x i32> @smull_v4i16_v4i32(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> %tmp5 = mul <4 x i32> %tmp3, %tmp4 ret <4 x i32> %tmp5 } define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull_v2i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> %tmp5 = mul <2 x i64> %tmp3, %tmp4 ret <2 x i64> %tmp5 } define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr d0, [x0] ; CHECK-NEON-NEXT: ldr q2, [x1] ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEON-NEXT: smull2 v1.4s, v0.8h, v2.8h ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v2.4h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr d0, [x0] ; CHECK-SVE-NEXT: ldr q2, [x1] ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SVE-NEXT: smull2 v1.4s, v0.8h, v2.8h ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v2.4h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr d0, [x0] ; CHECK-GI-NEXT: ldr q1, [x1] ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 ; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s ; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s ; CHECK-GI-NEXT: ret %load.A = load <8 x i8>, ptr %A %load.B = load <8 x i16>, ptr %B %zext.A = zext <8 x i8> %load.A to <8 x i32> %sext.B = sext <8 x i16> %load.B to <8 x i32> %res = mul <8 x i32> %zext.A, %sext.B ret <8 x i32> %res } define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr d0, [x1] ; CHECK-NEON-NEXT: ldr q2, [x0] ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEON-NEXT: smull2 v1.4s, v2.8h, v0.8h ; CHECK-NEON-NEXT: smull v0.4s, v2.4h, v0.4h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr d0, [x1] ; CHECK-SVE-NEXT: ldr q2, [x0] ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SVE-NEXT: smull2 v1.4s, v2.8h, v0.8h ; CHECK-SVE-NEXT: smull v0.4s, v2.4h, v0.4h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr d0, [x1] ; CHECK-GI-NEXT: ldr q1, [x0] ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: mul v1.4s, v1.4s, v4.4s ; CHECK-GI-NEXT: ret %load.A = load <8 x i16>, ptr %A %load.B = load <8 x i8>, ptr %B %sext.A = sext <8 x i16> %load.A to <8 x i32> %zext.B = zext <8 x i8> %load.B to <8 x i32> %res = mul <8 x i32> %sext.A, %zext.B ret <8 x i32> %res } define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr q0, [x0] ; CHECK-NEON-NEXT: ldr q1, [x1] ; CHECK-NEON-NEXT: orr v0.8h, #128, lsl #8 ; CHECK-NEON-NEXT: sshll v3.4s, v1.4h, #0 ; CHECK-NEON-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-NEON-NEXT: ushll v2.4s, v0.4h, #0 ; CHECK-NEON-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-NEON-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-NEON-NEXT: mul v0.4s, v2.4s, v3.4s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr q0, [x0] ; CHECK-SVE-NEXT: ldr q1, [x1] ; CHECK-SVE-NEXT: orr v0.8h, #128, lsl #8 ; CHECK-SVE-NEXT: sshll v3.4s, v1.4h, #0 ; CHECK-SVE-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-SVE-NEXT: ushll v2.4s, v0.4h, #0 ; CHECK-SVE-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-SVE-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-SVE-NEXT: mul v0.4s, v2.4s, v3.4s ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v0.8h, #128, lsl #8 ; CHECK-GI-NEXT: ldr q1, [x0] ; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: ldr q1, [x1] ; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 ; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s ; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s ; CHECK-GI-NEXT: ret %load.A = load <8 x i16>, ptr %A %or.A = or <8 x i16> %load.A, %load.B = load <8 x i16>, ptr %B %zext.A = zext <8 x i16> %or.A to <8 x i32> %sext.B = sext <8 x i16> %load.B to <8 x i32> %res = mul <8 x i32> %zext.A, %sext.B ret <8 x i32> %res } define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull_zext_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret %load.A = load <4 x i8>, ptr %A %load.B = load <4 x i16>, ptr %B %zext.A = zext <4 x i8> %load.A to <4 x i32> %sext.B = sext <4 x i16> %load.B to <4 x i32> %res = mul <4 x i32> %zext.A, %sext.B ret <4 x i32> %res } define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr d0, [x1] ; CHECK-NEON-NEXT: ldrh w9, [x0] ; CHECK-NEON-NEXT: ldrh w10, [x0, #2] ; CHECK-NEON-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEON-NEXT: fmov x11, d0 ; CHECK-NEON-NEXT: mov x8, v0.d[1] ; CHECK-NEON-NEXT: smull x9, w9, w11 ; CHECK-NEON-NEXT: smull x8, w10, w8 ; CHECK-NEON-NEXT: fmov d0, x9 ; CHECK-NEON-NEXT: mov v0.d[1], x8 ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldrh w8, [x0] ; CHECK-SVE-NEXT: ptrue p0.d, vl2 ; CHECK-SVE-NEXT: ldrh w9, [x0, #2] ; CHECK-SVE-NEXT: ldr d0, [x1] ; CHECK-SVE-NEXT: fmov d1, x8 ; CHECK-SVE-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-SVE-NEXT: mov v1.d[1], x9 ; CHECK-SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_zext_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h1, [x0] ; CHECK-GI-NEXT: ldr h2, [x0, #2] ; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: ldr d0, [x1] ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: fmov d1, x8 ; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: mov v1.d[1], x9 ; CHECK-GI-NEXT: fmov x9, d0 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: mov d2, v1.d[1] ; CHECK-GI-NEXT: fmov x8, d1 ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %load.A = load <2 x i16>, ptr %A %load.B = load <2 x i32>, ptr %B %zext.A = zext <2 x i16> %load.A to <2 x i64> %sext.B = sext <2 x i32> %load.B to <2 x i64> %res = mul <2 x i64> %zext.A, %sext.B ret <2 x i64> %res } define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr d0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: bic v0.2s, #128, lsl #24 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr d0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: bic v0.2s, #128, lsl #24 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mvni v0.2s, #128, lsl #24 ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %load.A = load <2 x i32>, ptr %A %and.A = and <2 x i32> %load.A, %load.B = load <2 x i32>, ptr %B %zext.A = zext <2 x i32> %and.A to <2 x i64> %sext.B = sext <2 x i32> %load.B to <2 x i64> %res = mul <2 x i64> %zext.A, %sext.B ret <2 x i64> %res } define <8 x i16> @umull_v8i8_v8i16(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: umull_v8i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> %tmp5 = mul <8 x i16> %tmp3, %tmp4 ret <8 x i16> %tmp5 } define <4 x i32> @umull_v4i16_v4i32(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: umull_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> %tmp5 = mul <4 x i32> %tmp3, %tmp4 ret <4 x i32> %tmp5 } define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: umull_v2i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> %tmp5 = mul <2 x i64> %tmp3, %tmp4 ret <2 x i64> %tmp5 } define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: amull_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr d0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr d0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: ldr d2, [x1] ; CHECK-GI-NEXT: movi v0.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: umull v1.8h, v1.8b, v2.8b ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> %tmp5 = mul <8 x i16> %tmp3, %tmp4 %and = and <8 x i16> %tmp5, ret <8 x i16> %and } define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: amull_v4i16_v4i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr d1, [x0] ; CHECK-NEON-NEXT: ldr d2, [x1] ; CHECK-NEON-NEXT: movi v0.2d, #0x00ffff0000ffff ; CHECK-NEON-NEXT: smull v1.4s, v1.4h, v2.4h ; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull_v4i16_v4i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr d1, [x0] ; CHECK-SVE-NEXT: ldr d2, [x1] ; CHECK-SVE-NEXT: movi v0.2d, #0x00ffff0000ffff ; CHECK-SVE-NEXT: smull v1.4s, v1.4h, v2.4h ; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: ldr d2, [x1] ; CHECK-GI-NEXT: movi v0.2d, #0x00ffff0000ffff ; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> %tmp5 = mul <4 x i32> %tmp3, %tmp4 %and = and <4 x i32> %tmp5, ret <4 x i32> %and } define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: amull_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr d1, [x0] ; CHECK-NEON-NEXT: ldr d2, [x1] ; CHECK-NEON-NEXT: movi v0.2d, #0x000000ffffffff ; CHECK-NEON-NEXT: smull v1.2d, v1.2s, v2.2s ; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr d1, [x0] ; CHECK-SVE-NEXT: ldr d2, [x1] ; CHECK-SVE-NEXT: movi v0.2d, #0x000000ffffffff ; CHECK-SVE-NEXT: smull v1.2d, v1.2s, v2.2s ; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: ldr d2, [x1] ; CHECK-GI-NEXT: movi v0.2d, #0x000000ffffffff ; CHECK-GI-NEXT: umull v1.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> %tmp5 = mul <2 x i64> %tmp3, %tmp4 %and = and <2 x i64> %tmp5, ret <2 x i64> %and } define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: smlal_v8i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> %tmp6 = mul <8 x i16> %tmp4, %tmp5 %tmp7 = add <8 x i16> %tmp1, %tmp6 ret <8 x i16> %tmp7 } define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: smlal_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> %tmp6 = mul <4 x i32> %tmp4, %tmp5 %tmp7 = add <4 x i32> %tmp1, %tmp6 ret <4 x i32> %tmp7 } define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: smlal_v2i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> %tmp6 = mul <2 x i64> %tmp4, %tmp5 %tmp7 = add <2 x i64> %tmp1, %tmp6 ret <2 x i64> %tmp7 } define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: umlal_v8i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> %tmp6 = mul <8 x i16> %tmp4, %tmp5 %tmp7 = add <8 x i16> %tmp1, %tmp6 ret <8 x i16> %tmp7 } define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: umlal_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> %tmp6 = mul <4 x i32> %tmp4, %tmp5 %tmp7 = add <4 x i32> %tmp1, %tmp6 ret <4 x i32> %tmp7 } define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: umlal_v2i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> %tmp6 = mul <2 x i64> %tmp4, %tmp5 %tmp7 = add <2 x i64> %tmp1, %tmp6 ret <2 x i64> %tmp7 } define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEON-LABEL: amlal_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr q0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: ldr d2, [x2] ; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amlal_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr q0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: ldr d2, [x2] ; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amlal_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: ldr d2, [x2] ; CHECK-GI-NEXT: umlal v0.8h, v1.8b, v2.8b ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> %tmp6 = mul <8 x i16> %tmp4, %tmp5 %tmp7 = add <8 x i16> %tmp1, %tmp6 %and = and <8 x i16> %tmp7, ret <8 x i16> %and } define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEON-LABEL: amlal_v4i16_v4i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr q0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: ldr d2, [x2] ; CHECK-NEON-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amlal_v4i16_v4i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr q0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: ldr d2, [x2] ; CHECK-SVE-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amlal_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff ; CHECK-GI-NEXT: ldr d2, [x2] ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> %tmp6 = mul <4 x i32> %tmp4, %tmp5 %tmp7 = add <4 x i32> %tmp1, %tmp6 %and = and <4 x i32> %tmp7, ret <4 x i32> %and } define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEON-LABEL: amlal_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr q0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: ldr d2, [x2] ; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amlal_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr q0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: ldr d2, [x2] ; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amlal_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-GI-NEXT: ldr d2, [x2] ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> %tmp6 = mul <2 x i64> %tmp4, %tmp5 %tmp7 = add <2 x i64> %tmp1, %tmp6 %and = and <2 x i64> %tmp7, ret <2 x i64> %and } define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: smlsl_v8i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> %tmp6 = mul <8 x i16> %tmp4, %tmp5 %tmp7 = sub <8 x i16> %tmp1, %tmp6 ret <8 x i16> %tmp7 } define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: smlsl_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> %tmp6 = mul <4 x i32> %tmp4, %tmp5 %tmp7 = sub <4 x i32> %tmp1, %tmp6 ret <4 x i32> %tmp7 } define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: smlsl_v2i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> %tmp6 = mul <2 x i64> %tmp4, %tmp5 %tmp7 = sub <2 x i64> %tmp1, %tmp6 ret <2 x i64> %tmp7 } define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: umlsl_v8i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> %tmp6 = mul <8 x i16> %tmp4, %tmp5 %tmp7 = sub <8 x i16> %tmp1, %tmp6 ret <8 x i16> %tmp7 } define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: umlsl_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> %tmp6 = mul <4 x i32> %tmp4, %tmp5 %tmp7 = sub <4 x i32> %tmp1, %tmp6 ret <4 x i32> %tmp7 } define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: umlsl_v2i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> %tmp6 = mul <2 x i64> %tmp4, %tmp5 %tmp7 = sub <2 x i64> %tmp1, %tmp6 ret <2 x i64> %tmp7 } define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEON-LABEL: amlsl_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr q0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: ldr d2, [x2] ; CHECK-NEON-NEXT: smlsl v0.8h, v1.8b, v2.8b ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amlsl_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr q0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: ldr d2, [x2] ; CHECK-SVE-NEXT: smlsl v0.8h, v1.8b, v2.8b ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amlsl_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: ldr d2, [x2] ; CHECK-GI-NEXT: umlsl v0.8h, v1.8b, v2.8b ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> %tmp6 = mul <8 x i16> %tmp4, %tmp5 %tmp7 = sub <8 x i16> %tmp1, %tmp6 %and = and <8 x i16> %tmp7, ret <8 x i16> %and } define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEON-LABEL: amlsl_v4i16_v4i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr q0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: ldr d2, [x2] ; CHECK-NEON-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amlsl_v4i16_v4i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr q0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: ldr d2, [x2] ; CHECK-SVE-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amlsl_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff ; CHECK-GI-NEXT: ldr d2, [x2] ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> %tmp6 = mul <4 x i32> %tmp4, %tmp5 %tmp7 = sub <4 x i32> %tmp1, %tmp6 %and = and <4 x i32> %tmp7, ret <4 x i32> %and } define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-NEON-LABEL: amlsl_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: ldr q0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] ; CHECK-NEON-NEXT: ldr d2, [x2] ; CHECK-NEON-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amlsl_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldr q0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] ; CHECK-SVE-NEXT: ldr d2, [x2] ; CHECK-SVE-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amlsl_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] ; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-GI-NEXT: ldr d2, [x2] ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> %tmp6 = mul <2 x i64> %tmp4, %tmp5 %tmp7 = sub <2 x i64> %tmp1, %tmp6 %and = and <2 x i64> %tmp7, ret <2 x i64> %and } ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements. define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: movi v1.8b, #244 ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: movi v1.8b, #244 ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mvni v1.8h, #11 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: ret %tmp3 = sext <8 x i8> %arg to <8 x i16> %tmp4 = mul <8 x i16> %tmp3, ret <8 x i16> %tmp4 } define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; Do not use SMULL if the BUILD_VECTOR element values are too big. ; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mov w8, #64537 // =0xfc19 ; CHECK-NEON-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEON-NEXT: dup v1.8h, w8 ; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mov w8, #64537 // =0xfc19 ; CHECK-SVE-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-SVE-NEXT: dup v1.8h, w8 ; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI34_0 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_0] ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: ret %tmp3 = sext <8 x i8> %arg to <8 x i16> %tmp4 = mul <8 x i16> %tmp3, ret <8 x i16> %tmp4 } define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { ; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mvni v1.4h, #11 ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mvni v1.4h, #11 ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mvni v1.4s, #11 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: ret %tmp3 = sext <4 x i16> %arg to <4 x i32> %tmp4 = mul <4 x i32> %tmp3, ret <4 x i32> %tmp4 } define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mov w8, #-1234 // =0xfffffb2e ; CHECK-NEON-NEXT: dup v1.2s, w8 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mov w8, #-1234 // =0xfffffb2e ; CHECK-SVE-NEXT: dup v1.2s, w8 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI36_0 ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0] ; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %tmp3 = sext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, ret <2 x i64> %tmp4 } define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: movi v1.8b, #12 ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: movi v1.8b, #12 ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v1.8h, #12 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: ret %tmp3 = zext <8 x i8> %arg to <8 x i16> %tmp4 = mul <8 x i16> %tmp3, ret <8 x i16> %tmp4 } define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; Do not use SMULL if the BUILD_VECTOR element values are too big. ; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mov w8, #999 // =0x3e7 ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEON-NEXT: dup v1.8h, w8 ; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mov w8, #999 // =0x3e7 ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SVE-NEXT: dup v1.8h, w8 ; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI38_0 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_0] ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: ret %tmp3 = zext <8 x i8> %arg to <8 x i16> %tmp4 = mul <8 x i16> %tmp3, ret <8 x i16> %tmp4 } define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { ; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEON-NEXT: dup v1.4h, w8 ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-SVE-NEXT: dup v1.4h, w8 ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI39_0 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0] ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: ret %tmp3 = zext <4 x i16> %arg to <4 x i32> %tmp4 = mul <4 x i32> %tmp3, ret <4 x i32> %tmp4 } define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEON-NEXT: dup v1.2s, w8 ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-SVE-NEXT: dup v1.2s, w8 ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI40_0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0] ; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, ret <2 x i64> %tmp4 } define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { ; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: movi v1.8b, #12 ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: movi v1.8b, #12 ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v1.8h, #12 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <8 x i8> %arg to <8 x i16> %tmp4 = mul <8 x i16> %tmp3, %and = and <8 x i16> %tmp4, ret <8 x i16> %and } define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { ; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEON-NEXT: dup v1.4h, w8 ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-SVE-NEXT: dup v1.4h, w8 ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI42_0 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_0] ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <4 x i16> %arg to <4 x i32> %tmp4 = mul <4 x i32> %tmp3, %and = and <4 x i32> %tmp4, ret <4 x i32> %and } define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-NEON-NEXT: dup v1.2s, w8 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 ; CHECK-SVE-NEXT: dup v1.2s, w8 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI43_0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0] ; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> %tmp4 = mul <2 x i64> %tmp3, %and = and <2 x i64> %tmp4, ret <2 x i64> %and } define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) { ; If one operand has a zero-extend and the other a sign-extend, smull ; cannot be used. ; CHECK-LABEL: smullWithInconsistentExtensions: ; CHECK: // %bb.0: ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %s = sext <8 x i8> %x to <8 x i16> %z = zext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %s, %z %r = extractelement <8 x i16> %m, i32 0 ret i16 %r } define <8 x i16> @smull_extended_vector_operand(<8 x i16> %v) { ; CHECK-LABEL: smull_extended_vector_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v1.4s, #139, lsl #8 ; CHECK-NEXT: sshll v2.4s, v0.4h, #0 ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 ; CHECK-NEXT: mul v2.4s, v2.4s, v1.4s ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-NEXT: shrn v0.4h, v2.4s, #1 ; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret entry: %0 = sext <8 x i16> %v to <8 x i32> %1 = mul <8 x i32> %0, %2 = lshr <8 x i32> %1, %3 = trunc <8 x i32> %2 to <8 x i16> ret <8 x i16> %3 } define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind { ; CHECK-NEON-LABEL: distribute: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: ldr q0, [x1] ; CHECK-NEON-NEXT: dup v1.8b, w2 ; CHECK-NEON-NEXT: mov d2, v0.d[1] ; CHECK-NEON-NEXT: umull v2.8h, v2.8b, v1.8b ; CHECK-NEON-NEXT: umlal v2.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: str q2, [x0] ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: distribute: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: ldr q0, [x1] ; CHECK-SVE-NEXT: dup v1.8b, w2 ; CHECK-SVE-NEXT: mov d2, v0.d[1] ; CHECK-SVE-NEXT: umull v2.8h, v2.8b, v1.8b ; CHECK-SVE-NEXT: umlal v2.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: str q2, [x0] ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: distribute: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr q0, [x1] ; CHECK-GI-NEXT: dup v1.8b, w2 ; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: uaddw2 v0.8h, v2.8h, v0.16b ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: str q0, [x0] ; CHECK-GI-NEXT: ret entry: %0 = trunc i32 %mul to i8 %1 = insertelement <8 x i8> undef, i8 %0, i32 0 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer %3 = load <16 x i8>, ptr %src, align 1 %4 = bitcast <16 x i8> %3 to <2 x double> %5 = extractelement <2 x double> %4, i32 1 %6 = bitcast double %5 to <8 x i8> %7 = zext <8 x i8> %6 to <8 x i16> %8 = zext <8 x i8> %2 to <8 x i16> %9 = extractelement <2 x double> %4, i32 0 %10 = bitcast double %9 to <8 x i8> %11 = zext <8 x i8> %10 to <8 x i16> %12 = add <8 x i16> %7, %11 %13 = mul <8 x i16> %12, %8 store <8 x i16> %13, ptr %dst, align 2 ret void } define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { ; CHECK-NEON-LABEL: umull2_i8: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: umull2 v2.8h, v0.16b, v1.16b ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: mov v1.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull2_i8: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: umull2 v2.8h, v0.16b, v1.16b ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: mov v1.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull2_i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = zext <16 x i8> %arg1 to <16 x i16> %arg2_ext = zext <16 x i8> %arg2 to <16 x i16> %mul = mul <16 x i16> %arg1_ext, %arg2_ext ret <16 x i16> %mul } define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { ; CHECK-NEON-LABEL: smull2_i8: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: smull2 v2.8h, v0.16b, v1.16b ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: mov v1.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull2_i8: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: smull2 v2.8h, v0.16b, v1.16b ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: mov v1.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull2_i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: smull v2.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: smull2 v1.8h, v0.16b, v1.16b ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = sext <16 x i8> %arg1 to <16 x i16> %arg2_ext = sext <16 x i8> %arg2 to <16 x i16> %mul = mul <16 x i16> %arg1_ext, %arg2_ext ret <16 x i16> %mul } define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { ; CHECK-NEON-LABEL: umull2_i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: umull2 v2.4s, v0.8h, v1.8h ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEON-NEXT: mov v1.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull2_i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: umull2 v2.4s, v0.8h, v1.8h ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-SVE-NEXT: mov v1.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull2_i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = zext <8 x i16> %arg1 to <8 x i32> %arg2_ext = zext <8 x i16> %arg2 to <8 x i32> %mul = mul <8 x i32> %arg1_ext, %arg2_ext ret <8 x i32> %mul } define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { ; CHECK-NEON-LABEL: smull2_i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEON-NEXT: mov v1.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull2_i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-SVE-NEXT: mov v1.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull2_i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: smull2 v1.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = sext <8 x i16> %arg1 to <8 x i32> %arg2_ext = sext <8 x i16> %arg2 to <8 x i32> %mul = mul <8 x i32> %arg1_ext, %arg2_ext ret <8 x i32> %mul } define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { ; CHECK-NEON-LABEL: umull2_i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: umull2 v2.2d, v0.4s, v1.4s ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: mov v1.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull2_i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: umull2 v2.2d, v0.4s, v1.4s ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: mov v1.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull2_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = zext <4 x i32> %arg1 to <4 x i64> %arg2_ext = zext <4 x i32> %arg2 to <4 x i64> %mul = mul <4 x i64> %arg1_ext, %arg2_ext ret <4 x i64> %mul } define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { ; CHECK-NEON-LABEL: smull2_i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: smull2 v2.2d, v0.4s, v1.4s ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: mov v1.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull2_i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: smull2 v2.2d, v0.4s, v1.4s ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: mov v1.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: smull2_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: smull v2.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: smull2 v1.2d, v0.4s, v1.4s ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = sext <4 x i32> %arg1 to <4 x i64> %arg2_ext = sext <4 x i32> %arg2 to <4 x i64> %mul = mul <4 x i64> %arg1_ext, %arg2_ext ret <4 x i64> %mul } define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { ; CHECK-NEON-LABEL: amull2_i8: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: smull v2.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: smull2 v1.8h, v0.16b, v1.16b ; CHECK-NEON-NEXT: bic v2.8h, #255, lsl #8 ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEON-NEXT: mov v0.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull2_i8: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: smull v2.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: smull2 v1.8h, v0.16b, v1.16b ; CHECK-SVE-NEXT: bic v2.8h, #255, lsl #8 ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-SVE-NEXT: mov v0.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull2_i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: umull v3.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = zext <16 x i8> %arg1 to <16 x i16> %arg2_ext = zext <16 x i8> %arg2 to <16 x i16> %mul = mul <16 x i16> %arg1_ext, %arg2_ext %and = and <16 x i16> %mul, ret <16 x i16> %and } define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { ; CHECK-NEON-LABEL: amull2_i16: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEON-NEXT: smull v3.4s, v0.4h, v1.4h ; CHECK-NEON-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b ; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull2_i16: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-SVE-NEXT: smull v3.4s, v0.4h, v1.4h ; CHECK-SVE-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b ; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull2_i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = zext <8 x i16> %arg1 to <8 x i32> %arg2_ext = zext <8 x i16> %arg2 to <8 x i32> %mul = mul <8 x i32> %arg1_ext, %arg2_ext %and = and <8 x i32> %mul, ret <8 x i32> %and } define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { ; CHECK-NEON-LABEL: amull2_i32: ; CHECK-NEON: // %bb.0: ; CHECK-NEON-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEON-NEXT: smull v3.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: smull2 v0.2d, v0.4s, v1.4s ; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b ; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: amull2_i32: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-SVE-NEXT: smull v3.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: smull2 v0.2d, v0.4s, v1.4s ; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b ; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: amull2_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-GI-NEXT: umull v3.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: ret %arg1_ext = zext <4 x i32> %arg1 to <4 x i64> %arg2_ext = zext <4 x i32> %arg2 to <4 x i64> %mul = mul <4 x i64> %arg1_ext, %arg2_ext %and = and <4 x i64> %mul, ret <4 x i64> %and } define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-NEON-LABEL: umull_and_v8i16: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v8i16: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> %in2 = and <8 x i16> %src2, %out = mul nsw <8 x i16> %in1, %in2 ret <8 x i16> %out } define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-NEON-LABEL: umull_and_v8i16_c: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h ; CHECK-NEON-NEXT: umull v0.8h, v1.8b, v0.8b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v8i16_c: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h ; CHECK-SVE-NEXT: umull v0.8h, v1.8b, v0.8b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v8i16_c: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> %in2 = and <8 x i16> %src2, %out = mul nsw <8 x i16> %in2, %in1 ret <8 x i16> %out } define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-LABEL: umull_and256_v8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.8h, #1, lsl #8 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> %in2 = and <8 x i16> %src2, %out = mul nsw <8 x i16> %in1, %in2 ret <8 x i16> %out } define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-NEON-LABEL: umull_andconst_v8i16: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: movi v1.2d, #0xffffffffffffffff ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_andconst_v8i16: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: movi v1.2d, #0xffffffffffffffff ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_andconst_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v1.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> %out = mul nsw <8 x i16> %in1, ret <8 x i16> %out } define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) { ; CHECK-NEON-LABEL: umull_smaller_v8i16: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: movi v2.8b, #15 ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h ; CHECK-NEON-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_smaller_v8i16: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: movi v2.8b, #15 ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h ; CHECK-SVE-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_smaller_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff ; CHECK-GI-NEXT: movi v3.8h, #15 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i4> %src1 to <8 x i16> %in2 = and <8 x i16> %src2, %out = mul nsw <8 x i16> %in1, %in2 ret <8 x i16> %out } define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) { ; CHECK-NEON-LABEL: umull_and_v4i32: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: movi v2.2d, #0x0000ff000000ff ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEON-NEXT: xtn v1.4h, v1.4s ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v4i32: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: movi v2.2d, #0x0000ff000000ff ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-SVE-NEXT: xtn v1.4h, v1.4s ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v4i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: ret entry: %in1 = zext <4 x i16> %src1 to <4 x i32> %in2 = and <4 x i32> %src2, %out = mul nsw <4 x i32> %in1, %in2 ret <4 x i32> %out } define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) { ; CHECK-NEON-LABEL: umull_and_v8i32: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: movi v3.2d, #0x0000ff000000ff ; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-NEON-NEXT: uzp1 v2.8h, v1.8h, v2.8h ; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v8i32: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: movi v3.2d, #0x0000ff000000ff ; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-SVE-NEXT: uzp1 v2.8h, v1.8h, v2.8h ; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v8i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff ; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 ; CHECK-GI-NEXT: and v0.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b ; CHECK-GI-NEXT: mul v0.4s, v4.4s, v0.4s ; CHECK-GI-NEXT: mul v1.4s, v5.4s, v1.4s ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> %in2 = and <8 x i32> %src2, %out = mul nsw <8 x i32> %in1, %in2 ret <8 x i32> %out } define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) { ; CHECK-NEON-LABEL: umull_and_v8i32_dup: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: and w8, w0, #0xff ; CHECK-NEON-NEXT: dup v2.8h, w8 ; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v8i32_dup: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: and w8, w0, #0xff ; CHECK-SVE-NEXT: dup v2.8h, w8 ; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v8i32_dup: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xff ; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0 ; CHECK-GI-NEXT: dup v3.4s, w8 ; CHECK-GI-NEXT: mul v0.4s, v1.4s, v3.4s ; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> %in2 = and i32 %src2, 255 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %in2, i64 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %out = mul nsw <8 x i32> %in1, %broadcast.splat ret <8 x i32> %out } define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) { ; CHECK-NEON-LABEL: umull_and_v2i64: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: movi v2.2d, #0x000000000000ff ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEON-NEXT: xtn v1.2s, v1.2d ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v2i64: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: movi v2.2d, #0x000000000000ff ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-SVE-NEXT: xtn v1.2s, v1.2d ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v2i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: ret entry: %in1 = zext <2 x i32> %src1 to <2 x i64> %in2 = and <2 x i64> %src2, %out = mul nsw <2 x i64> %in1, %in2 ret <2 x i64> %out } define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { ; CHECK-NEON-LABEL: umull_and_v4i64: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: movi v3.2d, #0x000000000000ff ; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-NEON-NEXT: uzp1 v2.4s, v1.4s, v2.4s ; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v4i64: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: movi v3.2d, #0x000000000000ff ; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-SVE-NEXT: uzp1 v2.4s, v1.4s, v2.4s ; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v4i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v3.2d, #0x000000000000ff ; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 ; CHECK-GI-NEXT: fmov x8, d4 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b ; CHECK-GI-NEXT: mov d3, v4.d[1] ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: fmov x10, d2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d0 ; CHECK-GI-NEXT: mov d0, v2.d[1] ; CHECK-GI-NEXT: fmov x11, d4 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: fmov x12, d0 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: fmov x11, d1 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: mul x11, x11, x12 ; CHECK-GI-NEXT: mov v0.d[1], x10 ; CHECK-GI-NEXT: mov v1.d[1], x11 ; CHECK-GI-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64> %in2 = and <4 x i64> %src2, %out = mul nsw <4 x i64> %in1, %in2 ret <4 x i64> %out } define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { ; CHECK-NEON-LABEL: umull_and_v4i64_dup: ; CHECK-NEON: // %bb.0: // %entry ; CHECK-NEON-NEXT: and w8, w0, #0xff ; CHECK-NEON-NEXT: dup v2.4s, w8 ; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: umull_and_v4i64_dup: ; CHECK-SVE: // %bb.0: // %entry ; CHECK-SVE-NEXT: and w8, w0, #0xff ; CHECK-SVE-NEXT: dup v2.4s, w8 ; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s ; CHECK-SVE-NEXT: ret ; ; CHECK-GI-LABEL: umull_and_v4i64_dup: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and x8, x0, #0xff ; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 ; CHECK-GI-NEXT: dup v2.2d, x8 ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmov x8, d1 ; CHECK-GI-NEXT: fmov x10, d0 ; CHECK-GI-NEXT: mov d1, v2.d[1] ; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: fmov x11, d1 ; CHECK-GI-NEXT: fmov x12, d2 ; CHECK-GI-NEXT: mul x9, x10, x9 ; CHECK-GI-NEXT: fmov x10, d3 ; CHECK-GI-NEXT: mul x10, x10, x11 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: mul x11, x12, x11 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: mov v0.d[1], x10 ; CHECK-GI-NEXT: mov v1.d[1], x11 ; CHECK-GI-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64> %in2 = and i64 %src2, 255 %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0 %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer %out = mul nsw <4 x i64> %in1, %broadcast.splat ret <4 x i64> %out } define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { ; CHECK-LABEL: pmlsl2_v8i16_uzp1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q2, [x1, #16] ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %5 = getelementptr inbounds i32, ptr %3, i64 4 %6 = load <8 x i16>, ptr %5, align 4 %7 = trunc <8 x i16> %6 to <8 x i8> %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7) %10 = sub <8 x i16> %1, %9 store <8 x i16> %10, ptr %2, align 16 ret void } define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { ; CHECK-LABEL: smlsl2_v8i16_uzp1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q2, [x1, #16] ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b ; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %5 = getelementptr inbounds i32, ptr %3, i64 4 %6 = load <8 x i16>, ptr %5, align 4 %7 = trunc <8 x i16> %6 to <8 x i8> %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7) %10 = sub <8 x i16> %1, %9 store <8 x i16> %10, ptr %2, align 16 ret void } define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { ; CHECK-LABEL: umlsl2_v8i16_uzp1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q2, [x1, #16] ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b ; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %5 = getelementptr inbounds i32, ptr %3, i64 4 %6 = load <8 x i16>, ptr %5, align 4 %7 = trunc <8 x i16> %6 to <8 x i8> %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7) %10 = sub <8 x i16> %1, %9 store <8 x i16> %10, ptr %2, align 16 ret void } define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { ; CHECK-LABEL: smlsl2_v4i32_uzp1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q2, [x1, #16] ; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %5 = getelementptr inbounds i32, ptr %3, i64 4 %6 = load <4 x i32>, ptr %5, align 4 %7 = trunc <4 x i32> %6 to <4 x i16> %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7) %10 = sub <4 x i32> %1, %9 store <4 x i32> %10, ptr %2, align 16 ret void } define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { ; CHECK-LABEL: umlsl2_v4i32_uzp1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q2, [x1, #16] ; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h ; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %5 = getelementptr inbounds i32, ptr %3, i64 4 %6 = load <4 x i32>, ptr %5, align 4 %7 = trunc <4 x i32> %6 to <4 x i16> %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> %9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7) %10 = sub <4 x i32> %1, %9 store <4 x i32> %10, ptr %2, align 16 ret void } define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { ; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q3, [x1] ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b ; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b ; CHECK-NEXT: add v0.8h, v3.8h, v0.8h ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: %5 = load <8 x i16>, ptr %3, align 4 %6 = trunc <8 x i16> %5 to <8 x i8> %7 = getelementptr inbounds i32, ptr %3, i64 4 %8 = load <8 x i16>, ptr %7, align 4 %9 = trunc <8 x i16> %8 to <8 x i8> %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %11 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %10, <8 x i8> %6) %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %13 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %12, <8 x i8> %9) %14 = add <8 x i16> %11, %13 %15 = sub <8 x i16> %1, %14 store <8 x i16> %15, ptr %2, align 16 ret void } define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { ; CHECK-LABEL: smlsl_smlsl2_v8i16_uzp1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q3, [x1] ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b ; CHECK-NEXT: smlsl v1.8h, v0.8b, v2.8b ; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret entry: %5 = load <8 x i16>, ptr %3, align 4 %6 = trunc <8 x i16> %5 to <8 x i8> %7 = getelementptr inbounds i32, ptr %3, i64 4 %8 = load <8 x i16>, ptr %7, align 4 %9 = trunc <8 x i16> %8 to <8 x i8> %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %11 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %10, <8 x i8> %6) %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %13 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %12, <8 x i8> %9) %14 = add <8 x i16> %11, %13 %15 = sub <8 x i16> %1, %14 store <8 x i16> %15, ptr %2, align 16 ret void } define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { ; CHECK-LABEL: umlsl_umlsl2_v8i16_uzp1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q3, [x1] ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b ; CHECK-NEXT: umlsl v1.8h, v0.8b, v2.8b ; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret entry: %5 = load <8 x i16>, ptr %3, align 4 %6 = trunc <8 x i16> %5 to <8 x i8> %7 = getelementptr inbounds i32, ptr %3, i64 4 %8 = load <8 x i16>, ptr %7, align 4 %9 = trunc <8 x i16> %8 to <8 x i8> %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %11 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %10, <8 x i8> %6) %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> %13 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %12, <8 x i8> %9) %14 = add <8 x i16> %11, %13 %15 = sub <8 x i16> %1, %14 store <8 x i16> %15, ptr %2, align 16 ret void } define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { ; CHECK-LABEL: smlsl_smlsl2_v4i32_uzp1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q3, [x1] ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret entry: %5 = load <4 x i32>, ptr %3, align 4 %6 = trunc <4 x i32> %5 to <4 x i16> %7 = getelementptr inbounds i32, ptr %3, i64 4 %8 = load <4 x i32>, ptr %7, align 4 %9 = trunc <4 x i32> %8 to <4 x i16> %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> %11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6) %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> %13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9) %14 = add <4 x i32> %11, %13 %15 = sub <4 x i32> %1, %14 store <4 x i32> %15, ptr %2, align 16 ret void } define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { ; CHECK-LABEL: umlsl_umlsl2_v4i32_uzp1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q3, [x1] ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: umlsl v1.4s, v0.4h, v2.4h ; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret entry: %5 = load <4 x i32>, ptr %3, align 4 %6 = trunc <4 x i32> %5 to <4 x i16> %7 = getelementptr inbounds i32, ptr %3, i64 4 %8 = load <4 x i32>, ptr %7, align 4 %9 = trunc <4 x i32> %8 to <4 x i16> %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> %11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6) %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> %13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9) %14 = add <4 x i32> %11, %13 %15 = sub <4 x i32> %1, %14 store <4 x i32> %15, ptr %2, align 16 ret void } define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) { ; CHECK-LABEL: do_stuff: ; CHECK: // %bb.0: ; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s ; CHECK-NEXT: smull2 v0.2d, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %bc.1 = bitcast <2 x i64> %1 to <4 x i32> %trunc.0 = trunc <2 x i64> %0 to <2 x i32> %shuff.hi = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> %shuff.lo = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> %smull = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuff.hi, <2 x i32> %trunc.0) %trunc.smull = trunc <2 x i64> %smull to <2 x i32> %final = add <2 x i32> %trunc.smull, %shuff.lo ret <2 x i32> %final } declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)