; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s define <4 x i16> @normal_load_v4i8(ptr %p) { ; CHECK-LABEL: normal_load_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %l1 = load <4 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 4 %l2 = load <4 x i8>, ptr %q %e1 = zext <4 x i8> %l1 to <4 x i16> %e2 = zext <4 x i8> %l2 to <4 x i16> %a = add <4 x i16> %e1, %e2 ret <4 x i16> %a } define <4 x i32> @normal_load_v4i16_v4i32(ptr %p) { ; CHECK-LABEL: normal_load_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp d0, d1, [x0] ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret %l1 = load <4 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 8 %l2 = load <4 x i16>, ptr %q %e1 = zext <4 x i16> %l1 to <4 x i32> %e2 = zext <4 x i16> %l2 to <4 x i32> %a = add <4 x i32> %e1, %e2 ret <4 x i32> %a } define <4 x i16> @load_v4i8(ptr %p) { ; CHECK-LABEL: load_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s1, s0, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: shl v0.4h, v0.4h, #3 ; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %l1 = load <4 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 4 %l2 = load <4 x i8>, ptr %q %e1 = zext <4 x i8> %l1 to <4 x i16> %e2 = zext <4 x i8> %l2 to <4 x i16> %e3 = shl <4 x i16> %e2, %a = add <4 x i16> %e1, %e3 ret <4 x i16> %a } define <4 x i32> @load_v4i16_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l1 = load <4 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 8 %l2 = load <4 x i16>, ptr %q %e1 = zext <4 x i16> %l1 to <4 x i32> %e2 = zext <4 x i16> %l2 to <4 x i32> %e3 = shl <4 x i32> %e2, %a = add <4 x i32> %e1, %e3 ret <4 x i32> %a } define <4 x i64> @load_v4i32_v4i64(ptr %p) { ; CHECK-LABEL: load_v4i32_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #3 ; CHECK-NEXT: uaddw2 v1.2d, v1.2d, v2.4s ; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s ; CHECK-NEXT: ret %l1 = load <4 x i32>, ptr %p %q = getelementptr i8, ptr %p, i32 16 %l2 = load <4 x i32>, ptr %q %e1 = zext <4 x i32> %l1 to <4 x i64> %e2 = zext <4 x i32> %l2 to <4 x i64> %e3 = shl <4 x i64> %e2, %a = add <4 x i64> %e1, %e3 ret <4 x i64> %a } define <4 x i32> @load_v4i8_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i8_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l1 = load <4 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 4 %l2 = load <4 x i8>, ptr %q %e1 = zext <4 x i8> %l1 to <4 x i32> %e2 = zext <4 x i8> %l2 to <4 x i32> %e3 = shl <4 x i32> %e2, %a = add <4 x i32> %e1, %e3 ret <4 x i32> %a } define <4 x i32> @load_v4i12_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i12_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: ldr w9, [x0, #8] ; CHECK-NEXT: lsr x10, x8, #60 ; CHECK-NEXT: ubfx x11, x8, #48, #12 ; CHECK-NEXT: ubfx w12, w9, #8, #12 ; CHECK-NEXT: orr w10, w10, w9, lsl #4 ; CHECK-NEXT: fmov s0, w11 ; CHECK-NEXT: and w11, w8, #0xfff ; CHECK-NEXT: fmov s1, w11 ; CHECK-NEXT: lsr x9, x9, #20 ; CHECK-NEXT: and w10, w10, #0xfff ; CHECK-NEXT: mov v0.h[1], w10 ; CHECK-NEXT: ubfx w10, w8, #12, #12 ; CHECK-NEXT: mov v1.h[1], w10 ; CHECK-NEXT: ubfx x10, x8, #24, #12 ; CHECK-NEXT: ubfx x8, x8, #36, #12 ; CHECK-NEXT: mov v0.h[2], w12 ; CHECK-NEXT: mov v1.h[2], w10 ; CHECK-NEXT: mov v0.h[3], w9 ; CHECK-NEXT: mov v1.h[3], w8 ; CHECK-NEXT: ushll v0.4s, v0.4h, #3 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: ret %l1 = load <4 x i12>, ptr %p %q = getelementptr i8, ptr %p, i32 6 %l2 = load <4 x i12>, ptr %q %e1 = zext <4 x i12> %l1 to <4 x i32> %e2 = zext <4 x i12> %l2 to <4 x i32> %e3 = shl <4 x i32> %e2, %a = add <4 x i32> %e1, %e3 ret <4 x i32> %a } define <8 x i16> @load_v8i8(ptr %p) { ; CHECK-LABEL: load_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b ; CHECK-NEXT: ret %l1 = load <8 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 8 %l2 = load <8 x i8>, ptr %q %e1 = zext <8 x i8> %l1 to <8 x i16> %e2 = zext <8 x i8> %l2 to <8 x i16> %e3 = shl <8 x i16> %e2, %a = add <8 x i16> %e1, %e3 ret <8 x i16> %a } define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadadd_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 %l12 = load <8 x i8>, ptr %q1 %l21 = load <8 x i8>, ptr %p2 %q2 = getelementptr i8, ptr %p2, i32 8 %l22 = load <8 x i8>, ptr %q2 %l1 = add <8 x i8> %l11, %l21 %l2 = add <8 x i8> %l12, %l22 %e1 = zext <8 x i8> %l1 to <8 x i16> %e2 = zext <8 x i8> %l2 to <8 x i16> %e3 = shl <8 x i16> %e2, %a = add <8 x i16> %e1, %e3 ret <8 x i16> %a } define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadaddext_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 ; CHECK-NEXT: ushll v2.4s, v2.4h, #3 ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h ; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 %l12 = load <8 x i8>, ptr %q1 %l21 = load <8 x i8>, ptr %p2 %q2 = getelementptr i8, ptr %p2, i32 8 %l22 = load <8 x i8>, ptr %q2 %le11 = zext <8 x i8> %l11 to <8 x i16> %le12 = zext <8 x i8> %l12 to <8 x i16> %le21 = zext <8 x i8> %l21 to <8 x i16> %le22 = zext <8 x i8> %l22 to <8 x i16> %l1 = add <8 x i16> %le11, %le21 %l2 = add <8 x i16> %le12, %le22 %e1 = zext <8 x i16> %l1 to <8 x i32> %e2 = zext <8 x i16> %l2 to <8 x i32> %e3 = shl <8 x i32> %e2, %a = add <8 x i32> %e1, %e3 ret <8 x i32> %a } define <4 x i32> @loadaddext_v4i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadaddext_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 %l12 = load <4 x i8>, ptr %q1 %l21 = load <4 x i8>, ptr %p2 %q2 = getelementptr i8, ptr %p2, i32 4 %l22 = load <4 x i8>, ptr %q2 %le11 = zext <4 x i8> %l11 to <4 x i16> %le12 = zext <4 x i8> %l12 to <4 x i16> %le21 = zext <4 x i8> %l21 to <4 x i16> %le22 = zext <4 x i8> %l22 to <4 x i16> %l1 = add <4 x i16> %le11, %le21 %l2 = add <4 x i16> %le12, %le22 %e1 = zext <4 x i16> %l1 to <4 x i32> %e2 = zext <4 x i16> %l2 to <4 x i32> %e3 = shl <4 x i32> %e2, %a = add <4 x i32> %e1, %e3 ret <4 x i32> %a } define <16 x i16> @load_v16i8(ptr %p) { ; CHECK-LABEL: load_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 ; CHECK-NEXT: ushll v0.8h, v0.8b, #3 ; CHECK-NEXT: uaddw2 v1.8h, v1.8h, v2.16b ; CHECK-NEXT: uaddw v0.8h, v0.8h, v2.8b ; CHECK-NEXT: ret %l1 = load <16 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 16 %l2 = load <16 x i8>, ptr %q %e1 = zext <16 x i8> %l1 to <16 x i16> %e2 = zext <16 x i8> %l2 to <16 x i16> %e3 = shl <16 x i16> %e2, %a = add <16 x i16> %e1, %e3 ret <16 x i16> %a } define <2 x i16> @std_v2i8_v2i16(ptr %p) { ; CHECK-LABEL: std_v2i8_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #2] ; CHECK-NEXT: ldrb w9, [x0, #3] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: ldrb w9, [x0, #1] ; CHECK-NEXT: mov v1.s[1], w9 ; CHECK-NEXT: shl v0.2s, v0.2s, #3 ; CHECK-NEXT: add v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %l1 = load <2 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 2 %l2 = load <2 x i8>, ptr %q %e1 = zext <2 x i8> %l1 to <2 x i16> %e2 = zext <2 x i8> %l2 to <2 x i16> %se2 = shl <2 x i16> %e2, %a = add <2 x i16> %e1, %se2 ret <2 x i16> %a } define <8 x i16> @load_bv_v4i8(ptr %p, ptr %q) { ; CHECK-LABEL: load_bv_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ushll v1.8h, v1.8b, #3 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 %j2 = load <4 x i8>, ptr %p1 %k1 = load <4 x i8>, ptr %q %q1 = getelementptr i8, ptr %q, i32 4 %k2 = load <4 x i8>, ptr %q1 %l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> %l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> %e1 = zext <8 x i8> %l1 to <8 x i16> %e2 = zext <8 x i8> %l2 to <8 x i16> %e3 = shl <8 x i16> %e2, %a = add <8 x i16> %e1, %e3 ret <8 x i16> %a } define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) { ; CHECK-LABEL: load_bv_v4i8_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h ; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 %j2 = load <4 x i8>, ptr %p1 %k1 = load <4 x i8>, ptr %q %q1 = getelementptr i8, ptr %q, i32 4 %k2 = load <4 x i8>, ptr %q1 %l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> %l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> %e1 = zext <8 x i8> %l1 to <8 x i32> %e2 = zext <8 x i8> %l2 to <8 x i32> %e3 = shl <8 x i32> %e2, %a = add <8 x i32> %e1, %e3 ret <8 x i32> %a } define <8 x i32> @load_bv_v4i16_i32(ptr %p, ptr %q) { ; CHECK-LABEL: load_bv_v4i16_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h ; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h ; CHECK-NEXT: ret %j1 = load <4 x i16>, ptr %p %p1 = getelementptr i8, ptr %p, i32 8 %j2 = load <4 x i16>, ptr %p1 %k1 = load <4 x i16>, ptr %q %q1 = getelementptr i8, ptr %q, i32 8 %k2 = load <4 x i16>, ptr %q1 %l1 = shufflevector <4 x i16> %j1, <4 x i16> %k1, <8 x i32> %l2 = shufflevector <4 x i16> %j2, <4 x i16> %k2, <8 x i32> %e1 = zext <8 x i16> %l1 to <8 x i32> %e2 = zext <8 x i16> %l2 to <8 x i32> %e3 = shl <8 x i32> %e2, %a = add <8 x i32> %e1, %e3 ret <8 x i32> %a } define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) { ; CHECK-LABEL: load_bv_3xv4i8_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ldp s3, s2, [x2] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0 ; CHECK-NEXT: ushll2 v4.4s, v1.8h, #3 ; CHECK-NEXT: ushll v1.4s, v1.4h, #3 ; CHECK-NEXT: ushll v2.4s, v2.4h, #3 ; CHECK-NEXT: uaddw v2.4s, v2.4s, v3.4h ; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v0.8h ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: stp q3, q2, [x8, #16] ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 %j2 = load <4 x i8>, ptr %p1 %k1 = load <4 x i8>, ptr %q %q1 = getelementptr i8, ptr %q, i32 4 %k2 = load <4 x i8>, ptr %q1 %m1 = load <4 x i8>, ptr %r %r1 = getelementptr i8, ptr %r, i32 4 %m2 = load <4 x i8>, ptr %r1 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> %mn1 = shufflevector <4 x i8> %m1, <4 x i8> undef, <8 x i32> %mn2 = shufflevector <4 x i8> %m2, <4 x i8> undef, <8 x i32> %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <12 x i32> %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <12 x i32> %e1 = zext <12 x i8> %l1 to <12 x i32> %e2 = zext <12 x i8> %l2 to <12 x i32> %e3 = shl <12 x i32> %e2, %a = add <12 x i32> %e1, %e3 ret <12 x i32> %a } define <16 x i16> @load_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: load_bv_4xv4i8_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ldp s2, s3, [x2] ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 %j2 = load <4 x i8>, ptr %p1 %k1 = load <4 x i8>, ptr %q %q1 = getelementptr i8, ptr %q, i32 4 %k2 = load <4 x i8>, ptr %q1 %m1 = load <4 x i8>, ptr %r %r1 = getelementptr i8, ptr %r, i32 4 %m2 = load <4 x i8>, ptr %r1 %n1 = load <4 x i8>, ptr %s %s1 = getelementptr i8, ptr %s, i32 4 %n2 = load <4 x i8>, ptr %s1 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> %e1 = zext <16 x i8> %l1 to <16 x i16> %e2 = zext <16 x i8> %l2 to <16 x i16> %e3 = shl <16 x i16> %e2, %a = add <16 x i16> %e1, %e2 ret <16 x i16> %a } define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: double_bv_2xv4i8_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ldp s2, s3, [x2] ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: shll v3.4s, v2.4h, #16 ; CHECK-NEXT: shll2 v1.4s, v2.8h, #16 ; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h ; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 %j2 = load <4 x i8>, ptr %p1 %k1 = load <4 x i8>, ptr %q %q1 = getelementptr i8, ptr %q, i32 4 %k2 = load <4 x i8>, ptr %q1 %m1 = load <4 x i8>, ptr %r %r1 = getelementptr i8, ptr %r, i32 4 %m2 = load <4 x i8>, ptr %r1 %n1 = load <4 x i8>, ptr %s %s1 = getelementptr i8, ptr %s, i32 4 %n2 = load <4 x i8>, ptr %s1 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> %ejk1 = zext <8 x i8> %jk1 to <8 x i16> %ejk2 = zext <8 x i8> %jk2 to <8 x i16> %ajk = sub <8 x i16> %ejk1, %ejk2 %enm1 = zext <8 x i8> %mn1 to <8 x i16> %enm2 = zext <8 x i8> %mn2 to <8 x i16> %anm = sub <8 x i16> %enm1, %enm2 %x = sext <8 x i16> %ajk to <8 x i32> %y = zext <8 x i16> %anm to <8 x i32> %ys = shl <8 x i32> %y, %a = add <8 x i32> %x, %ys ret <8 x i32> %a } define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) { ; CHECK-LABEL: double_bv_4xv4i8_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ldp s2, s3, [x2] ; CHECK-NEXT: usubl v1.8h, v0.8b, v1.8b ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ldp s4, s5, [x4] ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: ld1 { v4.s }[1], [x5], #4 ; CHECK-NEXT: ld1 { v5.s }[1], [x5] ; CHECK-NEXT: ldp s6, s7, [x6] ; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b ; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4 ; CHECK-NEXT: ld1 { v7.s }[1], [x7] ; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b ; CHECK-NEXT: shll v0.4s, v4.4h, #16 ; CHECK-NEXT: shll2 v4.4s, v4.8h, #16 ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: saddw2 v1.4s, v4.4s, v1.8h ; CHECK-NEXT: shll v6.4s, v5.4h, #16 ; CHECK-NEXT: shll2 v3.4s, v5.8h, #16 ; CHECK-NEXT: saddw2 v3.4s, v3.4s, v2.8h ; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 %j2 = load <4 x i8>, ptr %p1 %k1 = load <4 x i8>, ptr %q %q1 = getelementptr i8, ptr %q, i32 4 %k2 = load <4 x i8>, ptr %q1 %m1 = load <4 x i8>, ptr %r %r1 = getelementptr i8, ptr %r, i32 4 %m2 = load <4 x i8>, ptr %r1 %n1 = load <4 x i8>, ptr %s %s1 = getelementptr i8, ptr %s, i32 4 %n2 = load <4 x i8>, ptr %s1 %j3 = load <4 x i8>, ptr %t %t3 = getelementptr i8, ptr %t, i32 4 %j4 = load <4 x i8>, ptr %t3 %k3 = load <4 x i8>, ptr %u %u3 = getelementptr i8, ptr %u, i32 4 %k4 = load <4 x i8>, ptr %u3 %m3 = load <4 x i8>, ptr %v %v3 = getelementptr i8, ptr %v, i32 4 %m4 = load <4 x i8>, ptr %v3 %n3 = load <4 x i8>, ptr %w %w3 = getelementptr i8, ptr %w, i32 4 %n4 = load <4 x i8>, ptr %w3 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> %jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <8 x i32> %jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <8 x i32> %mn3 = shufflevector <4 x i8> %m3, <4 x i8> %n3, <8 x i32> %mn4 = shufflevector <4 x i8> %m4, <4 x i8> %n4, <8 x i32> %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> %l3 = shufflevector <8 x i8> %jk3, <8 x i8> %mn3, <16 x i32> %l4 = shufflevector <8 x i8> %jk4, <8 x i8> %mn4, <16 x i32> %ejk1 = zext <16 x i8> %l1 to <16 x i16> %ejk2 = zext <16 x i8> %l2 to <16 x i16> %ajk = sub <16 x i16> %ejk1, %ejk2 %enm1 = zext <16 x i8> %l3 to <16 x i16> %enm2 = zext <16 x i8> %l4 to <16 x i16> %anm = sub <16 x i16> %enm1, %enm2 %x = sext <16 x i16> %ajk to <16 x i32> %y = zext <16 x i16> %anm to <16 x i32> %ys = shl <16 x i32> %y, %a = add <16 x i32> %x, %ys ret <16 x i32> %a } define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) { ; CHECK-LABEL: double2_bv_4xv4i8_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x2] ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d2, [x1] ; CHECK-NEXT: ldr d3, [x3] ; CHECK-NEXT: ldr d4, [x4] ; CHECK-NEXT: ldr d5, [x5] ; CHECK-NEXT: ldr d6, [x6] ; CHECK-NEXT: ldr d7, [x7] ; CHECK-NEXT: usubl v1.8h, v1.8b, v4.8b ; CHECK-NEXT: usubl v2.8h, v2.8b, v5.8b ; CHECK-NEXT: usubl v3.8h, v3.8b, v7.8b ; CHECK-NEXT: usubl v4.8h, v0.8b, v6.8b ; CHECK-NEXT: shll2 v0.4s, v1.8h, #16 ; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 ; CHECK-NEXT: shll2 v6.4s, v4.8h, #16 ; CHECK-NEXT: shll2 v7.4s, v3.8h, #16 ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: saddw v1.4s, v5.4s, v2.4h ; CHECK-NEXT: saddw v2.4s, v6.4s, v4.4h ; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 %j2 = load <4 x i8>, ptr %p1 %k1 = load <4 x i8>, ptr %q %q1 = getelementptr i8, ptr %q, i32 4 %k2 = load <4 x i8>, ptr %q1 %m1 = load <4 x i8>, ptr %r %r1 = getelementptr i8, ptr %r, i32 4 %m2 = load <4 x i8>, ptr %r1 %n1 = load <4 x i8>, ptr %s %s1 = getelementptr i8, ptr %s, i32 4 %n2 = load <4 x i8>, ptr %s1 %j3 = load <4 x i8>, ptr %t %t3 = getelementptr i8, ptr %t, i32 4 %j4 = load <4 x i8>, ptr %t3 %k3 = load <4 x i8>, ptr %u %u3 = getelementptr i8, ptr %u, i32 4 %k4 = load <4 x i8>, ptr %u3 %m3 = load <4 x i8>, ptr %v %v3 = getelementptr i8, ptr %v, i32 4 %m4 = load <4 x i8>, ptr %v3 %n3 = load <4 x i8>, ptr %w %w3 = getelementptr i8, ptr %w, i32 4 %n4 = load <4 x i8>, ptr %w3 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <16 x i32> %m1l = shufflevector <4 x i8> %m1, <4 x i8> poison, <16 x i32> %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> %n1l = shufflevector <4 x i8> %n1, <4 x i8> poison, <16 x i32> %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <16 x i32> %m2l = shufflevector <4 x i8> %m2, <4 x i8> poison, <16 x i32> %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> %n2l = shufflevector <4 x i8> %n2, <4 x i8> poison, <16 x i32> %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> %jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <16 x i32> %m3l = shufflevector <4 x i8> %m3, <4 x i8> poison, <16 x i32> %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> %n3l = shufflevector <4 x i8> %n3, <4 x i8> poison, <16 x i32> %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> %jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <16 x i32> %m4l = shufflevector <4 x i8> %m4, <4 x i8> poison, <16 x i32> %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> %n4l = shufflevector <4 x i8> %n4, <4 x i8> poison, <16 x i32> %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> %ejk1 = zext <16 x i8> %l1 to <16 x i16> %ejk2 = zext <16 x i8> %l3 to <16 x i16> %ajk = sub <16 x i16> %ejk1, %ejk2 %enm1 = zext <16 x i8> %l2 to <16 x i16> %enm2 = zext <16 x i8> %l4 to <16 x i16> %anm = sub <16 x i16> %enm1, %enm2 %x = sext <16 x i16> %ajk to <16 x i32> %y = zext <16 x i16> %anm to <16 x i32> %ys = shl <16 x i32> %y, %a = add <16 x i32> %x, %ys ret <16 x i32> %a } define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x11, x3, #12 ; CHECK-NEXT: str s1, [x4] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ldp s0, s5, [x2] ; CHECK-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-NEXT: umov w9, v2.h[0] ; CHECK-NEXT: umov w10, v2.h[1] ; CHECK-NEXT: mov v0.b[8], w9 ; CHECK-NEXT: umov w9, v2.h[2] ; CHECK-NEXT: mov v0.b[9], w10 ; CHECK-NEXT: umov w10, v2.h[3] ; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NEXT: mov v0.b[10], w9 ; CHECK-NEXT: add x9, x1, #4 ; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b ; CHECK-NEXT: mov v0.b[11], w10 ; CHECK-NEXT: add x10, x1, #12 ; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 ; CHECK-NEXT: ldr s4, [x0, #12] ; CHECK-NEXT: ldp s3, s16, [x0, #4] ; CHECK-NEXT: ld1 { v5.s }[1], [x3] ; CHECK-NEXT: ldp s6, s7, [x2, #8] ; CHECK-NEXT: ld1 { v4.s }[1], [x10] ; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: ld1 { v7.s }[1], [x11] ; CHECK-NEXT: add x8, x1, #8 ; CHECK-NEXT: ld1 { v16.s }[1], [x8] ; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b ; CHECK-NEXT: ushll v3.8h, v6.8b, #0 ; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b ; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b ; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b ; CHECK-NEXT: ushll v0.4s, v2.4h, #3 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 ; CHECK-NEXT: ushll v6.4s, v4.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h ; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v5.8h ; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p store <4 x i8> %lp1, ptr %z %p2 = getelementptr i8, ptr %p, i32 4 %lp2 = load <4 x i8>, ptr %p2 %p3 = getelementptr i8, ptr %p, i32 8 %lp3 = load <4 x i8>, ptr %p3 %p4 = getelementptr i8, ptr %p, i32 12 %lp4 = load <4 x i8>, ptr %p4 %lq1 = load <4 x i8>, ptr %q %q2 = getelementptr i8, ptr %q, i32 4 %lq2 = load <4 x i8>, ptr %q2 %q3 = getelementptr i8, ptr %q, i32 8 %lq3 = load <4 x i8>, ptr %q3 %q4 = getelementptr i8, ptr %q, i32 12 %lq4 = load <4 x i8>, ptr %q4 %lr1 = load <4 x i8>, ptr %r %r2 = getelementptr i8, ptr %r, i32 4 %lr2 = load <4 x i8>, ptr %r2 %r3 = getelementptr i8, ptr %r, i32 8 %lr3 = load <4 x i8>, ptr %r3 %r4 = getelementptr i8, ptr %r, i32 12 %lr4 = load <4 x i8>, ptr %r4 %ls1 = load <4 x i8>, ptr %s %s2 = getelementptr i8, ptr %s, i32 4 %ls2 = load <4 x i8>, ptr %s2 %s3 = getelementptr i8, ptr %s, i32 8 %ls3 = load <4 x i8>, ptr %s3 %s4 = getelementptr i8, ptr %s, i32 12 %ls4 = load <4 x i8>, ptr %s4 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> %le11 = zext <16 x i8> %l1 to <16 x i16> %le12 = zext <16 x i8> %l3 to <16 x i16> %le21 = zext <16 x i8> %l2 to <16 x i16> %le22 = zext <16 x i8> %l4 to <16 x i16> %la1 = add <16 x i16> %le11, %le12 %la2 = add <16 x i16> %le21, %le22 %e1 = zext <16 x i16> %la1 to <16 x i32> %e2 = zext <16 x i16> %la2 to <16 x i32> %se2 = shl <16 x i32> %e2, %a = add <16 x i32> %e1, %se2 ret <16 x i32> %a } define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_shuffle: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0, #8] ; CHECK-NEXT: add x8, x1, #8 ; CHECK-NEXT: ldr s6, [x1, #12] ; CHECK-NEXT: ldp s17, s18, [x2, #8] ; CHECK-NEXT: ldp s2, s3, [x2] ; CHECK-NEXT: add x9, x3, #8 ; CHECK-NEXT: mov v4.16b, v1.16b ; CHECK-NEXT: ldp s7, s16, [x0] ; CHECK-NEXT: ldr s5, [x3, #12] ; CHECK-NEXT: mov v1.s[1], v6.s[0] ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 ; CHECK-NEXT: mov v4.s[1], v6.s[0] ; CHECK-NEXT: ld1 { v7.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v16.s }[1], [x1] ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ld1 { v0.s }[1], [x8] ; CHECK-NEXT: ld1 { v17.s }[1], [x9] ; CHECK-NEXT: mov v4.s[2], v18.s[0] ; CHECK-NEXT: mov v18.s[1], v5.s[0] ; CHECK-NEXT: uaddl v1.8h, v16.8b, v1.8b ; CHECK-NEXT: uaddl v6.8h, v7.8b, v0.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v17.8b ; CHECK-NEXT: uaddl v3.8h, v3.8b, v18.8b ; CHECK-NEXT: ushll v0.4s, v1.4h, #3 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 ; CHECK-NEXT: mov v4.s[3], v5.s[0] ; CHECK-NEXT: uaddw v0.4s, v0.4s, v6.4h ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v6.8h ; CHECK-NEXT: ushll v7.4s, v3.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 ; CHECK-NEXT: str q4, [x4] ; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h ; CHECK-NEXT: uaddw v2.4s, v7.4s, v2.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 %lp2 = load <4 x i8>, ptr %p2 %p3 = getelementptr i8, ptr %p, i32 8 %lp3 = load <4 x i8>, ptr %p3 %p4 = getelementptr i8, ptr %p, i32 12 %lp4 = load <4 x i8>, ptr %p4 %lq1 = load <4 x i8>, ptr %q %q2 = getelementptr i8, ptr %q, i32 4 %lq2 = load <4 x i8>, ptr %q2 %q3 = getelementptr i8, ptr %q, i32 8 %lq3 = load <4 x i8>, ptr %q3 %q4 = getelementptr i8, ptr %q, i32 12 %lq4 = load <4 x i8>, ptr %q4 %lr1 = load <4 x i8>, ptr %r %r2 = getelementptr i8, ptr %r, i32 4 %lr2 = load <4 x i8>, ptr %r2 %r3 = getelementptr i8, ptr %r, i32 8 %lr3 = load <4 x i8>, ptr %r3 %r4 = getelementptr i8, ptr %r, i32 12 %lr4 = load <4 x i8>, ptr %r4 %ls1 = load <4 x i8>, ptr %s %s2 = getelementptr i8, ptr %s, i32 4 %ls2 = load <4 x i8>, ptr %s2 %s3 = getelementptr i8, ptr %s, i32 8 %ls3 = load <4 x i8>, ptr %s3 %s4 = getelementptr i8, ptr %s, i32 12 %ls4 = load <4 x i8>, ptr %s4 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> store <16 x i8> %l4, ptr %z %le11 = zext <16 x i8> %l1 to <16 x i16> %le12 = zext <16 x i8> %l3 to <16 x i16> %le21 = zext <16 x i8> %l2 to <16 x i16> %le22 = zext <16 x i8> %l4 to <16 x i16> %la1 = add <16 x i16> %le11, %le12 %la2 = add <16 x i16> %le21, %le22 %e1 = zext <16 x i16> %la1 to <16 x i32> %e2 = zext <16 x i16> %la2 to <16 x i32> %se2 = shl <16 x i32> %e2, %a = add <16 x i32> %e1, %se2 ret <16 x i32> %a } define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_ext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s1, s2, [x2] ; CHECK-NEXT: add x10, x3, #12 ; CHECK-NEXT: ldp s3, s5, [x0] ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: ldp s6, s0, [x2, #8] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: ldp s7, s4, [x0, #8] ; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: ld1 { v1.s }[1], [x3], #4 ; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v5.s }[1], [x1] ; CHECK-NEXT: ld1 { v4.s }[1], [x11] ; CHECK-NEXT: ld1 { v2.s }[1], [x3] ; CHECK-NEXT: ld1 { v0.s }[1], [x10] ; CHECK-NEXT: ld1 { v7.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: uaddl v5.8h, v5.8b, v4.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v0.8b ; CHECK-NEXT: ushll v16.8h, v0.8b, #0 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b ; CHECK-NEXT: uaddl v6.8h, v1.8b, v6.8b ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 ; CHECK-NEXT: ushll v1.4s, v5.4h, #3 ; CHECK-NEXT: ushll v7.4s, v2.4h, #3 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 ; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3 ; CHECK-NEXT: stp q4, q16, [x4] ; CHECK-NEXT: uaddw v0.4s, v1.4s, v3.4h ; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v3.8h ; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v6.8h ; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 %lp2 = load <4 x i8>, ptr %p2 %p3 = getelementptr i8, ptr %p, i32 8 %lp3 = load <4 x i8>, ptr %p3 %p4 = getelementptr i8, ptr %p, i32 12 %lp4 = load <4 x i8>, ptr %p4 %lq1 = load <4 x i8>, ptr %q %q2 = getelementptr i8, ptr %q, i32 4 %lq2 = load <4 x i8>, ptr %q2 %q3 = getelementptr i8, ptr %q, i32 8 %lq3 = load <4 x i8>, ptr %q3 %q4 = getelementptr i8, ptr %q, i32 12 %lq4 = load <4 x i8>, ptr %q4 %lr1 = load <4 x i8>, ptr %r %r2 = getelementptr i8, ptr %r, i32 4 %lr2 = load <4 x i8>, ptr %r2 %r3 = getelementptr i8, ptr %r, i32 8 %lr3 = load <4 x i8>, ptr %r3 %r4 = getelementptr i8, ptr %r, i32 12 %lr4 = load <4 x i8>, ptr %r4 %ls1 = load <4 x i8>, ptr %s %s2 = getelementptr i8, ptr %s, i32 4 %ls2 = load <4 x i8>, ptr %s2 %s3 = getelementptr i8, ptr %s, i32 8 %ls3 = load <4 x i8>, ptr %s3 %s4 = getelementptr i8, ptr %s, i32 12 %ls4 = load <4 x i8>, ptr %s4 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> %le11 = zext <16 x i8> %l1 to <16 x i16> %le12 = zext <16 x i8> %l3 to <16 x i16> %le21 = zext <16 x i8> %l2 to <16 x i16> %le22 = zext <16 x i8> %l4 to <16 x i16> store <16 x i16> %le22, ptr %z %la1 = add <16 x i16> %le11, %le12 %la2 = add <16 x i16> %le21, %le22 %e1 = zext <16 x i16> %la1 to <16 x i32> %e2 = zext <16 x i16> %la2 to <16 x i32> %se2 = shl <16 x i32> %e2, %a = add <16 x i32> %e1, %se2 ret <16 x i32> %a } define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_add: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: add x10, x3, #12 ; CHECK-NEXT: ldp s2, s3, [x2] ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: ldp s4, s5, [x0, #8] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: ldp s6, s7, [x2, #8] ; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ld1 { v5.s }[1], [x11] ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: uaddl v5.8h, v1.8b, v5.8b ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b ; CHECK-NEXT: uaddl v1.8h, v0.8b, v4.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b ; CHECK-NEXT: ushll v0.4s, v5.4h, #3 ; CHECK-NEXT: ushll v4.4s, v7.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v7.8h, #3 ; CHECK-NEXT: ushll2 v6.4s, v5.8h, #3 ; CHECK-NEXT: stp q5, q7, [x4] ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h ; CHECK-NEXT: uaddw v2.4s, v4.4s, v2.4h ; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v1.8h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 %lp2 = load <4 x i8>, ptr %p2 %p3 = getelementptr i8, ptr %p, i32 8 %lp3 = load <4 x i8>, ptr %p3 %p4 = getelementptr i8, ptr %p, i32 12 %lp4 = load <4 x i8>, ptr %p4 %lq1 = load <4 x i8>, ptr %q %q2 = getelementptr i8, ptr %q, i32 4 %lq2 = load <4 x i8>, ptr %q2 %q3 = getelementptr i8, ptr %q, i32 8 %lq3 = load <4 x i8>, ptr %q3 %q4 = getelementptr i8, ptr %q, i32 12 %lq4 = load <4 x i8>, ptr %q4 %lr1 = load <4 x i8>, ptr %r %r2 = getelementptr i8, ptr %r, i32 4 %lr2 = load <4 x i8>, ptr %r2 %r3 = getelementptr i8, ptr %r, i32 8 %lr3 = load <4 x i8>, ptr %r3 %r4 = getelementptr i8, ptr %r, i32 12 %lr4 = load <4 x i8>, ptr %r4 %ls1 = load <4 x i8>, ptr %s %s2 = getelementptr i8, ptr %s, i32 4 %ls2 = load <4 x i8>, ptr %s2 %s3 = getelementptr i8, ptr %s, i32 8 %ls3 = load <4 x i8>, ptr %s3 %s4 = getelementptr i8, ptr %s, i32 12 %ls4 = load <4 x i8>, ptr %s4 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> %le11 = zext <16 x i8> %l1 to <16 x i16> %le12 = zext <16 x i8> %l3 to <16 x i16> %le21 = zext <16 x i8> %l2 to <16 x i16> %le22 = zext <16 x i8> %l4 to <16 x i16> %la1 = add <16 x i16> %le11, %le12 %la2 = add <16 x i16> %le21, %le22 store <16 x i16> %la2, ptr %z %e1 = zext <16 x i16> %la1 to <16 x i32> %e2 = zext <16 x i16> %la2 to <16 x i32> %se2 = shl <16 x i32> %e2, %a = add <16 x i32> %e1, %se2 ret <16 x i32> %a } define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_ext2: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x2] ; CHECK-NEXT: add x10, x3, #12 ; CHECK-NEXT: ldp s2, s3, [x0] ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: ldp s4, s5, [x2, #8] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: ldp s6, s7, [x0, #8] ; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v3.s }[1], [x1] ; CHECK-NEXT: ld1 { v7.s }[1], [x11] ; CHECK-NEXT: ld1 { v1.s }[1], [x3] ; CHECK-NEXT: ld1 { v5.s }[1], [x10] ; CHECK-NEXT: ld1 { v6.s }[1], [x9] ; CHECK-NEXT: ld1 { v4.s }[1], [x8] ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b ; CHECK-NEXT: uaddl v3.8h, v1.8b, v5.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b ; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b ; CHECK-NEXT: ushll v0.4s, v7.4h, #3 ; CHECK-NEXT: ushll2 v1.4s, v7.8h, #3 ; CHECK-NEXT: ushll v5.4s, v3.4h, #3 ; CHECK-NEXT: ushll2 v6.4s, v3.8h, #3 ; CHECK-NEXT: ushll2 v16.4s, v3.8h, #0 ; CHECK-NEXT: ushll v17.4s, v3.4h, #0 ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h ; CHECK-NEXT: uaddw2 v3.4s, v6.4s, v4.8h ; CHECK-NEXT: ushll2 v4.4s, v7.8h, #0 ; CHECK-NEXT: ushll v5.4s, v7.4h, #0 ; CHECK-NEXT: stp q17, q16, [x4, #32] ; CHECK-NEXT: stp q5, q4, [x4] ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 %lp2 = load <4 x i8>, ptr %p2 %p3 = getelementptr i8, ptr %p, i32 8 %lp3 = load <4 x i8>, ptr %p3 %p4 = getelementptr i8, ptr %p, i32 12 %lp4 = load <4 x i8>, ptr %p4 %lq1 = load <4 x i8>, ptr %q %q2 = getelementptr i8, ptr %q, i32 4 %lq2 = load <4 x i8>, ptr %q2 %q3 = getelementptr i8, ptr %q, i32 8 %lq3 = load <4 x i8>, ptr %q3 %q4 = getelementptr i8, ptr %q, i32 12 %lq4 = load <4 x i8>, ptr %q4 %lr1 = load <4 x i8>, ptr %r %r2 = getelementptr i8, ptr %r, i32 4 %lr2 = load <4 x i8>, ptr %r2 %r3 = getelementptr i8, ptr %r, i32 8 %lr3 = load <4 x i8>, ptr %r3 %r4 = getelementptr i8, ptr %r, i32 12 %lr4 = load <4 x i8>, ptr %r4 %ls1 = load <4 x i8>, ptr %s %s2 = getelementptr i8, ptr %s, i32 4 %ls2 = load <4 x i8>, ptr %s2 %s3 = getelementptr i8, ptr %s, i32 8 %ls3 = load <4 x i8>, ptr %s3 %s4 = getelementptr i8, ptr %s, i32 12 %ls4 = load <4 x i8>, ptr %s4 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> %le11 = zext <16 x i8> %l1 to <16 x i16> %le12 = zext <16 x i8> %l3 to <16 x i16> %le21 = zext <16 x i8> %l2 to <16 x i16> %le22 = zext <16 x i8> %l4 to <16 x i16> %la1 = add <16 x i16> %le11, %le12 %la2 = add <16 x i16> %le21, %le22 %e1 = zext <16 x i16> %la1 to <16 x i32> %e2 = zext <16 x i16> %la2 to <16 x i32> store <16 x i32> %e2, ptr %z %se2 = shl <16 x i32> %e2, %a = add <16 x i32> %e1, %se2 ret <16 x i32> %a } define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_shl: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp s0, s1, [x0] ; CHECK-NEXT: add x10, x3, #12 ; CHECK-NEXT: ldp s2, s3, [x2] ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: ldp s4, s5, [x0, #8] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: ldp s6, s7, [x2, #8] ; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ld1 { v5.s }[1], [x11] ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b ; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b ; CHECK-NEXT: ushll v5.4s, v1.4h, #3 ; CHECK-NEXT: ushll v6.4s, v3.4h, #3 ; CHECK-NEXT: ushll2 v7.4s, v1.8h, #3 ; CHECK-NEXT: ushll2 v16.4s, v3.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h ; CHECK-NEXT: uaddw2 v1.4s, v7.4s, v4.8h ; CHECK-NEXT: stp q5, q7, [x4] ; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v2.8h ; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h ; CHECK-NEXT: stp q6, q16, [x4, #32] ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 %lp2 = load <4 x i8>, ptr %p2 %p3 = getelementptr i8, ptr %p, i32 8 %lp3 = load <4 x i8>, ptr %p3 %p4 = getelementptr i8, ptr %p, i32 12 %lp4 = load <4 x i8>, ptr %p4 %lq1 = load <4 x i8>, ptr %q %q2 = getelementptr i8, ptr %q, i32 4 %lq2 = load <4 x i8>, ptr %q2 %q3 = getelementptr i8, ptr %q, i32 8 %lq3 = load <4 x i8>, ptr %q3 %q4 = getelementptr i8, ptr %q, i32 12 %lq4 = load <4 x i8>, ptr %q4 %lr1 = load <4 x i8>, ptr %r %r2 = getelementptr i8, ptr %r, i32 4 %lr2 = load <4 x i8>, ptr %r2 %r3 = getelementptr i8, ptr %r, i32 8 %lr3 = load <4 x i8>, ptr %r3 %r4 = getelementptr i8, ptr %r, i32 12 %lr4 = load <4 x i8>, ptr %r4 %ls1 = load <4 x i8>, ptr %s %s2 = getelementptr i8, ptr %s, i32 4 %ls2 = load <4 x i8>, ptr %s2 %s3 = getelementptr i8, ptr %s, i32 8 %ls3 = load <4 x i8>, ptr %s3 %s4 = getelementptr i8, ptr %s, i32 12 %ls4 = load <4 x i8>, ptr %s4 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> %le11 = zext <16 x i8> %l1 to <16 x i16> %le12 = zext <16 x i8> %l3 to <16 x i16> %le21 = zext <16 x i8> %l2 to <16 x i16> %le22 = zext <16 x i8> %l4 to <16 x i16> %la1 = add <16 x i16> %le11, %le12 %la2 = add <16 x i16> %le21, %le22 %e1 = zext <16 x i16> %la1 to <16 x i32> %e2 = zext <16 x i16> %la2 to <16 x i32> %se2 = shl <16 x i32> %e2, store <16 x i32> %se2, ptr %z %a = add <16 x i32> %e1, %se2 ret <16 x i32> %a } define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) { ; CHECK-LABEL: commuted_loads: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: add v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 ; CHECK-NEXT: ushll v3.4s, v1.4h, #3 ; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h ; CHECK-NEXT: uaddw v0.4s, v3.4s, v0.4h ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 %l12 = load <8 x i8>, ptr %q1 %l21 = load <8 x i8>, ptr %p2 %q2 = getelementptr i8, ptr %p2, i32 8 %l22 = load <8 x i8>, ptr %q2 %l1 = add <8 x i8> %l21, %l11 %l2 = add <8 x i8> %l22, %l12 %e1 = zext <8 x i8> %l1 to <8 x i32> %e2 = zext <8 x i8> %l2 to <8 x i32> %se2 = shl <8 x i32> %e2, %a = add <8 x i32> %e1, %se2 ret <8 x i32> %a } define <8 x i32> @commuted_loads2(ptr %p1, ptr %p2) { ; CHECK-LABEL: commuted_loads2: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp d0, d3, [x1] ; CHECK-NEXT: ldp d1, d2, [x0] ; CHECK-NEXT: add v0.8b, v1.8b, v0.8b ; CHECK-NEXT: add v1.8b, v2.8b, v3.8b ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v2.8h, v1.8b, #0 ; CHECK-NEXT: ushll v3.4s, v0.4h, #3 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #3 ; CHECK-NEXT: uaddw2 v1.4s, v0.4s, v2.8h ; CHECK-NEXT: uaddw v0.4s, v3.4s, v2.4h ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 %l12 = load <8 x i8>, ptr %q1 %l21 = load <8 x i8>, ptr %p2 %q2 = getelementptr i8, ptr %p2, i32 8 %l22 = load <8 x i8>, ptr %q2 %l1 = add <8 x i8> %l11, %l21 %l2 = add <8 x i8> %l12, %l22 %e1 = zext <8 x i8> %l2 to <8 x i32> %e2 = zext <8 x i8> %l1 to <8 x i32> %se2 = shl <8 x i32> %e2, %a = add <8 x i32> %e1, %se2 ret <8 x i32> %a } define <8 x i32> @commuted_sub(ptr %p1, ptr %p2) { ; CHECK-LABEL: commuted_sub: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp d2, d1, [x1] ; CHECK-NEXT: ldr d0, [x0, #8] ; CHECK-NEXT: add v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: add v1.8b, v1.8b, v2.8b ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v2.8h, v1.8b, #0 ; CHECK-NEXT: ushll v3.4s, v0.4h, #3 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #3 ; CHECK-NEXT: usubw2 v1.4s, v0.4s, v2.8h ; CHECK-NEXT: usubw v0.4s, v3.4s, v2.4h ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 %l12 = load <8 x i8>, ptr %q1 %l21 = load <8 x i8>, ptr %p2 %q2 = getelementptr i8, ptr %p2, i32 8 %l22 = load <8 x i8>, ptr %q2 %l1 = add <8 x i8> %l11, %l21 %l2 = add <8 x i8> %l12, %l22 %e1 = zext <8 x i8> %l1 to <8 x i32> %e2 = zext <8 x i8> %l2 to <8 x i32> %se2 = shl <8 x i32> %e2, %a = sub <8 x i32> %se2, %e1 ret <8 x i32> %a } define <4 x i32> @bitcast(ptr %p) { ; CHECK-LABEL: bitcast: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l1b = load float, ptr %p %l1 = bitcast float %l1b to <4 x i8> %q = getelementptr i8, ptr %p, i32 4 %l2b = load float, ptr %q %l2 = bitcast float %l2b to <4 x i8> %e1 = zext <4 x i8> %l1 to <4 x i32> %e2 = zext <4 x i8> %l2 to <4 x i32> %e3 = shl <4 x i32> %e2, %a = add <4 x i32> %e1, %e3 ret <4 x i32> %a } define <4 x i32> @atomic(ptr %p) { ; CHECK-LABEL: atomic: ; CHECK: // %bb.0: ; CHECK-NEXT: ldar w8, [x0] ; CHECK-NEXT: movi v0.2d, #0x0000ff000000ff ; CHECK-NEXT: ldr s1, [x0, #4] ; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: zip1 v2.8b, v2.8b, v0.8b ; CHECK-NEXT: ushll v1.4s, v1.4h, #3 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %l1b = load atomic float, ptr %p acquire, align 4 %l1 = bitcast float %l1b to <4 x i8> %q = getelementptr i8, ptr %p, i32 4 %l2b = load float, ptr %q %l2 = bitcast float %l2b to <4 x i8> %e1 = zext <4 x i8> %l1 to <4 x i32> %e2 = zext <4 x i8> %l2 to <4 x i32> %e3 = shl <4 x i32> %e2, %a = add <4 x i32> %e1, %e3 ret <4 x i32> %a } define <4 x i32> @volatile(ptr %p) { ; CHECK-LABEL: volatile: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ldr s1, [x0, #4] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #3 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %l1b = load volatile float, ptr %p %l1 = bitcast float %l1b to <4 x i8> %q = getelementptr i8, ptr %p, i32 4 %l2b = load float, ptr %q %l2 = bitcast float %l2b to <4 x i8> %e1 = zext <4 x i8> %l1 to <4 x i32> %e2 = zext <4 x i8> %l2 to <4 x i32> %e3 = shl <4 x i32> %e2, %a = add <4 x i32> %e1, %e3 ret <4 x i32> %a }