; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=ALL,AVX512VBMI define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a) { ; ALL-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # %bb.0: ; ALL-NEXT: vpsrld $16, %xmm0, %xmm0 ; ALL-NEXT: retq %b = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> ret <64 x i8> %b } define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62] ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62] ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62] ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62] ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpbroadcastb %xmm0, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> ret <64 x i8> %shuffle } ; PR44379 define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_18_19_20_21_22_23_16_17_26_27_28_29_30_31_24_25_34_35_36_37_38_39_32_33_42_43_44_45_46_47_40_41_50_51_52_53_54_55_48_49_58_59_60_61_62_63_56_57(<64 x i8> %a) { ; ALL-LABEL: shuffle_v64i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_18_19_20_21_22_23_16_17_26_27_28_29_30_31_24_25_34_35_36_37_38_39_32_33_42_43_44_45_46_47_40_41_50_51_52_53_54_55_48_49_58_59_60_61_62_63_56_57: ; ALL: # %bb.0: ; ALL-NEXT: vprolq $48, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> ret <64 x i8> %shuffle } ; PR54658 define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,1,3,2,5,7,6,9,11] ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,13,15,14,1,3,2,5,u,u,u,u,u,u,u,u,26,29,31,30,17,19,18,21,23,22,25,27,u,u,u,u] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] ; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u,33,35,34,37,39,38,41,43,42,45,47,46,u,u,u,u,49,51,50,53,55,54,57,59,58,61,63,62,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,4,5,6,8,9,10,12,13,14,0,0,0,0] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,1,3,2,5,7,6,9,11] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,13,15,14,1,3,2,5,u,u,u,u,u,u,u,u,26,29,31,30,17,19,18,21,23,22,25,27,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,3,2,5,7,6,9,11,10,13,15,14,17,19,18,21,23,22,25,27,26,29,31,30,33,35,34,37,39,38,41,43,42,45,47,46,49,51,50,53,55,54,57,59,58,61,63,62,1,3,2,5,1,3,2,5,1,3,2,5,1,3,2,5] ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> poison, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @insert_dup_mem_v64i8_i32(ptr %ptr) { ; AVX512F-LABEL: insert_dup_mem_v64i8_i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_mem_v64i8_i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: insert_dup_mem_v64i8_i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_i32: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0 ; AVX512VBMI-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> zeroinitializer ret <64 x i8> %tmp3 } define <64 x i8> @insert_dup_mem_v64i8_sext_i8(ptr %ptr) { ; AVX512F-LABEL: insert_dup_mem_v64i8_sext_i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_mem_v64i8_sext_i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: insert_dup_mem_v64i8_sext_i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_sext_i8: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0 ; AVX512VBMI-NEXT: retq %tmp = load i8, ptr %ptr, align 1 %tmp1 = sext i8 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> zeroinitializer ret <64 x i8> %tmp4 } define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(ptr %ptr) { ; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb 1(%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb 1(%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb 1(%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_i32: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpbroadcastb 1(%rdi), %zmm0 ; AVX512VBMI-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> ret <64 x i8> %tmp3 } define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(ptr %ptr) { ; AVX512F-LABEL: insert_dup_elt3_mem_v64i8_i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb 3(%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_elt3_mem_v64i8_i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb 3(%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: insert_dup_elt3_mem_v64i8_i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb 3(%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_elt3_mem_v64i8_i32: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpbroadcastb 3(%rdi), %zmm0 ; AVX512VBMI-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> ret <64 x i8> %tmp3 } define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(ptr %ptr) { ; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_sext_i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movsbl (%rdi), %eax ; AVX512F-NEXT: shrl $8, %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_sext_i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movsbl (%rdi), %eax ; AVX512BW-NEXT: shrl $8, %eax ; AVX512BW-NEXT: vpbroadcastb %eax, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_sext_i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movsbl (%rdi), %eax ; AVX512DQ-NEXT: shrl $8, %eax ; AVX512DQ-NEXT: vmovd %eax, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_sext_i8: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: movsbl (%rdi), %eax ; AVX512VBMI-NEXT: shrl $8, %eax ; AVX512VBMI-NEXT: vpbroadcastb %eax, %zmm0 ; AVX512VBMI-NEXT: retq %tmp = load i8, ptr %ptr, align 1 %tmp1 = sext i8 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> ret <64 x i8> %tmp4 } define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) { ; ALL-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; ALL-NEXT: retq %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) { ; ALL-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; ALL-NEXT: retq %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zmm0[13],zero,zmm0[11],zero,zmm0[9],zero,zmm0[7],zero,zmm0[5],zero,zmm0[3],zero,zmm0[1],zero,zmm0[31],zero,zmm0[29],zero,zmm0[27],zero,zmm0[25],zero,zmm0[23],zero,zmm0[21],zero,zmm0[19],zero,zmm0[17],zero,zmm0[47],zero,zmm0[45],zero,zmm0[43],zero,zmm0[41],zero,zmm0[39],zero,zmm0[37],zero,zmm0[35],zero,zmm0[33],zero,zmm0[63],zero,zmm0[61],zero,zmm0[59],zero,zmm0[57],zero,zmm0[55],zero,zmm0[53],zero,zmm0[51],zero,zmm0[49],zero ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zmm0[13],zero,zmm0[11],zero,zmm0[9],zero,zmm0[7],zero,zmm0[5],zero,zmm0[3],zero,zmm0[1],zero,zmm0[31],zero,zmm0[29],zero,zmm0[27],zero,zmm0[25],zero,zmm0[23],zero,zmm0[21],zero,zmm0[19],zero,zmm0[17],zero,zmm0[47],zero,zmm0[45],zero,zmm0[43],zero,zmm0[41],zero,zmm0[39],zero,zmm0[37],zero,zmm0[35],zero,zmm0[33],zero,zmm0[63],zero,zmm0[61],zero,zmm0[59],zero,zmm0[57],zero,zmm0[55],zero,zmm0[53],zero,zmm0[51],zero,zmm0[49],zero ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> ret <64 x i8> %shuffle } define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512F-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,3,5,9,11,15],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,3,5,9,11,15],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512DQ-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,3,5,9,11,15],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125,1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm1, %zmm2 ; AVX512VBMI-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_50_52_56_58_62_64_68_70_74_76_80_82_86_88_92_94_98_100_104_106_110_112_116_118_122_124_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_50_52_56_58_62_64_68_70_74_76_80_82_86_88_92_94_98_100_104_106_110_112_116_118_122_124_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm4 ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_50_52_56_58_62_64_68_70_74_76_80_82_86_88_92_94_98_100_104_106_110_112_116_118_122_124_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm4 ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] ; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_50_52_56_58_62_64_68_70_74_76_80_82_86_88_92_94_98_100_104_106_110_112_116_118_122_124_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm4 ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_50_52_56_58_62_64_68_70_74_76_80_82_86_88_92_94_98_100_104_106_110_112_116_118_122_124_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,4,8,10,14,16,20,22,26,28,32,34,38,40,44,46,50,52,56,58,62,64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,0,2,6,8,12,14,18,20,24,26,30,32,36,38,42,44,48,50,54,56,60,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm1, %zmm2 ; AVX512VBMI-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_65_69_71_75_77_81_83_87_89_93_95_99_101_105_107_111_113_117_119_123_125(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_65_69_71_75_77_81_83_87_89_93_95_99_101_105_107_111_113_117_119_123_125: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,5,7,11,13] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,3,5,9,11,15],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_65_69_71_75_77_81_83_87_89_93_95_99_101_105_107_111_113_117_119_123_125: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,5,7,11,13] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,3,5,9,11,15],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_65_69_71_75_77_81_83_87_89_93_95_99_101_105_107_111_113_117_119_123_125: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,5,7,11,13] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,3,5,9,11,15],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_65_69_71_75_77_81_83_87_89_93_95_99_101_105_107_111_113_117_119_123_125: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[0,2,6,8,12,14] ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[0,2,6,8,12,14] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[0,2,6,8,12,14] ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,64,66,70,72,76,78,82,84,88,90,94,96,100,102,106,108,112,114,118,120,124,126] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_66_68_72_74_78_80_84_86_90_92_96_98_102_104_108_110_114_116_120_122_126(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_66_68_72_74_78_80_84_86_90_92_96_98_102_104_108_110_114_116_120_122_126: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_66_68_72_74_78_80_84_86_90_92_96_98_102_104_108_110_114_116_120_122_126: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_66_68_72_74_78_80_84_86_90_92_96_98_102_104_108_110_114_116_120_122_126: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_66_68_72_74_78_80_84_86_90_92_96_98_102_104_108_110_114_116_120_122_126: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,66,68,72,74,78,80,84,86,90,92,96,98,102,104,108,110,114,116,120,122,126] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> ret <64 x i8> %r } define <64 x i8> @shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512F-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512VBMI-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512VBMI-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %1 = ashr <16 x i32> %a0, %2 = ashr <16 x i32> %a1, %3 = bitcast <16 x i32> %1 to <64 x i8> %4 = bitcast <16 x i32> %2 to <64 x i8> %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> ret <64 x i8> %5 } define <64 x i8> @shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512F-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpacksswb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpacksswb %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpacksswb %ymm2, %ymm2, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512VBMI-NEXT: vpsrad $25, %zmm1, %zmm1 ; AVX512VBMI-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: vpacksswb %zmm0, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %1 = ashr <16 x i32> %a0, %2 = ashr <16 x i32> %a1, %3 = bitcast <16 x i32> %1 to <64 x i8> %4 = bitcast <16 x i32> %2 to <64 x i8> %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> ret <64 x i8> %5 } define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512F-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512VBMI-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512VBMI-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %1 = lshr <16 x i32> %a0, %2 = lshr <16 x i32> %a1, %3 = bitcast <16 x i32> %1 to <64 x i8> %4 = bitcast <16 x i32> %2 to <64 x i8> %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> ret <64 x i8> %5 } define <64 x i8> @shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512F-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm2, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512VBMI-NEXT: vpsrld $25, %zmm1, %zmm1 ; AVX512VBMI-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: vpackuswb %zmm0, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %1 = lshr <16 x i32> %a0, %2 = lshr <16 x i32> %a1, %3 = bitcast <16 x i32> %1 to <64 x i8> %4 = bitcast <16 x i32> %2 to <64 x i8> %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> ret <64 x i8> %5 } define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7] ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7] ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127] ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %1 = lshr <32 x i16> %a0, %2 = lshr <32 x i16> %a1, %3 = bitcast <32 x i16> %1 to <64 x i8> %4 = bitcast <32 x i16> %2 to <64 x i8> %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> ret <64 x i8> %5 } define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm3 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VBMI-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VBMI-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %1 = lshr <32 x i16> %a0, %2 = lshr <32 x i16> %a1, %3 = bitcast <32 x i16> %1 to <64 x i8> %4 = bitcast <32 x i16> %2 to <64 x i8> %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> ret <64 x i8> %5 } define <64 x i8> @PR54562_ref(<64 x i8> %a0) { ; AVX512F-LABEL: PR54562_ref: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: PR54562_ref: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,2,3,4,4,5] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30,33,32,34,33,36,35,37,36,39,38,40,39,42,41,43,42,53,52,54,53,56,55,57,56,59,58,60,59,62,61,63,62] ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: PR54562_ref: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: PR54562_ref: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,13,12,14,13,16,15,17,16,19,18,20,19,22,21,23,22,25,24,26,25,28,27,29,28,31,30,32,31,34,33,35,34,37,36,38,37,40,39,41,40,43,42,44,43,46,45,47,46] ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle1 = shufflevector <64 x i8> %a0, <64 x i8> poison, <64 x i32> %shuffle2 = shufflevector <64 x i8> %shuffle1, <64 x i8> poison, <64 x i32> ret <64 x i8> %shuffle2 } define void @PR54562_mem(ptr %src, ptr %dst) { ; AVX512F-LABEL: PR54562_mem: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2] ; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: PR54562_mem: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,2,3,4,4,5] ; AVX512BW-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30,33,32,34,33,36,35,37,36,39,38,40,39,42,41,43,42,53,52,54,53,56,55,57,56,59,58,60,59,62,61,63,62] ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: PR54562_mem: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rsi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: PR54562_mem: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,13,12,14,13,16,15,17,16,19,18,20,19,22,21,23,22,25,24,26,25,28,27,29,28,31,30,32,31,34,33,35,34,37,36,38,37,40,39,41,40,43,42,44,43,46,45,47,46] ; AVX512VBMI-NEXT: vpermb (%rdi), %zmm0, %zmm0 ; AVX512VBMI-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq %load = load <64 x i8>, ptr %src, align 512 %shuffle1 = shufflevector <64 x i8> %load, <64 x i8> poison, <64 x i32> %shuffle2 = shufflevector <64 x i8> %shuffle1, <64 x i8> poison, <64 x i32> store <64 x i8> %shuffle2, ptr %dst, align 512 ret void } define <64 x i8> @shuffle_v32i16_zextinreg_to_v16i32(<64 x i8> %a) { ; ALL-LABEL: shuffle_v32i16_zextinreg_to_v16i32: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; ALL-NEXT: retq %b = shufflevector <64 x i8> %a, <64 x i8> , <64 x i32> ret <64 x i8> %b }