// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: powerpc-registered-target // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64-unknown-unknown -emit-llvm %s \ // RUN: -target-cpu pwr8 -o - | FileCheck %s -check-prefix=BE-PWR8 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64le-unknown-unknown -emit-llvm %s \ // RUN: -target-cpu pwr8 -o - | FileCheck %s -check-prefix=LE-PWR8 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64-unknown-unknown -emit-llvm %s \ // RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE-PWR9 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64le-unknown-unknown -emit-llvm %s \ // RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=LE-PWR9 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc-unknown-unknown -emit-llvm %s \ // RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE32-PWR9 #include // BE-PWR8-LABEL: @test_ldrmb1( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 // BE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]]) // BE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]]) // BE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]]) // BE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) // BE-PWR8-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> // BE-PWR8-NEXT: ret <16 x i8> [[TMP2]] // // LE-PWR8-LABEL: @test_ldrmb1( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 // LE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]]) // LE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]]) // LE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP0]]) // LE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]]) // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8> // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> // LE-PWR8-NEXT: ret <16 x i8> [[TMP3]] // // BE-PWR9-LABEL: @test_ldrmb1( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 1, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 // BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]]) // BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> // BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) // BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> // BE-PWR9-NEXT: ret <16 x i8> [[TMP14]] // // LE-PWR9-LABEL: @test_ldrmb1( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 1, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 // LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]]) // LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> // LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) // LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> // LE-PWR9-NEXT: ret <16 x i8> [[TMP14]] // // BE32-PWR9-LABEL: @test_ldrmb1( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 // BE32-PWR9-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]]) // BE32-PWR9-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]]) // BE32-PWR9-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]]) // BE32-PWR9-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) // BE32-PWR9-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> // BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]] // vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); } // BE-PWR8-LABEL: @test_strmb1( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15 // BE-PWR8-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb1( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 // LE-PWR8-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb1( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 1, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb1( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 1, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb1( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15 // BE32-PWR9-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb1(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 1, data); } // BE-PWR8-LABEL: @test_strmb2( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 // BE-PWR8-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb2( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) // LE-PWR8-NEXT: store i16 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb2( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 2, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb2( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 2, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb2( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 // BE32-PWR9-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb2(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 2, data); } // BE-PWR8-LABEL: @test_strmb3( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 // BE-PWR8-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13 // BE-PWR8-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb3( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) // LE-PWR8-NEXT: store i16 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 2 // LE-PWR8-NEXT: store i8 [[TMP8]], ptr [[TMP7]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb3( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 3, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb3( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 3, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb3( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 // BE32-PWR9-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13 // BE32-PWR9-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb3(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 3, data); } // BE-PWR8-LABEL: @test_strmb4( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb4( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb4( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 4, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb4( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 4, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb4( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb4(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 4, data); } // BE-PWR8-LABEL: @test_strmb5( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11 // BE-PWR8-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb5( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 4 // LE-PWR8-NEXT: store i8 [[TMP8]], ptr [[TMP7]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb5( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 5, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb5( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 5, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb5( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11 // BE32-PWR9-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb5(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 5, data); } // BE-PWR8-LABEL: @test_strmb6( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb6( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb6( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 6, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb6( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 6, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb6( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb6(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 6, data); } // BE-PWR8-LABEL: @test_strmb7( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9 // BE-PWR8-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb7( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 6 // LE-PWR8-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb7( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 7, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb7( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 7, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb7( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9 // BE32-PWR9-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb7(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 7, data); } // BE-PWR8-LABEL: @test_strmb8( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb8( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb8( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 8, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb8( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 8, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb8( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb8(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 8, data); } // BE-PWR8-LABEL: @test_ldrmb9( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 // BE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]]) // BE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]]) // BE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]]) // BE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) // BE-PWR8-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> // BE-PWR8-NEXT: ret <16 x i8> [[TMP2]] // // LE-PWR8-LABEL: @test_ldrmb9( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 // LE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]]) // LE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]]) // LE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP0]]) // LE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]]) // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8> // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> // LE-PWR8-NEXT: ret <16 x i8> [[TMP3]] // // BE-PWR9-LABEL: @test_ldrmb9( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 9, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 // BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]]) // BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> // BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) // BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> // BE-PWR9-NEXT: ret <16 x i8> [[TMP14]] // // LE-PWR9-LABEL: @test_ldrmb9( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 9, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 // LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]]) // LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> // LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) // LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> // LE-PWR9-NEXT: ret <16 x i8> [[TMP14]] // // BE32-PWR9-LABEL: @test_ldrmb9( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 // BE32-PWR9-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]]) // BE32-PWR9-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]]) // BE32-PWR9-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]]) // BE32-PWR9-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) // BE32-PWR9-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> // BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]] // vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); } // BE-PWR8-LABEL: @test_strmb9( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7 // BE-PWR8-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb9( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 8 // LE-PWR8-NEXT: store i8 [[TMP8]], ptr [[TMP7]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb9( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 9, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb9( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 9, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb9( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7 // BE32-PWR9-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb9(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 9, data); } // BE-PWR8-LABEL: @test_strmb10( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb10( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb10( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 10, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb10( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 10, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb10( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb10(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 10, data); } // BE-PWR8-LABEL: @test_strmb11( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5 // BE-PWR8-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb11( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 10 // LE-PWR8-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb11( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 11, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb11( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 11, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb11( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5 // BE32-PWR9-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb11(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 11, data); } // BE-PWR8-LABEL: @test_strmb12( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb12( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb12( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 12, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb12( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 12, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb12( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb12(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 12, data); } // BE-PWR8-LABEL: @test_strmb13( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 5 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3 // BE-PWR8-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb13( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 5 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 12 // LE-PWR8-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb13( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 13, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb13( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 13, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb13( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 5 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3 // BE32-PWR9-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb13(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 13, data); } // BE-PWR8-LABEL: @test_strmb14( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 6 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 // BE-PWR8-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb14( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 6 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6 // LE-PWR8-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) // LE-PWR8-NEXT: store i16 [[TMP16]], ptr [[TMP13]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb14( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 14, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb14( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 14, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb14( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 6 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 // BE32-PWR9-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb14(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 14, data); } // BE-PWR8-LABEL: @test_strmb15( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 7 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE-PWR8-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE-PWR8-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 // BE-PWR8-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1 // BE-PWR8-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1 // BE-PWR8-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb15( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 7 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1 // LE-PWR8-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // LE-PWR8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // LE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6 // LE-PWR8-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) // LE-PWR8-NEXT: store i16 [[TMP16]], ptr [[TMP13]], align 1 // LE-PWR8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // LE-PWR8-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[TMP1]], i64 14 // LE-PWR8-NEXT: store i8 [[TMP18]], ptr [[TMP17]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb15( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 15, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb15( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 15, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb15( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 7 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1 // BE32-PWR9-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> // BE32-PWR9-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1 // BE32-PWR9-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 // BE32-PWR9-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1 // BE32-PWR9-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 // BE32-PWR9-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1 // BE32-PWR9-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb15(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 15, data); } // BE-PWR8-LABEL: @test_ldrmb16( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 // BE-PWR8-NEXT: ret <16 x i8> [[TMP2]] // // LE-PWR8-LABEL: @test_ldrmb16( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> [[TMP2]], <16 x i32> // LE-PWR8-NEXT: ret <16 x i8> [[TMP3]] // // BE-PWR9-LABEL: @test_ldrmb16( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 16, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 // BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]]) // BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> // BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) // BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> // BE-PWR9-NEXT: ret <16 x i8> [[TMP14]] // // LE-PWR9-LABEL: @test_ldrmb16( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 16, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 // LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]]) // LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> // LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) // LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> // LE-PWR9-NEXT: ret <16 x i8> [[TMP14]] // // BE32-PWR9-LABEL: @test_ldrmb16( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 // BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]] // vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); } // BE-PWR8-LABEL: @test_strmb16( // BE-PWR8-NEXT: entry: // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR8-NEXT: store <16 x i8> [[TMP1]], ptr [[TMP0]], align 1 // BE-PWR8-NEXT: ret void // // LE-PWR8-LABEL: @test_strmb16( // LE-PWR8-NEXT: entry: // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> // LE-PWR8-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP0]], align 1 // LE-PWR8-NEXT: ret void // // BE-PWR9-LABEL: @test_strmb16( // BE-PWR9-NEXT: entry: // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: store i64 16, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // BE-PWR9-NEXT: ret void // // LE-PWR9-LABEL: @test_strmb16( // LE-PWR9-NEXT: entry: // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: store i64 16, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]] // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]]) // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]]) // LE-PWR9-NEXT: ret void // // BE32-PWR9-LABEL: @test_strmb16( // BE32-PWR9-NEXT: entry: // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16 // BE32-PWR9-NEXT: store <16 x i8> [[TMP1]], ptr [[TMP0]], align 1 // BE32-PWR9-NEXT: ret void // void test_strmb16(char *ptr, vector unsigned char data) { __vec_strmb(ptr, 16, data); }