; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s ; ; kernel void combine_vloads(global char8 addrspace(5)* src, global char8 addrspace(5)* result) { ; for (int i = 0; i < 1024; ++i) ; result[i] = src[0] + src[1] + src[2] + src[3]; ; } ; ; 128-bit loads instead of many 8-bit ; EG-LABEL: {{^}}combine_vloads: ; EG: VTX_READ_128 ; EG: VTX_READ_128 define amdgpu_kernel void @combine_vloads(ptr addrspace(1) nocapture %src, ptr addrspace(1) nocapture %result) nounwind { entry: br label %for.body for.exit: ; preds = %for.body ret void for.body: ; preds = %for.body, %entry %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ] %vecload2 = load <8 x i32>, ptr addrspace(1) %src, align 32 %0 = bitcast <8 x i32> %vecload2 to <32 x i8> %tmp5 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> %tmp8 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> %tmp9 = add nsw <8 x i8> %tmp5, %tmp8 %tmp12 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> %tmp13 = add nsw <8 x i8> %tmp9, %tmp12 %tmp16 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> %tmp17 = add nsw <8 x i8> %tmp13, %tmp16 %scevgep = getelementptr <8 x i8>, ptr addrspace(1) %result, i32 %i.01 %1 = bitcast <8 x i8> %tmp17 to <2 x i32> store <2 x i32> %1, ptr addrspace(1) %scevgep, align 8 %tmp19 = add nsw i32 %i.01, 1 %exitcond = icmp eq i32 %tmp19, 1024 br i1 %exitcond, label %for.exit, label %for.body }