; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s -verify-machineinstrs | FileCheck %s target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" target triple = "hexagon" ; s8 -> f16 ; No widening define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s8f16_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7 = #1 ; CHECK-NEXT: r6 = #64 ; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vsplat(r7) ; CHECK-NEXT: r3:2 = combine(#31,#5) ; CHECK-NEXT: v3.h = vabs(v0.h) ; CHECK-NEXT: v4.h = vabs(v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vsplat(r6) ; CHECK-NEXT: v7.h = vsplat(r3) ; CHECK-NEXT: v9 = vxor(v9,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r5 = ##32768 ; CHECK-NEXT: v5.uh = vcl0(v3.uh) ; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: v6.uh = vcl0(v4.uh) ; CHECK-NEXT: v5.h = vadd(v5.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vmux(q0,v10,v9) ; CHECK-NEXT: v6.h = vadd(v6.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vasl(v3.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.h = vasl(v4.h,v6.h) ; CHECK-NEXT: v13 = vand(v3,v8) ; CHECK-NEXT: v11.h = vadd(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.h = vadd(v4.h,v7.h) ; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h) ; CHECK-NEXT: v8 = vand(v4,v8) ; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2) ; CHECK-NEXT: v13 = vmux(q2,v9,v2) ; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h) ; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2) ; CHECK-NEXT: v22 = vmux(q2,v9,v2) ; CHECK-NEXT: v21 = vmux(q1,v2,v9) ; CHECK-NEXT: v2 = vmux(q3,v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2) ; CHECK-NEXT: v13.h = vadd(v11.h,v13.h) ; CHECK-NEXT: v24.h = vadd(v20.h,v22.h) ; CHECK-NEXT: v2.h = vadd(v2.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2) ; CHECK-NEXT: v23.h = vadd(v21.h,v7.h) ; CHECK-NEXT: v2.h = vsub(v2.h,v6.h) ; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7) ; CHECK-NEXT: v3.h = vsub(v23.h,v5.h) ; CHECK-NEXT: q2 = vcmp.eq(v12.h,v11.h) ; CHECK-NEXT: q1 = vcmp.eq(v19.h,v20.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uh = vlsr(v13.uh,r7) ; CHECK-NEXT: v28 = vmux(q3,v10,v9) ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v9.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uh = vlsr(v24.uh,r7) ; CHECK-NEXT: v5 = vmux(q2,v25,v11) ; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uh = vlsr(v20.uh,r7) ; CHECK-NEXT: v5 = vor(v27,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vasl(v3.h,r4) ; CHECK-NEXT: v4 = vmux(q1,v26,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vasl(v2.h,r4) ; CHECK-NEXT: v4 = vor(v28,v4) ; CHECK-NEXT: v29 = vor(v5,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vor(v4,v2) ; CHECK-NEXT: v31 = vmux(q3,v9,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30 = vmux(q2,v9,v2) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = sitofp <128 x i8> %v0 to <128 x half> store <128 x half> %v1, ptr %a1, align 128 ret void } ; Widen input define void @s8f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s8f16_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 ; CHECK-NEXT: r3:2 = combine(#64,#31) ; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vsplat(r6) ; CHECK-NEXT: v4.h = vsplat(r2) ; CHECK-NEXT: v2.h = vabs(v0.h) ; CHECK-NEXT: v1 = vxor(v1,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r3) ; CHECK-NEXT: r5:4 = combine(##32768,#5) ; CHECK-NEXT: r2 = #10 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vsplat(r5) ; CHECK-NEXT: v5.uh = vcl0(v2.uh) ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vasl(v2.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v2.h,v4.h) ; CHECK-NEXT: v6 = vand(v2,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.uh = vlsr(v2.uh,r4) ; CHECK-NEXT: q0 = vcmp.eq(v6.h,v1.h) ; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v7.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4) ; CHECK-NEXT: v26 = vmux(q0,v1,v3) ; CHECK-NEXT: v3 = vmux(q1,v3,v1) ; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) ; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) ; CHECK-NEXT: q2 = vcmp.eq(v2.h,v25.h) ; CHECK-NEXT: v30 = vmux(q1,v8,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6) ; CHECK-NEXT: v28.h = vsub(v3.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vasl(v28.h,r2) ; CHECK-NEXT: v3 = vmux(q2,v29,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v30,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v1,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = sitofp <64 x i8> %v0 to <64 x half> store <64 x half> %v1, ptr %a1, align 128 ret void } ; s8 -> f32 ; No widening define void @s8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s8f32_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r0) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r3) ; CHECK-NEXT: r7 = #512 ; CHECK-NEXT: v9:8.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r7) ; CHECK-NEXT: r6 = ##-2147483648 ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v7:6.h = vunpack(v1.b) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: v1:0.w = vunpack(v8.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7:6.w = vunpack(v6.h) ; CHECK-NEXT: v5.w = vabs(v0.w) ; CHECK-NEXT: v10.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.w = vabs(v6.w) ; CHECK-NEXT: v13.w = vabs(v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vcl0(v5.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vcl0(v26.uw) ; CHECK-NEXT: v9.w = vadd(v9.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uw = vcl0(v13.uw) ; CHECK-NEXT: v15.w = vadd(v12.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uw = vcl0(v10.uw) ; CHECK-NEXT: v12.w = vadd(v14.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.w = vasl(v26.w,v15.w) ; CHECK-NEXT: v11.w = vadd(v11.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13.w = vasl(v13.w,v12.w) ; CHECK-NEXT: v20 = vand(v27,v4) ; CHECK-NEXT: v19.w = vadd(v27.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v16.w = vasl(v5.w,v9.w) ; CHECK-NEXT: v5 = vxor(v5,v5) ; CHECK-NEXT: v23.w = vadd(v13.w,v3.w) ; CHECK-NEXT: v28 = vand(v13,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v17.w = vasl(v10.w,v11.w) ; CHECK-NEXT: q3 = vcmp.eq(v20.w,v5.w) ; CHECK-NEXT: q2 = vcmp.gt(v27.uw,v19.uw) ; CHECK-NEXT: q0 = vcmp.gt(v5.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v27.uw,r2) ; CHECK-NEXT: v30 = vmux(q3,v5,v2) ; CHECK-NEXT: q3 = vcmp.eq(v28.w,v5.w) ; CHECK-NEXT: v22 = vand(v17,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uw = vlsr(v19.uw,r2) ; CHECK-NEXT: v27 = vmux(q3,v5,v2) ; CHECK-NEXT: q1 = vcmp.eq(v22.w,v5.w) ; CHECK-NEXT: v24 = vmux(q2,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.uw = vlsr(v23.uw,r2) ; CHECK-NEXT: v22.w = vadd(v14.w,v30.w) ; CHECK-NEXT: v30.w = vadd(v17.w,v3.w) ; CHECK-NEXT: q2 = vcmp.eq(v21.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uw = vlsr(v13.uw,r2) ; CHECK-NEXT: v28.w = vadd(v31.w,v27.w) ; CHECK-NEXT: v3.w = vadd(v16.w,v3.w) ; CHECK-NEXT: v4 = vand(v16,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uw = vlsr(v14.uw,r0) ; CHECK-NEXT: q3 = vcmp.eq(v29.w,v31.w) ; CHECK-NEXT: v18 = vmux(q0,v8,v5) ; CHECK-NEXT: q0 = vcmp.gt(v5.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) ; CHECK-NEXT: v26 = vmux(q1,v5,v2) ; CHECK-NEXT: v31 = vmux(q0,v8,v5) ; CHECK-NEXT: q0 = vcmp.gt(v16.uw,v3.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r5) ; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) ; CHECK-NEXT: v15.w = vsub(v24.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: v14 = vmux(q2,v29,v14) ; CHECK-NEXT: q2 = vcmp.gt(v13.uw,v23.uw) ; CHECK-NEXT: v15.w = vadd(v15.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2) ; CHECK-NEXT: v19 = vmux(q3,v20,v19) ; CHECK-NEXT: q3 = vcmp.eq(v4.w,v5.w) ; CHECK-NEXT: v27 = vmux(q2,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) ; CHECK-NEXT: q2 = vcmp.gt(v17.uw,v30.uw) ; CHECK-NEXT: v28.w = vadd(v25.w,v26.w) ; CHECK-NEXT: v29 = vmux(q3,v5,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v17.uw = vlsr(v17.uw,r2) ; CHECK-NEXT: v19 = vor(v31,v19) ; CHECK-NEXT: v31 = vmux(q2,v2,v5) ; CHECK-NEXT: v2 = vmux(q0,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v16.uw,r2) ; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) ; CHECK-NEXT: v2.w = vsub(v2.w,v9.w) ; CHECK-NEXT: v11.w = vsub(v31.w,v11.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: q3 = vcmp.eq(v17.w,v25.w) ; CHECK-NEXT: v4.w = vsub(v27.w,v12.w) ; CHECK-NEXT: v2.w = vadd(v2.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13.uw = vlsr(v25.uw,r0) ; CHECK-NEXT: q0 = vcmp.eq(v24.w,v3.w) ; CHECK-NEXT: v21.w = vadd(v11.w,v10.w) ; CHECK-NEXT: q2 = vcmp.gt(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.uw = vlsr(v30.uw,r0) ; CHECK-NEXT: v23 = vmux(q3,v16,v13) ; CHECK-NEXT: q3 = vcmp.gt(v5.w,v0.w) ; CHECK-NEXT: v24 = vmux(q2,v8,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r0) ; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) ; CHECK-NEXT: v8 = vmux(q3,v8,v5) ; CHECK-NEXT: v10 = vor(v24,v23) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasl(v21.w,r4) ; CHECK-NEXT: v3 = vmux(q0,v22,v3) ; CHECK-NEXT: v14 = vor(v18,v14) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r4) ; CHECK-NEXT: v3 = vor(v8,v3) ; CHECK-NEXT: v25 = vor(v10,v9) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v15.w = vasl(v15.w,r4) ; CHECK-NEXT: v2 = vor(v3,v2) ; CHECK-NEXT: v27 = vmux(q2,v5,v25) ; CHECK-NEXT: vmem(r1+#1) = v27.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.w = vasl(v4.w,r4) ; CHECK-NEXT: v29 = vmux(q3,v5,v2) ; CHECK-NEXT: q2 = vcmp.eq(v7.w,v5.w) ; CHECK-NEXT: vmem(r1+#0) = v29.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28 = vor(v19,v26) ; CHECK-NEXT: v30 = vor(v14,v15) ; CHECK-NEXT: q3 = vcmp.eq(v6.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q2,v5,v28) ; CHECK-NEXT: v31 = vmux(q3,v5,v30) ; CHECK-NEXT: vmem(r1+#3) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#2) = v31 ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = sitofp <128 x i8> %v0 to <128 x float> store <128 x float> %v1, ptr %a1, align 128 ret void } ; Widen input #1 define void @s8f32_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s8f32_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: v3:2.h = vunpack(v0.b) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vsplat(r0) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: r6 = #512 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r3) ; CHECK-NEXT: v3:2.w = vunpack(v2.h) ; CHECK-NEXT: v22 = vxor(v22,v22) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r6) ; CHECK-NEXT: r7 = ##-2147483648 ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r7) ; CHECK-NEXT: v4.w = vabs(v2.w) ; CHECK-NEXT: v5.w = vabs(v3.w) ; CHECK-NEXT: q0 = vcmp.gt(v22.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12 = vsplat(r5) ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v11 = vmux(q0,v9,v22) ; CHECK-NEXT: q0 = vcmp.gt(v22.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vcl0(v4.uw) ; CHECK-NEXT: v30 = vmux(q0,v9,v22) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vcl0(v5.uw) ; CHECK-NEXT: v6.w = vadd(v6.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.w = vadd(v8.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vasl(v5.w,v8.w) ; CHECK-NEXT: v13 = vand(v4,v10) ; CHECK-NEXT: v14.w = vadd(v4.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vand(v5,v10) ; CHECK-NEXT: v7.w = vadd(v5.w,v7.w) ; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v14.uw) ; CHECK-NEXT: q1 = vcmp.eq(v13.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uw = vlsr(v14.uw,r2) ; CHECK-NEXT: q3 = vcmp.eq(v10.w,v22.w) ; CHECK-NEXT: v25 = vmux(q2,v1,v22) ; CHECK-NEXT: q2 = vcmp.gt(v5.uw,v7.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v26 = vmux(q1,v22,v1) ; CHECK-NEXT: v27 = vmux(q3,v22,v1) ; CHECK-NEXT: v1 = vmux(q2,v1,v22) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) ; CHECK-NEXT: v5.w = vadd(v14.w,v26.w) ; CHECK-NEXT: v29.w = vadd(v7.w,v27.w) ; CHECK-NEXT: v6.w = vsub(v25.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v1.w = vsub(v1.w,v8.w) ; CHECK-NEXT: v6.w = vadd(v6.w,v12.w) ; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v14.uw,r0) ; CHECK-NEXT: v1.w = vadd(v1.w,v12.w) ; CHECK-NEXT: q1 = vcmp.eq(v23.w,v14.w) ; CHECK-NEXT: q2 = vcmp.eq(v3.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r0) ; CHECK-NEXT: v5 = vmux(q1,v5,v28) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uw = vlsr(v29.uw,r0) ; CHECK-NEXT: v5 = vor(v11,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,r4) ; CHECK-NEXT: v4 = vmux(q3,v4,v7) ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r4) ; CHECK-NEXT: v4 = vor(v30,v4) ; CHECK-NEXT: v31 = vor(v5,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v4,v1) ; CHECK-NEXT: v0 = vmux(q3,v22,v31) ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vmux(q2,v22,v1) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v1.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = sitofp <64 x i8> %v0 to <64 x float> store <64 x float> %v1, ptr %a1, align 128 ret void } ; Widen input #2 define void @s8f32_2(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s8f32_2: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: r3 = #512 ; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r0) ; CHECK-NEXT: v4 = vsplat(r3) ; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v1:0.w = vunpack(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vsplat(r2) ; CHECK-NEXT: v8 = vsplat(r4) ; CHECK-NEXT: v5.w = vabs(v0.w) ; CHECK-NEXT: q2 = vcmp.gt(v3.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r7) ; CHECK-NEXT: r2 = #23 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vcl0(v5.uw) ; CHECK-NEXT: v30 = vmux(q2,v7,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v6.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vasl(v5.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vadd(v5.w,v1.w) ; CHECK-NEXT: v4 = vand(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) ; CHECK-NEXT: q0 = vcmp.eq(v4.w,v3.w) ; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v1.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) ; CHECK-NEXT: v4 = vmux(q0,v3,v2) ; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vadd(v1.w,v4.w) ; CHECK-NEXT: v2.w = vsub(v2.w,v6.w) ; CHECK-NEXT: q3 = vcmp.eq(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v1.uw,r0) ; CHECK-NEXT: v2.w = vadd(v2.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uw = vlsr(v4.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r2) ; CHECK-NEXT: v1 = vmux(q3,v29,v28) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v30,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v1,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v3,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <32 x i8>, ptr %a0, align 128 %v1 = sitofp <32 x i8> %v0 to <32 x float> store <32 x float> %v1, ptr %a1, align 128 ret void } ; s16 -> f16 ; No widening define void @s16f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s16f16_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 ; CHECK-NEXT: r3:2 = combine(#64,#31) ; CHECK-NEXT: v1.h = vabs(v0.h) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vsplat(r6) ; CHECK-NEXT: v5.h = vsplat(r2) ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r3) ; CHECK-NEXT: r5:4 = combine(##32768,#5) ; CHECK-NEXT: v4.uh = vcl0(v1.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vsplat(r5) ; CHECK-NEXT: r2 = #10 ; CHECK-NEXT: v4.h = vadd(v4.h,v3.h) ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vasl(v1.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v1.h,v5.h) ; CHECK-NEXT: v6 = vand(v1,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.uh = vlsr(v1.uh,r4) ; CHECK-NEXT: q0 = vcmp.eq(v6.h,v2.h) ; CHECK-NEXT: q1 = vcmp.gt(v1.uh,v7.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4) ; CHECK-NEXT: v26 = vmux(q0,v2,v3) ; CHECK-NEXT: v3 = vmux(q1,v3,v2) ; CHECK-NEXT: q1 = vcmp.gt(v2.h,v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) ; CHECK-NEXT: v3.h = vadd(v3.h,v5.h) ; CHECK-NEXT: q2 = vcmp.eq(v1.h,v25.h) ; CHECK-NEXT: v30 = vmux(q1,v8,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6) ; CHECK-NEXT: v28.h = vsub(v3.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vasl(v28.h,r2) ; CHECK-NEXT: v3 = vmux(q2,v29,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v30,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v3,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v2,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <64 x i16>, ptr %a0, align 128 %v1 = sitofp <64 x i16> %v0 to <64 x half> store <64 x half> %v1, ptr %a1, align 128 ret void } ; Widen input and result define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s16f16_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#31,#1) ; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: v1.h = vabs(v0.h) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vsplat(r2) ; CHECK-NEXT: v5.h = vsplat(r3) ; CHECK-NEXT: r6 = #5 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r7) ; CHECK-NEXT: r4 = ##32768 ; CHECK-NEXT: v4.uh = vcl0(v1.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vsplat(r4) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h) ; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30 = vmux(q2,v8,v3) ; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vasl(v1.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v1.h,v5.h) ; CHECK-NEXT: v6 = vand(v1,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.uh = vlsr(v1.uh,r6) ; CHECK-NEXT: q1 = vcmp.eq(v6.h,v3.h) ; CHECK-NEXT: q0 = vcmp.gt(v1.uh,v7.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uh = vlsr(v7.uh,r6) ; CHECK-NEXT: v26 = vmux(q1,v3,v2) ; CHECK-NEXT: v2 = vmux(q0,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) ; CHECK-NEXT: v2.h = vadd(v2.h,v5.h) ; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uh = vlsr(v25.uh,r2) ; CHECK-NEXT: v28.h = vsub(v2.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vasl(v28.h,r4) ; CHECK-NEXT: q3 = vsetq(r7) ; CHECK-NEXT: v2 = vmux(q3,v29,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vor(v30,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v2,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q2,v3,v31) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 ; CHECK-NEXT: } %v0 = load <32 x i16>, ptr %a0, align 128 %v1 = sitofp <32 x i16> %v0 to <32 x half> store <32 x half> %v1, ptr %a1, align 128 ret void } ; s16 -> f32 ; No widening define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s16f32_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: v1:0.w = vunpack(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r0) ; CHECK-NEXT: r7 = #512 ; CHECK-NEXT: v4.w = vabs(v0.w) ; CHECK-NEXT: v6.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5 = vsplat(r3) ; CHECK-NEXT: v9 = vsplat(r7) ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: r6 = ##-2147483648 ; CHECK-NEXT: v7.uw = vcl0(v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r6) ; CHECK-NEXT: v8.uw = vcl0(v6.uw) ; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w) ; CHECK-NEXT: v7.w = vadd(v7.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v8.w = vadd(v8.w,v3.w) ; CHECK-NEXT: v27 = vmux(q0,v10,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,v8.w) ; CHECK-NEXT: v11.w = vadd(v4.w,v5.w) ; CHECK-NEXT: v12 = vand(v4,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v6.w,v5.w) ; CHECK-NEXT: v9 = vand(v6,v9) ; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w) ; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2) ; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w) ; CHECK-NEXT: v23 = vmux(q1,v2,v3) ; CHECK-NEXT: v14 = vmux(q2,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) ; CHECK-NEXT: v11.w = vadd(v22.w,v23.w) ; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw) ; CHECK-NEXT: v25 = vmux(q3,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v5.w = vadd(v24.w,v25.w) ; CHECK-NEXT: v3 = vmux(q2,v3,v2) ; CHECK-NEXT: v7.w = vsub(v14.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: v3.w = vsub(v3.w,v8.w) ; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w) ; CHECK-NEXT: v7.w = vadd(v7.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0) ; CHECK-NEXT: v3.w = vadd(v3.w,v13.w) ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uw = vlsr(v11.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) ; CHECK-NEXT: v4 = vmux(q3,v11,v4) ; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0) ; CHECK-NEXT: v28 = vmux(q3,v10,v2) ; CHECK-NEXT: v4 = vor(v27,v4) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v7.w,r4) ; CHECK-NEXT: v5 = vmux(q2,v5,v26) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vasl(v3.w,r4) ; CHECK-NEXT: v5 = vor(v28,v5) ; CHECK-NEXT: v29 = vor(v4,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v5,v3) ; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } %v0 = load <64 x i16>, ptr %a0, align 128 %v1 = sitofp <64 x i16> %v0 to <64 x float> store <64 x float> %v1, ptr %a1, align 128 ret void } ; Widen input define void @s16f32_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s16f32_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v1:0.w = vunpack(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r0) ; CHECK-NEXT: v4 = vsplat(r2) ; CHECK-NEXT: r3 = #512 ; CHECK-NEXT: v2.w = vabs(v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) ; CHECK-NEXT: v1 = vxor(v1,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v5.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v29 = vsplat(r7) ; CHECK-NEXT: q2 = vcmp.gt(v1.w,v0.w) ; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = #23 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vadd(v2.w,v4.w) ; CHECK-NEXT: v6 = vand(v2,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.uw = vlsr(v2.uw,r6) ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v1.w) ; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r6) ; CHECK-NEXT: v6 = vmux(q0,v1,v3) ; CHECK-NEXT: v3 = vmux(q1,v3,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v4.w,v6.w) ; CHECK-NEXT: v27.w = vsub(v3.w,v5.w) ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v4.uw,r0) ; CHECK-NEXT: v2.w = vadd(v27.w,v7.w) ; CHECK-NEXT: v4 = vmux(q2,v29,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r2) ; CHECK-NEXT: v3 = vmux(q3,v30,v28) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v4,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v1,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <32 x i16>, ptr %a0, align 128 %v1 = sitofp <32 x i16> %v0 to <32 x float> store <32 x float> %v1, ptr %a1, align 128 ret void } ; s32 -> f16 ; No widening define void @s32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s32f16_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 ; CHECK-NEXT: v6.w = vabs(v1.w) ; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: r4 = #512 ; CHECK-NEXT: v5.w = vabs(v0.w) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r4) ; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: v3.uw = vcl0(v6.uw) ; CHECK-NEXT: v20 = vxor(v20,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v4.uw = vcl0(v5.uw) ; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vsplat(r4) ; CHECK-NEXT: r5 = ##-2147483648 ; CHECK-NEXT: v7.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: v6.w = vasl(v6.w,v3.w) ; CHECK-NEXT: q0 = vcmp.gt(v20.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vasl(v5.w,v7.w) ; CHECK-NEXT: v26 = vmux(q0,v13,v20) ; CHECK-NEXT: v10.w = vadd(v6.w,v8.w) ; CHECK-NEXT: v11 = vand(v6,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vand(v5,v9) ; CHECK-NEXT: q3 = vcmp.eq(v11.w,v20.w) ; CHECK-NEXT: v8.w = vadd(v5.w,v8.w) ; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v10.uw,r3) ; CHECK-NEXT: q2 = vcmp.eq(v9.w,v20.w) ; CHECK-NEXT: v22 = vmux(q3,v20,v2) ; CHECK-NEXT: q3 = vcmp.gt(v5.uw,v8.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) ; CHECK-NEXT: v9.w = vadd(v21.w,v22.w) ; CHECK-NEXT: v24 = vmux(q2,v20,v2) ; CHECK-NEXT: v23 = vmux(q1,v2,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v6.uw,r3) ; CHECK-NEXT: v2 = vmux(q3,v2,v20) ; CHECK-NEXT: v25.w = vadd(v8.w,v24.w) ; CHECK-NEXT: v3.w = vsub(v23.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r3) ; CHECK-NEXT: v2.w = vsub(v2.w,v7.w) ; CHECK-NEXT: q3 = vcmp.eq(v12.w,v21.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: v6.uw = vlsr(v21.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v5.w,v8.w) ; CHECK-NEXT: v2.w = vadd(v2.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v25.uw,r2) ; CHECK-NEXT: v6 = vmux(q3,v9,v6) ; CHECK-NEXT: q3 = vcmp.gt(v20.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uw = vlsr(v8.uw,r2) ; CHECK-NEXT: v30 = vmux(q3,v13,v20) ; CHECK-NEXT: v6 = vor(v26,v6) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vasl(v3.w,r3) ; CHECK-NEXT: v5 = vmux(q2,v28,v29) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r3) ; CHECK-NEXT: v31 = vor(v30,v5) ; CHECK-NEXT: v3 = vor(v6,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v31,v2) ; CHECK-NEXT: v3 = vmux(q2,v20,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v20,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v20.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v20.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v3:2.qf32 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.h = vdeal(v0.h) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <64 x i32>, ptr %a0, align 128 %v1 = sitofp <64 x i32> %v0 to <64 x half> store <64 x half> %v1, ptr %a1, align 128 ret void } ; Widen result define void @s32f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s32f16_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 ; CHECK-NEXT: v1.w = vabs(v0.w) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r6) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: r4 = #512 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5 = vsplat(r3) ; CHECK-NEXT: v6 = vsplat(r4) ; CHECK-NEXT: v4.uw = vcl0(v1.uw) ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: r4 = ##-2147483648 ; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28 = vsplat(r5) ; CHECK-NEXT: v29 = vsplat(r4) ; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) ; CHECK-NEXT: v31 = vmux(q3,v29,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) ; CHECK-NEXT: v6 = vand(v1,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v1.uw,r2) ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w) ; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = #64 ; CHECK-NEXT: v1.uw = vlsr(v5.uw,r2) ; CHECK-NEXT: v27 = vmux(q0,v3,v2) ; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: q3 = vsetq(r2) ; CHECK-NEXT: v5.w = vadd(v1.w,v27.w) ; CHECK-NEXT: v2.w = vsub(v2.w,v4.w) ; CHECK-NEXT: q2 = vcmp.eq(v7.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) ; CHECK-NEXT: v2.w = vadd(v2.w,v28.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v5.uw,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r3) ; CHECK-NEXT: v1 = vmux(q2,v30,v1) ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v31,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.qf32 = vadd(v3.sf,v3.sf) ; CHECK-NEXT: v0 = vor(v1,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q2,v3,v0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v3.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v1:0.qf32 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.h = vdeal(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 ; CHECK-NEXT: } %v0 = load <32 x i32>, ptr %a0, align 128 %v1 = sitofp <32 x i32> %v0 to <32 x half> store <32 x half> %v1, ptr %a1, align 128 ret void } ; s32 -> f32 ; No widening define void @s32f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s32f32_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v1.w = vabs(v0.w) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r0) ; CHECK-NEXT: v5 = vsplat(r2) ; CHECK-NEXT: r3 = #512 ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) ; CHECK-NEXT: v4.uw = vcl0(v1.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v29 = vsplat(r7) ; CHECK-NEXT: r2 = #23 ; CHECK-NEXT: q2 = vcmp.gt(v2.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) ; CHECK-NEXT: v6 = vand(v1,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v2.w) ; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) ; CHECK-NEXT: v6 = vmux(q0,v2,v3) ; CHECK-NEXT: v3 = vmux(q1,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v5.w,v6.w) ; CHECK-NEXT: v27.w = vsub(v3.w,v4.w) ; CHECK-NEXT: q3 = vcmp.eq(v1.w,v5.w) ; CHECK-NEXT: v4 = vmux(q2,v29,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0) ; CHECK-NEXT: v1.w = vadd(v27.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r2) ; CHECK-NEXT: v3 = vmux(q3,v30,v28) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v4,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v3,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v2,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <32 x i32>, ptr %a0, align 128 %v1 = sitofp <32 x i32> %v0 to <32 x float> store <32 x float> %v1, ptr %a1, align 128 ret void } ; Widen input and result define void @s32f32_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: s32f32_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v1.w = vabs(v0.w) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r0) ; CHECK-NEXT: v5 = vsplat(r2) ; CHECK-NEXT: r3 = #512 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) ; CHECK-NEXT: v4.uw = vcl0(v1.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v29 = vsplat(r7) ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = #64 ; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) ; CHECK-NEXT: v6 = vand(v1,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w) ; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) ; CHECK-NEXT: v6 = vmux(q0,v3,v2) ; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v5.w,v6.w) ; CHECK-NEXT: v27.w = vsub(v2.w,v4.w) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) ; CHECK-NEXT: v4 = vmux(q3,v29,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0) ; CHECK-NEXT: q3 = vsetq(r2) ; CHECK-NEXT: v1.w = vadd(v27.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r3) ; CHECK-NEXT: v2 = vmux(q2,v30,v28) ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vor(v4,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v2,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q2,v3,v31) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 ; CHECK-NEXT: } %v0 = load <16 x i32>, ptr %a0, align 128 %v1 = sitofp <16 x i32> %v0 to <16 x float> store <16 x float> %v1, ptr %a1, align 128 ret void } ; u8 -> f16 ; No widening define void @u8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u8f16_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 ; CHECK-NEXT: r3:2 = combine(#31,#5) ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vsplat(r6) ; CHECK-NEXT: v4.h = vsplat(r3) ; CHECK-NEXT: r5 = #64 ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: v5.uh = vcl0(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uh = vcl0(v1.uh) ; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v7.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vasl(v0.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.h = vasl(v1.h,v7.h) ; CHECK-NEXT: v10 = vand(v8,v6) ; CHECK-NEXT: v9.h = vadd(v8.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.h = vadd(v11.h,v4.h) ; CHECK-NEXT: v6 = vand(v11,v6) ; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh) ; CHECK-NEXT: q1 = vcmp.eq(v10.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2) ; CHECK-NEXT: q2 = vcmp.eq(v6.h,v2.h) ; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh) ; CHECK-NEXT: v12 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2) ; CHECK-NEXT: v13 = vmux(q2,v2,v3) ; CHECK-NEXT: v25 = vmux(q0,v3,v2) ; CHECK-NEXT: v3 = vmux(q3,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2) ; CHECK-NEXT: v24.h = vadd(v9.h,v12.h) ; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) ; CHECK-NEXT: v12.h = vadd(v25.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2) ; CHECK-NEXT: v13.h = vadd(v8.h,v13.h) ; CHECK-NEXT: v5.h = vsub(v12.h,v5.h) ; CHECK-NEXT: v3.h = vsub(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6) ; CHECK-NEXT: q2 = vcmp.eq(v21.h,v9.h) ; CHECK-NEXT: q3 = vcmp.eq(v23.h,v8.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uh = vlsr(v24.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6) ; CHECK-NEXT: v4 = vmux(q2,v26,v14) ; CHECK-NEXT: q2 = vcmp.eq(v1.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.h = vasl(v5.h,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vasl(v3.h,r4) ; CHECK-NEXT: v29 = vor(v4,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v6,v3) ; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = uitofp <128 x i8> %v0 to <128 x half> store <128 x half> %v1, ptr %a1, align 128 ret void } ; Widen input define void @u8f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u8f16_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 ; CHECK-NEXT: r3:2 = combine(#64,#31) ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vsplat(r6) ; CHECK-NEXT: v4.h = vsplat(r2) ; CHECK-NEXT: r5 = #5 ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.h = vsplat(r3) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: v3.uh = vcl0(v0.uh) ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vadd(v3.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vasl(v0.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) ; CHECK-NEXT: v5 = vand(v6,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5) ; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh) ; CHECK-NEXT: q1 = vcmp.eq(v5.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uh = vlsr(v7.uh,r5) ; CHECK-NEXT: v27 = vmux(q1,v2,v1) ; CHECK-NEXT: v1 = vmux(q0,v1,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vadd(v1.h,v4.h) ; CHECK-NEXT: v28.h = vadd(v26.h,v27.h) ; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uh = vlsr(v26.uh,r6) ; CHECK-NEXT: v1.h = vsub(v1.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uh = vlsr(v28.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vasl(v1.h,r4) ; CHECK-NEXT: v3 = vmux(q2,v30,v29) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v3,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v2,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = uitofp <64 x i8> %v0 to <64 x half> store <64 x half> %v1, ptr %a1, align 128 ret void } ; u8 -> f32 ; No widening define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u8f32_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: r0 = #1 ; CHECK-NEXT: r6 = #512 ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r0) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v15 = vsplat(r6) ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v31:30.uh = vunpack(v1.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1:0.uw = vunpack(v30.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vcl0(v0.uw) ; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vcl0(v3.uw) ; CHECK-NEXT: v11.w = vadd(v7.w,v4.w) ; CHECK-NEXT: v7 = vxor(v7,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vcl0(v1.uw) ; CHECK-NEXT: v10.w = vadd(v8.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r5) ; CHECK-NEXT: v14.w = vasl(v0.w,v11.w) ; CHECK-NEXT: v8.w = vadd(v9.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.w = vasl(v2.w,v5.w) ; CHECK-NEXT: v24 = vand(v14,v15) ; CHECK-NEXT: v20.w = vadd(v14.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13.w = vasl(v3.w,v10.w) ; CHECK-NEXT: v19 = vand(v12,v15) ; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) ; CHECK-NEXT: v18.w = vadd(v12.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v16.w = vasl(v1.w,v8.w) ; CHECK-NEXT: v23 = vand(v13,v15) ; CHECK-NEXT: v22.w = vadd(v13.w,v6.w) ; CHECK-NEXT: q0 = vcmp.gt(v14.uw,v20.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v16.w,v6.w) ; CHECK-NEXT: v15 = vand(v16,v15) ; CHECK-NEXT: v30 = vmux(q3,v7,v4) ; CHECK-NEXT: q2 = vcmp.eq(v19.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v14.uw,r2) ; CHECK-NEXT: q3 = vcmp.eq(v15.w,v7.w) ; CHECK-NEXT: v28 = vmux(q0,v4,v7) ; CHECK-NEXT: q1 = vcmp.eq(v23.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uw = vlsr(v20.uw,r2) ; CHECK-NEXT: v26 = vmux(q3,v7,v4) ; CHECK-NEXT: v11.w = vsub(v28.w,v11.w) ; CHECK-NEXT: q3 = vcmp.gt(v13.uw,v22.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v15.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: v20.w = vadd(v14.w,v30.w) ; CHECK-NEXT: v30 = vmux(q1,v7,v4) ; CHECK-NEXT: v31 = vmux(q2,v7,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) ; CHECK-NEXT: v29.w = vadd(v15.w,v26.w) ; CHECK-NEXT: q1 = vcmp.gt(v12.uw,v18.uw) ; CHECK-NEXT: v11.w = vadd(v11.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) ; CHECK-NEXT: v23.w = vadd(v19.w,v31.w) ; CHECK-NEXT: v22 = vmux(q3,v4,v7) ; CHECK-NEXT: q3 = vcmp.gt(v16.uw,v6.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v29.uw,r0) ; CHECK-NEXT: v31.w = vadd(v28.w,v30.w) ; CHECK-NEXT: v30 = vmux(q1,v4,v7) ; CHECK-NEXT: v4 = vmux(q3,v4,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v17.uw = vlsr(v12.uw,r2) ; CHECK-NEXT: v5.w = vsub(v30.w,v5.w) ; CHECK-NEXT: v29.w = vsub(v22.w,v10.w) ; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13.uw = vlsr(v13.uw,r2) ; CHECK-NEXT: v6.w = vadd(v29.w,v9.w) ; CHECK-NEXT: v5.w = vadd(v5.w,v9.w) ; CHECK-NEXT: q0 = vcmp.eq(v21.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uw = vlsr(v16.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v17.w,v19.w) ; CHECK-NEXT: q3 = vcmp.eq(v13.w,v28.w) ; CHECK-NEXT: v4.w = vadd(v4.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) ; CHECK-NEXT: q1 = vcmp.eq(v25.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uw = vlsr(v19.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.uw = vlsr(v31.uw,r0) ; CHECK-NEXT: v23 = vmux(q2,v21,v23) ; CHECK-NEXT: q2 = vcmp.eq(v3.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uw = vlsr(v15.uw,r0) ; CHECK-NEXT: v8 = vmux(q3,v31,v16) ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,r4) ; CHECK-NEXT: v22 = vmux(q1,v24,v26) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vasl(v5.w,r4) ; CHECK-NEXT: v6 = vor(v8,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uw = vlsr(v14.uw,r0) ; CHECK-NEXT: v25 = vor(v23,v5) ; CHECK-NEXT: v26 = vmux(q2,v7,v6) ; CHECK-NEXT: vmem(r1+#1) = v26.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) ; CHECK-NEXT: v28 = vmux(q3,v7,v25) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) ; CHECK-NEXT: vmem(r1+#0) = v28.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.w = vasl(v11.w,r4) ; CHECK-NEXT: v20 = vmux(q0,v20,v27) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.w = vasl(v4.w,r4) ; CHECK-NEXT: v29 = vor(v20,v11) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vor(v22,v24) ; CHECK-NEXT: v31 = vmux(q3,v7,v29) ; CHECK-NEXT: vmem(r1+#2) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30 = vmux(q2,v7,v27) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#3) = v30.new ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = uitofp <128 x i8> %v0 to <128 x float> store <128 x float> %v1, ptr %a1, align 128 ret void } ; Widen input #1 define void @u8f32_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u8f32_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r7 = #1 ; CHECK-NEXT: r6 = #512 ; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vsplat(r7) ; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) ; CHECK-NEXT: v21 = vxor(v21,v21) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vcl0(v3.uw) ; CHECK-NEXT: v4.w = vadd(v4.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v2.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasl(v3.w,v5.w) ; CHECK-NEXT: v11 = vand(v7,v8) ; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) ; CHECK-NEXT: q1 = vcmp.eq(v11.w,v21.w) ; CHECK-NEXT: v8 = vand(v9,v8) ; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.uw = vlsr(v10.uw,r2) ; CHECK-NEXT: v24 = vmux(q1,v21,v1) ; CHECK-NEXT: q3 = vcmp.eq(v8.w,v21.w) ; CHECK-NEXT: q1 = vcmp.gt(v9.uw,v6.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: v25 = vmux(q0,v1,v21) ; CHECK-NEXT: v27 = vmux(q3,v21,v1) ; CHECK-NEXT: v1 = vmux(q1,v1,v21) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) ; CHECK-NEXT: v1.w = vsub(v1.w,v5.w) ; CHECK-NEXT: v10.w = vadd(v22.w,v24.w) ; CHECK-NEXT: v28.w = vadd(v23.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) ; CHECK-NEXT: v1.w = vadd(v1.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uw = vlsr(v9.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v12.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uw = vlsr(v22.uw,r7) ; CHECK-NEXT: q3 = vcmp.eq(v26.w,v23.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uw = vlsr(v23.uw,r7) ; CHECK-NEXT: v5 = vmux(q2,v30,v11) ; CHECK-NEXT: q2 = vcmp.eq(v3.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v28.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) ; CHECK-NEXT: v6 = vmux(q3,v6,v29) ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r4) ; CHECK-NEXT: v31 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v6,v1) ; CHECK-NEXT: v0 = vmux(q3,v21,v31) ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vmux(q2,v21,v1) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v1.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = uitofp <64 x i8> %v0 to <64 x float> store <64 x float> %v1, ptr %a1, align 128 ret void } ; Widen input #2 define void @u8f32_2(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u8f32_2: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 ; CHECK-NEXT: r3 = #512 ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r6) ; CHECK-NEXT: v4 = vsplat(r3) ; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r5:4 = combine(##159,#8) ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vsplat(r2) ; CHECK-NEXT: v7 = vsplat(r5) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vcl0(v0.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v0.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vadd(v6.w,v1.w) ; CHECK-NEXT: v4 = vand(v6,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4) ; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v1.uw) ; CHECK-NEXT: q1 = vcmp.eq(v4.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r4) ; CHECK-NEXT: v4 = vmux(q1,v3,v2) ; CHECK-NEXT: v2 = vmux(q0,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) ; CHECK-NEXT: v4.w = vadd(v1.w,v4.w) ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uw = vlsr(v1.uw,r6) ; CHECK-NEXT: v2.w = vadd(v2.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r4) ; CHECK-NEXT: v1 = vmux(q2,v30,v29) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v1,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v3,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <32 x i8>, ptr %a0, align 128 %v1 = uitofp <32 x i8> %v0 to <32 x float> store <32 x float> %v1, ptr %a1, align 128 ret void } ; u16 -> f16 ; No widening define void @u16f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u16f16_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#64,#1) ; CHECK-NEXT: r5 = #31 ; CHECK-NEXT: v1.uh = vcl0(v0.uh) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vsplat(r2) ; CHECK-NEXT: v5.h = vsplat(r3) ; CHECK-NEXT: r4 = #5 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.h = vsplat(r5) ; CHECK-NEXT: r3 = #10 ; CHECK-NEXT: v1.h = vadd(v1.h,v2.h) ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vasl(v0.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) ; CHECK-NEXT: v5 = vand(v6,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uh = vlsr(v6.uh,r4) ; CHECK-NEXT: q0 = vcmp.eq(v5.h,v3.h) ; CHECK-NEXT: q1 = vcmp.gt(v6.uh,v7.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uh = vlsr(v7.uh,r4) ; CHECK-NEXT: v27 = vmux(q0,v3,v2) ; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vadd(v2.h,v4.h) ; CHECK-NEXT: v28.h = vadd(v26.h,v27.h) ; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uh = vlsr(v26.uh,r2) ; CHECK-NEXT: v1.h = vsub(v2.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vasl(v1.h,r3) ; CHECK-NEXT: v2 = vmux(q2,v30,v29) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v2,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v3,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <64 x i16>, ptr %a0, align 128 %v1 = uitofp <64 x i16> %v0 to <64 x half> store <64 x half> %v1, ptr %a1, align 128 ret void } ; Widen input and result define void @u16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u16f16_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#31,#1) ; CHECK-NEXT: r6 = #64 ; CHECK-NEXT: v1.uh = vcl0(v0.uh) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vsplat(r2) ; CHECK-NEXT: v4.h = vsplat(r3) ; CHECK-NEXT: r5 = #5 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.h = vsplat(r6) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: v1.h = vadd(v1.h,v2.h) ; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: q3 = vsetq(r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vasl(v0.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) ; CHECK-NEXT: v5 = vand(v6,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5) ; CHECK-NEXT: q1 = vcmp.eq(v5.h,v3.h) ; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uh = vlsr(v7.uh,r5) ; CHECK-NEXT: v5 = vmux(q1,v3,v2) ; CHECK-NEXT: v2 = vmux(q0,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vadd(v2.h,v4.h) ; CHECK-NEXT: v28.h = vadd(v7.h,v5.h) ; CHECK-NEXT: q1 = vcmp.eq(v6.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) ; CHECK-NEXT: v1.h = vsub(v2.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.h = vasl(v1.h,r4) ; CHECK-NEXT: v2 = vmux(q1,v30,v29) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v2,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q2,v3,v31) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 ; CHECK-NEXT: } %v0 = load <32 x i16>, ptr %a0, align 128 %v1 = uitofp <32 x i16> %v0 to <32 x half> store <32 x half> %v1, ptr %a1, align 128 ret void } ; u16 -> f32 ; No widening define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u16f32_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7 = #1 ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r7) ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r6 = #512 ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v4.uw = vcl0(v0.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14 = vsplat(r5) ; CHECK-NEXT: v5.uw = vcl0(v1.uw) ; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasl(v1.w,v5.w) ; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) ; CHECK-NEXT: v11 = vand(v7,v8) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) ; CHECK-NEXT: v8 = vand(v9,v8) ; CHECK-NEXT: q1 = vcmp.eq(v11.w,v2.w) ; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v8.w,v2.w) ; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw) ; CHECK-NEXT: v20 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: v22 = vmux(q2,v2,v3) ; CHECK-NEXT: v25 = vmux(q0,v3,v2) ; CHECK-NEXT: v3 = vmux(q3,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) ; CHECK-NEXT: v3.w = vsub(v3.w,v5.w) ; CHECK-NEXT: v23.w = vadd(v19.w,v20.w) ; CHECK-NEXT: v10.w = vadd(v21.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v14.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13.uw = vlsr(v19.uw,r7) ; CHECK-NEXT: q3 = vcmp.eq(v24.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uw = vlsr(v23.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: v5 = vmux(q2,v26,v13) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vasl(v3.w,r4) ; CHECK-NEXT: v29 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vor(v6,v3) ; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } %v0 = load <64 x i16>, ptr %a0, align 128 %v1 = uitofp <64 x i16> %v0 to <64 x float> store <64 x float> %v1, ptr %a1, align 128 ret void } ; Widen input define void @u16f32_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u16f32_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r6 = #1 ; CHECK-NEXT: r2 = #255 ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vsplat(r6) ; CHECK-NEXT: v4 = vsplat(r2) ; CHECK-NEXT: r3 = #512 ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5 = vsplat(r3) ; CHECK-NEXT: r5:4 = combine(##159,#8) ; CHECK-NEXT: v3.uw = vcl0(v0.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r5) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) ; CHECK-NEXT: v5 = vand(v6,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4) ; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v4.uw) ; CHECK-NEXT: q1 = vcmp.eq(v5.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r4) ; CHECK-NEXT: v5 = vmux(q1,v2,v1) ; CHECK-NEXT: v1 = vmux(q0,v1,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vsub(v1.w,v3.w) ; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6) ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.uw = vlsr(v29.uw,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r4) ; CHECK-NEXT: v3 = vmux(q2,v3,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v3,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v2,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <32 x i16>, ptr %a0, align 128 %v1 = uitofp <32 x i16> %v0 to <32 x float> store <32 x float> %v1, ptr %a1, align 128 ret void } ; u32 -> f16 ; No widening define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u32f16_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 ; CHECK-NEXT: v3.uw = vcl0(v0.uw) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: r4 = #512 ; CHECK-NEXT: v4.uw = vcl0(v1.uw) ; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v6 = vsplat(r6) ; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v9 = vxor(v9,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r4) ; CHECK-NEXT: v5.w = vasl(v1.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.w = vasl(v0.w,v3.w) ; CHECK-NEXT: v11.w = vadd(v5.w,v6.w) ; CHECK-NEXT: v13 = vand(v5,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v8.w,v6.w) ; CHECK-NEXT: v7 = vand(v8,v7) ; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v11.uw) ; CHECK-NEXT: q2 = vcmp.eq(v13.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3) ; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw) ; CHECK-NEXT: q0 = vcmp.eq(v7.w,v9.w) ; CHECK-NEXT: v28 = vmux(q2,v9,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3) ; CHECK-NEXT: v29 = vmux(q1,v2,v9) ; CHECK-NEXT: v30 = vmux(q3,v2,v9) ; CHECK-NEXT: v2 = vmux(q0,v9,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vsub(v29.w,v4.w) ; CHECK-NEXT: v7.w = vadd(v27.w,v28.w) ; CHECK-NEXT: v3.w = vsub(v30.w,v3.w) ; CHECK-NEXT: v2.w = vadd(v6.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3) ; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v10.w) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3) ; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2) ; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2) ; CHECK-NEXT: v5 = vmux(q3,v7,v5) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r3) ; CHECK-NEXT: v31 = vmux(q1,v2,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v3.w,r3) ; CHECK-NEXT: v4 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v31,v2) ; CHECK-NEXT: v3 = vmux(q2,v9,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v9,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v9.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v9.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v3:2.qf32 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.h = vdeal(v0.h) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <64 x i32>, ptr %a0, align 128 %v1 = uitofp <64 x i32> %v0 to <64 x half> store <64 x half> %v1, ptr %a1, align 128 ret void } ; Widen result define void @u32f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u32f16_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##512,#1) ; CHECK-NEXT: v1.uw = vcl0(v0.uw) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r2) ; CHECK-NEXT: v5 = vsplat(r3) ; CHECK-NEXT: r6 = #255 ; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r6) ; CHECK-NEXT: r5 = #8 ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v1.w = vadd(v1.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) ; CHECK-NEXT: v5 = vand(v6,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) ; CHECK-NEXT: q0 = vcmp.eq(v5.w,v2.w) ; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) ; CHECK-NEXT: v5 = vmux(q0,v2,v3) ; CHECK-NEXT: v3 = vmux(q1,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vsub(v3.w,v1.w) ; CHECK-NEXT: v30.w = vadd(v4.w,v5.w) ; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = #64 ; CHECK-NEXT: v3.uw = vlsr(v30.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r3) ; CHECK-NEXT: q3 = vsetq(r2) ; CHECK-NEXT: v3 = vmux(q1,v3,v31) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.qf32 = vadd(v2.sf,v2.sf) ; CHECK-NEXT: v0 = vor(v3,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q2,v2,v0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v2.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v1:0.qf32 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.h = vdeal(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 ; CHECK-NEXT: } %v0 = load <32 x i32>, ptr %a0, align 128 %v1 = uitofp <32 x i32> %v0 to <32 x half> store <32 x half> %v1, ptr %a1, align 128 ret void } ; u32 -> f32 ; No widening define void @u32f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u32f32_0: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##512,#1) ; CHECK-NEXT: v1.uw = vcl0(v0.uw) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: v5 = vsplat(r3) ; CHECK-NEXT: r6 = #255 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r6) ; CHECK-NEXT: r5 = #8 ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v1.w = vadd(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) ; CHECK-NEXT: v5 = vand(v6,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) ; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w) ; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) ; CHECK-NEXT: v5 = vmux(q0,v3,v2) ; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vsub(v2.w,v1.w) ; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r3) ; CHECK-NEXT: v2 = vmux(q2,v2,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v2,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v3,v31) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } %v0 = load <32 x i32>, ptr %a0, align 128 %v1 = uitofp <32 x i32> %v0 to <32 x float> store <32 x float> %v1, ptr %a1, align 128 ret void } ; Widen input and result define void @u32f32_1(ptr %a0, ptr %a1) #0 { ; CHECK-LABEL: u32f32_1: ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##512,#1) ; CHECK-NEXT: v1.uw = vcl0(v0.uw) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: v5 = vsplat(r3) ; CHECK-NEXT: r6 = #255 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r6) ; CHECK-NEXT: r5 = #8 ; CHECK-NEXT: r4 = #159 ; CHECK-NEXT: v1.w = vadd(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) ; CHECK-NEXT: v5 = vand(v6,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) ; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w) ; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) ; CHECK-NEXT: v5 = vmux(q0,v3,v2) ; CHECK-NEXT: v2 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vsub(v2.w,v1.w) ; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) ; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = #64 ; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vasl(v1.w,r3) ; CHECK-NEXT: q3 = vsetq(r2) ; CHECK-NEXT: v2 = vmux(q1,v2,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31 = vor(v2,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q2,v3,v31) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 ; CHECK-NEXT: } %v0 = load <16 x i32>, ptr %a0, align 128 %v1 = uitofp <16 x i32> %v0 to <16 x float> store <16 x float> %v1, ptr %a1, align 128 ret void } attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" }