// RUN: mlir-opt %s --transform-interpreter | FileCheck %s // RUN: mlir-opt %s --gpu-eliminate-barriers | FileCheck %s module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.apply_patterns to %0 { transform.apply_patterns.gpu.eliminate_barriers } : !transform.any_op transform.yield } } // CHECK-LABEL: @read_read_write func.func @read_read_write(%arg0: memref, %arg1: index) attributes {__parallel_region_boundary_for_test} { // CHECK: load %0 = memref.load %arg0[%arg1] : memref // The barrier between loads can be removed. // CHECK-NOT: barrier gpu.barrier // CHECK: load %1 = memref.load %arg0[%arg1] : memref %2 = arith.addf %0, %1 : f32 // The barrier between load and store cannot be removed (unless we reason about accessed subsets). // CHECK: barrier gpu.barrier // CHECK: store memref.store %2, %arg0[%arg1] : memref return } // CHECK-LABEL: @write_read_read func.func @write_read_read(%arg0: memref, %arg1: index, %arg2: f32) -> f32 attributes {__parallel_region_boundary_for_test} { // CHECK: store memref.store %arg2, %arg0[%arg1] : memref // The barrier between load and store cannot be removed (unless we reason about accessed subsets). // CHECK: barrier gpu.barrier // CHECK: load %0 = memref.load %arg0[%arg1] : memref // CHECK-NOT: barrier gpu.barrier // CHECK: load %1 = memref.load %arg0[%arg1] : memref %2 = arith.addf %0, %1 : f32 return %2 : f32 } // CHECK-LABEL: @write_in_a_loop func.func @write_in_a_loop(%arg0: memref, %arg1: f32) attributes {__parallel_region_boundary_for_test} { %c0 = arith.constant 0 : index %c42 = arith.constant 42 : index %c1 = arith.constant 1 : index scf.for %i = %c0 to %c42 step %c1 { memref.store %arg1, %arg0[%i] : memref // Cannot remove this barrier because it guards write-after-write between different iterations. // CHECK: barrier gpu.barrier } return } // CHECK-LABEL @read_read_write_loop func.func @read_read_write_loop(%arg0: memref, %arg1: f32) attributes {__parallel_region_boundary_for_test} { %c0 = arith.constant 0 : index %c42 = arith.constant 42 : index %c1 = arith.constant 1 : index scf.for %i = %c0 to %c42 step %c1 { // (Note that if subscript were different, this would have been a race with the store at the end of the loop). %0 = memref.load %arg0[%i] : memref // Guards read-after-write where the write happens on the previous iteration. // CHECK: barrier gpu.barrier %1 = memref.load %arg0[%i] : memref %2 = arith.addf %0, %1 : f32 // Guards write-after-read. // CHECK: barrier gpu.barrier memref.store %2, %arg0[%i] : memref } return } // CHECK-LABEL: @read_read_write_loop_trailing_sync func.func @read_read_write_loop_trailing_sync(%arg0: memref, %arg1: f32) attributes {__parallel_region_boundary_for_test} { %c0 = arith.constant 0 : index %c42 = arith.constant 42 : index %c1 = arith.constant 1 : index scf.for %i = %c0 to %c42 step %c1 { // CHECK: load %0 = memref.load %arg0[%i] : memref // This can be removed because it only guards a read-after-read. // CHECK-NOT: barrier gpu.barrier // CHECK: load %1 = memref.load %arg0[%i] : memref %2 = arith.addf %0, %1 : f32 // CHECK: barrier gpu.barrier // CHECK: store memref.store %2, %arg0[%i] : memref // CHECK: barrier gpu.barrier } return } // CHECK-LABEL: @write_write_noalias func.func @write_write_noalias(%arg0: index, %arg1: f32) -> (memref<42xf32>, memref<10xf32>) attributes {__parallel_region_boundary_for_test} { %0 = memref.alloc() : memref<42xf32> %1 = memref.alloc() : memref<10xf32> // CHECK: store memref.store %arg1, %0[%arg0] : memref<42xf32> // This can be removed because we can prove two allocations don't alias. // CHECK-NOT: barrier gpu.barrier // CHECK: store memref.store %arg1, %1[%arg0] : memref<10xf32> return %0, %1 : memref<42xf32>, memref<10xf32> } // CHECK-LABEL: @write_write_alloc_arg_noalias func.func @write_write_alloc_arg_noalias(%arg0: index, %arg1: f32, %arg2: memref) -> (memref<42xf32>) attributes {__parallel_region_boundary_for_test} { %0 = memref.alloc() : memref<42xf32> // CHECK: store memref.store %arg1, %0[%arg0] : memref<42xf32> // This can be removed because we can prove local allocation doesn't alias with a function argument. // CHECK-NOT: barrier gpu.barrier // CHECK: store memref.store %arg1, %arg2[%arg0] : memref return %0 : memref<42xf32> } // CHECK-LABEL: @repeated_barrier func.func @repeated_barrier(%arg0: memref, %arg1: index, %arg2: f32) -> f32 attributes {__parallel_region_boundary_for_test} { %0 = memref.load %arg0[%arg1] : memref // CHECK: gpu.barrier gpu.barrier // CHECK-NOT: gpu.barrier gpu.barrier memref.store %arg2, %arg0[%arg1] : memref return %0 : f32 } // CHECK-LABEL: @symmetric_stop func.func @symmetric_stop(%val: f32) -> (f32, f32, f32, f32, f32) attributes {__parallel_region_boundary_for_test} { // CHECK: %[[A:.+]] = memref.alloc // CHECK: %[[B:.+]] = memref.alloc // CHECK: %[[C:.+]] = memref.alloc %A = memref.alloc() : memref %B = memref.alloc() : memref %C = memref.alloc() : memref // CHECK: memref.store %{{.*}}, %[[A]] memref.store %val, %A[] : memref // CHECK: gpu.barrier gpu.barrier // CHECK: memref.load %[[A]] %0 = memref.load %A[] : memref // CHECK: memref.store %{{.*}}, %[[B]] memref.store %val, %B[] : memref // This barrier is eliminated because the surrounding barriers are sufficient // to guard write/read on all memrefs. // CHECK-NOT: gpu.barrier gpu.barrier // CHECK: memref.load %[[A]] %1 = memref.load %A[] : memref // CHECK: memref.store %{{.*}} %[[C]] memref.store %val, %C[] : memref // CHECK: gpu.barrier gpu.barrier // CHECK: memref.load %[[A]] // CHECK: memref.load %[[B]] // CHECK: memref.load %[[C]] %2 = memref.load %A[] : memref %3 = memref.load %B[] : memref %4 = memref.load %C[] : memref return %0, %1, %2, %3, %4 : f32, f32, f32, f32, f32 }