// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file | FileCheck %s // Run fuzzer with different seeds. // RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null // RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null // RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null // Test bufferization using memref types that have no layout map. // RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null // CHECK-LABEL: func @insert_slice_fun // CHECK-SAME: %[[A0:[a-zA-Z0-9]*]]: memref>, // CHECK-SAME: %[[A1:[a-zA-Z0-9]*]]: memref>, // CHECK-SAME: %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>, // CHECK-SAME: %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>> func.func @insert_slice_fun( %A0 : tensor {bufferization.writable = false}, %A1 : tensor {bufferization.writable = true}, %t0 : tensor<4xf32> {bufferization.writable = false}, %t1 : tensor<4xf32> {bufferization.writable = true}) -> (tensor, tensor, tensor, tensor) { // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. // CHECK: %[[REALLOC3:.*]] = memref.alloc // CHECK: memref.copy %[[A0]], %[[REALLOC3]] // CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC3]] // CHECK: memref.copy %[[t0]], %[[SV_A0]] %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. // CHECK: %[[REALLOC2:.*]] = memref.alloc // CHECK: memref.copy %[[A0]] // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC2]] // CHECK: memref.copy %[[t1]], %[[SV_A0_2]] %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice. // CHECK: %[[REALLOC1:.*]] = memref.alloc // CHECK: memref.copy %[[A1]] // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC1]] // CHECK: memref.copy %[[t0]], %[[SV_A1]] %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor // Do not realloc the large tensor. Copy the tensor.extract_slice. // CHECK-NOT: alloc // CHECK: %[[SV_A1_2:.*]] = memref.subview %[[A1]] // CHECK: memref.copy %[[t1]], %[[SV_A1_2]] %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor // CHECK: return %[[REALLOC3]], %[[REALLOC2]], %[[REALLOC1]] : // CHECK-SAME: memref, memref, memref return %r0, %r1, %r2, %r3: tensor, tensor, tensor, tensor } // ----- // CHECK-LABEL: func @insert_slice_fun // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref> // CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>> func.func @insert_slice_fun( %A : tensor {bufferization.writable = true}, %t : tensor<4xf32> {bufferization.writable = false}) -> tensor { %f0 = arith.constant 0.0 : f32 // CHECK-NOT: alloc // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] // CHECK: memref.copy %[[t]], %[[SV_A]] %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor /// Overwrite A inplace. // CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]] %r1 = linalg.fill ins(%f0 : f32) outs(%r0 : tensor) -> tensor // CHECK: return // CHECK-NOT: tensor return %r1: tensor } // ----- // CHECK-LABEL: func @insert_slice_fun // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref> // CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>> func.func @insert_slice_fun( %A : tensor {bufferization.writable = true}, %t : tensor<4xf32> {bufferization.writable = false}) -> tensor { %f0 = arith.constant 0.0 : f32 // CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]] %r0 = linalg.fill ins(%f0 : f32) outs(%A : tensor) -> tensor // CHECK-NOT: alloc // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] /// Overwrite A inplace by copying into the subview. // CHECK: memref.copy %[[t]], %[[SV_A]] %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor // CHECK: return // CHECK-NOT: tensor return %r1: tensor } // ----- // CHECK-LABEL: func @insert_slice_fun_not_inplace // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref> // CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>> func.func @insert_slice_fun_not_inplace( %A : tensor {bufferization.writable = false}, %t : tensor<4xf32> {bufferization.writable = false}) -> tensor { // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 64 : i64} : memref // CHECK: memref.copy %[[A]], %[[ALLOC]] : memref // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref to memref<4xf32, strided<[1]>> // CHECK: memref.copy %[[t]], %[[SV]] : memref<4xf32, strided{{.*}}> to memref<4xf32, strided<[1]>> %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor // CHECK: return %{{.*}} : memref return %r0: tensor } // ----- // This test case could bufferize in-place with a better analysis. However, it // is simpler to let the canonicalizer fold away the tensor.insert_slice. // CHECK-LABEL: func @tensor_cast_not_in_place( // CHECK-SAME: %[[A:.*]]: memref, %[[B:.*]]: memref // CHECK: %[[casted:.*]] = memref.cast %[[A]] : memref> to memref<4xf32, strided<[?], offset: ?>> // CHECK: %[[alloc:.*]] = memref.alloc // CHECK: memref.copy %[[casted]], %[[alloc]] // CHECK: %[[subview:.*]] = memref.subview %[[A]][{{.*}}] [4] [1] : {{.*}} to memref<4xf32 // CHECK: memref.copy %[[alloc]], %[[subview]] func.func @tensor_cast_not_in_place( %A : tensor {bufferization.writable = true}, %B : tensor {bufferization.writable = false}, %idx: index) -> (tensor) { %r0 = tensor.cast %A : tensor to tensor<4xf32> %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor return %r1 : tensor } // ----- // CHECK-LABEL: func @insert_op // CHECK-SAME: %[[t1:.*]]: memref, %[[s:.*]]: f32, %[[i:.*]]: index func.func @insert_op(%t1 : tensor {bufferization.writable = true}, %s : f32, %i : index) -> tensor { // CHECK: memref.store %[[s]], %[[t1]][%[[i]]] %0 = tensor.insert %s into %t1[%i] : tensor // CHECK: return return %0 : tensor } // ----- // A regression test to make sure that we handle rank-reducing extract_slice // correctly. // CHECK-LABEL: func @rank_reducing func.func @rank_reducing( %i: index, %j: index, %arg0: tensor<8x18x32xf32>) -> tensor { %c1 = arith.constant 1 : index %c6 = arith.constant 6 : index %c8 = arith.constant 8 : index %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index %0 = bufferization.alloc_tensor() : tensor<4x1x6x8xf32> %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor %2 = bufferization.alloc_tensor() : tensor<1x6x8xf32> %5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor) { %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7) %8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32> %9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) { %11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32> %12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32> scf.yield %12 : tensor<1x6x8xf32> } %10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor scf.yield %10 : tensor } return %5: tensor } // ----- // CHECK-LABEL: func.func @rank_reducing_parallel_insert_slice func.func @rank_reducing_parallel_insert_slice(%in: tensor<100xf32>, %out: tensor<200x100xf32>) { %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index // CHECK: scf.forall {{.*}} { %result = scf.forall (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> { %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32> scf.forall.in_parallel { // CHECK: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<100xf32, strided<[?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>> // CHECK: memref.subview %{{.*}}[1, %{{.*}}] [1, 1] [1, 1] : memref<200x100xf32, strided<[?, ?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>> tensor.parallel_insert_slice %1 into %o[1, %thread_idx][1, 1][1, 1] : tensor<1xf32> into tensor<200x100xf32> } } // CHECK: } return } // ----- // This test case could bufferize in-place with a better analysis. However, it // is simpler to let the canonicalizer fold away the tensor.insert_slice. // CHECK-LABEL: func @insert_equivalent_tensor func.func @insert_equivalent_tensor(%t: tensor<10xf32>) -> tensor<10xf32> { // CHECK: memref.alloc %cst = arith.constant 4.200000e+01 : f32 // CHECK: linalg.fill %0 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32> // CHECK: memref.copy %1 = tensor.insert_slice %0 into %t[0][10][1] : tensor<10xf32> into tensor<10xf32> return %1 : tensor<10xf32> } // ----- // CHECK-LABEL: func @pad_memory_space( // CHECK-SAME: %[[t:.*]]: memref> func.func @pad_memory_space(%t: tensor, %h1: index, %f: f32, %pos: index) -> f32 { // CHECK: %[[alloc_tensor:.*]] = memref.alloc{{.*}} : memref // CHECK: memref.copy %[[t]], %[[alloc_tensor]] %0 = bufferization.alloc_tensor() copy(%t) {memory_space = 3 : i64} : tensor // CHECK: %[[padded_alloc:.*]] = memref.alloc() {{.*}} : memref<15xf32, 3> // CHECK: linalg.map // CHECK: outs(%[[padded_alloc]] : memref<15xf32, 3>) // CHECK: linalg.yield %{{.*}} // CHECK: } // CHECK: %[[subview:.*]] = memref.subview {{.*}} : memref<15xf32, 3> to memref, 3> // CHECK: memref.copy %[[alloc_tensor]], %[[subview]] %1 = tensor.pad %0 low[2] high[%h1] { ^bb0(%arg0: index): tensor.yield %f : f32 } : tensor to tensor<15xf32> // CHECK: memref.load {{.*}} : memref<15xf32, 3> %2 = tensor.extract %1[%pos] : tensor<15xf32> return %2 : f32 } // ----- // CHECK-LABEL: func @insert_slice_regression( // CHECK-SAME: %[[t:.*]]: memref<10xf32,{{.*}}>, %[[b:.*]]: memref<5xf32 func.func @insert_slice_regression(%t: tensor<10xf32>, %b: tensor<5xf32>) -> tensor<10xf32> { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32> // CHECK: linalg.fill {{.*}} outs(%[[alloc]] : memref<10xf32>) %1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32> // Read %1 so that it does not DCE away. %vec = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<10xf32> vector.print %vec : vector<10xf32> // Write back a different value (not %1). // CHECK: %[[subview:.*]] = memref.subview %[[t]][0] [5] [1] // CHECK: memref.copy %[[b]], %[[subview]] %2 = tensor.insert_slice %b into %t[0][5][1] : tensor<5xf32> into tensor<10xf32> return %2 : tensor<10xf32> } // ----- // CHECK-LABEL: func @insert_slice_full_overwrite( // CHECK-SAME: %[[t:.*]]: memref<10xf32,{{.*}}>, %[[b:.*]]: memref<10xf32,{{.*}}> func.func @insert_slice_full_overwrite(%t: tensor<10xf32>, %b: tensor<10xf32>) -> tensor<10xf32> { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index // CHECK: linalg.fill {{.*}} outs(%[[t]] : memref<10xf32,{{.*}}>) %1 = linalg.fill ins(%cst : f32) outs(%t : tensor<10xf32>) -> tensor<10xf32> // Read %1 so that it does not DCE away. %vec = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<10xf32> vector.print %vec : vector<10xf32> // Write back a different value (not %1). // CHECK: %[[subview:.*]] = memref.subview %[[t]][0] [10] [1] // CHECK: memref.copy %[[b]], %[[subview]] %2 = tensor.insert_slice %b into %t[0][10][1] : tensor<10xf32> into tensor<10xf32> return %2 : tensor<10xf32> } // ----- // CHECK-LABEL: func @dim_not_reading( // CHECK-SAME: %[[t:.*]]: memref, %f: f32, %pos: index) -> (tensor, index) { %c0 = arith.constant 0 : index // CHECK-NOT: memref.alloc // CHECK-NOT: memref.copy // CHECK: memref.store %{{.*}}, %[[t]] %0 = tensor.insert %f into %t[%pos] : tensor // CHECK: memref.dim %[[t]] %1 = tensor.dim %t, %c0 : tensor return %0, %1 : tensor, index } // ----- // CHECK: #[[$map:.*]] = affine_map<(d0) -> (d0 + 5)> // CHECK-LABEL: func.func @cast_retains_buffer_layout( // CHECK-SAME: %[[t:.*]]: memref, %[[sz:.*]]: index) -> memref> { // CHECK: %[[casted:.*]] = memref.cast %[[t]] : memref to memref<10xf32, #[[$map]]> // CHECK: %[[slice:.*]] = memref.subview %[[casted]][2] [%[[sz]]] [1] : memref<10xf32, #[[$map]]> to memref> // CHECK: return %[[slice]] func.func @cast_retains_buffer_layout( %t: tensor {bufferization.buffer_layout = affine_map<(d0) -> (d0 + 5)>}, %sz: index) -> (tensor<10xf32>, tensor) { %casted = tensor.cast %t : tensor to tensor<10xf32> %slice = tensor.extract_slice %casted[2][%sz][1] : tensor<10xf32> to tensor // Note: The %casted return type is folded away because both buffers are // equivalent. Therefore, we currently loose some static type information // in the caller. return %casted, %slice : tensor<10xf32>, tensor } // ----- // CHECK-LABEL: func.func @parallel_insert_slice_source_out_of_place func.func @parallel_insert_slice_source_out_of_place(%in: tensor<1xf32>, %out: tensor<100xf32>, %f: f32) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %num_threads = arith.constant 50 : index // CHECK: scf.forall {{.*}} { %result = scf.forall (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<100xf32> { // The tensor.insert must bufferize out-of-place. // CHECK: memref.alloc // CHECK: memref.store %insert = tensor.insert %f into %in[%c0] : tensor<1xf32> %r = tensor.extract %in[%c0] : tensor<1xf32> vector.print %r : f32 // CHECK: memref.copy scf.forall.in_parallel { tensor.parallel_insert_slice %insert into %o[%thread_idx][1][1] : tensor<1xf32> into tensor<100xf32> } } // CHECK: } return } // ----- // CHECK-LABEL: func @tensor.reshape( func.func @tensor.reshape() -> tensor<2x2x5xf32> { // CHECK-DAG: %[[M1:.*]] = memref.cast %{{.*}} : memref<2x10xf32> to memref %t1_static = arith.constant dense<0.> : tensor<2x10xf32> %t1 = tensor.cast %t1_static : tensor<2x10xf32> to tensor // CHECK: %[[SHAPE:.*]] = memref.get_global @{{.*}} : memref<3xi64> %shape = arith.constant dense<[2, 2, 5]> : tensor<3xi64> // CHECK: %[[RESHAPED:.*]] = memref.reshape %[[M1]](%[[SHAPE]]) : (memref, memref<3xi64>) -> memref<2x2x5xf32> %reshaped = tensor.reshape %t1(%shape) : (tensor, tensor<3xi64>) -> tensor<2x2x5xf32> // CHECK: return %[[RESHAPED]] return %reshaped : tensor<2x2x5xf32> } // ----- // CHECK-LABEL: @reshape_with_non_identity_layout( // CHECK-SAME: %[[INPUT:[a-zA-Z0-9]*]]: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>, // CHECK-SAME: %[[LAYOUT:[a-zA-Z0-9]*]]: memref<2xi32, strided<[?], offset: ?>>, func.func @reshape_with_non_identity_layout(%arg0: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>, %arg1: tensor<2xi32>, %idx: index) -> f32 { %t = bufferization.to_tensor %arg0 restrict : memref<2x2xf32, strided<[?, ?], offset: ?>, 3> // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]][1, 0] [1, 2] [1, 1] : memref<2x2xf32, strided<[?, ?], offset: ?>, 3> to memref<2xf32, strided<[?], offset: ?>, 3> %extracted_slice = tensor.extract_slice %t[1, 0] [1, 2] [1, 1] : tensor<2x2xf32> to tensor<2xf32> // To satisify the constraints of memref.reshape, the subview must be // reallocated a buffer with an identity layout. // CHECK: %[[ALLOC:.+]] = memref.alloc() {{.*}} : memref<2xf32, 3> // CHECK: memref.copy %[[SUBVIEW]], %[[ALLOC]] // CHECK: %[[RESHAPED:.+]] = memref.reshape %[[ALLOC]](%[[LAYOUT]]) : (memref<2xf32, 3>, memref<2xi32, strided<[?], offset: ?>>) -> memref<1x2xf32, 3> %reshape = tensor.reshape %extracted_slice(%arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<1x2xf32> %r = tensor.extract %reshape[%idx, %idx] : tensor<1x2xf32> return %r : f32 } // ----- // CHECK-LABEL: func @collapse_shape_regression( // CHECK-SAME: %[[t:.*]]: memref<10x20xf32, func.func @collapse_shape_regression( %t: tensor<10x20xf32>, %f: f32, %idx: index) { // CHECK: %[[subview:.*]] = memref.subview %[[t]] %0 = tensor.extract_slice %t [2, 3] [5, 6] [1, 1] : tensor<10x20xf32> to tensor<5x6xf32> // Insert a copy because the original %0 is read later. // CHECK: %[[alloc1:.*]] = memref.alloc() {{.*}} : memref<5x6xf32> // CHECK: memref.copy %[[subview]], %[[alloc1]] // CHECK: memref.store {{.*}}, %[[alloc1]] tensor.insert %f into %0[%idx, %idx] : tensor<5x6xf32> // Insert a copy because the shape cannot be collapsed. // CHECK: %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5x6xf32> // CHECK: memref.copy %[[subview]], %[[alloc2]] // CHECK: memref.collapse_shape %[[alloc2]] tensor.collapse_shape %0[[0, 1]] : tensor<5x6xf32> into tensor<30xf32> return }