// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -canonicalize -buffer-loop-hoisting -drop-equivalent-buffer-results -split-input-file | FileCheck %s

// Run fuzzer with different seeds.
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null

// Test bufferization using memref types that have no layout map.
// RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP

// TODO: Some test cases from this file should be moved to other dialects.

// CHECK-LABEL: func @fill_inplace(
//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
// CHECK-NO-LAYOUT-MAP-LABEL: func @fill_inplace(%{{.*}}: memref<?xf32>) {
func.func @fill_inplace(
    %A : tensor<?xf32> {bufferization.writable = true})
  -> tensor<?xf32>
{
  //     CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
  %f0 = arith.constant 0.0 : f32

  /// Inplaceable, no alloc
  // CHECK-NOT: alloc
  //     CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[A]] : memref<?xf32, strided<[?], offset: ?>>)
  %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>

  //     CHECK: return
  // CHECK-NOT: tensor
  return %r: tensor<?xf32>
}

// -----

/// No bufferization.writable flag, must allocate.
// CHECK-LABEL: func @not_inplace(
//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32> {
// CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?xf32>) -> memref<?xf32>
func.func @not_inplace(
    %A : tensor<?xf32> {bufferization.writable = false})
  -> tensor<?xf32>
{
  //     CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
  %f0 = arith.constant 0.0 : f32

  //     CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, strided<[?], offset: ?>>
  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 64 : i64} : memref<?xf32>
  //     CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[ALLOC]] : memref<?xf32>)
  %r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>

  // CHECK-NOT: dealloc
  //     CHECK: return %[[ALLOC]] : memref<?xf32>
  return %r: tensor<?xf32>
}

// -----


// CHECK-LABEL: func @not_inplace
//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?x?xf32, strided<[?, ?], offset: ?>>) {
// CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?x?xf32>) {
func.func @not_inplace(
    %A : tensor<?x?xf32> {bufferization.writable = true})
  -> tensor<?x?xf32>
{
  %f0 = arith.constant 0.0 : f32

  /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
  //       CHECK: %[[ALLOC:.*]] = memref.alloc
  //       CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[ALLOC]]
  %f = linalg.fill ins(%f0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>

  /// The second op has no interfering reads and can reuse.
  //   CHECK-NOT: alloc
  //       CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]]
  %r = linalg.matmul  ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
                     outs(%A: tensor<?x?xf32>)
    -> tensor<?x?xf32>

  //     CHECK: return
  // CHECK-NOT: tensor
  return %r: tensor<?x?xf32>
}

// -----

// CHECK-LABEL: func @not_inplace
func.func @not_inplace(
    %A : tensor<?x?xf32> {bufferization.writable = true}) -> tensor<?x?xf32> {
  /// Within op multiple uses of %A, must alloc.
  // CHECK: alloc
  %r = linalg.matmul  ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
                     outs(%A: tensor<?x?xf32>)
    -> tensor<?x?xf32>
  // CHECK-NOT: dealloc
  return %r: tensor<?x?xf32>
}
// -----

// CHECK-LABEL: func @vec_inplace
func.func @vec_inplace(
    %A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
  -> tensor<?xf32>
{
  %c0 = arith.constant 0 : index

  // CHECK-NOT: alloc
  %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>

  //     CHECK: return
  // CHECK-NOT: tensor
  return %r: tensor<?xf32>
}

// -----

// CHECK-LABEL: func @vec_not_inplace
//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
func.func @vec_not_inplace(
    %A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
  -> (tensor<?xf32>, tensor<?xf32>)
{
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index

  /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
  //      CHECK: %[[ALLOC:.*]] = memref.alloc
  //      CHECK: memref.copy {{.*}}, %[[ALLOC]]
  // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
  %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>

  /// The second vector.transfer has no interfering reads and can reuse the buffer.
  //  CHECK-NOT: alloc
  // CHECK-NEXT: vector.transfer_write {{.*}}, %[[A]]
  %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>

  //     CHECK: return
  // CHECK-NOT: tensor
  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
}

// -----

//      CHECK: func @matmul(
// CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32>
// CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32>
// CHECK-SAME:   %[[C:[0-9a-zA-Z]*]]: memref<128x192xf32>
func.func @matmul(
    %A: tensor<128x256xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
    %B: tensor<256x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
    %C: tensor<128x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = true})
  -> tensor<128x192xf32> {
  %c0 = arith.constant 0 : index
  %c256 = arith.constant 256 : index
  %c32 = arith.constant 32 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c128 = arith.constant 128 : index
  %c192 = arith.constant 192 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index

  // Hoisted alloc.
  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x16xf32>

  // CHECK: scf.for %[[I:.*]] =
  %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
    %1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
      tensor<128x256xf32> to tensor<8x256xf32>

    // CHECK: scf.for %[[J:.*]] =
    %2 = scf.for %arg5 = %c0 to %c192 step %c16 iter_args(%arg6 = %arg4) -> (tensor<128x192xf32>) {
      %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
        tensor<256x192xf32> to tensor<256x16xf32>

      // Insert an artificial out-of-place buffer by extracting from %C instead
      // of %arg6.
      %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
        tensor<128x192xf32> to tensor<8x16xf32>

      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]]
      %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>

      // CHECK: scf.for %[[K:.*]] =
      %6 = scf.for %arg7 = %c0 to %c256 step %c32 iter_args(%arg8 = %5) -> (tensor<8x16xf32>) {
        %8 = tensor.extract_slice %1[0, %arg7] [8, 32] [1, 1] :
          tensor<8x256xf32> to tensor<8x32xf32>
        %9 = tensor.extract_slice %3[%arg7, 0] [32, 16] [1, 1] :
          tensor<256x16xf32> to tensor<32x16xf32>

        // linalg.matmul is inplace as well as the enclosing scf.for.
        // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
        %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
                           outs(%arg8 : tensor<8x16xf32>)
          -> tensor<8x16xf32>
        scf.yield %10 : tensor<8x16xf32>
      }

      // insert_slice is inplace but its source comes from an equivalent buffer
      // that is not in place. So we must insert a copy of the small buffer into
      // the bigger buffer.
      // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
      // CHECK: memref.copy %[[ALLOC]], %[[T]]
      %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
        tensor<8x16xf32> into tensor<128x192xf32>

      scf.yield %7 : tensor<128x192xf32>
    }
    scf.yield %2 : tensor<128x192xf32>
  }

  return %0 : tensor<128x192xf32>
}

// -----

/// This test just checks the produced IR is valid and does not have dominance
/// errors in the def-use chains.

// CHECK-LABEL: func @dominance_violation_bug_1
func.func @dominance_violation_bug_1(
    %A : tensor<?x?xf32> {bufferization.writable = false},
    %idx : index)
  -> tensor<?x?xf32>
{
  %f0 = arith.constant 0.0 : f32

  %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
  %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
  %FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>) -> tensor<4x4xf32>
  %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
  %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>

  return %rA : tensor<?x?xf32>
}

// -----

func.func @gather_like(
    %arg0 : tensor<?x?xf32> {bufferization.writable = false},
    %arg1 : tensor<?xi32> {bufferization.writable = false},
    %arg2 : tensor<?x?xf32> {bufferization.writable = true})
  -> tensor<?x?xf32>
{
  %0 = linalg.generic {
      indexing_maps = [affine_map<(d0, d1) -> (d0)>,
                       affine_map<(d0, d1) -> (d0, d1)>],
      iterator_types = ["parallel", "parallel"]}
      ins(%arg1 : tensor<?xi32>) outs(%arg2 : tensor<?x?xf32>) {
      ^bb0(%arg3: i32, %arg4 : f32):
        %iv1 = linalg.index 1 : index
        %1 = arith.index_cast %arg3: i32 to index
        %2 = tensor.extract %arg0[%1, %iv1] : tensor<?x?xf32>
        linalg.yield %2 : f32
      } -> tensor<?x?xf32>
  return %0 : tensor<?x?xf32>
}
// CHECK-LABEL: func @gather_like(
//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32,
//  CHECK-SAME:     %[[ARG1:.+]]: memref<?xi32
//  CHECK-SAME:     %[[ARG2:.+]]: memref<?x?xf32
//  CHECK-SAME:   ) {
//       CHECK:   linalg.generic
//  CHECK-SAME:       ins(%[[ARG1]] :
//  CHECK-SAME:       outs(%[[ARG2]] :
//       CHECK:     %[[YIELD:.+]] = memref.load %[[ARG0]]
//       CHECK:     linalg.yield %[[YIELD]]

// -----

// CHECK-LABEL: func @linalg_op_bufferizes_inplace_with_input
//  CHECK-SAME:     %[[t1:.*]]: memref<?x?xf32, strided{{.*}}>, %[[t2:.*]]: memref<?xf32, strided{{.*}}>, %[[t3:.*]]: memref<?x?xf32, strided{{.*}}>
func.func @linalg_op_bufferizes_inplace_with_input(
    %t1: tensor<?x?xf32> {bufferization.writable = true},
    %t2: tensor<?xf32> {bufferization.writable = true},
    %t3: tensor<?x?xf32> {bufferization.writable = true},
    %s1: index, %s2: index, %cst: f32)
  -> tensor<?x?xf32>
{
  // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t3]] : {{.*}})
  %r = linalg.generic {
    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                     affine_map<(d0, d1) -> (d1)>,
                     affine_map<(d0, d1)-> (d0, d1)>],
    iterator_types = ["parallel", "parallel"]}
    ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
    outs(%t3 : tensor<?x?xf32>) {
      ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
        %add = arith.addf %arg0, %arg1 : f32
        linalg.yield %add : f32
    } -> tensor<?x?xf32>
  return %r : tensor<?x?xf32>
}

// -----

#accesses = [
  affine_map<(i) -> (i)>
]
#trait = {
  indexing_maps = #accesses,
  iterator_types = ["parallel"]
}

// CHECK-LABEL: func @op_is_reading_but_following_ops_are_not
//  CHECK-SAME:     %[[t0:.*]]: memref<?xf32
func.func @op_is_reading_but_following_ops_are_not(
    %t0 : tensor<?xf32> {bufferization.writable = false},
    %cst : f32)
  -> tensor<?xf32>
{
  // Make sure that a copy is inserted here.
  // CHECK: %[[ALLOC:.*]] = memref.alloc
  // CHECK: memref.copy %[[t0]], %[[ALLOC]]
  // CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
  %r0 =linalg.generic #trait outs (%t0 : tensor<?xf32>) {
      ^bb(%0: f32) :
        %a = arith.addf %cst, %0 : f32
        linalg.yield %a : f32
    } -> (tensor<?xf32>)

  // CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
  %r1 = linalg.generic #trait outs (%r0 : tensor<?xf32>) {
      ^bb(%0: f32) :
        linalg.yield %cst : f32
    } -> (tensor<?xf32>)

  // CHECK: return %[[ALLOC]]
  return %r1 : tensor<?xf32>
}

// -----

// CHECK-LABEL: func @map_binary
// CHECK-SAME:  %[[LHS:[0-9a-zA-Z]*]]: memref<64xf32
// CHECK-SAME:  %[[RHS:[0-9a-zA-Z]*]]: memref<64xf32
func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
                      %init: tensor<64xf32>) -> tensor<64xf32> {
   // CHECK:      linalg.map { arith.addf } ins(%[[LHS]], %[[RHS]] : memref<64xf32
   %add = linalg.map
          ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>)
          outs(%init:tensor<64xf32>)
          (%lhs_elem: f32, %rhs_elem: f32) {
            %0 = arith.addf %lhs_elem, %rhs_elem: f32
            linalg.yield %0: f32
          }
  func.return %add : tensor<64xf32>
}

// -----

// CHECK-LABEL: func @reduce
// CHECK-SAME:  %[[INPUT:.*]]: memref<16x32x64xf32
func.func @reduce(%input: tensor<16x32x64xf32>,
                  %init: tensor<16x64xf32>) -> tensor<16x64xf32> {
  // CHECK:     linalg.reduce { arith.addf } ins(%[[INPUT]] : memref<16x32x64xf32
  %reduce = linalg.reduce
      ins(%input:tensor<16x32x64xf32>)
      outs(%init:tensor<16x64xf32>)
      dimensions = [1]
      (%in: f32, %out: f32) {
        %0 = arith.addf %out, %in: f32
        linalg.yield %0: f32
      }
  func.return %reduce : tensor<16x64xf32>
}

// -----

// CHECK-LABEL: func @transpose
// CHECK-SAME:  %[[ARG0:.*]]: memref<16x32x64xf32
func.func @transpose(%input: tensor<16x32x64xf32>,
                     %init: tensor<32x64x16xf32>) -> tensor<32x64x16xf32> {
  // CHECK:      linalg.transpose ins(%[[ARG0]] : memref<16x32x64xf32
  %transpose = linalg.transpose
      ins(%input:tensor<16x32x64xf32>)
      outs(%init:tensor<32x64x16xf32>)
      permutation = [1, 2, 0]
  func.return %transpose : tensor<32x64x16xf32>
}

// -----

// CHECK-LABEL: func @broadcast
// CHECK-SAME:  %[[ARG0:.*]]: memref<8x32xf32
func.func @broadcast(%input: tensor<8x32xf32>,
                     %init: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
  %bcast = linalg.broadcast
      ins(%input:tensor<8x32xf32>)
      outs(%init:tensor<8x16x32xf32>)
      dimensions = [1]
  func.return %bcast : tensor<8x16x32xf32>
}

// -----

//===----------------------------------------------------------------------===//
// AllocTensorOp elimination would produce SSA violations for the example below.
//===----------------------------------------------------------------------===//

func.func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x18x32xf32>)
    -> tensor<?x1x6x8xf32> {
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %0 = bufferization.alloc_tensor() : tensor<4x1x6x8xf32>
  %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
  %2 = bufferization.alloc_tensor() : tensor<1x6x8xf32>
  %3 = scf.for %arg3 = %c0 to %c32 step %c8 iter_args(%arg4 = %1) -> (tensor<?x1x6x8xf32>) {
    %4 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg3)
    %5 = tensor.insert_slice %2 into %arg4[%4,0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] :
      tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
    scf.yield %5 : tensor<?x1x6x8xf32>
  }
  return %3 : tensor<?x1x6x8xf32>
}

// -----

// CHECK-LABEL: func @do_not_copy_alloc_tensors(
func.func @do_not_copy_alloc_tensors(%f1: f32, %f2: f32, %idx: index)
  -> (tensor<5xf32>, tensor<5xf32>)
{
  // CHECK: memref.alloc
  // CHECK: memref.alloc
  // CHECK-NOT: copy
  // CHECK: memref.store
  // CHECK: memref.store
  %0 = bufferization.alloc_tensor() : tensor<5xf32>
  %1 = tensor.insert %f1 into %0[%idx] : tensor<5xf32>
  %2 = tensor.insert %f2 into %0[%idx] : tensor<5xf32>
  return %1, %2 : tensor<5xf32>, tensor<5xf32>
}