233 lines
11 KiB
MLIR
233 lines
11 KiB
MLIR
// RUN: mlir-opt --split-input-file --transform-interpreter %s | FileCheck %s
|
|
|
|
// CHECK-LABEL: func @matmul_divisible
|
|
// CHECK: scf.forall
|
|
// CHECK-NOT: memref.copy
|
|
// CHECK: linalg.fill
|
|
// CHECK: scf.for
|
|
// CHECK: memref.alloc() : memref<128x16xf32, 3>
|
|
// CHECK: scf.forall
|
|
// CHECK: vector.create_mask
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
// CHECK: memref.alloc() : memref<16x128xf32, 3>
|
|
// CHECK: scf.forall
|
|
// CHECK: vector.create_mask
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
// CHECK: memref.alloc() : memref<128x128xf32, 3>
|
|
// CHECK: scf.forall
|
|
// CHECK: vector.create_mask
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
// CHECK: linalg.matmul
|
|
// CHECK: scf.forall
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
func.func @matmul_divisible(%A: tensor<1024x1024xf32>,
|
|
%B: tensor<1024x1024xf32>,
|
|
%C: tensor<1024x1024xf32>)
|
|
-> tensor<1024x1024xf32>
|
|
{
|
|
%cst = arith.constant 0.000000e+00 : f32
|
|
%0 = linalg.fill ins(%cst : f32)
|
|
outs(%C : tensor<1024x1024xf32>)
|
|
-> tensor<1024x1024xf32>
|
|
%1 = linalg.matmul ins(%A, %B : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
|
|
outs(%0 : tensor<1024x1024xf32>)
|
|
-> tensor<1024x1024xf32>
|
|
return %1 : tensor<1024x1024xf32>
|
|
}
|
|
|
|
module attributes {transform.with_named_sequence} {
|
|
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
|
|
// Fuse linalg.fill into linalg.matmul and tile.
|
|
%matmul_op = transform.structured.match ops{["linalg.matmul"]} in %arg1
|
|
: (!transform.any_op) -> !transform.any_op
|
|
%fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1
|
|
: (!transform.any_op) -> !transform.any_op
|
|
%tiled_matmul_op, %forall_op = transform.structured.tile_using_forall %matmul_op num_threads [] tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
|
|
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
%fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %fill_op into %forall_op
|
|
: (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
|
|
// Tile linalg.matmul a second time.
|
|
%tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_matmul_op[0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
|
|
// Pad linalg.matmul.
|
|
%padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op
|
|
{padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
|
|
padding_dimensions=[0, 1, 2], pack_paddings=[1, 1, 1],
|
|
copy_back_op = "linalg.copy"}
|
|
: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
|
|
|
|
// Map and tile tensor.pad.
|
|
%pad_forall_op, %tiled_pad_op = transform.structured.gpu.map_copy_to_threads
|
|
%pad total_num_threads = 32 desired_bit_alignment = 128
|
|
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
transform.foreach %pad_forall_op : !transform.any_op {
|
|
^bb2(%arg2 : !transform.any_op):
|
|
%if_op = transform.structured.match ops{["scf.if"]} in %arg2
|
|
: (!transform.any_op) -> !transform.any_op
|
|
// TODO: The scf.if can be avoided with 0x... tensors.
|
|
transform.scf.take_assumed_branch %if_op take_else_branch
|
|
: (!transform.any_op) -> ()
|
|
}
|
|
|
|
// Map and tile copy back.
|
|
%copy_forall_op, %tiled_copy_op = transform.structured.gpu.map_copy_to_threads
|
|
%copy_back total_num_threads = 32 desired_bit_alignment = 128
|
|
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
|
|
// Apply masked vectorization to padding ops.
|
|
transform.structured.vectorize %tiled_pad_op vector_sizes [128, 4]
|
|
: !transform.any_op
|
|
|
|
// Assign shared memory buffer to padding.
|
|
%buffer, %new_ops = transform.structured.bufferize_to_allocation
|
|
%pad_forall_op {memory_space = 3, bufferize_destination_only, emit_dealloc}
|
|
: !transform.any_op
|
|
|
|
// Bufferize.
|
|
%func_op_1 = transform.structured.match ops{["func.func"]} in %arg1
|
|
: (!transform.any_op) -> !transform.any_op
|
|
transform.bufferization.eliminate_empty_tensors %func_op_1 : !transform.any_op
|
|
transform.apply_dce to %func_op_1 : !transform.any_op
|
|
transform.apply_cse to %func_op_1 : !transform.any_op
|
|
%bufferized = transform.bufferization.one_shot_bufferize
|
|
layout{IdentityLayoutMap} %arg1 {bufferize_function_boundaries=true}
|
|
: (!transform.any_op) -> !transform.any_op
|
|
|
|
// Apply vectorization to copy back from shared memory.
|
|
// TODO: Find a way to retain the handle to linalg.copy throughout
|
|
// bufferization.
|
|
%func_op_2 = transform.structured.match ops{["func.func"]} in %bufferized
|
|
: (!transform.any_op) -> !transform.any_op
|
|
%bufferized_copy_back = transform.structured.match ops{["linalg.copy"]} in %func_op_2
|
|
: (!transform.any_op) -> !transform.any_op
|
|
transform.structured.vectorize
|
|
%bufferized_copy_back vector_sizes [128, 4] : !transform.any_op
|
|
|
|
// Canonicalize, cleanup and vector lowering. This step also removes buffer
|
|
// self-copies.
|
|
transform.apply_patterns to %func_op_2 {
|
|
transform.apply_patterns.canonicalization
|
|
transform.apply_patterns.vector.lower_masked_transfers
|
|
} {apply_cse} : !transform.any_op
|
|
transform.yield
|
|
}
|
|
}
|
|
|
|
// -----
|
|
|
|
// CHECK-LABEL: func @matmul_not_divisible
|
|
// CHECK: scf.forall
|
|
// CHECK-NOT: memref.copy
|
|
// CHECK: linalg.fill
|
|
// CHECK: scf.for
|
|
// CHECK: memref.alloc() : memref<128x16xf32, 3>
|
|
// CHECK: scf.forall
|
|
// CHECK: vector.create_mask
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
// CHECK: memref.alloc() : memref<16x128xf32, 3>
|
|
// CHECK: scf.forall
|
|
// CHECK: vector.create_mask
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
// CHECK: memref.alloc() : memref<128x128xf32, 3>
|
|
// CHECK: scf.forall
|
|
// CHECK: vector.create_mask
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
// CHECK: linalg.matmul
|
|
// CHECK: vector.transfer_read
|
|
// CHECK: vector.transfer_write
|
|
func.func @matmul_not_divisible(%A: tensor<1023x1023xf32>,
|
|
%B: tensor<1023x1023xf32>,
|
|
%C: tensor<1023x1023xf32>)
|
|
-> tensor<1023x1023xf32>
|
|
{
|
|
%cst = arith.constant 0.000000e+00 : f32
|
|
%0 = linalg.fill ins(%cst : f32)
|
|
outs(%C : tensor<1023x1023xf32>)
|
|
-> tensor<1023x1023xf32>
|
|
%1 = linalg.matmul ins(%A, %B : tensor<1023x1023xf32>, tensor<1023x1023xf32>)
|
|
outs(%0 : tensor<1023x1023xf32>)
|
|
-> tensor<1023x1023xf32>
|
|
return %1 : tensor<1023x1023xf32>
|
|
}
|
|
|
|
module attributes {transform.with_named_sequence} {
|
|
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
|
|
// Fuse linalg.fill into linalg.matmul and tile.
|
|
%matmul_op = transform.structured.match ops{["linalg.matmul"]} in %arg1
|
|
: (!transform.any_op) -> !transform.any_op
|
|
%fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1
|
|
: (!transform.any_op) -> !transform.any_op
|
|
%tiled_matmul_op, %forall_op = transform.structured.tile_using_forall %matmul_op num_threads [] tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
|
|
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
%fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %fill_op into %forall_op
|
|
: (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
|
|
// Tile linalg.matmul a second time.
|
|
%tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_matmul_op[0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
|
|
// Pad linalg.matmul.
|
|
%padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op
|
|
{padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
|
|
padding_dimensions=[0, 1, 2], pack_paddings=[1, 1, 1],
|
|
copy_back_op = "linalg.copy"}
|
|
: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
|
|
|
|
// Map and tile tensor.pad.
|
|
%pad_forall_op, %tiled_pad_op = transform.structured.gpu.map_copy_to_threads
|
|
%pad total_num_threads = 32 desired_bit_alignment = 128
|
|
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
|
|
transform.foreach %pad_forall_op : !transform.any_op {
|
|
^bb2(%arg2 : !transform.any_op):
|
|
%if_op = transform.structured.match ops{["scf.if"]} in %arg2
|
|
: (!transform.any_op) -> !transform.any_op
|
|
// TODO: The scf.if can be avoided with 0x... tensors.
|
|
transform.scf.take_assumed_branch %if_op take_else_branch
|
|
: (!transform.any_op) -> ()
|
|
}
|
|
|
|
// Apply masked vectorization to padding ops.
|
|
transform.structured.vectorize %tiled_pad_op vector_sizes [128, 4]
|
|
: !transform.any_op
|
|
|
|
// Assign shared memory buffer to padding.
|
|
%buffer, %new_ops = transform.structured.bufferize_to_allocation
|
|
%pad_forall_op {memory_space = 3, bufferize_destination_only, emit_dealloc}
|
|
: !transform.any_op
|
|
|
|
// Bufferize.
|
|
%func_op_1 = transform.structured.match ops{["func.func"]} in %arg1
|
|
: (!transform.any_op) -> !transform.any_op
|
|
transform.bufferization.eliminate_empty_tensors %func_op_1 : !transform.any_op
|
|
transform.apply_dce to %func_op_1 : !transform.any_op
|
|
transform.apply_cse to %func_op_1 : !transform.any_op
|
|
%bufferized = transform.bufferization.one_shot_bufferize
|
|
layout{IdentityLayoutMap} %arg1 {bufferize_function_boundaries=true}
|
|
: (!transform.any_op) -> !transform.any_op
|
|
|
|
// Apply vectorization to copy back from shared memory.
|
|
// TODO: Find a way to retain the handle to linalg.copy throughout
|
|
// bufferization.
|
|
%func_op_2 = transform.structured.match ops{["func.func"]} in %bufferized
|
|
: (!transform.any_op) -> !transform.any_op
|
|
%bufferized_copy_back = transform.structured.match ops{["linalg.copy"]} in %func_op_2
|
|
: (!transform.any_op) -> !transform.any_op
|
|
transform.structured.vectorize
|
|
%bufferized_copy_back vector_sizes [128, 4] : !transform.any_op
|
|
|
|
// Canonicalize, cleanup and vector lowering. This step also removes buffer
|
|
// self-copies.
|
|
transform.apply_patterns to %func_op_2 {
|
|
transform.apply_patterns.canonicalization
|
|
transform.apply_patterns.vector.lower_masked_transfers
|
|
} {apply_cse} : !transform.any_op
|
|
transform.yield
|
|
}
|
|
}
|