//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Support/MathExtras.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include #include using namespace mlir; #define DEBUG_TYPE "linalg-transforms" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") static Attribute linearId0(MLIRContext *ctx) { return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0); } static Attribute linearId1(MLIRContext *ctx) { return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1); } static Attribute linearId2(MLIRContext *ctx) { return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2); } transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, int64_t desiredBitAlignment, ArrayRef copySizes, bool favorPredication, int64_t elementalBitwidth) { assert(!copySizes.empty() && copySizes.size() <= 3 && "only 1,2,3-D copies are supported for now"); LDBG("START CopyMappingInfo, favorPredication: " << favorPredication); LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: "); llvm::dbgs() << "\n";); // Greedily find the largest vector size that can be used to copy the most // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth // contiguous memory transactions with as few threads as possible. int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer( desiredBitAlignment, copySizes.back(), elementalBitwidth); LDBG("--greedily determined vectorSize: " << desiredVectorSize << " elements of " << elementalBitwidth << "b each -> " << (desiredVectorSize * elementalBitwidth) << "b total out of a max of " << kMaxVectorLoadBitWidth << "b"); status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize, favorPredication); if (status == Status::Invalid) return; LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: "); llvm::dbgs() << "\n"; llvm::interleaveComma( this->numThreads, DBGS() << "--numThreads: "); llvm::dbgs() << "\n";); LDBG("--vectorSize: " << this->vectorSize); assert(this->numThreads.size() == copySizes.size() && "compute copy mapping expected same number of threads and copy sizes"); // Compute the smallest bounding box. this->smallestBoundingTileSizes = llvm::to_vector( llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) { int64_t size, numThreads; std::tie(size, numThreads) = pair; return mlir::ceilDiv(size, numThreads); })); SmallVector allThreadMappings{linearId2(ctx), linearId1(ctx), linearId0(ctx)}; // Set the thread mapping. this->threadMapping = llvm::to_vector(ArrayRef(allThreadMappings) .take_back(this->smallestBoundingTileSizes.size())); LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n"); } int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer( int64_t desiredBitAlignment, int64_t numContiguousElements, int64_t elementalBitwidth) { assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 && "elemental bitwidth does not divide kMaxVectorLoadBitWidth"); assert(desiredBitAlignment % elementalBitwidth == 0 && "elemental bitwidth does not divide desired bit alignment"); return std::gcd( std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements), kMaxVectorLoadBitWidth / elementalBitwidth); } /// Get the list of all factors that divide `val`, not just the prime factors. static SmallVector getFactors(int64_t val) { SmallVector factors; factors.reserve(val); for (int64_t factor = 1; factor <= val; ++factor) { if (val % factor != 0) continue; factors.push_back(factor); } factors.push_back(val); return factors; } static int64_t product(ArrayRef vals) { int64_t res = 1; for (auto val : vals) res *= val; return res; } /// Extract `result` from `sizes` with the following constraints: /// 1. sizes[i] % result[i] for all i /// 2. product_of_threadsPerDim <= maxNumThreads /// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex] /// must be sizes[currentIndex]. /// This is used to greedily extract the maximum number of threads usable for /// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and /// ensuring coalesced access along the most minor dimension. /// Return the number of threads used in the range: /// threadsPerDim[currentIndex .. sizes.end()] // The implementation uses a dynamic programming approach to greedily extract // the best combination under the constraints. // TODO: Implementation details can be improved but putting effort there is a // tradeoffs: `sizes` is expected to be of small rank and contain small values. static SmallVector maximizeNumThreads(ArrayRef sizes, int64_t currentIndex, int64_t maxNumThreads) { assert(static_cast(currentIndex) < sizes.size() && "currentIndex out of bounds"); std::string indent(2 * currentIndex, '-'); if (static_cast(currentIndex) == sizes.size() - 1) { LDBG(indent << "mandated globalBest: " << sizes[currentIndex]); return SmallVector{sizes[currentIndex]}; } int64_t best = 0; int64_t s = sizes[currentIndex]; SmallVector factors = getFactors(s); SmallVector localThreadsPerDim; localThreadsPerDim.reserve(sizes.size()); LDBG(indent << "maximizeNumThreads in " << s << " with limit: " << maxNumThreads); for (auto factor : factors) { auto nestedThreadsPerDim = maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor); int64_t localBest = factor * product(nestedThreadsPerDim); if (localBest > best && localBest <= maxNumThreads) { LDBG(indent << "new localBest: " << localBest); LLVM_DEBUG( llvm::interleaveComma(nestedThreadsPerDim, DBGS() << indent << "nestedThreadsPerDim: "); llvm::dbgs() << "\n";); localThreadsPerDim.clear(); localThreadsPerDim.push_back(factor); llvm::append_range(localThreadsPerDim, nestedThreadsPerDim); best = localBest; } } LDBG(indent << "found globalBest: " << best); LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim, DBGS() << indent << "numThreads: "); llvm::dbgs() << "\n";); return localThreadsPerDim; } transform::gpu::CopyMappingInfo::Status transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads, ArrayRef sizes, int64_t desiredVectorSize, bool favorPredication) { if (!favorPredication) { int64_t localVectorSize = desiredVectorSize; for (; localVectorSize >= 1; localVectorSize /= 2) { // Attempt to map the copy with predication and current fixed vector size: // 1. if the status is Success, we are done. // 2. if the status is Invalid, we fail immediately, no amount of // vector size reduction can offset the bad tile size selection from the // higher-level. // 3. if the status is RequiresPredication, we try again with a smaller // vector size. Status status = inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize); if (status == Status::Success || status == Status::Invalid) return status; LDBG("requires predication, try reducing vector size to " << (localVectorSize / 2)); } } // If we have not yet returned, it means that we have tried all vector sizes // and we still require predication. Restart from the original vector size and // do not attempt to return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize); } transform::gpu::CopyMappingInfo::Status transform::gpu::CopyMappingInfo::inferNumThreadsImpl( int64_t totalNumThreads, ArrayRef sizes, int64_t desiredVectorSize) { assert(sizes.back() % desiredVectorSize == 0 && "most-minor size not divisible by actualVectorSize"); LDBG("inferNumThreadsImpl with totalNumThreads: " << totalNumThreads << " and vectorSize: " << desiredVectorSize); // Scale the most minor size to account for the chosen vector size and // maximize the number of threads without exceeding the total number of // threads. SmallVector scaledSizes{sizes}; scaledSizes.back() /= desiredVectorSize; if (scaledSizes.back() > totalNumThreads) { LDBG("--Too few threads given the required vector size -> FAIL"); return Status::Invalid; } SmallVector inferredNumThreads = maximizeNumThreads(scaledSizes, 0, totalNumThreads); LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads, DBGS() << "inferred numThreads: "); llvm::dbgs() << "\n"; LDBG("computed actualVectorSize: " << desiredVectorSize);); // Corner case: we cannot use more threads than available. If the dimension of // the copy is so bad it is because higher-level tiling did not do its job, we // do not try to recover from it here. int64_t totalNumThreadsUsed = product(inferredNumThreads); LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed); if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) { LDBG("--Too few threads given the required vector size -> FAIL"); return Status::Invalid; } this->vectorSize = desiredVectorSize; this->numThreads = inferredNumThreads; if (totalNumThreadsUsed == totalNumThreads) return Status::Success; return Status::RequiresPredication; } void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const { os << "MappingInfo{"; os << "CopyMappingInfo: "; os << "valid: " << (status != Status::Invalid) << ", "; os << "vectorSize: " << vectorSize << ", "; llvm::interleaveComma(numThreads, os << ", numThreads: {"); llvm::interleaveComma(smallestBoundingTileSizes, os << "}, smallestBoundingTileSizes: {"); llvm::interleaveComma(threadMapping, os << "}, threadMapping: {"); os << "}}"; }