bolt/deps/llvm-18.1.8/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp

//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"

#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Support/MathExtras.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <cmath>
#include <numeric>

using namespace mlir;

#define DEBUG_TYPE "linalg-transforms"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

static Attribute linearId0(MLIRContext *ctx) {
  return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
}
static Attribute linearId1(MLIRContext *ctx) {
  return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
}
static Attribute linearId2(MLIRContext *ctx) {
  return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
}

transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
                                                 int totalNumThreads,
                                                 int64_t desiredBitAlignment,
                                                 ArrayRef<int64_t> copySizes,
                                                 bool favorPredication,
                                                 int64_t elementalBitwidth) {
  assert(!copySizes.empty() && copySizes.size() <= 3 &&
         "only 1,2,3-D copies are supported for now");

  LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
  LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");
             llvm::dbgs() << "\n";);

  // Greedily find the largest vector size that can be used to copy the most
  // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
  // contiguous memory transactions with as few threads as possible.
  int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
      desiredBitAlignment, copySizes.back(), elementalBitwidth);

  LDBG("--greedily determined vectorSize: "
       << desiredVectorSize << " elements of " << elementalBitwidth
       << "b each -> " << (desiredVectorSize * elementalBitwidth)
       << "b total out of a max of " << kMaxVectorLoadBitWidth << "b");

  status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
                           favorPredication);
  if (status == Status::Invalid)
    return;

  LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");
             llvm::dbgs() << "\n"; llvm::interleaveComma(
                 this->numThreads, DBGS() << "--numThreads: ");
             llvm::dbgs() << "\n";);
  LDBG("--vectorSize: " << this->vectorSize);
  assert(this->numThreads.size() == copySizes.size() &&
         "compute copy mapping expected same number of threads and copy sizes");

  // Compute the smallest bounding box.
  this->smallestBoundingTileSizes = llvm::to_vector(
      llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
        int64_t size, numThreads;
        std::tie(size, numThreads) = pair;
        return mlir::ceilDiv(size, numThreads);
      }));
  SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
                                           linearId0(ctx)};

  // Set the thread mapping.
  this->threadMapping =
      llvm::to_vector(ArrayRef(allThreadMappings)
                          .take_back(this->smallestBoundingTileSizes.size()));
  LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
}

int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
    int64_t desiredBitAlignment, int64_t numContiguousElements,
    int64_t elementalBitwidth) {
  assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
         "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
  assert(desiredBitAlignment % elementalBitwidth == 0 &&
         "elemental bitwidth does not divide desired bit alignment");
  return std::gcd(
      std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
      kMaxVectorLoadBitWidth / elementalBitwidth);
}

/// Get the list of all factors that divide `val`, not just the prime factors.
static SmallVector<int64_t> getFactors(int64_t val) {
  SmallVector<int64_t> factors;
  factors.reserve(val);
  for (int64_t factor = 1; factor <= val; ++factor) {
    if (val % factor != 0)
      continue;
    factors.push_back(factor);
  }
  factors.push_back(val);
  return factors;
}

static int64_t product(ArrayRef<int64_t> vals) {
  int64_t res = 1;
  for (auto val : vals)
    res *= val;
  return res;
}

/// Extract `result` from `sizes` with the following constraints:
///   1. sizes[i] % result[i] for all i
///   2. product_of_threadsPerDim <= maxNumThreads
///   3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
///      must be sizes[currentIndex].
/// This is used to greedily extract the maximum number of threads usable for
/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
/// ensuring coalesced access along the most minor dimension.
/// Return the number of threads used in the range:
///   threadsPerDim[currentIndex .. sizes.end()]
// The implementation uses a dynamic programming approach to greedily extract
// the best combination under the constraints.
// TODO: Implementation details can be improved but putting effort there is a
// tradeoffs: `sizes` is expected to be of small rank and contain small values.
static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
                                               int64_t currentIndex,
                                               int64_t maxNumThreads) {
  assert(static_cast<size_t>(currentIndex) < sizes.size() &&
         "currentIndex out of bounds");
  std::string indent(2 * currentIndex, '-');
  if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
    LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
    return SmallVector<int64_t>{sizes[currentIndex]};
  }

  int64_t best = 0;
  int64_t s = sizes[currentIndex];
  SmallVector<int64_t> factors = getFactors(s);
  SmallVector<int64_t> localThreadsPerDim;
  localThreadsPerDim.reserve(sizes.size());
  LDBG(indent << "maximizeNumThreads in " << s
              << " with limit: " << maxNumThreads);
  for (auto factor : factors) {
    auto nestedThreadsPerDim =
        maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
    int64_t localBest = factor * product(nestedThreadsPerDim);
    if (localBest > best && localBest <= maxNumThreads) {
      LDBG(indent << "new localBest: " << localBest);
      LLVM_DEBUG(
          llvm::interleaveComma(nestedThreadsPerDim,
                                DBGS() << indent << "nestedThreadsPerDim: ");
          llvm::dbgs() << "\n";);
      localThreadsPerDim.clear();
      localThreadsPerDim.push_back(factor);
      llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
      best = localBest;
    }
  }

  LDBG(indent << "found globalBest: " << best);
  LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,
                                   DBGS() << indent << "numThreads: ");
             llvm::dbgs() << "\n";);

  return localThreadsPerDim;
}

transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
                                                 ArrayRef<int64_t> sizes,
                                                 int64_t desiredVectorSize,
                                                 bool favorPredication) {

  if (!favorPredication) {
    int64_t localVectorSize = desiredVectorSize;
    for (; localVectorSize >= 1; localVectorSize /= 2) {
      // Attempt to map the copy with predication and current fixed vector size:
      //   1. if the status is Success, we are done.
      //   2. if the status is Invalid, we fail immediately, no amount of
      //   vector size reduction can offset the bad tile size selection from the
      //   higher-level.
      //   3. if the status is RequiresPredication, we try again with a smaller
      //   vector size.
      Status status =
          inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
      if (status == Status::Success || status == Status::Invalid)
        return status;

      LDBG("requires predication, try reducing vector size to "
           << (localVectorSize / 2));
    }
  }

  // If we have not yet returned, it means that we have tried all vector sizes
  // and we still require predication. Restart from the original vector size and
  // do not attempt to
  return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
}

transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
    int64_t totalNumThreads, ArrayRef<int64_t> sizes,
    int64_t desiredVectorSize) {
  assert(sizes.back() % desiredVectorSize == 0 &&
         "most-minor size not divisible by actualVectorSize");

  LDBG("inferNumThreadsImpl with totalNumThreads: "
       << totalNumThreads << " and vectorSize: " << desiredVectorSize);

  // Scale the most minor size to account for the chosen vector size and
  // maximize the number of threads without exceeding the total number of
  // threads.
  SmallVector<int64_t> scaledSizes{sizes};
  scaledSizes.back() /= desiredVectorSize;
  if (scaledSizes.back() > totalNumThreads) {
    LDBG("--Too few threads given the required vector size -> FAIL");
    return Status::Invalid;
  }
  SmallVector<int64_t> inferredNumThreads =
      maximizeNumThreads(scaledSizes, 0, totalNumThreads);

  LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,
                                   DBGS() << "inferred numThreads: ");
             llvm::dbgs() << "\n";
             LDBG("computed actualVectorSize: " << desiredVectorSize););

  // Corner case: we cannot use more threads than available. If the dimension of
  // the copy is so bad it is because higher-level tiling did not do its job, we
  // do not try to recover from it here.
  int64_t totalNumThreadsUsed = product(inferredNumThreads);
  LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
  if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
    LDBG("--Too few threads given the required vector size -> FAIL");
    return Status::Invalid;
  }

  this->vectorSize = desiredVectorSize;
  this->numThreads = inferredNumThreads;
  if (totalNumThreadsUsed == totalNumThreads)
    return Status::Success;

  return Status::RequiresPredication;
}

void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
  os << "MappingInfo{";
  os << "CopyMappingInfo: ";
  os << "valid: " << (status != Status::Invalid) << ", ";
  os << "vectorSize: " << vectorSize << ", ";
  llvm::interleaveComma(numThreads, os << ", numThreads: {");
  llvm::interleaveComma(smallestBoundingTileSizes,
                        os << "}, smallestBoundingTileSizes: {");
  llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");
  os << "}}";
}
Embed LLVM 18.1.8 2025-02-14 19:21:04 +01:00			`//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"`

			`#include "mlir/Dialect/GPU/IR/GPUDialect.h"`
			`#include "mlir/Support/MathExtras.h"`
			`#include "llvm/ADT/ArrayRef.h"`
			`#include "llvm/ADT/STLExtras.h"`
			`#include "llvm/Support/CommandLine.h"`
			`#include "llvm/Support/Debug.h"`
			`#include "llvm/Support/raw_ostream.h"`
			`#include <cmath>`
			`#include <numeric>`

			`using namespace mlir;`

			`#define DEBUG_TYPE "linalg-transforms"`
			`#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")`
			`#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")`

			`static Attribute linearId0(MLIRContext *ctx) {`
			`return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);`
			`}`
			`static Attribute linearId1(MLIRContext *ctx) {`
			`return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);`
			`}`
			`static Attribute linearId2(MLIRContext *ctx) {`
			`return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);`
			`}`

			`transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,`
			`int totalNumThreads,`
			`int64_t desiredBitAlignment,`
			`ArrayRef<int64_t> copySizes,`
			`bool favorPredication,`
			`int64_t elementalBitwidth) {`
			`assert(!copySizes.empty() && copySizes.size() <= 3 &&`
			`"only 1,2,3-D copies are supported for now");`

			`LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);`
			`LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");`
			`llvm::dbgs() << "\n";);`

			`// Greedily find the largest vector size that can be used to copy the most`
			`// minor dimension: we are in the business of filling kMaxVectorLoadBitWidth`
			`// contiguous memory transactions with as few threads as possible.`
			`int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(`
			`desiredBitAlignment, copySizes.back(), elementalBitwidth);`

			`LDBG("--greedily determined vectorSize: "`
			`<< desiredVectorSize << " elements of " << elementalBitwidth`
			`<< "b each -> " << (desiredVectorSize * elementalBitwidth)`
			`<< "b total out of a max of " << kMaxVectorLoadBitWidth << "b");`

			`status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,`
			`favorPredication);`
			`if (status == Status::Invalid)`
			`return;`

			`LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");`
			`llvm::dbgs() << "\n"; llvm::interleaveComma(`
			`this->numThreads, DBGS() << "--numThreads: ");`
			`llvm::dbgs() << "\n";);`
			`LDBG("--vectorSize: " << this->vectorSize);`
			`assert(this->numThreads.size() == copySizes.size() &&`
			`"compute copy mapping expected same number of threads and copy sizes");`

			`// Compute the smallest bounding box.`
			`this->smallestBoundingTileSizes = llvm::to_vector(`
			`llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {`
			`int64_t size, numThreads;`
			`std::tie(size, numThreads) = pair;`
			`return mlir::ceilDiv(size, numThreads);`
			`}));`
			`SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),`
			`linearId0(ctx)};`

			`// Set the thread mapping.`
			`this->threadMapping =`
			`llvm::to_vector(ArrayRef(allThreadMappings)`
			`.take_back(this->smallestBoundingTileSizes.size()));`
			`LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");`
			`}`

			`int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(`
			`int64_t desiredBitAlignment, int64_t numContiguousElements,`
			`int64_t elementalBitwidth) {`
			`assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&`
			`"elemental bitwidth does not divide kMaxVectorLoadBitWidth");`
			`assert(desiredBitAlignment % elementalBitwidth == 0 &&`
			`"elemental bitwidth does not divide desired bit alignment");`
			`return std::gcd(`
			`std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),`
			`kMaxVectorLoadBitWidth / elementalBitwidth);`
			`}`

			/// Get the list of all factors that divide `val`, not just the prime factors.
			`static SmallVector<int64_t> getFactors(int64_t val) {`
			`SmallVector<int64_t> factors;`
			`factors.reserve(val);`
			`for (int64_t factor = 1; factor <= val; ++factor) {`
			`if (val % factor != 0)`
			`continue;`
			`factors.push_back(factor);`
			`}`
			`factors.push_back(val);`
			`return factors;`
			`}`

			`static int64_t product(ArrayRef<int64_t> vals) {`
			`int64_t res = 1;`
			`for (auto val : vals)`
			`res *= val;`
			`return res;`
			`}`

			/// Extract `result` from `sizes` with the following constraints:
			`/// 1. sizes[i] % result[i] for all i`
			`/// 2. product_of_threadsPerDim <= maxNumThreads`
			/// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
			`/// must be sizes[currentIndex].`
			`/// This is used to greedily extract the maximum number of threads usable for`
			/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
			`/// ensuring coalesced access along the most minor dimension.`
			`/// Return the number of threads used in the range:`
			`/// threadsPerDim[currentIndex .. sizes.end()]`
			`// The implementation uses a dynamic programming approach to greedily extract`
			`// the best combination under the constraints.`
			`// TODO: Implementation details can be improved but putting effort there is a`
			// tradeoffs: `sizes` is expected to be of small rank and contain small values.
			`static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,`
			`int64_t currentIndex,`
			`int64_t maxNumThreads) {`
			`assert(static_cast<size_t>(currentIndex) < sizes.size() &&`
			`"currentIndex out of bounds");`
			`std::string indent(2 * currentIndex, '-');`
			`if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {`
			`LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);`
			`return SmallVector<int64_t>{sizes[currentIndex]};`
			`}`

			`int64_t best = 0;`
			`int64_t s = sizes[currentIndex];`
			`SmallVector<int64_t> factors = getFactors(s);`
			`SmallVector<int64_t> localThreadsPerDim;`
			`localThreadsPerDim.reserve(sizes.size());`
			`LDBG(indent << "maximizeNumThreads in " << s`
			`<< " with limit: " << maxNumThreads);`
			`for (auto factor : factors) {`
			`auto nestedThreadsPerDim =`
			`maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);`
			`int64_t localBest = factor * product(nestedThreadsPerDim);`
			`if (localBest > best && localBest <= maxNumThreads) {`
			`LDBG(indent << "new localBest: " << localBest);`
			`LLVM_DEBUG(`
			`llvm::interleaveComma(nestedThreadsPerDim,`
			`DBGS() << indent << "nestedThreadsPerDim: ");`
			`llvm::dbgs() << "\n";);`
			`localThreadsPerDim.clear();`
			`localThreadsPerDim.push_back(factor);`
			`llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);`
			`best = localBest;`
			`}`
			`}`

			`LDBG(indent << "found globalBest: " << best);`
			`LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,`
			`DBGS() << indent << "numThreads: ");`
			`llvm::dbgs() << "\n";);`

			`return localThreadsPerDim;`
			`}`

			`transform::gpu::CopyMappingInfo::Status`
			`transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,`
			`ArrayRef<int64_t> sizes,`
			`int64_t desiredVectorSize,`
			`bool favorPredication) {`

			`if (!favorPredication) {`
			`int64_t localVectorSize = desiredVectorSize;`
			`for (; localVectorSize >= 1; localVectorSize /= 2) {`
			`// Attempt to map the copy with predication and current fixed vector size:`
			`// 1. if the status is Success, we are done.`
			`// 2. if the status is Invalid, we fail immediately, no amount of`
			`// vector size reduction can offset the bad tile size selection from the`
			`// higher-level.`
			`// 3. if the status is RequiresPredication, we try again with a smaller`
			`// vector size.`
			`Status status =`
			`inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);`
			`if (status == Status::Success \|\| status == Status::Invalid)`
			`return status;`

			`LDBG("requires predication, try reducing vector size to "`
			`<< (localVectorSize / 2));`
			`}`
			`}`

			`// If we have not yet returned, it means that we have tried all vector sizes`
			`// and we still require predication. Restart from the original vector size and`
			`// do not attempt to`
			`return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);`
			`}`

			`transform::gpu::CopyMappingInfo::Status`
			`transform::gpu::CopyMappingInfo::inferNumThreadsImpl(`
			`int64_t totalNumThreads, ArrayRef<int64_t> sizes,`
			`int64_t desiredVectorSize) {`
			`assert(sizes.back() % desiredVectorSize == 0 &&`
			`"most-minor size not divisible by actualVectorSize");`

			`LDBG("inferNumThreadsImpl with totalNumThreads: "`
			`<< totalNumThreads << " and vectorSize: " << desiredVectorSize);`

			`// Scale the most minor size to account for the chosen vector size and`
			`// maximize the number of threads without exceeding the total number of`
			`// threads.`
			`SmallVector<int64_t> scaledSizes{sizes};`
			`scaledSizes.back() /= desiredVectorSize;`
			`if (scaledSizes.back() > totalNumThreads) {`
			`LDBG("--Too few threads given the required vector size -> FAIL");`
			`return Status::Invalid;`
			`}`
			`SmallVector<int64_t> inferredNumThreads =`
			`maximizeNumThreads(scaledSizes, 0, totalNumThreads);`

			`LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,`
			`DBGS() << "inferred numThreads: ");`
			`llvm::dbgs() << "\n";`
			`LDBG("computed actualVectorSize: " << desiredVectorSize););`

			`// Corner case: we cannot use more threads than available. If the dimension of`
			`// the copy is so bad it is because higher-level tiling did not do its job, we`
			`// do not try to recover from it here.`
			`int64_t totalNumThreadsUsed = product(inferredNumThreads);`
			`LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);`
			`if (totalNumThreadsUsed == 0 \|\| totalNumThreadsUsed > totalNumThreads) {`
			`LDBG("--Too few threads given the required vector size -> FAIL");`
			`return Status::Invalid;`
			`}`

			`this->vectorSize = desiredVectorSize;`
			`this->numThreads = inferredNumThreads;`
			`if (totalNumThreadsUsed == totalNumThreads)`
			`return Status::Success;`

			`return Status::RequiresPredication;`
			`}`

			`void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {`
			`os << "MappingInfo{";`
			`os << "CopyMappingInfo: ";`
			`os << "valid: " << (status != Status::Invalid) << ", ";`
			`os << "vectorSize: " << vectorSize << ", ";`
			`llvm::interleaveComma(numThreads, os << ", numThreads: {");`
			`llvm::interleaveComma(smallestBoundingTileSizes,`
			`os << "}, smallestBoundingTileSizes: {");`
			`llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");`
			`os << "}}";`
			`}`