# This script generates all variants of wmma builtins, verifies that clang calls
# correct LLVM intrinsics, and checks that availability of specific builtins is
# constrained by the correct PTX version and the target GPU variant.

# Dummy test run to avoid lit warnings.
# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null

from __future__ import print_function

import argparse
from collections import defaultdict
from itertools import product
from string import Template


class MMAFrag:
    def __init__(self, geom, frag, ptx_elt_type):
        self.geom = geom
        self.frag = frag
        self.ptx_type = ptx_elt_type

    def __repr__(self):
        return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)


class MMAOp:
    def __init__(self, a, b, c, d, b1op=""):
        self.a = a
        self.b = b
        self.c = c
        self.d = d
        self.b1op = b1op

    def __repr__(self):
        return "{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d)


def make_mma_ops(geoms, types_a, types_b, types_c, types_d, b1ops=None):
    ops = []
    if b1ops is None:
        b1ops = [""]
    for geom, type_a, type_c in product(geoms, types_a, types_c):
        for type_b, type_d in product(
            types_b if types_b else [type_a], types_d if types_d else [type_c]
        ):
            ops += [
                MMAOp(
                    MMAFrag(geom, "a", type_a),
                    MMAFrag(geom, "b", type_b),
                    MMAFrag(geom, "c", type_c),
                    MMAFrag(geom, "d", type_d),
                    b1op,
                )
                for b1op in b1ops
            ]
    return ops


def make_ldst_ops(geoms, frags, types):
    return [
        MMAFrag(geom, frag, ptx_type)
        for (geom, frag, ptx_type) in product(geoms, frags, types)
    ]


def get_mma_ops():
    return (
        make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], [])
        + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["bf16"], [], ["f32"], [])
        + make_mma_ops(["m8n8k4"], ["f64"], [], ["f64"], [])
        + make_mma_ops(
            ["m16n16k16", "m32n8k16", "m8n32k16"],
            ["f16"],
            [],
            ["f16", "f32"],
            ["f16", "f32"],
        )
        + make_mma_ops(
            ["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []
        )
        + make_mma_ops(["m8n8k32"], ["s4", "u4"], [], ["s32"], [])
        + make_mma_ops(
            ["m8n8k128"], ["b1"], [], ["s32"], [], [".xor.popc", ".and.popc"]
        )
    )


def get_ldst_ops():
    # NOTE: fragemts are from the point of view of PTX.
    # fragment `d` is only for store ops, others for both loads and stores.
    return (
        make_ldst_ops(
            ["m16n16k16", "m32n8k16", "m8n32k16"],
            ["a", "b"],
            ["f16", "u8", "s8", "bf16"],
        )
        + make_ldst_ops(
            ["m16n16k16", "m32n8k16", "m8n32k16"], ["c", "d"], ["f16", "f32", "s32"]
        )
        + make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4", "u4"])
        + make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"])
        + make_ldst_ops(["m8n8k32", "m8n8k128"], ["c", "d"], ["s32"])
        + make_ldst_ops(["m8n8k4"], ["a", "b", "c", "d"], ["f64"])
        +
        # TF32 m16n16k8 is odd.
        # For fragment 'C' it uses __mma_*tf32*_m16n16k8_ld_c
        # but 'D' calls __mma_m16n16k8_st_c_*f32*.
        make_ldst_ops(["m16n16k8"], ["a", "b", "c"], ["tf32"])
        + make_ldst_ops(["m16n16k8"], ["d"], ["f32"])
    )


def is_geom_supported(geom):
    # geometries for FP and ints.
    if geom in ["m8n32k16", "m32n8k16"]:
        return ptx_version >= 61
    # geometries for sub-ints.
    if geom in ["m8n8k32", "m8n8k128"]:
        return ptx_version >= 63 and gpu_arch >= 75
    if geom == "m16n16k16":
        return ptx_version >= 60
    if geom in ["m16n16k8", "m8n8k4"]:
        return ptx_version >= 70 and gpu_arch >= 80
    assert False  # Unexpected geometry.


def is_type_supported(ptx_type):
    if ptx_type in ["s8", "u8", "s32"]:
        return ptx_version >= 63 and gpu_arch >= 72
    if ptx_type in ["s4", "u4", "b1"]:
        return ptx_version >= 63 and gpu_arch >= 75
    if ptx_type in ["bf16", "tf32", "f64"]:
        return ptx_version >= 70 and gpu_arch >= 80
    return ptx_version >= 60 and gpu_arch >= 70


def is_rnd_supported(op):
    # rnd is only supported for FP64 WMMA
    return op.a.ptx_type == "f64"


def is_mma_variant_supported(op, layout_a, layout_b, satf):
    if not (is_type_supported(op.a.ptx_type) and is_geom_supported(op.a.geom)):
        return False

    if satf and not op.a.ptx_type in ["f16", "s8", "u8", "s4", "u4"]:
        return False

    # sub-integer types require row/col layout.
    if op.a.ptx_type in ["s4", "u4", "b1"]:
        return layout_a == "row" and layout_b == "col"
    return True


def is_ldst_variant_supported(frag, layout):
    if not (is_type_supported(frag.ptx_type) and is_geom_supported(frag.geom)):
        return False
    if frag.ptx_type in ["s4", "u4", "b1"]:
        # sub-integer types require sm_75 and ptx63, row/col layout for a/b.
        return (
            (frag.frag == "a" and layout == "row")
            or (frag.frag == "b" and layout == "col")
            or frag.frag in ["c", "d"]
        )
    return True


def get_builtin_prefix(frag):
    prefix = None
    if frag.geom in ["m16n16k16", "m32n8k16", "m8n32k16"]:
        if frag.ptx_type in ["f16", "f32"]:
            prefix = "__hmma"
        elif frag.ptx_type == "bf16":
            prefix = "__mma_bf16"
        else:
            prefix = "__imma"
    elif frag.geom == "m8n8k32":
        prefix = "__imma"  # sub-integers
    elif frag.geom == "m8n8k128":
        prefix = "__bmma"
    elif frag.geom == "m8n8k4":
        prefix = "__dmma"
    elif frag.geom == "m16n16k8":
        if frag.ptx_type == "f32":
            prefix = "__mma"
        else:
            prefix = "__mma_tf32"
    assert prefix
    return prefix


def get_ldst_builtin_name(frag):
    prefix = get_builtin_prefix(frag)

    if prefix == "__hmma":
        suffix = "" if frag.frag in ["a", "b"] else frag.ptx_type
    elif prefix in ["__dmma", "__mma_bf16", "__mma_tf32"]:
        suffix = "" if frag.frag in ["a", "b", "c"] else frag.ptx_type
    else:
        suffix = "" if frag.frag == "c" else frag.ptx_type
        if suffix == "s32":
            suffix = "i32"

    if frag.frag == "d":
        ifrag = "c"
        op = "st"
    else:
        ifrag = frag.frag
        op = "ld"

    name = "%s_%s_%s_%s%s" % (
        prefix,
        frag.geom,
        op,
        ifrag,
        "_" + suffix if suffix else "",
    )
    return name


def get_mma_builtin_name(op):
    prefix = get_builtin_prefix(op.a)

    if prefix == "__hmma":
        suffix = op.d.ptx_type + op.c.ptx_type
    elif prefix in ["__mma_bf16", "__mma_tf32"]:
        suffix = op.d.ptx_type
    else:
        suffix = op.a.ptx_type

    name = "{prefix}_{geom}_mma{b1op}_{suffix}".format(
        prefix=prefix, geom=op.a.geom, b1op=op.b1op.replace(".", "_"), suffix=suffix
    )
    return name


def get_required_sm(frag, b1op=""):
    if frag.ptx_type in ["f64", "bf16", "tf32"]:
        return 80
    if frag.ptx_type in ["u4", "s4", "b1"]:
        if b1op == ".and.popc":
            return 80
        return 75
    if frag.ptx_type in ["s8", "u8"]:
        return 72
    if frag.ptx_type == "s32":
        if frag.geom in ["m8n8k32", "m8n8k128"]:  # s4/u4/b1
            return 75
        else:  # s8/u8
            return 72
    if frag.ptx_type in ["f16", "f32"]:
        if frag.geom == "m16n16k8":
            return 80
        else:
            return 70
    assert False


def get_required_ptx(frag, b1op=""):
    if frag.ptx_type == "b1" and b1op == ".and.popc":
        return 71
    if frag.ptx_type in ["f64", "bf16", "tf32"]:
        return 70
    if frag.ptx_type in ["f16", "f32"]:
        if frag.geom == "m16n16k16":
            return 60
        if frag.geom == "m16n16k8":
            return 70
        return 61
    return 63


def get_src_dst_prefix(frag):
    if frag.ptx_type == "f32":
        return "f"
    if frag.ptx_type == "f64":
        return "d"
    if frag.ptx_type == "tf32" and frag.frag in ["c", "d"]:
        return "f"
    return ""


def gen_wmma_ldst_tests(results):
    load_template = """
  // CHECK${check_suffix}: call {{.*}} @${intrinsic}
  // expected-error-re@+1 {{'${builtin}' needs target feature (sm_${min_sm}{{.*}},(ptx${min_ptx}{{.*}}}}
  ${builtin}(${dst}, ${src}, ldm, ${blayout});
""".rstrip()
    intrinsic_template = (
        "llvm.nvvm.wmma.${geom}.${op}.${frag}.${ilayout}.stride.${itype}"
    )

    for frag, layout in sorted(product(get_ldst_ops(), ["row", "col"]), key=str):

        if not is_ldst_variant_supported(frag, layout):
            continue

        src_dst_prefix = get_src_dst_prefix(frag)

        min_sm = get_required_sm(frag)
        min_ptx = get_required_ptx(frag)
        # TF32 uses f32 for accumulator loads.
        if frag.geom == "m16n16k8" and frag.frag == "c":
            assert frag.ptx_type == "tf32"
            itype = "f32"
        else:
            itype = frag.ptx_type

        params = {
            "check_suffix": "_PTX%d_SM%d" % (min_ptx, min_sm),
            "builtin": get_ldst_builtin_name(frag),
            "min_ptx": min_ptx,
            "min_sm": min_sm,
            "dst": src_dst_prefix + "dst",
            "src": src_dst_prefix + "src",
            "blayout": 0 if layout == "row" else 1,
            "intrinsic": Template(intrinsic_template).substitute(
                {
                    "frag": frag.frag,
                    "geom": frag.geom,
                    "ilayout": layout,
                    "itype": itype,
                    "op": "store" if frag.frag == "d" else "load",
                }
            ),
        }
        results[(min_ptx, min_sm)] += Template(load_template).substitute(params)

    return results


def mma_signature(op):
    if op.a.ptx_type == "f16":
        # FP16 ops identified by accumulator & result type.
        return "%s.%s" % (op.d.ptx_type, op.c.ptx_type)
    else:
        # other ops are identified by input type.
        return op.a.ptx_type


# Get numeric value for rowcol parameter of the builtin
# AFAICT it uses the encoding accepted by NVVM intrinsics:
# https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-matrix-mma
def get_ilayout(a, b):
    return {"row.row": 0, "row.col": 1, "col.row": 2, "col.col": 3}[a + "." + b]


def gen_wmma_mma_tests(results):
    mma_template = """
  // CHECK${check_suffix}: call {{.*}} @${intrinsic}
  // expected-error-re@+1 {{'${builtin}' needs target feature (sm_${min_sm}{{.*}},(ptx${min_ptx}{{.*}}}}
  ${builtin}(${dst}, ${asrc}, ${asrc}, ${csrc}, ${ilayout}${maybe_satf});
""".rstrip()
    intrinsic_template = "llvm.nvvm.wmma.${geom}.mma${b1op}.${alayout}.${blayout}.${intrinsic_signature}${satf}"

    for op, alayout, blayout, satf in sorted(
        product(get_mma_ops(), ["row", "col"], ["row", "col"], [".satfinite", ""]),
        key=str,
    ):

        if not is_mma_variant_supported(op, alayout, blayout, satf):
            continue

        asrc_prefix = get_src_dst_prefix(op.a)
        csrc_prefix = get_src_dst_prefix(op.c)
        ddst_prefix = get_src_dst_prefix(op.d)
        if op.a.ptx_type == "b1":  # .b1 MMA has no satf argument.
            isatf_arg = ""
        else:
            isatf_arg = ", 1" if satf else ", 0"
        min_sm = get_required_sm(op.a, op.b1op)
        min_ptx = get_required_ptx(op.a, op.b1op)
        params = {
            "check_suffix": "_PTX%d_SM%d" % (min_ptx, min_sm),
            "builtin": get_mma_builtin_name(op),
            "min_ptx": min_ptx,
            "min_sm": min_sm,
            "dst": ddst_prefix + "dst",
            "asrc": asrc_prefix + "src",
            "csrc": csrc_prefix + "src",
            "ilayout": get_ilayout(alayout, blayout),
            "maybe_satf": isatf_arg,
            "intrinsic": Template(intrinsic_template).substitute(
                {
                    "geom": op.a.geom,
                    "alayout": alayout,
                    "blayout": blayout,
                    "intrinsic_signature": mma_signature(op),
                    "satf": satf,
                    "b1op": op.b1op,
                }
            ),
        }
        results[(min_ptx, min_sm)] += Template(mma_template).substitute(params)

    return results


def gen_tests():
    results = gen_wmma_ldst_tests(defaultdict(str))
    results = gen_wmma_mma_tests(results)

    run_template = r"""
//
// *** DO NOT EDIT ***
//
//  This test has been automatically generated by
//  builtins-nvtx-mma.py --ptx=${ptx} --gpu-arch=${sm}
//
// Make sure we can handle all builtins available on sm_${sm} with PTX${ptx}
// ${run}: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_${sm} \
// ${run}:            -fcuda-is-device -target-feature +ptx${ptx} \
// ${run}:            -DPTX=${ptx} -DSM=${sm} \
// ${run}:            -S -emit-llvm -o - -x cuda %s \
// ${run}:   | FileCheck -check-prefixes=${check_labels} %s
// Verify that all builtins have correct constraints.
// ${run}: %clang_cc1 -triple nvptx-unknown-unknown \
// ${run}:   -target-cpu sm_60 -target-feature +ptx42 \
// ${run}:   -DPTX=${ptx} -DSM=${sm} -fcuda-is-device -S -o /dev/null -x cuda \
// ${run}:   -verify %s
"""

    def supported_variants(ptx, sm, results):
        return [(ptx_, sm_) for ptx_, sm_ in results if ptx_ <= ptx and sm_ <= sm]

    print(
        Template(run_template).substitute(
            {
                "run": "RUN",  # To avoid lit misinterpreting the template
                "ptx": ptx_version,
                "sm": gpu_arch,
                "check_labels": ",".join(
                    [
                        "CHECK_PTX%d_SM%d" % (ptx_, sm_)
                        for ptx_, sm_ in supported_variants(
                            ptx_version, gpu_arch, results
                        )
                    ]
                ),
            }
        )
    )

    print(
        """
#if !defined(CUDA_VERSION)
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))

typedef unsigned long long uint64_t;
#endif

// CHECK-LABEL: test_wmma_buitins
__device__ void test_wmma_buitins(int *src, int *dst,
                                  float *fsrc, float *fdst,
                                  double *dsrc, double *ddst, int ldm) {
"""
    )

    for (ptx, sm), tests in sorted(results.items()):
        print()
        print("#if (PTX >= %d) && (SM >= %d)" % (ptx, sm))
        print(tests)
        print("#endif // (PTX >= %d) && (SM >= %d)" % (ptx, sm))

    print("}")


parser = argparse.ArgumentParser()
parser.add_argument("--ptx", type=int, default=60)
parser.add_argument("--gpu-arch", type=int, default=70)
args = parser.parse_args()
ptx_version = args.ptx
gpu_arch = args.gpu_arch

gen_tests()