367 lines
14 KiB
C++
367 lines
14 KiB
C++
|
//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
/// \file
|
||
|
///
|
||
|
/// This file implements methods from the AMDGPUCustomBehaviour class.
|
||
|
///
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
#include "AMDGPUCustomBehaviour.h"
|
||
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||
|
#include "Utils/AMDGPUBaseInfo.h"
|
||
|
#include "TargetInfo/AMDGPUTargetInfo.h"
|
||
|
#include "llvm/MC/TargetRegistry.h"
|
||
|
#include "llvm/Support/WithColor.h"
|
||
|
|
||
|
namespace llvm {
|
||
|
namespace mca {
|
||
|
|
||
|
void AMDGPUInstrPostProcess::postProcessInstruction(
|
||
|
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
|
||
|
switch (MCI.getOpcode()) {
|
||
|
case AMDGPU::S_WAITCNT:
|
||
|
case AMDGPU::S_WAITCNT_soft:
|
||
|
case AMDGPU::S_WAITCNT_EXPCNT:
|
||
|
case AMDGPU::S_WAITCNT_LGKMCNT:
|
||
|
case AMDGPU::S_WAITCNT_VMCNT:
|
||
|
case AMDGPU::S_WAITCNT_VSCNT:
|
||
|
case AMDGPU::S_WAITCNT_VSCNT_soft:
|
||
|
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_VSCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_gfx6_gfx7:
|
||
|
case AMDGPU::S_WAITCNT_vi:
|
||
|
return processWaitCnt(Inst, MCI);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// s_waitcnt instructions encode important information as immediate operands
|
||
|
// which are lost during the MCInst -> mca::Instruction lowering.
|
||
|
void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
|
||
|
const MCInst &MCI) {
|
||
|
for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
|
||
|
MCAOperand Op;
|
||
|
const MCOperand &MCOp = MCI.getOperand(Idx);
|
||
|
if (MCOp.isReg()) {
|
||
|
Op = MCAOperand::createReg(MCOp.getReg());
|
||
|
} else if (MCOp.isImm()) {
|
||
|
Op = MCAOperand::createImm(MCOp.getImm());
|
||
|
}
|
||
|
Op.setIndex(Idx);
|
||
|
Inst->addOperand(Op);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
|
||
|
const mca::SourceMgr &SrcMgr,
|
||
|
const MCInstrInfo &MCII)
|
||
|
: CustomBehaviour(STI, SrcMgr, MCII) {
|
||
|
generateWaitCntInfo();
|
||
|
}
|
||
|
|
||
|
unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
|
||
|
const InstRef &IR) {
|
||
|
const Instruction &Inst = *IR.getInstruction();
|
||
|
unsigned Opcode = Inst.getOpcode();
|
||
|
|
||
|
// llvm-mca is generally run on fully compiled assembly so we wouldn't see any
|
||
|
// pseudo instructions here. However, there are plans for the future to make
|
||
|
// it possible to use mca within backend passes. As such, I have left the
|
||
|
// pseudo version of s_waitcnt within this switch statement.
|
||
|
switch (Opcode) {
|
||
|
default:
|
||
|
return 0;
|
||
|
case AMDGPU::S_WAITCNT: // This instruction
|
||
|
case AMDGPU::S_WAITCNT_soft:
|
||
|
case AMDGPU::S_WAITCNT_EXPCNT:
|
||
|
case AMDGPU::S_WAITCNT_LGKMCNT:
|
||
|
case AMDGPU::S_WAITCNT_VMCNT:
|
||
|
case AMDGPU::S_WAITCNT_VSCNT:
|
||
|
case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
|
||
|
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_VSCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_gfx6_gfx7:
|
||
|
case AMDGPU::S_WAITCNT_vi:
|
||
|
// s_endpgm also behaves as if there is an implicit
|
||
|
// s_waitcnt 0, but I'm not sure if it would be appropriate
|
||
|
// to model this in llvm-mca based on how the iterations work
|
||
|
// while simulating the pipeline over and over.
|
||
|
return handleWaitCnt(IssuedInst, IR);
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
|
||
|
const InstRef &IR) {
|
||
|
// Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
|
||
|
// I do not know how that instruction works so I did not attempt to model it.
|
||
|
// set the max values to begin
|
||
|
unsigned Vmcnt = 63;
|
||
|
unsigned Expcnt = 7;
|
||
|
unsigned Lgkmcnt = 31;
|
||
|
unsigned Vscnt = 63;
|
||
|
unsigned CurrVmcnt = 0;
|
||
|
unsigned CurrExpcnt = 0;
|
||
|
unsigned CurrLgkmcnt = 0;
|
||
|
unsigned CurrVscnt = 0;
|
||
|
unsigned CyclesToWaitVm = ~0U;
|
||
|
unsigned CyclesToWaitExp = ~0U;
|
||
|
unsigned CyclesToWaitLgkm = ~0U;
|
||
|
unsigned CyclesToWaitVs = ~0U;
|
||
|
|
||
|
computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
|
||
|
|
||
|
// We will now look at each of the currently executing instructions
|
||
|
// to find out if this wait instruction still needs to wait.
|
||
|
for (const InstRef &PrevIR : IssuedInst) {
|
||
|
const Instruction &PrevInst = *PrevIR.getInstruction();
|
||
|
const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
|
||
|
const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
|
||
|
const int CyclesLeft = PrevInst.getCyclesLeft();
|
||
|
assert(CyclesLeft != UNKNOWN_CYCLES &&
|
||
|
"We should know how many cycles are left for this instruction");
|
||
|
if (PrevInstWaitInfo.VmCnt) {
|
||
|
CurrVmcnt++;
|
||
|
if ((unsigned)CyclesLeft < CyclesToWaitVm)
|
||
|
CyclesToWaitVm = CyclesLeft;
|
||
|
}
|
||
|
if (PrevInstWaitInfo.ExpCnt) {
|
||
|
CurrExpcnt++;
|
||
|
if ((unsigned)CyclesLeft < CyclesToWaitExp)
|
||
|
CyclesToWaitExp = CyclesLeft;
|
||
|
}
|
||
|
if (PrevInstWaitInfo.LgkmCnt) {
|
||
|
CurrLgkmcnt++;
|
||
|
if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
|
||
|
CyclesToWaitLgkm = CyclesLeft;
|
||
|
}
|
||
|
if (PrevInstWaitInfo.VsCnt) {
|
||
|
CurrVscnt++;
|
||
|
if ((unsigned)CyclesLeft < CyclesToWaitVs)
|
||
|
CyclesToWaitVs = CyclesLeft;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
unsigned CyclesToWait = ~0U;
|
||
|
if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
|
||
|
CyclesToWait = CyclesToWaitVm;
|
||
|
if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
|
||
|
CyclesToWait = CyclesToWaitExp;
|
||
|
if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
|
||
|
CyclesToWait = CyclesToWaitLgkm;
|
||
|
if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
|
||
|
CyclesToWait = CyclesToWaitVs;
|
||
|
|
||
|
// We may underestimate how many cycles we need to wait, but this
|
||
|
// isn't a big deal. Our return value is just how many cycles until
|
||
|
// this function gets run again. So as long as we don't overestimate
|
||
|
// the wait time, we'll still end up stalling at this instruction
|
||
|
// for the correct number of cycles.
|
||
|
|
||
|
if (CyclesToWait == ~0U)
|
||
|
return 0;
|
||
|
return CyclesToWait;
|
||
|
}
|
||
|
|
||
|
void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
|
||
|
unsigned &Expcnt, unsigned &Lgkmcnt,
|
||
|
unsigned &Vscnt) {
|
||
|
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
|
||
|
const Instruction &Inst = *IR.getInstruction();
|
||
|
unsigned Opcode = Inst.getOpcode();
|
||
|
|
||
|
switch (Opcode) {
|
||
|
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
|
||
|
// Should probably be checking for nullptr
|
||
|
// here, but I'm not sure how I should handle the case
|
||
|
// where we see a nullptr.
|
||
|
const MCAOperand *OpReg = Inst.getOperand(0);
|
||
|
const MCAOperand *OpImm = Inst.getOperand(1);
|
||
|
assert(OpReg && OpReg->isReg() && "First operand should be a register.");
|
||
|
assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
|
||
|
if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
|
||
|
// Instruction is using a real register.
|
||
|
// Since we can't know what value this register will have,
|
||
|
// we can't compute what the value of this wait should be.
|
||
|
WithColor::warning() << "The register component of "
|
||
|
<< MCII.getName(Opcode) << " will be completely "
|
||
|
<< "ignored. So the wait may not be accurate.\n";
|
||
|
}
|
||
|
switch (Opcode) {
|
||
|
// Redundant switch so I don't have to repeat the code above
|
||
|
// for each case. There are more clever ways to avoid this
|
||
|
// extra switch and anyone can feel free to implement one of them.
|
||
|
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||
|
Expcnt = OpImm->getImm();
|
||
|
break;
|
||
|
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||
|
Lgkmcnt = OpImm->getImm();
|
||
|
break;
|
||
|
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||
|
Vmcnt = OpImm->getImm();
|
||
|
break;
|
||
|
case AMDGPU::S_WAITCNT_VSCNT_gfx10:
|
||
|
Vscnt = OpImm->getImm();
|
||
|
break;
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
case AMDGPU::S_WAITCNT_gfx10:
|
||
|
case AMDGPU::S_WAITCNT_gfx6_gfx7:
|
||
|
case AMDGPU::S_WAITCNT_vi:
|
||
|
unsigned WaitCnt = Inst.getOperand(0)->getImm();
|
||
|
AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void AMDGPUCustomBehaviour::generateWaitCntInfo() {
|
||
|
// The core logic from this function is taken from
|
||
|
// SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
|
||
|
// that are being looked at are in the MachineInstr format, whereas we have
|
||
|
// access to the MCInst format. The side effects of this are that we can't use
|
||
|
// the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
|
||
|
// functions. Therefore, we conservatively assume that these functions will
|
||
|
// return true. This may cause a few instructions to be incorrectly tagged
|
||
|
// with an extra CNT. However, these are instructions that do interact with at
|
||
|
// least one CNT so giving them an extra CNT shouldn't cause issues in most
|
||
|
// scenarios.
|
||
|
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
|
||
|
InstrWaitCntInfo.resize(SrcMgr.size());
|
||
|
|
||
|
for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
|
||
|
const std::unique_ptr<Instruction> &Inst = EN.value();
|
||
|
unsigned Index = EN.index();
|
||
|
unsigned Opcode = Inst->getOpcode();
|
||
|
const MCInstrDesc &MCID = MCII.get(Opcode);
|
||
|
if ((MCID.TSFlags & SIInstrFlags::DS) &&
|
||
|
(MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
|
||
|
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||
|
if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
|
||
|
InstrWaitCntInfo[Index].ExpCnt = true;
|
||
|
} else if (MCID.TSFlags & SIInstrFlags::FLAT) {
|
||
|
// We conservatively assume that mayAccessVMEMThroughFlat(Inst)
|
||
|
// and mayAccessLDSThroughFlat(Inst) would both return true for this
|
||
|
// instruction. We have to do this because those functions use
|
||
|
// information about the memory operands that we don't have access to.
|
||
|
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||
|
if (!STI.hasFeature(AMDGPU::FeatureVscnt))
|
||
|
InstrWaitCntInfo[Index].VmCnt = true;
|
||
|
else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
|
||
|
InstrWaitCntInfo[Index].VmCnt = true;
|
||
|
else
|
||
|
InstrWaitCntInfo[Index].VsCnt = true;
|
||
|
} else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
|
||
|
if (!STI.hasFeature(AMDGPU::FeatureVscnt))
|
||
|
InstrWaitCntInfo[Index].VmCnt = true;
|
||
|
else if ((MCID.mayLoad() &&
|
||
|
!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
|
||
|
((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
|
||
|
!MCID.mayStore()))
|
||
|
InstrWaitCntInfo[Index].VmCnt = true;
|
||
|
else if (MCID.mayStore())
|
||
|
InstrWaitCntInfo[Index].VsCnt = true;
|
||
|
|
||
|
// (IV.Major < 7) is meant to represent
|
||
|
// GCNTarget.vmemWriteNeedsExpWaitcnt()
|
||
|
// which is defined as
|
||
|
// { return getGeneration() < SEA_ISLANDS; }
|
||
|
if (IV.Major < 7 &&
|
||
|
(MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
|
||
|
InstrWaitCntInfo[Index].ExpCnt = true;
|
||
|
} else if (MCID.TSFlags & SIInstrFlags::SMRD) {
|
||
|
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||
|
} else if (MCID.TSFlags & SIInstrFlags::EXP) {
|
||
|
InstrWaitCntInfo[Index].ExpCnt = true;
|
||
|
} else {
|
||
|
switch (Opcode) {
|
||
|
case AMDGPU::S_SENDMSG:
|
||
|
case AMDGPU::S_SENDMSGHALT:
|
||
|
case AMDGPU::S_MEMTIME:
|
||
|
case AMDGPU::S_MEMREALTIME:
|
||
|
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// taken from SIInstrInfo::isVMEM()
|
||
|
bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
|
||
|
return MCID.TSFlags & SIInstrFlags::MUBUF ||
|
||
|
MCID.TSFlags & SIInstrFlags::MTBUF ||
|
||
|
MCID.TSFlags & SIInstrFlags::MIMG;
|
||
|
}
|
||
|
|
||
|
// taken from SIInstrInfo::hasModifiersSet()
|
||
|
bool AMDGPUCustomBehaviour::hasModifiersSet(
|
||
|
const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
|
||
|
int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
|
||
|
if (Idx == -1)
|
||
|
return false;
|
||
|
|
||
|
const MCAOperand *Op = Inst->getOperand(Idx);
|
||
|
if (Op == nullptr || !Op->isImm() || !Op->getImm())
|
||
|
return false;
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// taken from SIInstrInfo::isGWS()
|
||
|
bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
|
||
|
const MCInstrDesc &MCID = MCII.get(Opcode);
|
||
|
return MCID.TSFlags & SIInstrFlags::GWS;
|
||
|
}
|
||
|
|
||
|
// taken from SIInstrInfo::isAlwaysGDS()
|
||
|
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
|
||
|
return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
|
||
|
}
|
||
|
|
||
|
} // namespace mca
|
||
|
} // namespace llvm
|
||
|
|
||
|
using namespace llvm;
|
||
|
using namespace mca;
|
||
|
|
||
|
static CustomBehaviour *
|
||
|
createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
|
||
|
const mca::SourceMgr &SrcMgr,
|
||
|
const MCInstrInfo &MCII) {
|
||
|
return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
|
||
|
}
|
||
|
|
||
|
static InstrPostProcess *
|
||
|
createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
|
||
|
const MCInstrInfo &MCII) {
|
||
|
return new AMDGPUInstrPostProcess(STI, MCII);
|
||
|
}
|
||
|
|
||
|
/// Extern function to initialize the targets for the AMDGPU backend
|
||
|
|
||
|
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
|
||
|
TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
|
||
|
createAMDGPUCustomBehaviour);
|
||
|
TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
|
||
|
createAMDGPUInstrPostProcess);
|
||
|
|
||
|
TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
|
||
|
createAMDGPUCustomBehaviour);
|
||
|
TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
|
||
|
createAMDGPUInstrPostProcess);
|
||
|
}
|