357 lines
13 KiB
C++
357 lines
13 KiB
C++
//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "ParallelSnippetGenerator.h"
|
|
|
|
#include "BenchmarkRunner.h"
|
|
#include "MCInstrDescView.h"
|
|
#include "Target.h"
|
|
|
|
// FIXME: Load constants into registers (e.g. with fld1) to not break
|
|
// instructions like x87.
|
|
|
|
// Ideally we would like the only limitation on executing instructions to be the
|
|
// availability of the CPU resources (e.g. execution ports) needed to execute
|
|
// them, instead of the availability of their data dependencies.
|
|
|
|
// To achieve that, one approach is to generate instructions that do not have
|
|
// data dependencies between them.
|
|
//
|
|
// For some instructions, this is trivial:
|
|
// mov rax, qword ptr [rsi]
|
|
// mov rax, qword ptr [rsi]
|
|
// mov rax, qword ptr [rsi]
|
|
// mov rax, qword ptr [rsi]
|
|
// For the above snippet, haswell just renames rax four times and executes the
|
|
// four instructions two at a time on P23 and P0126.
|
|
//
|
|
// For some instructions, we just need to make sure that the source is
|
|
// different from the destination. For example, IDIV8r reads from GPR and
|
|
// writes to AX. We just need to ensure that the Var is assigned a
|
|
// register which is different from AX:
|
|
// idiv bx
|
|
// idiv bx
|
|
// idiv bx
|
|
// idiv bx
|
|
// The above snippet will be able to fully saturate the ports, while the same
|
|
// with ax would issue one uop every `latency(IDIV8r)` cycles.
|
|
//
|
|
// Some instructions make this harder because they both read and write from
|
|
// the same register:
|
|
// inc rax
|
|
// inc rax
|
|
// inc rax
|
|
// inc rax
|
|
// This has a data dependency from each instruction to the next, limit the
|
|
// number of instructions that can be issued in parallel.
|
|
// It turns out that this is not a big issue on recent Intel CPUs because they
|
|
// have heuristics to balance port pressure. In the snippet above, subsequent
|
|
// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
|
|
// might end up executing them all on P0 (just because they can), or try
|
|
// avoiding P5 because it's usually under high pressure from vector
|
|
// instructions.
|
|
// This issue is even more important for high-latency instructions because
|
|
// they increase the idle time of the CPU, e.g. :
|
|
// imul rax, rbx
|
|
// imul rax, rbx
|
|
// imul rax, rbx
|
|
// imul rax, rbx
|
|
//
|
|
// To avoid that, we do the renaming statically by generating as many
|
|
// independent exclusive assignments as possible (until all possible registers
|
|
// are exhausted) e.g.:
|
|
// imul rax, rbx
|
|
// imul rcx, rbx
|
|
// imul rdx, rbx
|
|
// imul r8, rbx
|
|
//
|
|
// Some instruction even make the above static renaming impossible because
|
|
// they implicitly read and write from the same operand, e.g. ADC16rr reads
|
|
// and writes from EFLAGS.
|
|
// In that case we just use a greedy register assignment and hope for the
|
|
// best.
|
|
|
|
namespace llvm {
|
|
namespace exegesis {
|
|
|
|
static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
|
|
SmallVector<const Variable *, 8> Result;
|
|
for (const auto &Var : Instr.Variables)
|
|
if (Var.hasTiedOperands())
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
|
|
|
|
void ParallelSnippetGenerator::instantiateMemoryOperands(
|
|
const unsigned ScratchSpacePointerInReg,
|
|
std::vector<InstructionTemplate> &Instructions) const {
|
|
if (ScratchSpacePointerInReg == 0)
|
|
return; // no memory operands.
|
|
const auto &ET = State.getExegesisTarget();
|
|
const unsigned MemStep = ET.getMaxMemoryAccessSize();
|
|
const size_t OriginalInstructionsSize = Instructions.size();
|
|
size_t I = 0;
|
|
for (InstructionTemplate &IT : Instructions) {
|
|
ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
|
|
++I;
|
|
}
|
|
|
|
while (Instructions.size() < kMinNumDifferentAddresses) {
|
|
InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
|
|
ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
|
|
++I;
|
|
Instructions.push_back(std::move(IT));
|
|
}
|
|
assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
|
|
"not enough scratch space");
|
|
}
|
|
|
|
enum class RegRandomizationStrategy : uint8_t {
|
|
PickRandomRegs,
|
|
SingleStaticRegPerOperand,
|
|
SingleStaticReg,
|
|
|
|
FIRST = PickRandomRegs,
|
|
LAST = SingleStaticReg,
|
|
};
|
|
|
|
} // namespace exegesis
|
|
|
|
template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
|
|
static constexpr bool is_iterable = true;
|
|
};
|
|
|
|
namespace exegesis {
|
|
|
|
const char *getDescription(RegRandomizationStrategy S) {
|
|
switch (S) {
|
|
case RegRandomizationStrategy::PickRandomRegs:
|
|
return "randomizing registers";
|
|
case RegRandomizationStrategy::SingleStaticRegPerOperand:
|
|
return "one unique register for each position";
|
|
case RegRandomizationStrategy::SingleStaticReg:
|
|
return "reusing the same register for all positions";
|
|
}
|
|
llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
|
|
}
|
|
|
|
static std::variant<std::nullopt_t, MCOperand, Register>
|
|
generateSingleRegisterForInstrAvoidingDefUseOverlap(
|
|
const LLVMState &State, const BitVector &ForbiddenRegisters,
|
|
const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
|
|
const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
|
|
const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
|
|
RegRandomizationStrategy S) {
|
|
const Instruction &Instr = IT.getInstr();
|
|
assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
|
|
!IT.getValueFor(Op).isValid());
|
|
assert((!Op.isUse() || !Op.isTied()) &&
|
|
"Not expecting to see a tied use reg");
|
|
|
|
if (Op.isUse()) {
|
|
switch (S) {
|
|
case RegRandomizationStrategy::PickRandomRegs:
|
|
break;
|
|
case RegRandomizationStrategy::SingleStaticReg:
|
|
case RegRandomizationStrategy::SingleStaticRegPerOperand: {
|
|
if (!Instructions.empty())
|
|
return Instructions.front().getValueFor(Op);
|
|
if (S != RegRandomizationStrategy::SingleStaticReg)
|
|
break;
|
|
BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
|
|
const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
|
|
if (std::optional<int> CommonBit =
|
|
getFirstCommonBit(PossibleRegisters, UseAliases))
|
|
return *CommonBit;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
|
|
remove(PossibleRegisters, ForbiddenRegisters);
|
|
|
|
if (Op.isDef()) {
|
|
remove(PossibleRegisters, ImplicitUseAliases);
|
|
const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
|
|
remove(PossibleRegisters, UseAliases);
|
|
}
|
|
|
|
if (Op.isUse()) {
|
|
remove(PossibleRegisters, ImplicitDefAliases);
|
|
// NOTE: in general, using same reg for multiple Use's is fine.
|
|
if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
|
|
const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
|
|
remove(PossibleRegisters, UseAliases);
|
|
}
|
|
}
|
|
|
|
bool IsDefWithTiedUse =
|
|
Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
|
|
if (Op.isUse() || IsDefWithTiedUse) {
|
|
// Now, important bit: if we have used some register for def,
|
|
// then we can not use that same register for *any* use,
|
|
// be it either an untied use, or an use tied to a def.
|
|
// But def-ing same regs is fine, as long as there are no uses!
|
|
const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
|
|
remove(PossibleRegisters, DefsAliases);
|
|
}
|
|
|
|
if (!PossibleRegisters.any())
|
|
return std::nullopt;
|
|
|
|
return randomBit(PossibleRegisters);
|
|
}
|
|
|
|
static std::optional<InstructionTemplate>
|
|
generateSingleSnippetForInstrAvoidingDefUseOverlap(
|
|
const LLVMState &State, const BitVector &ForbiddenRegisters,
|
|
const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
|
|
BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
|
|
const ArrayRef<InstructionTemplate> Instructions,
|
|
RegRandomizationStrategy S) {
|
|
const Instruction &Instr = IT.getInstr();
|
|
for (const Operand &Op : Instr.Operands) {
|
|
if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
|
|
IT.getValueFor(Op).isValid())
|
|
continue;
|
|
assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
|
|
|
|
std::variant<std::nullopt_t, MCOperand, Register> R =
|
|
generateSingleRegisterForInstrAvoidingDefUseOverlap(
|
|
State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
|
|
Uses, Defs, IT, Op, Instructions, S);
|
|
|
|
if (std::holds_alternative<std::nullopt_t>(R))
|
|
return {};
|
|
|
|
MCOperand MCOp;
|
|
if (std::holds_alternative<MCOperand>(R))
|
|
MCOp = std::get<MCOperand>(R);
|
|
else {
|
|
Register RandomReg = std::get<Register>(R);
|
|
if (Op.isDef())
|
|
Defs.set(RandomReg);
|
|
if (Op.isUse())
|
|
Uses.set(RandomReg);
|
|
MCOp = MCOperand::createReg(RandomReg);
|
|
}
|
|
IT.getValueFor(Op) = MCOp;
|
|
}
|
|
return IT;
|
|
}
|
|
|
|
static std::vector<InstructionTemplate>
|
|
generateSnippetForInstrAvoidingDefUseOverlap(
|
|
const LLVMState &State, const InstructionTemplate &IT,
|
|
RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
|
|
// We don't want to accidentally serialize the instruction,
|
|
// so we must be sure that we don't pick a def that is an implicit use,
|
|
// or a use that is an implicit def, so record implicit regs now.
|
|
BitVector ImplicitUses(State.getRegInfo().getNumRegs());
|
|
BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
|
|
for (const auto &Op : IT.getInstr().Operands) {
|
|
if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
|
|
assert(Op.isImplicitReg() && "Not an implicit register operand?");
|
|
if (Op.isUse())
|
|
ImplicitUses.set(Op.getImplicitReg());
|
|
else {
|
|
assert(Op.isDef() && "Not a use and not a def?");
|
|
ImplicitDefs.set(Op.getImplicitReg());
|
|
}
|
|
}
|
|
}
|
|
const BitVector ImplicitUseAliases =
|
|
getAliasedBits(State.getRegInfo(), ImplicitUses);
|
|
const BitVector ImplicitDefAliases =
|
|
getAliasedBits(State.getRegInfo(), ImplicitDefs);
|
|
|
|
BitVector Defs(State.getRegInfo().getNumRegs());
|
|
BitVector Uses(State.getRegInfo().getNumRegs());
|
|
std::vector<InstructionTemplate> Instructions;
|
|
|
|
while (true) {
|
|
std::optional<InstructionTemplate> TmpIT =
|
|
generateSingleSnippetForInstrAvoidingDefUseOverlap(
|
|
State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
|
|
Uses, Defs, IT, Instructions, S);
|
|
if (!TmpIT)
|
|
return Instructions;
|
|
Instructions.push_back(std::move(*TmpIT));
|
|
if (!hasVariablesWithTiedOperands(IT.getInstr()))
|
|
return Instructions;
|
|
assert(Instructions.size() <= 128 && "Stuck in endless loop?");
|
|
}
|
|
}
|
|
|
|
Expected<std::vector<CodeTemplate>>
|
|
ParallelSnippetGenerator::generateCodeTemplates(
|
|
InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
|
|
const Instruction &Instr = Variant.getInstr();
|
|
CodeTemplate CT;
|
|
CT.ScratchSpacePointerInReg =
|
|
Instr.hasMemoryOperands()
|
|
? State.getExegesisTarget().getScratchMemoryRegister(
|
|
State.getTargetMachine().getTargetTriple())
|
|
: 0;
|
|
const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
|
|
if (SelfAliasing.empty()) {
|
|
CT.Info = "instruction is parallel, repeating a random one.";
|
|
CT.Instructions.push_back(std::move(Variant));
|
|
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
return getSingleton(std::move(CT));
|
|
}
|
|
if (SelfAliasing.hasImplicitAliasing()) {
|
|
CT.Info = "instruction is serial, repeating a random one.";
|
|
CT.Instructions.push_back(std::move(Variant));
|
|
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
return getSingleton(std::move(CT));
|
|
}
|
|
std::vector<CodeTemplate> Result;
|
|
bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
|
|
// If there are no tied operands, then we don't want to "saturate backedge",
|
|
// and the template we will produce will have only a single instruction.
|
|
unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
|
|
return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
|
|
!Op.isTied();
|
|
});
|
|
SmallVector<RegRandomizationStrategy, 3> Strategies;
|
|
if (HasTiedOperands || NumUntiedUseRegs >= 3)
|
|
Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
|
|
if (NumUntiedUseRegs >= 2)
|
|
Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
|
|
Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
|
|
for (RegRandomizationStrategy S : Strategies) {
|
|
CodeTemplate CurrCT = CT.clone();
|
|
CurrCT.Info =
|
|
Twine("instruction has ")
|
|
.concat(HasTiedOperands ? "" : "no ")
|
|
.concat("tied variables, avoiding "
|
|
"Read-After-Write issue, picking random def and use "
|
|
"registers not aliasing each other, for uses, ")
|
|
.concat(getDescription(S))
|
|
.str();
|
|
CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
|
|
State, Variant, S, ForbiddenRegisters);
|
|
if (CurrCT.Instructions.empty())
|
|
return make_error<StringError>(
|
|
Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
|
|
inconvertibleErrorCode());
|
|
instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
|
|
CurrCT.Instructions);
|
|
Result.push_back(std::move(CurrCT));
|
|
}
|
|
return Result;
|
|
}
|
|
|
|
constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
|
|
|
|
} // namespace exegesis
|
|
} // namespace llvm
|