834 lines
31 KiB
C++
834 lines
31 KiB
C++
//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass does combining of machine instructions at the generic MI level,
|
|
// before the legalizer.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AArch64GlobalISelUtils.h"
|
|
#include "AArch64TargetMachine.h"
|
|
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
|
|
#include "llvm/CodeGen/GlobalISel/Combiner.h"
|
|
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
|
|
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
|
|
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
|
|
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
|
|
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
|
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
|
|
#include "llvm/CodeGen/GlobalISel/Utils.h"
|
|
#include "llvm/CodeGen/MachineDominators.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#define GET_GICOMBINER_DEPS
|
|
#include "AArch64GenPreLegalizeGICombiner.inc"
|
|
#undef GET_GICOMBINER_DEPS
|
|
|
|
#define DEBUG_TYPE "aarch64-prelegalizer-combiner"
|
|
|
|
using namespace llvm;
|
|
using namespace MIPatternMatch;
|
|
|
|
namespace {
|
|
|
|
#define GET_GICOMBINER_TYPES
|
|
#include "AArch64GenPreLegalizeGICombiner.inc"
|
|
#undef GET_GICOMBINER_TYPES
|
|
|
|
/// Return true if a G_FCONSTANT instruction is known to be better-represented
|
|
/// as a G_CONSTANT.
|
|
bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
|
|
if (DstSize != 32 && DstSize != 64)
|
|
return false;
|
|
|
|
// When we're storing a value, it doesn't matter what register bank it's on.
|
|
// Since not all floating point constants can be materialized using a fmov,
|
|
// it makes more sense to just use a GPR.
|
|
return all_of(MRI.use_nodbg_instructions(DstReg),
|
|
[](const MachineInstr &Use) { return Use.mayStore(); });
|
|
}
|
|
|
|
/// Change a G_FCONSTANT into a G_CONSTANT.
|
|
void applyFConstantToConstant(MachineInstr &MI) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
|
|
MachineIRBuilder MIB(MI);
|
|
const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
|
|
MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
|
|
MI.eraseFromParent();
|
|
}
|
|
|
|
/// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
|
|
/// are sign bits. In this case, we can transform the G_ICMP to directly compare
|
|
/// the wide value with a zero.
|
|
bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
GISelKnownBits *KB, Register &MatchInfo) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
|
|
|
|
auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
|
|
if (!ICmpInst::isEquality(Pred))
|
|
return false;
|
|
|
|
Register LHS = MI.getOperand(2).getReg();
|
|
LLT LHSTy = MRI.getType(LHS);
|
|
if (!LHSTy.isScalar())
|
|
return false;
|
|
|
|
Register RHS = MI.getOperand(3).getReg();
|
|
Register WideReg;
|
|
|
|
if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) ||
|
|
!mi_match(RHS, MRI, m_SpecificICst(0)))
|
|
return false;
|
|
|
|
LLT WideTy = MRI.getType(WideReg);
|
|
if (KB->computeNumSignBits(WideReg) <=
|
|
WideTy.getSizeInBits() - LHSTy.getSizeInBits())
|
|
return false;
|
|
|
|
MatchInfo = WideReg;
|
|
return true;
|
|
}
|
|
|
|
void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &Builder,
|
|
GISelChangeObserver &Observer, Register &WideReg) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_ICMP);
|
|
|
|
LLT WideTy = MRI.getType(WideReg);
|
|
// We're going to directly use the wide register as the LHS, and then use an
|
|
// equivalent size zero for RHS.
|
|
Builder.setInstrAndDebugLoc(MI);
|
|
auto WideZero = Builder.buildConstant(WideTy, 0);
|
|
Observer.changingInstr(MI);
|
|
MI.getOperand(2).setReg(WideReg);
|
|
MI.getOperand(3).setReg(WideZero.getReg(0));
|
|
Observer.changedInstr(MI);
|
|
}
|
|
|
|
/// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
|
|
///
|
|
/// e.g.
|
|
///
|
|
/// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
|
|
bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
std::pair<uint64_t, uint64_t> &MatchInfo) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
|
|
MachineFunction &MF = *MI.getMF();
|
|
auto &GlobalOp = MI.getOperand(1);
|
|
auto *GV = GlobalOp.getGlobal();
|
|
if (GV->isThreadLocal())
|
|
return false;
|
|
|
|
// Don't allow anything that could represent offsets etc.
|
|
if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
|
|
GV, MF.getTarget()) != AArch64II::MO_NO_FLAG)
|
|
return false;
|
|
|
|
// Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
|
|
//
|
|
// %g = G_GLOBAL_VALUE @x
|
|
// %ptr1 = G_PTR_ADD %g, cst1
|
|
// %ptr2 = G_PTR_ADD %g, cst2
|
|
// ...
|
|
// %ptrN = G_PTR_ADD %g, cstN
|
|
//
|
|
// Identify the *smallest* constant. We want to be able to form this:
|
|
//
|
|
// %offset_g = G_GLOBAL_VALUE @x + min_cst
|
|
// %g = G_PTR_ADD %offset_g, -min_cst
|
|
// %ptr1 = G_PTR_ADD %g, cst1
|
|
// ...
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
uint64_t MinOffset = -1ull;
|
|
for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) {
|
|
if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
|
|
return false;
|
|
auto Cst = getIConstantVRegValWithLookThrough(
|
|
UseInstr.getOperand(2).getReg(), MRI);
|
|
if (!Cst)
|
|
return false;
|
|
MinOffset = std::min(MinOffset, Cst->Value.getZExtValue());
|
|
}
|
|
|
|
// Require that the new offset is larger than the existing one to avoid
|
|
// infinite loops.
|
|
uint64_t CurrOffset = GlobalOp.getOffset();
|
|
uint64_t NewOffset = MinOffset + CurrOffset;
|
|
if (NewOffset <= CurrOffset)
|
|
return false;
|
|
|
|
// Check whether folding this offset is legal. It must not go out of bounds of
|
|
// the referenced object to avoid violating the code model, and must be
|
|
// smaller than 2^20 because this is the largest offset expressible in all
|
|
// object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
|
|
// stores an immediate signed 21 bit offset.)
|
|
//
|
|
// This check also prevents us from folding negative offsets, which will end
|
|
// up being treated in the same way as large positive ones. They could also
|
|
// cause code model violations, and aren't really common enough to matter.
|
|
if (NewOffset >= (1 << 20))
|
|
return false;
|
|
|
|
Type *T = GV->getValueType();
|
|
if (!T->isSized() ||
|
|
NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
|
|
return false;
|
|
MatchInfo = std::make_pair(NewOffset, MinOffset);
|
|
return true;
|
|
}
|
|
|
|
void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, GISelChangeObserver &Observer,
|
|
std::pair<uint64_t, uint64_t> &MatchInfo) {
|
|
// Change:
|
|
//
|
|
// %g = G_GLOBAL_VALUE @x
|
|
// %ptr1 = G_PTR_ADD %g, cst1
|
|
// %ptr2 = G_PTR_ADD %g, cst2
|
|
// ...
|
|
// %ptrN = G_PTR_ADD %g, cstN
|
|
//
|
|
// To:
|
|
//
|
|
// %offset_g = G_GLOBAL_VALUE @x + min_cst
|
|
// %g = G_PTR_ADD %offset_g, -min_cst
|
|
// %ptr1 = G_PTR_ADD %g, cst1
|
|
// ...
|
|
// %ptrN = G_PTR_ADD %g, cstN
|
|
//
|
|
// Then, the original G_PTR_ADDs should be folded later on so that they look
|
|
// like this:
|
|
//
|
|
// %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
|
|
uint64_t Offset, MinOffset;
|
|
std::tie(Offset, MinOffset) = MatchInfo;
|
|
B.setInstrAndDebugLoc(*std::next(MI.getIterator()));
|
|
Observer.changingInstr(MI);
|
|
auto &GlobalOp = MI.getOperand(1);
|
|
auto *GV = GlobalOp.getGlobal();
|
|
GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags());
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register NewGVDst = MRI.cloneVirtualRegister(Dst);
|
|
MI.getOperand(0).setReg(NewGVDst);
|
|
Observer.changedInstr(MI);
|
|
B.buildPtrAdd(
|
|
Dst, NewGVDst,
|
|
B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
|
|
}
|
|
|
|
// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
|
|
// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
|
|
// Similar to performVecReduceAddCombine in SelectionDAG
|
|
bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
const AArch64Subtarget &STI,
|
|
std::tuple<Register, Register, bool> &MatchInfo) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
|
|
"Expected a G_VECREDUCE_ADD instruction");
|
|
assert(STI.hasDotProd() && "Target should have Dot Product feature");
|
|
|
|
MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
Register MidReg = I1->getOperand(0).getReg();
|
|
LLT DstTy = MRI.getType(DstReg);
|
|
LLT MidTy = MRI.getType(MidReg);
|
|
if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
|
|
return false;
|
|
|
|
LLT SrcTy;
|
|
auto I1Opc = I1->getOpcode();
|
|
if (I1Opc == TargetOpcode::G_MUL) {
|
|
// If result of this has more than 1 use, then there is no point in creating
|
|
// udot instruction
|
|
if (!MRI.hasOneNonDBGUse(MidReg))
|
|
return false;
|
|
|
|
MachineInstr *ExtMI1 =
|
|
getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI);
|
|
MachineInstr *ExtMI2 =
|
|
getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI);
|
|
LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg());
|
|
LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg());
|
|
|
|
if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
|
|
return false;
|
|
I1Opc = ExtMI1->getOpcode();
|
|
SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg());
|
|
std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg();
|
|
std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg();
|
|
} else {
|
|
SrcTy = MRI.getType(I1->getOperand(1).getReg());
|
|
std::get<0>(MatchInfo) = I1->getOperand(1).getReg();
|
|
std::get<1>(MatchInfo) = 0;
|
|
}
|
|
|
|
if (I1Opc == TargetOpcode::G_ZEXT)
|
|
std::get<2>(MatchInfo) = 0;
|
|
else if (I1Opc == TargetOpcode::G_SEXT)
|
|
std::get<2>(MatchInfo) = 1;
|
|
else
|
|
return false;
|
|
|
|
if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &Builder,
|
|
GISelChangeObserver &Observer,
|
|
const AArch64Subtarget &STI,
|
|
std::tuple<Register, Register, bool> &MatchInfo) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
|
|
"Expected a G_VECREDUCE_ADD instruction");
|
|
assert(STI.hasDotProd() && "Target should have Dot Product feature");
|
|
|
|
// Initialise the variables
|
|
unsigned DotOpcode =
|
|
std::get<2>(MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
|
|
Register Ext1SrcReg = std::get<0>(MatchInfo);
|
|
|
|
// If there is one source register, create a vector of 0s as the second
|
|
// source register
|
|
Register Ext2SrcReg;
|
|
if (std::get<1>(MatchInfo) == 0)
|
|
Ext2SrcReg = Builder.buildConstant(MRI.getType(Ext1SrcReg), 1)
|
|
->getOperand(0)
|
|
.getReg();
|
|
else
|
|
Ext2SrcReg = std::get<1>(MatchInfo);
|
|
|
|
// Find out how many DOT instructions are needed
|
|
LLT SrcTy = MRI.getType(Ext1SrcReg);
|
|
LLT MidTy;
|
|
unsigned NumOfDotMI;
|
|
if (SrcTy.getNumElements() % 16 == 0) {
|
|
NumOfDotMI = SrcTy.getNumElements() / 16;
|
|
MidTy = LLT::fixed_vector(4, 32);
|
|
} else if (SrcTy.getNumElements() % 8 == 0) {
|
|
NumOfDotMI = SrcTy.getNumElements() / 8;
|
|
MidTy = LLT::fixed_vector(2, 32);
|
|
} else {
|
|
llvm_unreachable("Source type number of elements is not multiple of 8");
|
|
}
|
|
|
|
// Handle case where one DOT instruction is needed
|
|
if (NumOfDotMI == 1) {
|
|
auto Zeroes = Builder.buildConstant(MidTy, 0)->getOperand(0).getReg();
|
|
auto Dot = Builder.buildInstr(DotOpcode, {MidTy},
|
|
{Zeroes, Ext1SrcReg, Ext2SrcReg});
|
|
Builder.buildVecReduceAdd(MI.getOperand(0), Dot->getOperand(0));
|
|
} else {
|
|
// If not pad the last v8 element with 0s to a v16
|
|
SmallVector<Register, 4> Ext1UnmergeReg;
|
|
SmallVector<Register, 4> Ext2UnmergeReg;
|
|
if (SrcTy.getNumElements() % 16 != 0) {
|
|
SmallVector<Register> Leftover1;
|
|
SmallVector<Register> Leftover2;
|
|
|
|
// Split the elements into v16i8 and v8i8
|
|
LLT MainTy = LLT::fixed_vector(16, 8);
|
|
LLT LeftoverTy1, LeftoverTy2;
|
|
if ((!extractParts(Ext1SrcReg, MRI.getType(Ext1SrcReg), MainTy,
|
|
LeftoverTy1, Ext1UnmergeReg, Leftover1, Builder,
|
|
MRI)) ||
|
|
(!extractParts(Ext2SrcReg, MRI.getType(Ext2SrcReg), MainTy,
|
|
LeftoverTy2, Ext2UnmergeReg, Leftover2, Builder,
|
|
MRI))) {
|
|
llvm_unreachable("Unable to split this vector properly");
|
|
}
|
|
|
|
// Pad the leftover v8i8 vector with register of 0s of type v8i8
|
|
Register v8Zeroes = Builder.buildConstant(LLT::fixed_vector(8, 8), 0)
|
|
->getOperand(0)
|
|
.getReg();
|
|
|
|
Ext1UnmergeReg.push_back(
|
|
Builder
|
|
.buildMergeLikeInstr(LLT::fixed_vector(16, 8),
|
|
{Leftover1[0], v8Zeroes})
|
|
.getReg(0));
|
|
Ext2UnmergeReg.push_back(
|
|
Builder
|
|
.buildMergeLikeInstr(LLT::fixed_vector(16, 8),
|
|
{Leftover2[0], v8Zeroes})
|
|
.getReg(0));
|
|
|
|
} else {
|
|
// Unmerge the source vectors to v16i8
|
|
unsigned SrcNumElts = SrcTy.getNumElements();
|
|
extractParts(Ext1SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
|
|
Ext1UnmergeReg, Builder, MRI);
|
|
extractParts(Ext2SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
|
|
Ext2UnmergeReg, Builder, MRI);
|
|
}
|
|
|
|
// Build the UDOT instructions
|
|
SmallVector<Register, 2> DotReg;
|
|
unsigned NumElements = 0;
|
|
for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) {
|
|
LLT ZeroesLLT;
|
|
// Check if it is 16 or 8 elements. Set Zeroes to the according size
|
|
if (MRI.getType(Ext1UnmergeReg[i]).getNumElements() == 16) {
|
|
ZeroesLLT = LLT::fixed_vector(4, 32);
|
|
NumElements += 4;
|
|
} else {
|
|
ZeroesLLT = LLT::fixed_vector(2, 32);
|
|
NumElements += 2;
|
|
}
|
|
auto Zeroes = Builder.buildConstant(ZeroesLLT, 0)->getOperand(0).getReg();
|
|
DotReg.push_back(
|
|
Builder
|
|
.buildInstr(DotOpcode, {MRI.getType(Zeroes)},
|
|
{Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]})
|
|
.getReg(0));
|
|
}
|
|
|
|
// Merge the output
|
|
auto ConcatMI =
|
|
Builder.buildConcatVectors(LLT::fixed_vector(NumElements, 32), DotReg);
|
|
|
|
// Put it through a vector reduction
|
|
Builder.buildVecReduceAdd(MI.getOperand(0).getReg(),
|
|
ConcatMI->getOperand(0).getReg());
|
|
}
|
|
|
|
// Erase the dead instructions
|
|
MI.eraseFromParent();
|
|
}
|
|
|
|
// Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
|
|
// Ensure that the type coming from the extend instruction is the right size
|
|
bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
std::pair<Register, bool> &MatchInfo) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
|
|
"Expected G_VECREDUCE_ADD Opcode");
|
|
|
|
// Check if the last instruction is an extend
|
|
MachineInstr *ExtMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
|
|
auto ExtOpc = ExtMI->getOpcode();
|
|
|
|
if (ExtOpc == TargetOpcode::G_ZEXT)
|
|
std::get<1>(MatchInfo) = 0;
|
|
else if (ExtOpc == TargetOpcode::G_SEXT)
|
|
std::get<1>(MatchInfo) = 1;
|
|
else
|
|
return false;
|
|
|
|
// Check if the source register is a valid type
|
|
Register ExtSrcReg = ExtMI->getOperand(1).getReg();
|
|
LLT ExtSrcTy = MRI.getType(ExtSrcReg);
|
|
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
|
|
if ((DstTy.getScalarSizeInBits() == 16 &&
|
|
ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) ||
|
|
(DstTy.getScalarSizeInBits() == 32 &&
|
|
ExtSrcTy.getNumElements() % 4 == 0) ||
|
|
(DstTy.getScalarSizeInBits() == 64 &&
|
|
ExtSrcTy.getNumElements() % 4 == 0)) {
|
|
std::get<0>(MatchInfo) = ExtSrcReg;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, GISelChangeObserver &Observer,
|
|
std::pair<Register, bool> &MatchInfo) {
|
|
assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
|
|
"Expected G_VECREDUCE_ADD Opcode");
|
|
|
|
unsigned Opc = std::get<1>(MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
|
|
Register SrcReg = std::get<0>(MatchInfo);
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
LLT SrcTy = MRI.getType(SrcReg);
|
|
LLT DstTy = MRI.getType(DstReg);
|
|
|
|
// If SrcTy has more elements than expected, split them into multiple
|
|
// insructions and sum the results
|
|
LLT MainTy;
|
|
SmallVector<Register, 1> WorkingRegisters;
|
|
unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
|
|
unsigned SrcNumElem = SrcTy.getNumElements();
|
|
if ((SrcScalSize == 8 && SrcNumElem > 16) ||
|
|
(SrcScalSize == 16 && SrcNumElem > 8) ||
|
|
(SrcScalSize == 32 && SrcNumElem > 4)) {
|
|
|
|
LLT LeftoverTy;
|
|
SmallVector<Register, 4> LeftoverRegs;
|
|
if (SrcScalSize == 8)
|
|
MainTy = LLT::fixed_vector(16, 8);
|
|
else if (SrcScalSize == 16)
|
|
MainTy = LLT::fixed_vector(8, 16);
|
|
else if (SrcScalSize == 32)
|
|
MainTy = LLT::fixed_vector(4, 32);
|
|
else
|
|
llvm_unreachable("Source's Scalar Size not supported");
|
|
|
|
// Extract the parts and put each extracted sources through U/SADDLV and put
|
|
// the values inside a small vec
|
|
extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters,
|
|
LeftoverRegs, B, MRI);
|
|
for (unsigned I = 0; I < LeftoverRegs.size(); I++) {
|
|
WorkingRegisters.push_back(LeftoverRegs[I]);
|
|
}
|
|
} else {
|
|
WorkingRegisters.push_back(SrcReg);
|
|
MainTy = SrcTy;
|
|
}
|
|
|
|
unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2;
|
|
LLT MidScalarLLT = LLT::scalar(MidScalarSize);
|
|
Register zeroReg = B.buildConstant(LLT::scalar(64), 0).getReg(0);
|
|
for (unsigned I = 0; I < WorkingRegisters.size(); I++) {
|
|
// If the number of elements is too small to build an instruction, extend
|
|
// its size before applying addlv
|
|
LLT WorkingRegTy = MRI.getType(WorkingRegisters[I]);
|
|
if ((WorkingRegTy.getScalarSizeInBits() == 8) &&
|
|
(WorkingRegTy.getNumElements() == 4)) {
|
|
WorkingRegisters[I] =
|
|
B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
|
|
: TargetOpcode::G_ZEXT,
|
|
{LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]})
|
|
.getReg(0);
|
|
}
|
|
|
|
// Generate the {U/S}ADDLV instruction, whose output is always double of the
|
|
// Src's Scalar size
|
|
LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32)
|
|
: LLT::fixed_vector(2, 64);
|
|
Register addlvReg =
|
|
B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]}).getReg(0);
|
|
|
|
// The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
|
|
// v2i64 register.
|
|
// i16, i32 results uses v4i32 registers
|
|
// i64 results uses v2i64 registers
|
|
// Therefore we have to extract/truncate the the value to the right type
|
|
if (MidScalarSize == 32 || MidScalarSize == 64) {
|
|
WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
|
|
{MidScalarLLT}, {addlvReg, zeroReg})
|
|
.getReg(0);
|
|
} else {
|
|
Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
|
|
{LLT::scalar(32)}, {addlvReg, zeroReg})
|
|
.getReg(0);
|
|
WorkingRegisters[I] =
|
|
B.buildTrunc({MidScalarLLT}, {extractReg}).getReg(0);
|
|
}
|
|
}
|
|
|
|
Register outReg;
|
|
if (WorkingRegisters.size() > 1) {
|
|
outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1])
|
|
.getReg(0);
|
|
for (unsigned I = 2; I < WorkingRegisters.size(); I++) {
|
|
outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I]).getReg(0);
|
|
}
|
|
} else {
|
|
outReg = WorkingRegisters[0];
|
|
}
|
|
|
|
if (DstTy.getScalarSizeInBits() > MidScalarSize) {
|
|
// Handle the scalar value if the DstTy's Scalar Size is more than double
|
|
// Src's ScalarType
|
|
B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
|
|
: TargetOpcode::G_ZEXT,
|
|
{DstReg}, {outReg});
|
|
} else {
|
|
B.buildCopy(DstReg, outReg);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
}
|
|
|
|
bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
|
|
CombinerHelper &Helper, GISelChangeObserver &Observer) {
|
|
// Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
|
|
// result is only used in the no-overflow case. It is restricted to cases
|
|
// where we know that the high-bits of the operands are 0. If there's an
|
|
// overflow, then the 9th or 17th bit must be set, which can be checked
|
|
// using TBNZ.
|
|
//
|
|
// Change (for UADDOs on 8 and 16 bits):
|
|
//
|
|
// %z0 = G_ASSERT_ZEXT _
|
|
// %op0 = G_TRUNC %z0
|
|
// %z1 = G_ASSERT_ZEXT _
|
|
// %op1 = G_TRUNC %z1
|
|
// %val, %cond = G_UADDO %op0, %op1
|
|
// G_BRCOND %cond, %error.bb
|
|
//
|
|
// error.bb:
|
|
// (no successors and no uses of %val)
|
|
//
|
|
// To:
|
|
//
|
|
// %z0 = G_ASSERT_ZEXT _
|
|
// %z1 = G_ASSERT_ZEXT _
|
|
// %add = G_ADD %z0, %z1
|
|
// %val = G_TRUNC %add
|
|
// %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
|
|
// %cond = G_ICMP NE, %bit, 0
|
|
// G_BRCOND %cond, %error.bb
|
|
|
|
auto &MRI = *B.getMRI();
|
|
|
|
MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg());
|
|
MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg());
|
|
Register Op0Wide;
|
|
Register Op1Wide;
|
|
if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) ||
|
|
!mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide))))
|
|
return false;
|
|
LLT WideTy0 = MRI.getType(Op0Wide);
|
|
LLT WideTy1 = MRI.getType(Op1Wide);
|
|
Register ResVal = MI.getOperand(0).getReg();
|
|
LLT OpTy = MRI.getType(ResVal);
|
|
MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide);
|
|
MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide);
|
|
|
|
unsigned OpTySize = OpTy.getScalarSizeInBits();
|
|
// First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
|
|
// inputs have been zero-extended.
|
|
if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
|
|
Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
|
|
OpTySize != Op0WideDef->getOperand(2).getImm() ||
|
|
OpTySize != Op1WideDef->getOperand(2).getImm())
|
|
return false;
|
|
|
|
// Only scalar UADDO with either 8 or 16 bit operands are handled.
|
|
if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
|
|
OpTySize >= WideTy0.getScalarSizeInBits() ||
|
|
(OpTySize != 8 && OpTySize != 16))
|
|
return false;
|
|
|
|
// The overflow-status result must be used by a branch only.
|
|
Register ResStatus = MI.getOperand(1).getReg();
|
|
if (!MRI.hasOneNonDBGUse(ResStatus))
|
|
return false;
|
|
MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus);
|
|
if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
|
|
return false;
|
|
|
|
// Make sure the computed result is only used in the no-overflow blocks.
|
|
MachineBasicBlock *CurrentMBB = MI.getParent();
|
|
MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB();
|
|
if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
|
|
return false;
|
|
if (any_of(MRI.use_nodbg_instructions(ResVal),
|
|
[&MI, FailMBB, CurrentMBB](MachineInstr &I) {
|
|
return &MI != &I &&
|
|
(I.getParent() == FailMBB || I.getParent() == CurrentMBB);
|
|
}))
|
|
return false;
|
|
|
|
// Remove G_ADDO.
|
|
B.setInstrAndDebugLoc(*MI.getNextNode());
|
|
MI.eraseFromParent();
|
|
|
|
// Emit wide add.
|
|
Register AddDst = MRI.cloneVirtualRegister(Op0Wide);
|
|
B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide});
|
|
|
|
// Emit check of the 9th or 17th bit and update users (the branch). This will
|
|
// later be folded to TBNZ.
|
|
Register CondBit = MRI.cloneVirtualRegister(Op0Wide);
|
|
B.buildAnd(
|
|
CondBit, AddDst,
|
|
B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16));
|
|
B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit,
|
|
B.buildConstant(LLT::scalar(32), 0));
|
|
|
|
// Update ZEXts users of the result value. Because all uses are in the
|
|
// no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
|
|
B.buildZExtOrTrunc(ResVal, AddDst);
|
|
for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) {
|
|
Register WideReg;
|
|
if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) {
|
|
auto OldR = U.getParent()->getOperand(0).getReg();
|
|
Observer.erasingInstr(*U.getParent());
|
|
U.getParent()->eraseFromParent();
|
|
Helper.replaceRegWith(MRI, OldR, AddDst);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
class AArch64PreLegalizerCombinerImpl : public Combiner {
|
|
protected:
|
|
// TODO: Make CombinerHelper methods const.
|
|
mutable CombinerHelper Helper;
|
|
const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
|
|
const AArch64Subtarget &STI;
|
|
|
|
public:
|
|
AArch64PreLegalizerCombinerImpl(
|
|
MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
|
|
GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
|
|
const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
|
|
const AArch64Subtarget &STI, MachineDominatorTree *MDT,
|
|
const LegalizerInfo *LI);
|
|
|
|
static const char *getName() { return "AArch6400PreLegalizerCombiner"; }
|
|
|
|
bool tryCombineAll(MachineInstr &I) const override;
|
|
|
|
bool tryCombineAllImpl(MachineInstr &I) const;
|
|
|
|
private:
|
|
#define GET_GICOMBINER_CLASS_MEMBERS
|
|
#include "AArch64GenPreLegalizeGICombiner.inc"
|
|
#undef GET_GICOMBINER_CLASS_MEMBERS
|
|
};
|
|
|
|
#define GET_GICOMBINER_IMPL
|
|
#include "AArch64GenPreLegalizeGICombiner.inc"
|
|
#undef GET_GICOMBINER_IMPL
|
|
|
|
AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl(
|
|
MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
|
|
GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
|
|
const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
|
|
const AArch64Subtarget &STI, MachineDominatorTree *MDT,
|
|
const LegalizerInfo *LI)
|
|
: Combiner(MF, CInfo, TPC, &KB, CSEInfo),
|
|
Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
|
|
RuleConfig(RuleConfig), STI(STI),
|
|
#define GET_GICOMBINER_CONSTRUCTOR_INITS
|
|
#include "AArch64GenPreLegalizeGICombiner.inc"
|
|
#undef GET_GICOMBINER_CONSTRUCTOR_INITS
|
|
{
|
|
}
|
|
|
|
bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
|
|
if (tryCombineAllImpl(MI))
|
|
return true;
|
|
|
|
unsigned Opc = MI.getOpcode();
|
|
switch (Opc) {
|
|
case TargetOpcode::G_CONCAT_VECTORS:
|
|
return Helper.tryCombineConcatVectors(MI);
|
|
case TargetOpcode::G_SHUFFLE_VECTOR:
|
|
return Helper.tryCombineShuffleVector(MI);
|
|
case TargetOpcode::G_UADDO:
|
|
return tryToSimplifyUADDO(MI, B, Helper, Observer);
|
|
case TargetOpcode::G_MEMCPY_INLINE:
|
|
return Helper.tryEmitMemcpyInline(MI);
|
|
case TargetOpcode::G_MEMCPY:
|
|
case TargetOpcode::G_MEMMOVE:
|
|
case TargetOpcode::G_MEMSET: {
|
|
// If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
|
|
// heuristics decide.
|
|
unsigned MaxLen = CInfo.EnableOpt ? 0 : 32;
|
|
// Try to inline memcpy type calls if optimizations are enabled.
|
|
if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
|
|
return true;
|
|
if (Opc == TargetOpcode::G_MEMSET)
|
|
return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, CInfo.EnableMinSize);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Pass boilerplate
|
|
// ================
|
|
|
|
class AArch64PreLegalizerCombiner : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
AArch64PreLegalizerCombiner();
|
|
|
|
StringRef getPassName() const override {
|
|
return "AArch64PreLegalizerCombiner";
|
|
}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
|
|
|
private:
|
|
AArch64PreLegalizerCombinerImplRuleConfig RuleConfig;
|
|
};
|
|
} // end anonymous namespace
|
|
|
|
void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
|
|
AU.addRequired<TargetPassConfig>();
|
|
AU.setPreservesCFG();
|
|
getSelectionDAGFallbackAnalysisUsage(AU);
|
|
AU.addRequired<GISelKnownBitsAnalysis>();
|
|
AU.addPreserved<GISelKnownBitsAnalysis>();
|
|
AU.addRequired<MachineDominatorTree>();
|
|
AU.addPreserved<MachineDominatorTree>();
|
|
AU.addRequired<GISelCSEAnalysisWrapperPass>();
|
|
AU.addPreserved<GISelCSEAnalysisWrapperPass>();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
|
|
: MachineFunctionPass(ID) {
|
|
initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
|
|
|
|
if (!RuleConfig.parseCommandLineOption())
|
|
report_fatal_error("Invalid rule identifier");
|
|
}
|
|
|
|
bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
|
|
if (MF.getProperties().hasProperty(
|
|
MachineFunctionProperties::Property::FailedISel))
|
|
return false;
|
|
auto &TPC = getAnalysis<TargetPassConfig>();
|
|
|
|
// Enable CSE.
|
|
GISelCSEAnalysisWrapper &Wrapper =
|
|
getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
|
|
auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig());
|
|
|
|
const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
|
|
const auto *LI = ST.getLegalizerInfo();
|
|
|
|
const Function &F = MF.getFunction();
|
|
bool EnableOpt =
|
|
MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
|
|
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
|
|
MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
|
|
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
|
|
/*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
|
|
F.hasMinSize());
|
|
AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo,
|
|
RuleConfig, ST, MDT, LI);
|
|
return Impl.combineMachineInstrs();
|
|
}
|
|
|
|
char AArch64PreLegalizerCombiner::ID = 0;
|
|
INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
|
|
"Combine AArch64 machine instrs before legalization",
|
|
false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
|
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
|
|
INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
|
|
INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
|
|
"Combine AArch64 machine instrs before legalization", false,
|
|
false)
|
|
|
|
namespace llvm {
|
|
FunctionPass *createAArch64PreLegalizerCombiner() {
|
|
return new AArch64PreLegalizerCombiner();
|
|
}
|
|
} // end namespace llvm
|