2920 lines
116 KiB
C++
2920 lines
116 KiB
C++
|
//===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
//
|
||
|
/// \file
|
||
|
/// This file implements the lowering of LLVM calls to DAG nodes.
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
#include "X86.h"
|
||
|
#include "X86CallingConv.h"
|
||
|
#include "X86FrameLowering.h"
|
||
|
#include "X86ISelLowering.h"
|
||
|
#include "X86InstrBuilder.h"
|
||
|
#include "X86MachineFunctionInfo.h"
|
||
|
#include "X86TargetMachine.h"
|
||
|
#include "X86TargetObjectFile.h"
|
||
|
#include "llvm/ADT/Statistic.h"
|
||
|
#include "llvm/Analysis/ObjCARCUtil.h"
|
||
|
#include "llvm/CodeGen/MachineJumpTableInfo.h"
|
||
|
#include "llvm/CodeGen/MachineModuleInfo.h"
|
||
|
#include "llvm/CodeGen/WinEHFuncInfo.h"
|
||
|
#include "llvm/IR/DiagnosticInfo.h"
|
||
|
#include "llvm/IR/IRBuilder.h"
|
||
|
|
||
|
#define DEBUG_TYPE "x86-isel"
|
||
|
|
||
|
using namespace llvm;
|
||
|
|
||
|
STATISTIC(NumTailCalls, "Number of tail calls");
|
||
|
|
||
|
/// Call this when the user attempts to do something unsupported, like
|
||
|
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
|
||
|
/// report_fatal_error, so calling code should attempt to recover without
|
||
|
/// crashing.
|
||
|
static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
|
||
|
const char *Msg) {
|
||
|
MachineFunction &MF = DAG.getMachineFunction();
|
||
|
DAG.getContext()->diagnose(
|
||
|
DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
|
||
|
}
|
||
|
|
||
|
/// Returns true if a CC can dynamically exclude a register from the list of
|
||
|
/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
|
||
|
/// the return registers.
|
||
|
static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
|
||
|
switch (CC) {
|
||
|
default:
|
||
|
return false;
|
||
|
case CallingConv::X86_RegCall:
|
||
|
case CallingConv::PreserveMost:
|
||
|
case CallingConv::PreserveAll:
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Returns true if a CC can dynamically exclude a register from the list of
|
||
|
/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
|
||
|
/// the parameters.
|
||
|
static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
|
||
|
return CC == CallingConv::X86_RegCall;
|
||
|
}
|
||
|
|
||
|
static std::pair<MVT, unsigned>
|
||
|
handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
|
||
|
const X86Subtarget &Subtarget) {
|
||
|
// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
|
||
|
// convention is one that uses k registers.
|
||
|
if (NumElts == 2)
|
||
|
return {MVT::v2i64, 1};
|
||
|
if (NumElts == 4)
|
||
|
return {MVT::v4i32, 1};
|
||
|
if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
|
||
|
CC != CallingConv::Intel_OCL_BI)
|
||
|
return {MVT::v8i16, 1};
|
||
|
if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
|
||
|
CC != CallingConv::Intel_OCL_BI)
|
||
|
return {MVT::v16i8, 1};
|
||
|
// v32i1 passes in ymm unless we have BWI and the calling convention is
|
||
|
// regcall.
|
||
|
if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
|
||
|
return {MVT::v32i8, 1};
|
||
|
// Split v64i1 vectors if we don't have v64i8 available.
|
||
|
if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
|
||
|
if (Subtarget.useAVX512Regs())
|
||
|
return {MVT::v64i8, 1};
|
||
|
return {MVT::v32i8, 2};
|
||
|
}
|
||
|
|
||
|
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
|
||
|
if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
|
||
|
NumElts > 64)
|
||
|
return {MVT::i8, NumElts};
|
||
|
|
||
|
return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
|
||
|
}
|
||
|
|
||
|
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
|
||
|
CallingConv::ID CC,
|
||
|
EVT VT) const {
|
||
|
if (VT.isVector()) {
|
||
|
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
|
||
|
unsigned NumElts = VT.getVectorNumElements();
|
||
|
|
||
|
MVT RegisterVT;
|
||
|
unsigned NumRegisters;
|
||
|
std::tie(RegisterVT, NumRegisters) =
|
||
|
handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
|
||
|
if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
|
||
|
return RegisterVT;
|
||
|
}
|
||
|
|
||
|
if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
|
||
|
return MVT::v8f16;
|
||
|
}
|
||
|
|
||
|
// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
|
||
|
if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
|
||
|
!Subtarget.hasX87())
|
||
|
return MVT::i32;
|
||
|
|
||
|
if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
|
||
|
return getRegisterTypeForCallingConv(Context, CC,
|
||
|
VT.changeVectorElementType(MVT::f16));
|
||
|
|
||
|
if (VT == MVT::bf16)
|
||
|
return MVT::f16;
|
||
|
|
||
|
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
|
||
|
}
|
||
|
|
||
|
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
|
||
|
CallingConv::ID CC,
|
||
|
EVT VT) const {
|
||
|
if (VT.isVector()) {
|
||
|
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
|
||
|
unsigned NumElts = VT.getVectorNumElements();
|
||
|
|
||
|
MVT RegisterVT;
|
||
|
unsigned NumRegisters;
|
||
|
std::tie(RegisterVT, NumRegisters) =
|
||
|
handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
|
||
|
if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
|
||
|
return NumRegisters;
|
||
|
}
|
||
|
|
||
|
if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
|
||
|
// x87 is disabled.
|
||
|
if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
|
||
|
if (VT == MVT::f64)
|
||
|
return 2;
|
||
|
if (VT == MVT::f80)
|
||
|
return 3;
|
||
|
}
|
||
|
|
||
|
if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
|
||
|
return getNumRegistersForCallingConv(Context, CC,
|
||
|
VT.changeVectorElementType(MVT::f16));
|
||
|
|
||
|
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
|
||
|
}
|
||
|
|
||
|
unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
|
||
|
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
|
||
|
unsigned &NumIntermediates, MVT &RegisterVT) const {
|
||
|
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
|
||
|
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
|
||
|
Subtarget.hasAVX512() &&
|
||
|
(!isPowerOf2_32(VT.getVectorNumElements()) ||
|
||
|
(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
|
||
|
VT.getVectorNumElements() > 64)) {
|
||
|
RegisterVT = MVT::i8;
|
||
|
IntermediateVT = MVT::i1;
|
||
|
NumIntermediates = VT.getVectorNumElements();
|
||
|
return NumIntermediates;
|
||
|
}
|
||
|
|
||
|
// Split v64i1 vectors if we don't have v64i8 available.
|
||
|
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
|
||
|
CC != CallingConv::X86_RegCall) {
|
||
|
RegisterVT = MVT::v32i8;
|
||
|
IntermediateVT = MVT::v32i1;
|
||
|
NumIntermediates = 2;
|
||
|
return 2;
|
||
|
}
|
||
|
|
||
|
// Split vNbf16 vectors according to vNf16.
|
||
|
if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
|
||
|
VT = VT.changeVectorElementType(MVT::f16);
|
||
|
|
||
|
return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
|
||
|
NumIntermediates, RegisterVT);
|
||
|
}
|
||
|
|
||
|
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
|
||
|
LLVMContext& Context,
|
||
|
EVT VT) const {
|
||
|
if (!VT.isVector())
|
||
|
return MVT::i8;
|
||
|
|
||
|
if (Subtarget.hasAVX512()) {
|
||
|
// Figure out what this type will be legalized to.
|
||
|
EVT LegalVT = VT;
|
||
|
while (getTypeAction(Context, LegalVT) != TypeLegal)
|
||
|
LegalVT = getTypeToTransformTo(Context, LegalVT);
|
||
|
|
||
|
// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
|
||
|
if (LegalVT.getSimpleVT().is512BitVector())
|
||
|
return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
|
||
|
|
||
|
if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
|
||
|
// If we legalized to less than a 512-bit vector, then we will use a vXi1
|
||
|
// compare for vXi32/vXi64 for sure. If we have BWI we will also support
|
||
|
// vXi16/vXi8.
|
||
|
MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
|
||
|
if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
|
||
|
return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return VT.changeVectorElementTypeToInteger();
|
||
|
}
|
||
|
|
||
|
/// Helper for getByValTypeAlignment to determine
|
||
|
/// the desired ByVal argument alignment.
|
||
|
static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
|
||
|
if (MaxAlign == 16)
|
||
|
return;
|
||
|
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
|
||
|
if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
|
||
|
MaxAlign = Align(16);
|
||
|
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
|
||
|
Align EltAlign;
|
||
|
getMaxByValAlign(ATy->getElementType(), EltAlign);
|
||
|
if (EltAlign > MaxAlign)
|
||
|
MaxAlign = EltAlign;
|
||
|
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
|
||
|
for (auto *EltTy : STy->elements()) {
|
||
|
Align EltAlign;
|
||
|
getMaxByValAlign(EltTy, EltAlign);
|
||
|
if (EltAlign > MaxAlign)
|
||
|
MaxAlign = EltAlign;
|
||
|
if (MaxAlign == 16)
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Return the desired alignment for ByVal aggregate
|
||
|
/// function arguments in the caller parameter area. For X86, aggregates
|
||
|
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
|
||
|
/// are at 4-byte boundaries.
|
||
|
uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
|
||
|
const DataLayout &DL) const {
|
||
|
if (Subtarget.is64Bit()) {
|
||
|
// Max of 8 and alignment of type.
|
||
|
Align TyAlign = DL.getABITypeAlign(Ty);
|
||
|
if (TyAlign > 8)
|
||
|
return TyAlign.value();
|
||
|
return 8;
|
||
|
}
|
||
|
|
||
|
Align Alignment(4);
|
||
|
if (Subtarget.hasSSE1())
|
||
|
getMaxByValAlign(Ty, Alignment);
|
||
|
return Alignment.value();
|
||
|
}
|
||
|
|
||
|
/// It returns EVT::Other if the type should be determined using generic
|
||
|
/// target-independent logic.
|
||
|
/// For vector ops we check that the overall size isn't larger than our
|
||
|
/// preferred vector width.
|
||
|
EVT X86TargetLowering::getOptimalMemOpType(
|
||
|
const MemOp &Op, const AttributeList &FuncAttributes) const {
|
||
|
if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
|
||
|
if (Op.size() >= 16 &&
|
||
|
(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
|
||
|
// FIXME: Check if unaligned 64-byte accesses are slow.
|
||
|
if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
|
||
|
(Subtarget.getPreferVectorWidth() >= 512)) {
|
||
|
return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
|
||
|
}
|
||
|
// FIXME: Check if unaligned 32-byte accesses are slow.
|
||
|
if (Op.size() >= 32 && Subtarget.hasAVX() &&
|
||
|
Subtarget.useLight256BitInstructions()) {
|
||
|
// Although this isn't a well-supported type for AVX1, we'll let
|
||
|
// legalization and shuffle lowering produce the optimal codegen. If we
|
||
|
// choose an optimal type with a vector element larger than a byte,
|
||
|
// getMemsetStores() may create an intermediate splat (using an integer
|
||
|
// multiply) before we splat as a vector.
|
||
|
return MVT::v32i8;
|
||
|
}
|
||
|
if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
|
||
|
return MVT::v16i8;
|
||
|
// TODO: Can SSE1 handle a byte vector?
|
||
|
// If we have SSE1 registers we should be able to use them.
|
||
|
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
|
||
|
(Subtarget.getPreferVectorWidth() >= 128))
|
||
|
return MVT::v4f32;
|
||
|
} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
|
||
|
Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
|
||
|
// Do not use f64 to lower memcpy if source is string constant. It's
|
||
|
// better to use i32 to avoid the loads.
|
||
|
// Also, do not use f64 to lower memset unless this is a memset of zeros.
|
||
|
// The gymnastics of splatting a byte value into an XMM register and then
|
||
|
// only using 8-byte stores (because this is a CPU with slow unaligned
|
||
|
// 16-byte accesses) makes that a loser.
|
||
|
return MVT::f64;
|
||
|
}
|
||
|
}
|
||
|
// This is a compromise. If we reach here, unaligned accesses may be slow on
|
||
|
// this target. However, creating smaller, aligned accesses could be even
|
||
|
// slower and would certainly be a lot more code.
|
||
|
if (Subtarget.is64Bit() && Op.size() >= 8)
|
||
|
return MVT::i64;
|
||
|
return MVT::i32;
|
||
|
}
|
||
|
|
||
|
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
|
||
|
if (VT == MVT::f32)
|
||
|
return Subtarget.hasSSE1();
|
||
|
if (VT == MVT::f64)
|
||
|
return Subtarget.hasSSE2();
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
|
||
|
return (8 * Alignment.value()) % SizeInBits == 0;
|
||
|
}
|
||
|
|
||
|
bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
|
||
|
if (isBitAligned(Alignment, VT.getSizeInBits()))
|
||
|
return true;
|
||
|
switch (VT.getSizeInBits()) {
|
||
|
default:
|
||
|
// 8-byte and under are always assumed to be fast.
|
||
|
return true;
|
||
|
case 128:
|
||
|
return !Subtarget.isUnalignedMem16Slow();
|
||
|
case 256:
|
||
|
return !Subtarget.isUnalignedMem32Slow();
|
||
|
// TODO: What about AVX-512 (512-bit) accesses?
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool X86TargetLowering::allowsMisalignedMemoryAccesses(
|
||
|
EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
|
||
|
unsigned *Fast) const {
|
||
|
if (Fast)
|
||
|
*Fast = isMemoryAccessFast(VT, Alignment);
|
||
|
// NonTemporal vector memory ops must be aligned.
|
||
|
if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
|
||
|
// NT loads can only be vector aligned, so if its less aligned than the
|
||
|
// minimum vector size (which we can split the vector down to), we might as
|
||
|
// well use a regular unaligned vector load.
|
||
|
// We don't have any NT loads pre-SSE41.
|
||
|
if (!!(Flags & MachineMemOperand::MOLoad))
|
||
|
return (Alignment < 16 || !Subtarget.hasSSE41());
|
||
|
return false;
|
||
|
}
|
||
|
// Misaligned accesses of any size are always allowed.
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
|
||
|
const DataLayout &DL, EVT VT,
|
||
|
unsigned AddrSpace, Align Alignment,
|
||
|
MachineMemOperand::Flags Flags,
|
||
|
unsigned *Fast) const {
|
||
|
if (Fast)
|
||
|
*Fast = isMemoryAccessFast(VT, Alignment);
|
||
|
if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
|
||
|
if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
|
||
|
/*Fast=*/nullptr))
|
||
|
return true;
|
||
|
// NonTemporal vector memory ops are special, and must be aligned.
|
||
|
if (!isBitAligned(Alignment, VT.getSizeInBits()))
|
||
|
return false;
|
||
|
switch (VT.getSizeInBits()) {
|
||
|
case 128:
|
||
|
if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
|
||
|
return true;
|
||
|
if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
|
||
|
return true;
|
||
|
return false;
|
||
|
case 256:
|
||
|
if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
|
||
|
return true;
|
||
|
if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
|
||
|
return true;
|
||
|
return false;
|
||
|
case 512:
|
||
|
if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
|
||
|
return true;
|
||
|
return false;
|
||
|
default:
|
||
|
return false; // Don't have NonTemporal vector memory ops of this size.
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/// Return the entry encoding for a jump table in the
|
||
|
/// current function. The returned value is a member of the
|
||
|
/// MachineJumpTableInfo::JTEntryKind enum.
|
||
|
unsigned X86TargetLowering::getJumpTableEncoding() const {
|
||
|
// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
|
||
|
// symbol.
|
||
|
if (isPositionIndependent() && Subtarget.isPICStyleGOT())
|
||
|
return MachineJumpTableInfo::EK_Custom32;
|
||
|
if (isPositionIndependent() &&
|
||
|
getTargetMachine().getCodeModel() == CodeModel::Large)
|
||
|
return MachineJumpTableInfo::EK_LabelDifference64;
|
||
|
|
||
|
// Otherwise, use the normal jump table encoding heuristics.
|
||
|
return TargetLowering::getJumpTableEncoding();
|
||
|
}
|
||
|
|
||
|
bool X86TargetLowering::useSoftFloat() const {
|
||
|
return Subtarget.useSoftFloat();
|
||
|
}
|
||
|
|
||
|
void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
|
||
|
ArgListTy &Args) const {
|
||
|
|
||
|
// Only relabel X86-32 for C / Stdcall CCs.
|
||
|
if (Subtarget.is64Bit())
|
||
|
return;
|
||
|
if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
|
||
|
return;
|
||
|
unsigned ParamRegs = 0;
|
||
|
if (auto *M = MF->getFunction().getParent())
|
||
|
ParamRegs = M->getNumberRegisterParameters();
|
||
|
|
||
|
// Mark the first N int arguments as having reg
|
||
|
for (auto &Arg : Args) {
|
||
|
Type *T = Arg.Ty;
|
||
|
if (T->isIntOrPtrTy())
|
||
|
if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
|
||
|
unsigned numRegs = 1;
|
||
|
if (MF->getDataLayout().getTypeAllocSize(T) > 4)
|
||
|
numRegs = 2;
|
||
|
if (ParamRegs < numRegs)
|
||
|
return;
|
||
|
ParamRegs -= numRegs;
|
||
|
Arg.IsInReg = true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const MCExpr *
|
||
|
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
|
||
|
const MachineBasicBlock *MBB,
|
||
|
unsigned uid,MCContext &Ctx) const{
|
||
|
assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
|
||
|
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
|
||
|
// entries.
|
||
|
return MCSymbolRefExpr::create(MBB->getSymbol(),
|
||
|
MCSymbolRefExpr::VK_GOTOFF, Ctx);
|
||
|
}
|
||
|
|
||
|
/// Returns relocation base for the given PIC jumptable.
|
||
|
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
|
||
|
SelectionDAG &DAG) const {
|
||
|
if (!Subtarget.is64Bit())
|
||
|
// This doesn't have SDLoc associated with it, but is not really the
|
||
|
// same as a Register.
|
||
|
return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
|
||
|
getPointerTy(DAG.getDataLayout()));
|
||
|
return Table;
|
||
|
}
|
||
|
|
||
|
/// This returns the relocation base for the given PIC jumptable,
|
||
|
/// the same as getPICJumpTableRelocBase, but as an MCExpr.
|
||
|
const MCExpr *X86TargetLowering::
|
||
|
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
|
||
|
MCContext &Ctx) const {
|
||
|
// X86-64 uses RIP relative addressing based on the jump table label.
|
||
|
if (Subtarget.isPICStyleRIPRel() ||
|
||
|
(Subtarget.is64Bit() &&
|
||
|
getTargetMachine().getCodeModel() == CodeModel::Large))
|
||
|
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
|
||
|
|
||
|
// Otherwise, the reference is relative to the PIC base.
|
||
|
return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
|
||
|
}
|
||
|
|
||
|
std::pair<const TargetRegisterClass *, uint8_t>
|
||
|
X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
|
||
|
MVT VT) const {
|
||
|
const TargetRegisterClass *RRC = nullptr;
|
||
|
uint8_t Cost = 1;
|
||
|
switch (VT.SimpleTy) {
|
||
|
default:
|
||
|
return TargetLowering::findRepresentativeClass(TRI, VT);
|
||
|
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
|
||
|
RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
|
||
|
break;
|
||
|
case MVT::x86mmx:
|
||
|
RRC = &X86::VR64RegClass;
|
||
|
break;
|
||
|
case MVT::f32: case MVT::f64:
|
||
|
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
|
||
|
case MVT::v4f32: case MVT::v2f64:
|
||
|
case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
|
||
|
case MVT::v8f32: case MVT::v4f64:
|
||
|
case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
|
||
|
case MVT::v16f32: case MVT::v8f64:
|
||
|
RRC = &X86::VR128XRegClass;
|
||
|
break;
|
||
|
}
|
||
|
return std::make_pair(RRC, Cost);
|
||
|
}
|
||
|
|
||
|
unsigned X86TargetLowering::getAddressSpace() const {
|
||
|
if (Subtarget.is64Bit())
|
||
|
return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
|
||
|
return 256;
|
||
|
}
|
||
|
|
||
|
static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
|
||
|
return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
|
||
|
(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
|
||
|
}
|
||
|
|
||
|
static Constant* SegmentOffset(IRBuilderBase &IRB,
|
||
|
int Offset, unsigned AddressSpace) {
|
||
|
return ConstantExpr::getIntToPtr(
|
||
|
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
|
||
|
IRB.getPtrTy(AddressSpace));
|
||
|
}
|
||
|
|
||
|
Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
|
||
|
// glibc, bionic, and Fuchsia have a special slot for the stack guard in
|
||
|
// tcbhead_t; use it instead of the usual global variable (see
|
||
|
// sysdeps/{i386,x86_64}/nptl/tls.h)
|
||
|
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
|
||
|
unsigned AddressSpace = getAddressSpace();
|
||
|
|
||
|
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
|
||
|
if (Subtarget.isTargetFuchsia())
|
||
|
return SegmentOffset(IRB, 0x10, AddressSpace);
|
||
|
|
||
|
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
|
||
|
// Specially, some users may customize the base reg and offset.
|
||
|
int Offset = M->getStackProtectorGuardOffset();
|
||
|
// If we don't set -stack-protector-guard-offset value:
|
||
|
// %fs:0x28, unless we're using a Kernel code model, in which case
|
||
|
// it's %gs:0x28. gs:0x14 on i386.
|
||
|
if (Offset == INT_MAX)
|
||
|
Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
|
||
|
|
||
|
StringRef GuardReg = M->getStackProtectorGuardReg();
|
||
|
if (GuardReg == "fs")
|
||
|
AddressSpace = X86AS::FS;
|
||
|
else if (GuardReg == "gs")
|
||
|
AddressSpace = X86AS::GS;
|
||
|
|
||
|
// Use symbol guard if user specify.
|
||
|
StringRef GuardSymb = M->getStackProtectorGuardSymbol();
|
||
|
if (!GuardSymb.empty()) {
|
||
|
GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
|
||
|
if (!GV) {
|
||
|
Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
|
||
|
: Type::getInt32Ty(M->getContext());
|
||
|
GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
|
||
|
nullptr, GuardSymb, nullptr,
|
||
|
GlobalValue::NotThreadLocal, AddressSpace);
|
||
|
if (!Subtarget.isTargetDarwin())
|
||
|
GV->setDSOLocal(M->getDirectAccessExternalData());
|
||
|
}
|
||
|
return GV;
|
||
|
}
|
||
|
|
||
|
return SegmentOffset(IRB, Offset, AddressSpace);
|
||
|
}
|
||
|
return TargetLowering::getIRStackGuard(IRB);
|
||
|
}
|
||
|
|
||
|
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
|
||
|
// MSVC CRT provides functionalities for stack protection.
|
||
|
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
|
||
|
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
|
||
|
// MSVC CRT has a global variable holding security cookie.
|
||
|
M.getOrInsertGlobal("__security_cookie",
|
||
|
PointerType::getUnqual(M.getContext()));
|
||
|
|
||
|
// MSVC CRT has a function to validate security cookie.
|
||
|
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
|
||
|
"__security_check_cookie", Type::getVoidTy(M.getContext()),
|
||
|
PointerType::getUnqual(M.getContext()));
|
||
|
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
|
||
|
F->setCallingConv(CallingConv::X86_FastCall);
|
||
|
F->addParamAttr(0, Attribute::AttrKind::InReg);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
StringRef GuardMode = M.getStackProtectorGuard();
|
||
|
|
||
|
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
|
||
|
if ((GuardMode == "tls" || GuardMode.empty()) &&
|
||
|
hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
|
||
|
return;
|
||
|
TargetLowering::insertSSPDeclarations(M);
|
||
|
}
|
||
|
|
||
|
Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
|
||
|
// MSVC CRT has a global variable holding security cookie.
|
||
|
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
|
||
|
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
|
||
|
return M.getGlobalVariable("__security_cookie");
|
||
|
}
|
||
|
return TargetLowering::getSDagStackGuard(M);
|
||
|
}
|
||
|
|
||
|
Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
|
||
|
// MSVC CRT has a function to validate security cookie.
|
||
|
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
|
||
|
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
|
||
|
return M.getFunction("__security_check_cookie");
|
||
|
}
|
||
|
return TargetLowering::getSSPStackGuardCheck(M);
|
||
|
}
|
||
|
|
||
|
Value *
|
||
|
X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
|
||
|
// Android provides a fixed TLS slot for the SafeStack pointer. See the
|
||
|
// definition of TLS_SLOT_SAFESTACK in
|
||
|
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
|
||
|
if (Subtarget.isTargetAndroid()) {
|
||
|
// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
|
||
|
// %gs:0x24 on i386
|
||
|
int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
|
||
|
return SegmentOffset(IRB, Offset, getAddressSpace());
|
||
|
}
|
||
|
|
||
|
// Fuchsia is similar.
|
||
|
if (Subtarget.isTargetFuchsia()) {
|
||
|
// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
|
||
|
return SegmentOffset(IRB, 0x18, getAddressSpace());
|
||
|
}
|
||
|
|
||
|
return TargetLowering::getSafeStackPointerLocation(IRB);
|
||
|
}
|
||
|
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
// Return Value Calling Convention Implementation
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
bool X86TargetLowering::CanLowerReturn(
|
||
|
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
|
||
|
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
|
||
|
SmallVector<CCValAssign, 16> RVLocs;
|
||
|
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
|
||
|
return CCInfo.CheckReturn(Outs, RetCC_X86);
|
||
|
}
|
||
|
|
||
|
const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
|
||
|
static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
|
||
|
return ScratchRegs;
|
||
|
}
|
||
|
|
||
|
ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
|
||
|
// FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
|
||
|
// tests at the moment, which is not what we expected.
|
||
|
static const MCPhysReg RCRegs[] = {X86::MXCSR};
|
||
|
return RCRegs;
|
||
|
}
|
||
|
|
||
|
/// Lowers masks values (v*i1) to the local register values
|
||
|
/// \returns DAG node after lowering to register type
|
||
|
static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
|
||
|
const SDLoc &DL, SelectionDAG &DAG) {
|
||
|
EVT ValVT = ValArg.getValueType();
|
||
|
|
||
|
if (ValVT == MVT::v1i1)
|
||
|
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg,
|
||
|
DAG.getIntPtrConstant(0, DL));
|
||
|
|
||
|
if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
|
||
|
(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
|
||
|
// Two stage lowering might be required
|
||
|
// bitcast: v8i1 -> i8 / v16i1 -> i16
|
||
|
// anyextend: i8 -> i32 / i16 -> i32
|
||
|
EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
|
||
|
SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
|
||
|
if (ValLoc == MVT::i32)
|
||
|
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy);
|
||
|
return ValToCopy;
|
||
|
}
|
||
|
|
||
|
if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
|
||
|
(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
|
||
|
// One stage lowering is required
|
||
|
// bitcast: v32i1 -> i32 / v64i1 -> i64
|
||
|
return DAG.getBitcast(ValLoc, ValArg);
|
||
|
}
|
||
|
|
||
|
return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg);
|
||
|
}
|
||
|
|
||
|
/// Breaks v64i1 value into two registers and adds the new node to the DAG
|
||
|
static void Passv64i1ArgInRegs(
|
||
|
const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
|
||
|
SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
|
||
|
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
|
||
|
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
|
||
|
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
|
||
|
assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
|
||
|
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
|
||
|
"The value should reside in two registers");
|
||
|
|
||
|
// Before splitting the value we cast it to i64
|
||
|
Arg = DAG.getBitcast(MVT::i64, Arg);
|
||
|
|
||
|
// Splitting the value into two i32 types
|
||
|
SDValue Lo, Hi;
|
||
|
std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32);
|
||
|
|
||
|
// Attach the two i32 types into corresponding registers
|
||
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
|
||
|
RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
|
||
|
}
|
||
|
|
||
|
SDValue
|
||
|
X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||
|
bool isVarArg,
|
||
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||
|
const SmallVectorImpl<SDValue> &OutVals,
|
||
|
const SDLoc &dl, SelectionDAG &DAG) const {
|
||
|
MachineFunction &MF = DAG.getMachineFunction();
|
||
|
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
|
||
|
|
||
|
// In some cases we need to disable registers from the default CSR list.
|
||
|
// For example, when they are used as return registers (preserve_* and X86's
|
||
|
// regcall) or for argument passing (X86's regcall).
|
||
|
bool ShouldDisableCalleeSavedRegister =
|
||
|
shouldDisableRetRegFromCSR(CallConv) ||
|
||
|
MF.getFunction().hasFnAttribute("no_caller_saved_registers");
|
||
|
|
||
|
if (CallConv == CallingConv::X86_INTR && !Outs.empty())
|
||
|
report_fatal_error("X86 interrupts may not return any value");
|
||
|
|
||
|
SmallVector<CCValAssign, 16> RVLocs;
|
||
|
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
|
||
|
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
|
||
|
|
||
|
SmallVector<std::pair<Register, SDValue>, 4> RetVals;
|
||
|
for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
|
||
|
++I, ++OutsIndex) {
|
||
|
CCValAssign &VA = RVLocs[I];
|
||
|
assert(VA.isRegLoc() && "Can only return in registers!");
|
||
|
|
||
|
// Add the register to the CalleeSaveDisableRegs list.
|
||
|
if (ShouldDisableCalleeSavedRegister)
|
||
|
MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
|
||
|
|
||
|
SDValue ValToCopy = OutVals[OutsIndex];
|
||
|
EVT ValVT = ValToCopy.getValueType();
|
||
|
|
||
|
// Promote values to the appropriate types.
|
||
|
if (VA.getLocInfo() == CCValAssign::SExt)
|
||
|
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
|
||
|
else if (VA.getLocInfo() == CCValAssign::ZExt)
|
||
|
ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
|
||
|
else if (VA.getLocInfo() == CCValAssign::AExt) {
|
||
|
if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
|
||
|
ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
|
||
|
else
|
||
|
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
|
||
|
}
|
||
|
else if (VA.getLocInfo() == CCValAssign::BCvt)
|
||
|
ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
|
||
|
|
||
|
assert(VA.getLocInfo() != CCValAssign::FPExt &&
|
||
|
"Unexpected FP-extend for return value.");
|
||
|
|
||
|
// Report an error if we have attempted to return a value via an XMM
|
||
|
// register and SSE was disabled.
|
||
|
if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
|
||
|
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
|
||
|
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
|
||
|
} else if (!Subtarget.hasSSE2() &&
|
||
|
X86::FR64XRegClass.contains(VA.getLocReg()) &&
|
||
|
ValVT == MVT::f64) {
|
||
|
// When returning a double via an XMM register, report an error if SSE2 is
|
||
|
// not enabled.
|
||
|
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
|
||
|
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
|
||
|
}
|
||
|
|
||
|
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
|
||
|
// the RET instruction and handled by the FP Stackifier.
|
||
|
if (VA.getLocReg() == X86::FP0 ||
|
||
|
VA.getLocReg() == X86::FP1) {
|
||
|
// If this is a copy from an xmm register to ST(0), use an FPExtend to
|
||
|
// change the value to the FP stack register class.
|
||
|
if (isScalarFPTypeInSSEReg(VA.getValVT()))
|
||
|
ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
|
||
|
RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
|
||
|
// Don't emit a copytoreg.
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
|
||
|
// which is returned in RAX / RDX.
|
||
|
if (Subtarget.is64Bit()) {
|
||
|
if (ValVT == MVT::x86mmx) {
|
||
|
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
|
||
|
ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
|
||
|
ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
|
||
|
ValToCopy);
|
||
|
// If we don't have SSE2 available, convert to v4f32 so the generated
|
||
|
// register is legal.
|
||
|
if (!Subtarget.hasSSE2())
|
||
|
ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (VA.needsCustom()) {
|
||
|
assert(VA.getValVT() == MVT::v64i1 &&
|
||
|
"Currently the only custom case is when we split v64i1 to 2 regs");
|
||
|
|
||
|
Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
|
||
|
Subtarget);
|
||
|
|
||
|
// Add the second register to the CalleeSaveDisableRegs list.
|
||
|
if (ShouldDisableCalleeSavedRegister)
|
||
|
MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
|
||
|
} else {
|
||
|
RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
SDValue Glue;
|
||
|
SmallVector<SDValue, 6> RetOps;
|
||
|
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
|
||
|
// Operand #1 = Bytes To Pop
|
||
|
RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
|
||
|
MVT::i32));
|
||
|
|
||
|
// Copy the result values into the output registers.
|
||
|
for (auto &RetVal : RetVals) {
|
||
|
if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
|
||
|
RetOps.push_back(RetVal.second);
|
||
|
continue; // Don't emit a copytoreg.
|
||
|
}
|
||
|
|
||
|
Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
|
||
|
Glue = Chain.getValue(1);
|
||
|
RetOps.push_back(
|
||
|
DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
|
||
|
}
|
||
|
|
||
|
// Swift calling convention does not require we copy the sret argument
|
||
|
// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
|
||
|
|
||
|
// All x86 ABIs require that for returning structs by value we copy
|
||
|
// the sret argument into %rax/%eax (depending on ABI) for the return.
|
||
|
// We saved the argument into a virtual register in the entry block,
|
||
|
// so now we copy the value out and into %rax/%eax.
|
||
|
//
|
||
|
// Checking Function.hasStructRetAttr() here is insufficient because the IR
|
||
|
// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
|
||
|
// false, then an sret argument may be implicitly inserted in the SelDAG. In
|
||
|
// either case FuncInfo->setSRetReturnReg() will have been called.
|
||
|
if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
|
||
|
// When we have both sret and another return value, we should use the
|
||
|
// original Chain stored in RetOps[0], instead of the current Chain updated
|
||
|
// in the above loop. If we only have sret, RetOps[0] equals to Chain.
|
||
|
|
||
|
// For the case of sret and another return value, we have
|
||
|
// Chain_0 at the function entry
|
||
|
// Chain_1 = getCopyToReg(Chain_0) in the above loop
|
||
|
// If we use Chain_1 in getCopyFromReg, we will have
|
||
|
// Val = getCopyFromReg(Chain_1)
|
||
|
// Chain_2 = getCopyToReg(Chain_1, Val) from below
|
||
|
|
||
|
// getCopyToReg(Chain_0) will be glued together with
|
||
|
// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
|
||
|
// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
|
||
|
// Data dependency from Unit B to Unit A due to usage of Val in
|
||
|
// getCopyToReg(Chain_1, Val)
|
||
|
// Chain dependency from Unit A to Unit B
|
||
|
|
||
|
// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
|
||
|
SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
|
||
|
getPointerTy(MF.getDataLayout()));
|
||
|
|
||
|
Register RetValReg
|
||
|
= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
|
||
|
X86::RAX : X86::EAX;
|
||
|
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
|
||
|
Glue = Chain.getValue(1);
|
||
|
|
||
|
// RAX/EAX now acts like a return value.
|
||
|
RetOps.push_back(
|
||
|
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
|
||
|
|
||
|
// Add the returned register to the CalleeSaveDisableRegs list. Don't do
|
||
|
// this however for preserve_most/preserve_all to minimize the number of
|
||
|
// callee-saved registers for these CCs.
|
||
|
if (ShouldDisableCalleeSavedRegister &&
|
||
|
CallConv != CallingConv::PreserveAll &&
|
||
|
CallConv != CallingConv::PreserveMost)
|
||
|
MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
|
||
|
}
|
||
|
|
||
|
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
|
||
|
const MCPhysReg *I =
|
||
|
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
|
||
|
if (I) {
|
||
|
for (; *I; ++I) {
|
||
|
if (X86::GR64RegClass.contains(*I))
|
||
|
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
|
||
|
else
|
||
|
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RetOps[0] = Chain; // Update chain.
|
||
|
|
||
|
// Add the glue if we have it.
|
||
|
if (Glue.getNode())
|
||
|
RetOps.push_back(Glue);
|
||
|
|
||
|
X86ISD::NodeType opcode = X86ISD::RET_GLUE;
|
||
|
if (CallConv == CallingConv::X86_INTR)
|
||
|
opcode = X86ISD::IRET;
|
||
|
return DAG.getNode(opcode, dl, MVT::Other, RetOps);
|
||
|
}
|
||
|
|
||
|
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
|
||
|
if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
|
||
|
return false;
|
||
|
|
||
|
SDValue TCChain = Chain;
|
||
|
SDNode *Copy = *N->use_begin();
|
||
|
if (Copy->getOpcode() == ISD::CopyToReg) {
|
||
|
// If the copy has a glue operand, we conservatively assume it isn't safe to
|
||
|
// perform a tail call.
|
||
|
if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
|
||
|
return false;
|
||
|
TCChain = Copy->getOperand(0);
|
||
|
} else if (Copy->getOpcode() != ISD::FP_EXTEND)
|
||
|
return false;
|
||
|
|
||
|
bool HasRet = false;
|
||
|
for (const SDNode *U : Copy->uses()) {
|
||
|
if (U->getOpcode() != X86ISD::RET_GLUE)
|
||
|
return false;
|
||
|
// If we are returning more than one value, we can definitely
|
||
|
// not make a tail call see PR19530
|
||
|
if (U->getNumOperands() > 4)
|
||
|
return false;
|
||
|
if (U->getNumOperands() == 4 &&
|
||
|
U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
|
||
|
return false;
|
||
|
HasRet = true;
|
||
|
}
|
||
|
|
||
|
if (!HasRet)
|
||
|
return false;
|
||
|
|
||
|
Chain = TCChain;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
|
||
|
ISD::NodeType ExtendKind) const {
|
||
|
MVT ReturnMVT = MVT::i32;
|
||
|
|
||
|
bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
|
||
|
if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
|
||
|
// The ABI does not require i1, i8 or i16 to be extended.
|
||
|
//
|
||
|
// On Darwin, there is code in the wild relying on Clang's old behaviour of
|
||
|
// always extending i8/i16 return values, so keep doing that for now.
|
||
|
// (PR26665).
|
||
|
ReturnMVT = MVT::i8;
|
||
|
}
|
||
|
|
||
|
EVT MinVT = getRegisterType(Context, ReturnMVT);
|
||
|
return VT.bitsLT(MinVT) ? MinVT : VT;
|
||
|
}
|
||
|
|
||
|
/// Reads two 32 bit registers and creates a 64 bit mask value.
|
||
|
/// \param VA The current 32 bit value that need to be assigned.
|
||
|
/// \param NextVA The next 32 bit value that need to be assigned.
|
||
|
/// \param Root The parent DAG node.
|
||
|
/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
|
||
|
/// glue purposes. In the case the DAG is already using
|
||
|
/// physical register instead of virtual, we should glue
|
||
|
/// our new SDValue to InGlue SDvalue.
|
||
|
/// \return a new SDvalue of size 64bit.
|
||
|
static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
|
||
|
SDValue &Root, SelectionDAG &DAG,
|
||
|
const SDLoc &DL, const X86Subtarget &Subtarget,
|
||
|
SDValue *InGlue = nullptr) {
|
||
|
assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
|
||
|
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
|
||
|
assert(VA.getValVT() == MVT::v64i1 &&
|
||
|
"Expecting first location of 64 bit width type");
|
||
|
assert(NextVA.getValVT() == VA.getValVT() &&
|
||
|
"The locations should have the same type");
|
||
|
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
|
||
|
"The values should reside in two registers");
|
||
|
|
||
|
SDValue Lo, Hi;
|
||
|
SDValue ArgValueLo, ArgValueHi;
|
||
|
|
||
|
MachineFunction &MF = DAG.getMachineFunction();
|
||
|
const TargetRegisterClass *RC = &X86::GR32RegClass;
|
||
|
|
||
|
// Read a 32 bit value from the registers.
|
||
|
if (nullptr == InGlue) {
|
||
|
// When no physical register is present,
|
||
|
// create an intermediate virtual register.
|
||
|
Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
|
||
|
ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
|
||
|
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
|
||
|
ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
|
||
|
} else {
|
||
|
// When a physical register is available read the value from it and glue
|
||
|
// the reads together.
|
||
|
ArgValueLo =
|
||
|
DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue);
|
||
|
*InGlue = ArgValueLo.getValue(2);
|
||
|
ArgValueHi =
|
||
|
DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue);
|
||
|
*InGlue = ArgValueHi.getValue(2);
|
||
|
}
|
||
|
|
||
|
// Convert the i32 type into v32i1 type.
|
||
|
Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
|
||
|
|
||
|
// Convert the i32 type into v32i1 type.
|
||
|
Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
|
||
|
|
||
|
// Concatenate the two values together.
|
||
|
return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi);
|
||
|
}
|
||
|
|
||
|
/// The function will lower a register of various sizes (8/16/32/64)
|
||
|
/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
|
||
|
/// \returns a DAG node contains the operand after lowering to mask type.
|
||
|
static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
|
||
|
const EVT &ValLoc, const SDLoc &DL,
|
||
|
SelectionDAG &DAG) {
|
||
|
SDValue ValReturned = ValArg;
|
||
|
|
||
|
if (ValVT == MVT::v1i1)
|
||
|
return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned);
|
||
|
|
||
|
if (ValVT == MVT::v64i1) {
|
||
|
// In 32 bit machine, this case is handled by getv64i1Argument
|
||
|
assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
|
||
|
// In 64 bit machine, There is no need to truncate the value only bitcast
|
||
|
} else {
|
||
|
MVT MaskLenVT;
|
||
|
switch (ValVT.getSimpleVT().SimpleTy) {
|
||
|
case MVT::v8i1:
|
||
|
MaskLenVT = MVT::i8;
|
||
|
break;
|
||
|
case MVT::v16i1:
|
||
|
MaskLenVT = MVT::i16;
|
||
|
break;
|
||
|
case MVT::v32i1:
|
||
|
MaskLenVT = MVT::i32;
|
||
|
break;
|
||
|
default:
|
||
|
llvm_unreachable("Expecting a vector of i1 types");
|
||
|
}
|
||
|
|
||
|
ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned);
|
||
|
}
|
||
|
return DAG.getBitcast(ValVT, ValReturned);
|
||
|
}
|
||
|
|
||
|
/// Lower the result values of a call into the
|
||
|
/// appropriate copies out of appropriate physical registers.
|
||
|
///
|
||
|
SDValue X86TargetLowering::LowerCallResult(
|
||
|
SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
|
||
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
||
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
|
||
|
uint32_t *RegMask) const {
|
||
|
|
||
|
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
|
||
|
// Assign locations to each value returned by this call.
|
||
|
SmallVector<CCValAssign, 16> RVLocs;
|
||
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
||
|
*DAG.getContext());
|
||
|
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
|
||
|
|
||
|
// Copy all of the result registers out of their specified physreg.
|
||
|
for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
|
||
|
++I, ++InsIndex) {
|
||
|
CCValAssign &VA = RVLocs[I];
|
||
|
EVT CopyVT = VA.getLocVT();
|
||
|
|
||
|
// In some calling conventions we need to remove the used registers
|
||
|
// from the register mask.
|
||
|
if (RegMask) {
|
||
|
for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
|
||
|
RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
|
||
|
}
|
||
|
|
||
|
// Report an error if there was an attempt to return FP values via XMM
|
||
|
// registers.
|
||
|
if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
|
||
|
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
|
||
|
if (VA.getLocReg() == X86::XMM1)
|
||
|
VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
|
||
|
else
|
||
|
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
|
||
|
} else if (!Subtarget.hasSSE2() &&
|
||
|
X86::FR64XRegClass.contains(VA.getLocReg()) &&
|
||
|
CopyVT == MVT::f64) {
|
||
|
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
|
||
|
if (VA.getLocReg() == X86::XMM1)
|
||
|
VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
|
||
|
else
|
||
|
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
|
||
|
}
|
||
|
|
||
|
// If we prefer to use the value in xmm registers, copy it out as f80 and
|
||
|
// use a truncate to move it from fp stack reg to xmm reg.
|
||
|
bool RoundAfterCopy = false;
|
||
|
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
|
||
|
isScalarFPTypeInSSEReg(VA.getValVT())) {
|
||
|
if (!Subtarget.hasX87())
|
||
|
report_fatal_error("X87 register return with X87 disabled");
|
||
|
CopyVT = MVT::f80;
|
||
|
RoundAfterCopy = (CopyVT != VA.getLocVT());
|
||
|
}
|
||
|
|
||
|
SDValue Val;
|
||
|
if (VA.needsCustom()) {
|
||
|
assert(VA.getValVT() == MVT::v64i1 &&
|
||
|
"Currently the only custom case is when we split v64i1 to 2 regs");
|
||
|
Val =
|
||
|
getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
|
||
|
} else {
|
||
|
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
|
||
|
.getValue(1);
|
||
|
Val = Chain.getValue(0);
|
||
|
InGlue = Chain.getValue(2);
|
||
|
}
|
||
|
|
||
|
if (RoundAfterCopy)
|
||
|
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
|
||
|
// This truncation won't change the value.
|
||
|
DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
|
||
|
|
||
|
if (VA.isExtInLoc()) {
|
||
|
if (VA.getValVT().isVector() &&
|
||
|
VA.getValVT().getScalarType() == MVT::i1 &&
|
||
|
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
|
||
|
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
|
||
|
// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
|
||
|
Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
|
||
|
} else
|
||
|
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
|
||
|
}
|
||
|
|
||
|
if (VA.getLocInfo() == CCValAssign::BCvt)
|
||
|
Val = DAG.getBitcast(VA.getValVT(), Val);
|
||
|
|
||
|
InVals.push_back(Val);
|
||
|
}
|
||
|
|
||
|
return Chain;
|
||
|
}
|
||
|
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
// C & StdCall & Fast Calling Convention implementation
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
// StdCall calling convention seems to be standard for many Windows' API
|
||
|
// routines and around. It differs from C calling convention just a little:
|
||
|
// callee should clean up the stack, not caller. Symbols should be also
|
||
|
// decorated in some fancy way :) It doesn't support any vector arguments.
|
||
|
// For info on fast calling convention see Fast Calling Convention (tail call)
|
||
|
// implementation LowerX86_32FastCCCallTo.
|
||
|
|
||
|
/// Determines whether Args, either a set of outgoing arguments to a call, or a
|
||
|
/// set of incoming args of a call, contains an sret pointer that the callee
|
||
|
/// pops
|
||
|
template <typename T>
|
||
|
static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
|
||
|
const X86Subtarget &Subtarget) {
|
||
|
// Not C++20 (yet), so no concepts available.
|
||
|
static_assert(std::is_same_v<T, ISD::OutputArg> ||
|
||
|
std::is_same_v<T, ISD::InputArg>,
|
||
|
"requires ISD::OutputArg or ISD::InputArg");
|
||
|
|
||
|
// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
|
||
|
// for most compilations.
|
||
|
if (!Subtarget.is32Bit())
|
||
|
return false;
|
||
|
|
||
|
if (Args.empty())
|
||
|
return false;
|
||
|
|
||
|
// Most calls do not have an sret argument, check the arg next.
|
||
|
const ISD::ArgFlagsTy &Flags = Args[0].Flags;
|
||
|
if (!Flags.isSRet() || Flags.isInReg())
|
||
|
return false;
|
||
|
|
||
|
// The MSVCabi does not pop the sret.
|
||
|
if (Subtarget.getTargetTriple().isOSMSVCRT())
|
||
|
return false;
|
||
|
|
||
|
// MCUs don't pop the sret
|
||
|
if (Subtarget.isTargetMCU())
|
||
|
return false;
|
||
|
|
||
|
// Callee pops argument
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/// Make a copy of an aggregate at address specified by "Src" to address
|
||
|
/// "Dst" with size and alignment information specified by the specific
|
||
|
/// parameter attribute. The copy will be passed as a byval function parameter.
|
||
|
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
|
||
|
SDValue Chain, ISD::ArgFlagsTy Flags,
|
||
|
SelectionDAG &DAG, const SDLoc &dl) {
|
||
|
SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
|
||
|
|
||
|
return DAG.getMemcpy(
|
||
|
Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
|
||
|
/*isVolatile*/ false, /*AlwaysInline=*/true,
|
||
|
/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
|
||
|
}
|
||
|
|
||
|
/// Return true if the calling convention is one that we can guarantee TCO for.
|
||
|
static bool canGuaranteeTCO(CallingConv::ID CC) {
|
||
|
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
|
||
|
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
|
||
|
CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
|
||
|
}
|
||
|
|
||
|
/// Return true if we might ever do TCO for calls with this calling convention.
|
||
|
static bool mayTailCallThisCC(CallingConv::ID CC) {
|
||
|
switch (CC) {
|
||
|
// C calling conventions:
|
||
|
case CallingConv::C:
|
||
|
case CallingConv::Win64:
|
||
|
case CallingConv::X86_64_SysV:
|
||
|
// Callee pop conventions:
|
||
|
case CallingConv::X86_ThisCall:
|
||
|
case CallingConv::X86_StdCall:
|
||
|
case CallingConv::X86_VectorCall:
|
||
|
case CallingConv::X86_FastCall:
|
||
|
// Swift:
|
||
|
case CallingConv::Swift:
|
||
|
return true;
|
||
|
default:
|
||
|
return canGuaranteeTCO(CC);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Return true if the function is being made into a tailcall target by
|
||
|
/// changing its ABI.
|
||
|
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
|
||
|
return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
|
||
|
CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
|
||
|
}
|
||
|
|
||
|
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
|
||
|
if (!CI->isTailCall())
|
||
|
return false;
|
||
|
|
||
|
CallingConv::ID CalleeCC = CI->getCallingConv();
|
||
|
if (!mayTailCallThisCC(CalleeCC))
|
||
|
return false;
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
SDValue
|
||
|
X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
|
||
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
||
|
const SDLoc &dl, SelectionDAG &DAG,
|
||
|
const CCValAssign &VA,
|
||
|
MachineFrameInfo &MFI, unsigned i) const {
|
||
|
// Create the nodes corresponding to a load from this parameter slot.
|
||
|
ISD::ArgFlagsTy Flags = Ins[i].Flags;
|
||
|
bool AlwaysUseMutable = shouldGuaranteeTCO(
|
||
|
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
|
||
|
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
|
||
|
EVT ValVT;
|
||
|
MVT PtrVT = getPointerTy(DAG.getDataLayout());
|
||
|
|
||
|
// If value is passed by pointer we have address passed instead of the value
|
||
|
// itself. No need to extend if the mask value and location share the same
|
||
|
// absolute size.
|
||
|
bool ExtendedInMem =
|
||
|
VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
|
||
|
VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
|
||
|
|
||
|
if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
|
||
|
ValVT = VA.getLocVT();
|
||
|
else
|
||
|
ValVT = VA.getValVT();
|
||
|
|
||
|
// FIXME: For now, all byval parameter objects are marked mutable. This can be
|
||
|
// changed with more analysis.
|
||
|
// In case of tail call optimization mark all arguments mutable. Since they
|
||
|
// could be overwritten by lowering of arguments in case of a tail call.
|
||
|
if (Flags.isByVal()) {
|
||
|
unsigned Bytes = Flags.getByValSize();
|
||
|
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
|
||
|
|
||
|
// FIXME: For now, all byval parameter objects are marked as aliasing. This
|
||
|
// can be improved with deeper analysis.
|
||
|
int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
|
||
|
/*isAliased=*/true);
|
||
|
return DAG.getFrameIndex(FI, PtrVT);
|
||
|
}
|
||
|
|
||
|
EVT ArgVT = Ins[i].ArgVT;
|
||
|
|
||
|
// If this is a vector that has been split into multiple parts, don't elide
|
||
|
// the copy. The layout on the stack may not match the packed in-memory
|
||
|
// layout.
|
||
|
bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
|
||
|
|
||
|
// This is an argument in memory. We might be able to perform copy elision.
|
||
|
// If the argument is passed directly in memory without any extension, then we
|
||
|
// can perform copy elision. Large vector types, for example, may be passed
|
||
|
// indirectly by pointer.
|
||
|
if (Flags.isCopyElisionCandidate() &&
|
||
|
VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
|
||
|
!ScalarizedVector) {
|
||
|
SDValue PartAddr;
|
||
|
if (Ins[i].PartOffset == 0) {
|
||
|
// If this is a one-part value or the first part of a multi-part value,
|
||
|
// create a stack object for the entire argument value type and return a
|
||
|
// load from our portion of it. This assumes that if the first part of an
|
||
|
// argument is in memory, the rest will also be in memory.
|
||
|
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
|
||
|
/*IsImmutable=*/false);
|
||
|
PartAddr = DAG.getFrameIndex(FI, PtrVT);
|
||
|
return DAG.getLoad(
|
||
|
ValVT, dl, Chain, PartAddr,
|
||
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
|
||
|
}
|
||
|
|
||
|
// This is not the first piece of an argument in memory. See if there is
|
||
|
// already a fixed stack object including this offset. If so, assume it
|
||
|
// was created by the PartOffset == 0 branch above and create a load from
|
||
|
// the appropriate offset into it.
|
||
|
int64_t PartBegin = VA.getLocMemOffset();
|
||
|
int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
|
||
|
int FI = MFI.getObjectIndexBegin();
|
||
|
for (; MFI.isFixedObjectIndex(FI); ++FI) {
|
||
|
int64_t ObjBegin = MFI.getObjectOffset(FI);
|
||
|
int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
|
||
|
if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
|
||
|
break;
|
||
|
}
|
||
|
if (MFI.isFixedObjectIndex(FI)) {
|
||
|
SDValue Addr =
|
||
|
DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
|
||
|
DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
|
||
|
return DAG.getLoad(ValVT, dl, Chain, Addr,
|
||
|
MachinePointerInfo::getFixedStack(
|
||
|
DAG.getMachineFunction(), FI, Ins[i].PartOffset));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
|
||
|
VA.getLocMemOffset(), isImmutable);
|
||
|
|
||
|
// Set SExt or ZExt flag.
|
||
|
if (VA.getLocInfo() == CCValAssign::ZExt) {
|
||
|
MFI.setObjectZExt(FI, true);
|
||
|
} else if (VA.getLocInfo() == CCValAssign::SExt) {
|
||
|
MFI.setObjectSExt(FI, true);
|
||
|
}
|
||
|
|
||
|
MaybeAlign Alignment;
|
||
|
if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
|
||
|
ValVT != MVT::f80)
|
||
|
Alignment = MaybeAlign(4);
|
||
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
||
|
SDValue Val = DAG.getLoad(
|
||
|
ValVT, dl, Chain, FIN,
|
||
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
|
||
|
Alignment);
|
||
|
return ExtendedInMem
|
||
|
? (VA.getValVT().isVector()
|
||
|
? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
|
||
|
: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
|
||
|
: Val;
|
||
|
}
|
||
|
|
||
|
// FIXME: Get this from tablegen.
|
||
|
static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
|
||
|
const X86Subtarget &Subtarget) {
|
||
|
assert(Subtarget.is64Bit());
|
||
|
|
||
|
if (Subtarget.isCallingConvWin64(CallConv)) {
|
||
|
static const MCPhysReg GPR64ArgRegsWin64[] = {
|
||
|
X86::RCX, X86::RDX, X86::R8, X86::R9
|
||
|
};
|
||
|
return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
|
||
|
}
|
||
|
|
||
|
static const MCPhysReg GPR64ArgRegs64Bit[] = {
|
||
|
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
|
||
|
};
|
||
|
return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
|
||
|
}
|
||
|
|
||
|
// FIXME: Get this from tablegen.
|
||
|
static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
|
||
|
CallingConv::ID CallConv,
|
||
|
const X86Subtarget &Subtarget) {
|
||
|
assert(Subtarget.is64Bit());
|
||
|
if (Subtarget.isCallingConvWin64(CallConv)) {
|
||
|
// The XMM registers which might contain var arg parameters are shadowed
|
||
|
// in their paired GPR. So we only need to save the GPR to their home
|
||
|
// slots.
|
||
|
// TODO: __vectorcall will change this.
|
||
|
return std::nullopt;
|
||
|
}
|
||
|
|
||
|
bool isSoftFloat = Subtarget.useSoftFloat();
|
||
|
if (isSoftFloat || !Subtarget.hasSSE1())
|
||
|
// Kernel mode asks for SSE to be disabled, so there are no XMM argument
|
||
|
// registers.
|
||
|
return std::nullopt;
|
||
|
|
||
|
static const MCPhysReg XMMArgRegs64Bit[] = {
|
||
|
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
|
||
|
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
|
||
|
};
|
||
|
return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
|
||
|
}
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
|
||
|
return llvm::is_sorted(
|
||
|
ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
|
||
|
return A.getValNo() < B.getValNo();
|
||
|
});
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
namespace {
|
||
|
/// This is a helper class for lowering variable arguments parameters.
|
||
|
class VarArgsLoweringHelper {
|
||
|
public:
|
||
|
VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
|
||
|
SelectionDAG &DAG, const X86Subtarget &Subtarget,
|
||
|
CallingConv::ID CallConv, CCState &CCInfo)
|
||
|
: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
|
||
|
TheMachineFunction(DAG.getMachineFunction()),
|
||
|
TheFunction(TheMachineFunction.getFunction()),
|
||
|
FrameInfo(TheMachineFunction.getFrameInfo()),
|
||
|
FrameLowering(*Subtarget.getFrameLowering()),
|
||
|
TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
|
||
|
CCInfo(CCInfo) {}
|
||
|
|
||
|
// Lower variable arguments parameters.
|
||
|
void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
|
||
|
|
||
|
private:
|
||
|
void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
|
||
|
|
||
|
void forwardMustTailParameters(SDValue &Chain);
|
||
|
|
||
|
bool is64Bit() const { return Subtarget.is64Bit(); }
|
||
|
bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
|
||
|
|
||
|
X86MachineFunctionInfo *FuncInfo;
|
||
|
const SDLoc &DL;
|
||
|
SelectionDAG &DAG;
|
||
|
const X86Subtarget &Subtarget;
|
||
|
MachineFunction &TheMachineFunction;
|
||
|
const Function &TheFunction;
|
||
|
MachineFrameInfo &FrameInfo;
|
||
|
const TargetFrameLowering &FrameLowering;
|
||
|
const TargetLowering &TargLowering;
|
||
|
CallingConv::ID CallConv;
|
||
|
CCState &CCInfo;
|
||
|
};
|
||
|
} // namespace
|
||
|
|
||
|
void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
|
||
|
SDValue &Chain, unsigned StackSize) {
|
||
|
// If the function takes variable number of arguments, make a frame index for
|
||
|
// the start of the first vararg value... for expansion of llvm.va_start. We
|
||
|
// can skip this if there are no va_start calls.
|
||
|
if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
|
||
|
CallConv != CallingConv::X86_ThisCall)) {
|
||
|
FuncInfo->setVarArgsFrameIndex(
|
||
|
FrameInfo.CreateFixedObject(1, StackSize, true));
|
||
|
}
|
||
|
|
||
|
// 64-bit calling conventions support varargs and register parameters, so we
|
||
|
// have to do extra work to spill them in the prologue.
|
||
|
if (is64Bit()) {
|
||
|
// Find the first unallocated argument registers.
|
||
|
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
|
||
|
ArrayRef<MCPhysReg> ArgXMMs =
|
||
|
get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
|
||
|
unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
|
||
|
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
|
||
|
|
||
|
assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
|
||
|
"SSE register cannot be used when SSE is disabled!");
|
||
|
|
||
|
if (isWin64()) {
|
||
|
// Get to the caller-allocated home save location. Add 8 to account
|
||
|
// for the return address.
|
||
|
int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
|
||
|
FuncInfo->setRegSaveFrameIndex(
|
||
|
FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
|
||
|
// Fixup to set vararg frame on shadow area (4 x i64).
|
||
|
if (NumIntRegs < 4)
|
||
|
FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
|
||
|
} else {
|
||
|
// For X86-64, if there are vararg parameters that are passed via
|
||
|
// registers, then we must store them to their spots on the stack so
|
||
|
// they may be loaded by dereferencing the result of va_next.
|
||
|
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
|
||
|
FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
|
||
|
FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
|
||
|
ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
|
||
|
}
|
||
|
|
||
|
SmallVector<SDValue, 6>
|
||
|
LiveGPRs; // list of SDValue for GPR registers keeping live input value
|
||
|
SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
|
||
|
// keeping live input value
|
||
|
SDValue ALVal; // if applicable keeps SDValue for %al register
|
||
|
|
||
|
// Gather all the live in physical registers.
|
||
|
for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
|
||
|
Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
|
||
|
LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
|
||
|
}
|
||
|
const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
|
||
|
if (!AvailableXmms.empty()) {
|
||
|
Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
|
||
|
ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
|
||
|
for (MCPhysReg Reg : AvailableXmms) {
|
||
|
// FastRegisterAllocator spills virtual registers at basic
|
||
|
// block boundary. That leads to usages of xmm registers
|
||
|
// outside of check for %al. Pass physical registers to
|
||
|
// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
|
||
|
TheMachineFunction.getRegInfo().addLiveIn(Reg);
|
||
|
LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Store the integer parameter registers.
|
||
|
SmallVector<SDValue, 8> MemOps;
|
||
|
SDValue RSFIN =
|
||
|
DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
|
||
|
TargLowering.getPointerTy(DAG.getDataLayout()));
|
||
|
unsigned Offset = FuncInfo->getVarArgsGPOffset();
|
||
|
for (SDValue Val : LiveGPRs) {
|
||
|
SDValue FIN = DAG.getNode(ISD::ADD, DL,
|
||
|
TargLowering.getPointerTy(DAG.getDataLayout()),
|
||
|
RSFIN, DAG.getIntPtrConstant(Offset, DL));
|
||
|
SDValue Store =
|
||
|
DAG.getStore(Val.getValue(1), DL, Val, FIN,
|
||
|
MachinePointerInfo::getFixedStack(
|
||
|
DAG.getMachineFunction(),
|
||
|
FuncInfo->getRegSaveFrameIndex(), Offset));
|
||
|
MemOps.push_back(Store);
|
||
|
Offset += 8;
|
||
|
}
|
||
|
|
||
|
// Now store the XMM (fp + vector) parameter registers.
|
||
|
if (!LiveXMMRegs.empty()) {
|
||
|
SmallVector<SDValue, 12> SaveXMMOps;
|
||
|
SaveXMMOps.push_back(Chain);
|
||
|
SaveXMMOps.push_back(ALVal);
|
||
|
SaveXMMOps.push_back(RSFIN);
|
||
|
SaveXMMOps.push_back(
|
||
|
DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
|
||
|
llvm::append_range(SaveXMMOps, LiveXMMRegs);
|
||
|
MachineMemOperand *StoreMMO =
|
||
|
DAG.getMachineFunction().getMachineMemOperand(
|
||
|
MachinePointerInfo::getFixedStack(
|
||
|
DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
|
||
|
Offset),
|
||
|
MachineMemOperand::MOStore, 128, Align(16));
|
||
|
MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
|
||
|
DL, DAG.getVTList(MVT::Other),
|
||
|
SaveXMMOps, MVT::i8, StoreMMO));
|
||
|
}
|
||
|
|
||
|
if (!MemOps.empty())
|
||
|
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
|
||
|
// Find the largest legal vector type.
|
||
|
MVT VecVT = MVT::Other;
|
||
|
// FIXME: Only some x86_32 calling conventions support AVX512.
|
||
|
if (Subtarget.useAVX512Regs() &&
|
||
|
(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
|
||
|
CallConv == CallingConv::Intel_OCL_BI)))
|
||
|
VecVT = MVT::v16f32;
|
||
|
else if (Subtarget.hasAVX())
|
||
|
VecVT = MVT::v8f32;
|
||
|
else if (Subtarget.hasSSE2())
|
||
|
VecVT = MVT::v4f32;
|
||
|
|
||
|
// We forward some GPRs and some vector types.
|
||
|
SmallVector<MVT, 2> RegParmTypes;
|
||
|
MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
|
||
|
RegParmTypes.push_back(IntVT);
|
||
|
if (VecVT != MVT::Other)
|
||
|
RegParmTypes.push_back(VecVT);
|
||
|
|
||
|
// Compute the set of forwarded registers. The rest are scratch.
|
||
|
SmallVectorImpl<ForwardedRegister> &Forwards =
|
||
|
FuncInfo->getForwardedMustTailRegParms();
|
||
|
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
|
||
|
|
||
|
// Forward AL for SysV x86_64 targets, since it is used for varargs.
|
||
|
if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
|
||
|
Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
|
||
|
Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
|
||
|
}
|
||
|
|
||
|
// Copy all forwards from physical to virtual registers.
|
||
|
for (ForwardedRegister &FR : Forwards) {
|
||
|
// FIXME: Can we use a less constrained schedule?
|
||
|
SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
|
||
|
FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
|
||
|
TargLowering.getRegClassFor(FR.VT));
|
||
|
Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
|
||
|
unsigned StackSize) {
|
||
|
// Set FrameIndex to the 0xAAAAAAA value to mark unset state.
|
||
|
// If necessary, it would be set into the correct value later.
|
||
|
FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
|
||
|
FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
|
||
|
|
||
|
if (FrameInfo.hasVAStart())
|
||
|
createVarArgAreaAndStoreRegisters(Chain, StackSize);
|
||
|
|
||
|
if (FrameInfo.hasMustTailInVarArgFunc())
|
||
|
forwardMustTailParameters(Chain);
|
||
|
}
|
||
|
|
||
|
SDValue X86TargetLowering::LowerFormalArguments(
|
||
|
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
|
||
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
||
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
||
|
MachineFunction &MF = DAG.getMachineFunction();
|
||
|
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
|
||
|
|
||
|
const Function &F = MF.getFunction();
|
||
|
if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
|
||
|
F.getName() == "main")
|
||
|
FuncInfo->setForceFramePointer(true);
|
||
|
|
||
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||
|
bool Is64Bit = Subtarget.is64Bit();
|
||
|
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
|
||
|
|
||
|
assert(
|
||
|
!(IsVarArg && canGuaranteeTCO(CallConv)) &&
|
||
|
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
|
||
|
|
||
|
// Assign locations to all of the incoming arguments.
|
||
|
SmallVector<CCValAssign, 16> ArgLocs;
|
||
|
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
|
||
|
|
||
|
// Allocate shadow area for Win64.
|
||
|
if (IsWin64)
|
||
|
CCInfo.AllocateStack(32, Align(8));
|
||
|
|
||
|
CCInfo.AnalyzeArguments(Ins, CC_X86);
|
||
|
|
||
|
// In vectorcall calling convention a second pass is required for the HVA
|
||
|
// types.
|
||
|
if (CallingConv::X86_VectorCall == CallConv) {
|
||
|
CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
|
||
|
}
|
||
|
|
||
|
// The next loop assumes that the locations are in the same order of the
|
||
|
// input arguments.
|
||
|
assert(isSortedByValueNo(ArgLocs) &&
|
||
|
"Argument Location list must be sorted before lowering");
|
||
|
|
||
|
SDValue ArgValue;
|
||
|
for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
|
||
|
++I, ++InsIndex) {
|
||
|
assert(InsIndex < Ins.size() && "Invalid Ins index");
|
||
|
CCValAssign &VA = ArgLocs[I];
|
||
|
|
||
|
if (VA.isRegLoc()) {
|
||
|
EVT RegVT = VA.getLocVT();
|
||
|
if (VA.needsCustom()) {
|
||
|
assert(
|
||
|
VA.getValVT() == MVT::v64i1 &&
|
||
|
"Currently the only custom case is when we split v64i1 to 2 regs");
|
||
|
|
||
|
// v64i1 values, in regcall calling convention, that are
|
||
|
// compiled to 32 bit arch, are split up into two registers.
|
||
|
ArgValue =
|
||
|
getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
|
||
|
} else {
|
||
|
const TargetRegisterClass *RC;
|
||
|
if (RegVT == MVT::i8)
|
||
|
RC = &X86::GR8RegClass;
|
||
|
else if (RegVT == MVT::i16)
|
||
|
RC = &X86::GR16RegClass;
|
||
|
else if (RegVT == MVT::i32)
|
||
|
RC = &X86::GR32RegClass;
|
||
|
else if (Is64Bit && RegVT == MVT::i64)
|
||
|
RC = &X86::GR64RegClass;
|
||
|
else if (RegVT == MVT::f16)
|
||
|
RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
|
||
|
else if (RegVT == MVT::f32)
|
||
|
RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
|
||
|
else if (RegVT == MVT::f64)
|
||
|
RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
|
||
|
else if (RegVT == MVT::f80)
|
||
|
RC = &X86::RFP80RegClass;
|
||
|
else if (RegVT == MVT::f128)
|
||
|
RC = &X86::VR128RegClass;
|
||
|
else if (RegVT.is512BitVector())
|
||
|
RC = &X86::VR512RegClass;
|
||
|
else if (RegVT.is256BitVector())
|
||
|
RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
|
||
|
else if (RegVT.is128BitVector())
|
||
|
RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
|
||
|
else if (RegVT == MVT::x86mmx)
|
||
|
RC = &X86::VR64RegClass;
|
||
|
else if (RegVT == MVT::v1i1)
|
||
|
RC = &X86::VK1RegClass;
|
||
|
else if (RegVT == MVT::v8i1)
|
||
|
RC = &X86::VK8RegClass;
|
||
|
else if (RegVT == MVT::v16i1)
|
||
|
RC = &X86::VK16RegClass;
|
||
|
else if (RegVT == MVT::v32i1)
|
||
|
RC = &X86::VK32RegClass;
|
||
|
else if (RegVT == MVT::v64i1)
|
||
|
RC = &X86::VK64RegClass;
|
||
|
else
|
||
|
llvm_unreachable("Unknown argument type!");
|
||
|
|
||
|
Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
|
||
|
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
|
||
|
}
|
||
|
|
||
|
// If this is an 8 or 16-bit value, it is really passed promoted to 32
|
||
|
// bits. Insert an assert[sz]ext to capture this, then truncate to the
|
||
|
// right size.
|
||
|
if (VA.getLocInfo() == CCValAssign::SExt)
|
||
|
ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
|
||
|
DAG.getValueType(VA.getValVT()));
|
||
|
else if (VA.getLocInfo() == CCValAssign::ZExt)
|
||
|
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
|
||
|
DAG.getValueType(VA.getValVT()));
|
||
|
else if (VA.getLocInfo() == CCValAssign::BCvt)
|
||
|
ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
|
||
|
|
||
|
if (VA.isExtInLoc()) {
|
||
|
// Handle MMX values passed in XMM regs.
|
||
|
if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
|
||
|
ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
|
||
|
else if (VA.getValVT().isVector() &&
|
||
|
VA.getValVT().getScalarType() == MVT::i1 &&
|
||
|
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
|
||
|
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
|
||
|
// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
|
||
|
ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
|
||
|
} else
|
||
|
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
|
||
|
}
|
||
|
} else {
|
||
|
assert(VA.isMemLoc());
|
||
|
ArgValue =
|
||
|
LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
|
||
|
}
|
||
|
|
||
|
// If value is passed via pointer - do a load.
|
||
|
if (VA.getLocInfo() == CCValAssign::Indirect &&
|
||
|
!(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
|
||
|
ArgValue =
|
||
|
DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
|
||
|
}
|
||
|
|
||
|
InVals.push_back(ArgValue);
|
||
|
}
|
||
|
|
||
|
for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
|
||
|
if (Ins[I].Flags.isSwiftAsync()) {
|
||
|
auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
|
||
|
if (Subtarget.is64Bit())
|
||
|
X86FI->setHasSwiftAsyncContext(true);
|
||
|
else {
|
||
|
int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
|
||
|
X86FI->setSwiftAsyncContextFrameIdx(FI);
|
||
|
SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
|
||
|
DAG.getFrameIndex(FI, MVT::i32),
|
||
|
MachinePointerInfo::getFixedStack(MF, FI));
|
||
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Swift calling convention does not require we copy the sret argument
|
||
|
// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
|
||
|
if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
|
||
|
continue;
|
||
|
|
||
|
// All x86 ABIs require that for returning structs by value we copy the
|
||
|
// sret argument into %rax/%eax (depending on ABI) for the return. Save
|
||
|
// the argument into a virtual register so that we can access it from the
|
||
|
// return points.
|
||
|
if (Ins[I].Flags.isSRet()) {
|
||
|
assert(!FuncInfo->getSRetReturnReg() &&
|
||
|
"SRet return has already been set");
|
||
|
MVT PtrTy = getPointerTy(DAG.getDataLayout());
|
||
|
Register Reg =
|
||
|
MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
|
||
|
FuncInfo->setSRetReturnReg(Reg);
|
||
|
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
|
||
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
unsigned StackSize = CCInfo.getStackSize();
|
||
|
// Align stack specially for tail calls.
|
||
|
if (shouldGuaranteeTCO(CallConv,
|
||
|
MF.getTarget().Options.GuaranteedTailCallOpt))
|
||
|
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
|
||
|
|
||
|
if (IsVarArg)
|
||
|
VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
|
||
|
.lowerVarArgsParameters(Chain, StackSize);
|
||
|
|
||
|
// Some CCs need callee pop.
|
||
|
if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
|
||
|
MF.getTarget().Options.GuaranteedTailCallOpt)) {
|
||
|
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
|
||
|
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
|
||
|
// X86 interrupts must pop the error code (and the alignment padding) if
|
||
|
// present.
|
||
|
FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
|
||
|
} else {
|
||
|
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
|
||
|
// If this is an sret function, the return should pop the hidden pointer.
|
||
|
if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
|
||
|
FuncInfo->setBytesToPopOnReturn(4);
|
||
|
}
|
||
|
|
||
|
if (!Is64Bit) {
|
||
|
// RegSaveFrameIndex is X86-64 only.
|
||
|
FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
|
||
|
}
|
||
|
|
||
|
FuncInfo->setArgumentStackSize(StackSize);
|
||
|
|
||
|
if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
|
||
|
EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
|
||
|
if (Personality == EHPersonality::CoreCLR) {
|
||
|
assert(Is64Bit);
|
||
|
// TODO: Add a mechanism to frame lowering that will allow us to indicate
|
||
|
// that we'd prefer this slot be allocated towards the bottom of the frame
|
||
|
// (i.e. near the stack pointer after allocating the frame). Every
|
||
|
// funclet needs a copy of this slot in its (mostly empty) frame, and the
|
||
|
// offset from the bottom of this and each funclet's frame must be the
|
||
|
// same, so the size of funclets' (mostly empty) frames is dictated by
|
||
|
// how far this slot is from the bottom (since they allocate just enough
|
||
|
// space to accommodate holding this slot at the correct offset).
|
||
|
int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
|
||
|
EHInfo->PSPSymFrameIdx = PSPSymFI;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (shouldDisableArgRegFromCSR(CallConv) ||
|
||
|
F.hasFnAttribute("no_caller_saved_registers")) {
|
||
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||
|
for (std::pair<Register, Register> Pair : MRI.liveins())
|
||
|
MRI.disableCalleeSavedRegister(Pair.first);
|
||
|
}
|
||
|
|
||
|
return Chain;
|
||
|
}
|
||
|
|
||
|
SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
|
||
|
SDValue Arg, const SDLoc &dl,
|
||
|
SelectionDAG &DAG,
|
||
|
const CCValAssign &VA,
|
||
|
ISD::ArgFlagsTy Flags,
|
||
|
bool isByVal) const {
|
||
|
unsigned LocMemOffset = VA.getLocMemOffset();
|
||
|
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
|
||
|
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
|
||
|
StackPtr, PtrOff);
|
||
|
if (isByVal)
|
||
|
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
|
||
|
|
||
|
MaybeAlign Alignment;
|
||
|
if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
|
||
|
Arg.getSimpleValueType() != MVT::f80)
|
||
|
Alignment = MaybeAlign(4);
|
||
|
return DAG.getStore(
|
||
|
Chain, dl, Arg, PtrOff,
|
||
|
MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
|
||
|
Alignment);
|
||
|
}
|
||
|
|
||
|
/// Emit a load of return address if tail call
|
||
|
/// optimization is performed and it is required.
|
||
|
SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
|
||
|
SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
|
||
|
bool Is64Bit, int FPDiff, const SDLoc &dl) const {
|
||
|
// Adjust the Return address stack slot.
|
||
|
EVT VT = getPointerTy(DAG.getDataLayout());
|
||
|
OutRetAddr = getReturnAddressFrameIndex(DAG);
|
||
|
|
||
|
// Load the "old" Return address.
|
||
|
OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
|
||
|
return SDValue(OutRetAddr.getNode(), 1);
|
||
|
}
|
||
|
|
||
|
/// Emit a store of the return address if tail call
|
||
|
/// optimization is performed and it is required (FPDiff!=0).
|
||
|
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
|
||
|
SDValue Chain, SDValue RetAddrFrIdx,
|
||
|
EVT PtrVT, unsigned SlotSize,
|
||
|
int FPDiff, const SDLoc &dl) {
|
||
|
// Store the return address to the appropriate stack slot.
|
||
|
if (!FPDiff) return Chain;
|
||
|
// Calculate the new stack slot for the return address.
|
||
|
int NewReturnAddrFI =
|
||
|
MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
|
||
|
false);
|
||
|
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
|
||
|
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
|
||
|
MachinePointerInfo::getFixedStack(
|
||
|
DAG.getMachineFunction(), NewReturnAddrFI));
|
||
|
return Chain;
|
||
|
}
|
||
|
|
||
|
/// Returns a vector_shuffle mask for an movs{s|d}, movd
|
||
|
/// operation of specified width.
|
||
|
SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
|
||
|
SDValue V1, SDValue V2) const {
|
||
|
unsigned NumElems = VT.getVectorNumElements();
|
||
|
SmallVector<int, 8> Mask;
|
||
|
Mask.push_back(NumElems);
|
||
|
for (unsigned i = 1; i != NumElems; ++i)
|
||
|
Mask.push_back(i);
|
||
|
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
|
||
|
}
|
||
|
|
||
|
SDValue
|
||
|
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
||
|
SmallVectorImpl<SDValue> &InVals) const {
|
||
|
SelectionDAG &DAG = CLI.DAG;
|
||
|
SDLoc &dl = CLI.DL;
|
||
|
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
|
||
|
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
|
||
|
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
|
||
|
SDValue Chain = CLI.Chain;
|
||
|
SDValue Callee = CLI.Callee;
|
||
|
CallingConv::ID CallConv = CLI.CallConv;
|
||
|
bool &isTailCall = CLI.IsTailCall;
|
||
|
bool isVarArg = CLI.IsVarArg;
|
||
|
const auto *CB = CLI.CB;
|
||
|
|
||
|
MachineFunction &MF = DAG.getMachineFunction();
|
||
|
bool Is64Bit = Subtarget.is64Bit();
|
||
|
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
|
||
|
bool IsSibcall = false;
|
||
|
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
|
||
|
CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
|
||
|
bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
|
||
|
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
|
||
|
bool HasNCSR = (CB && isa<CallInst>(CB) &&
|
||
|
CB->hasFnAttr("no_caller_saved_registers"));
|
||
|
bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
|
||
|
bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
|
||
|
bool IsCFICall = IsIndirectCall && CLI.CFIType;
|
||
|
const Module *M = MF.getMMI().getModule();
|
||
|
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
|
||
|
|
||
|
MachineFunction::CallSiteInfo CSInfo;
|
||
|
if (CallConv == CallingConv::X86_INTR)
|
||
|
report_fatal_error("X86 interrupts may not be called directly");
|
||
|
|
||
|
bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
|
||
|
if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
|
||
|
// If we are using a GOT, disable tail calls to external symbols with
|
||
|
// default visibility. Tail calling such a symbol requires using a GOT
|
||
|
// relocation, which forces early binding of the symbol. This breaks code
|
||
|
// that require lazy function symbol resolution. Using musttail or
|
||
|
// GuaranteedTailCallOpt will override this.
|
||
|
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
|
||
|
if (!G || (!G->getGlobal()->hasLocalLinkage() &&
|
||
|
G->getGlobal()->hasDefaultVisibility()))
|
||
|
isTailCall = false;
|
||
|
}
|
||
|
|
||
|
if (isTailCall && !IsMustTail) {
|
||
|
// Check if it's really possible to do a tail call.
|
||
|
isTailCall = IsEligibleForTailCallOptimization(
|
||
|
Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
|
||
|
Ins, DAG);
|
||
|
|
||
|
// Sibcalls are automatically detected tailcalls which do not require
|
||
|
// ABI changes.
|
||
|
if (!IsGuaranteeTCO && isTailCall)
|
||
|
IsSibcall = true;
|
||
|
|
||
|
if (isTailCall)
|
||
|
++NumTailCalls;
|
||
|
}
|
||
|
|
||
|
if (IsMustTail && !isTailCall)
|
||
|
report_fatal_error("failed to perform tail call elimination on a call "
|
||
|
"site marked musttail");
|
||
|
|
||
|
assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
|
||
|
"Var args not supported with calling convention fastcc, ghc or hipe");
|
||
|
|
||
|
// Analyze operands of the call, assigning locations to each operand.
|
||
|
SmallVector<CCValAssign, 16> ArgLocs;
|
||
|
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
|
||
|
|
||
|
// Allocate shadow area for Win64.
|
||
|
if (IsWin64)
|
||
|
CCInfo.AllocateStack(32, Align(8));
|
||
|
|
||
|
CCInfo.AnalyzeArguments(Outs, CC_X86);
|
||
|
|
||
|
// In vectorcall calling convention a second pass is required for the HVA
|
||
|
// types.
|
||
|
if (CallingConv::X86_VectorCall == CallConv) {
|
||
|
CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
|
||
|
}
|
||
|
|
||
|
// Get a count of how many bytes are to be pushed on the stack.
|
||
|
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
|
||
|
if (IsSibcall)
|
||
|
// This is a sibcall. The memory operands are available in caller's
|
||
|
// own caller's stack.
|
||
|
NumBytes = 0;
|
||
|
else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
|
||
|
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
|
||
|
|
||
|
int FPDiff = 0;
|
||
|
if (isTailCall &&
|
||
|
shouldGuaranteeTCO(CallConv,
|
||
|
MF.getTarget().Options.GuaranteedTailCallOpt)) {
|
||
|
// Lower arguments at fp - stackoffset + fpdiff.
|
||
|
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
|
||
|
|
||
|
FPDiff = NumBytesCallerPushed - NumBytes;
|
||
|
|
||
|
// Set the delta of movement of the returnaddr stackslot.
|
||
|
// But only set if delta is greater than previous delta.
|
||
|
if (FPDiff < X86Info->getTCReturnAddrDelta())
|
||
|
X86Info->setTCReturnAddrDelta(FPDiff);
|
||
|
}
|
||
|
|
||
|
unsigned NumBytesToPush = NumBytes;
|
||
|
unsigned NumBytesToPop = NumBytes;
|
||
|
|
||
|
// If we have an inalloca argument, all stack space has already been allocated
|
||
|
// for us and be right at the top of the stack. We don't support multiple
|
||
|
// arguments passed in memory when using inalloca.
|
||
|
if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
|
||
|
NumBytesToPush = 0;
|
||
|
if (!ArgLocs.back().isMemLoc())
|
||
|
report_fatal_error("cannot use inalloca attribute on a register "
|
||
|
"parameter");
|
||
|
if (ArgLocs.back().getLocMemOffset() != 0)
|
||
|
report_fatal_error("any parameter with the inalloca attribute must be "
|
||
|
"the only memory argument");
|
||
|
} else if (CLI.IsPreallocated) {
|
||
|
assert(ArgLocs.back().isMemLoc() &&
|
||
|
"cannot use preallocated attribute on a register "
|
||
|
"parameter");
|
||
|
SmallVector<size_t, 4> PreallocatedOffsets;
|
||
|
for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
|
||
|
if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
|
||
|
PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
|
||
|
}
|
||
|
}
|
||
|
auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
|
||
|
size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
|
||
|
MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
|
||
|
MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
|
||
|
NumBytesToPush = 0;
|
||
|
}
|
||
|
|
||
|
if (!IsSibcall && !IsMustTail)
|
||
|
Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
|
||
|
NumBytes - NumBytesToPush, dl);
|
||
|
|
||
|
SDValue RetAddrFrIdx;
|
||
|
// Load return address for tail calls.
|
||
|
if (isTailCall && FPDiff)
|
||
|
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
|
||
|
Is64Bit, FPDiff, dl);
|
||
|
|
||
|
SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
|
||
|
SmallVector<SDValue, 8> MemOpChains;
|
||
|
SDValue StackPtr;
|
||
|
|
||
|
// The next loop assumes that the locations are in the same order of the
|
||
|
// input arguments.
|
||
|
assert(isSortedByValueNo(ArgLocs) &&
|
||
|
"Argument Location list must be sorted before lowering");
|
||
|
|
||
|
// Walk the register/memloc assignments, inserting copies/loads. In the case
|
||
|
// of tail call optimization arguments are handle later.
|
||
|
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
|
||
|
for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
|
||
|
++I, ++OutIndex) {
|
||
|
assert(OutIndex < Outs.size() && "Invalid Out index");
|
||
|
// Skip inalloca/preallocated arguments, they have already been written.
|
||
|
ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
|
||
|
if (Flags.isInAlloca() || Flags.isPreallocated())
|
||
|
continue;
|
||
|
|
||
|
CCValAssign &VA = ArgLocs[I];
|
||
|
EVT RegVT = VA.getLocVT();
|
||
|
SDValue Arg = OutVals[OutIndex];
|
||
|
bool isByVal = Flags.isByVal();
|
||
|
|
||
|
// Promote the value if needed.
|
||
|
switch (VA.getLocInfo()) {
|
||
|
default: llvm_unreachable("Unknown loc info!");
|
||
|
case CCValAssign::Full: break;
|
||
|
case CCValAssign::SExt:
|
||
|
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
|
||
|
break;
|
||
|
case CCValAssign::ZExt:
|
||
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
|
||
|
break;
|
||
|
case CCValAssign::AExt:
|
||
|
if (Arg.getValueType().isVector() &&
|
||
|
Arg.getValueType().getVectorElementType() == MVT::i1)
|
||
|
Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
|
||
|
else if (RegVT.is128BitVector()) {
|
||
|
// Special case: passing MMX values in XMM registers.
|
||
|
Arg = DAG.getBitcast(MVT::i64, Arg);
|
||
|
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
|
||
|
Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
|
||
|
} else
|
||
|
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
|
||
|
break;
|
||
|
case CCValAssign::BCvt:
|
||
|
Arg = DAG.getBitcast(RegVT, Arg);
|
||
|
break;
|
||
|
case CCValAssign::Indirect: {
|
||
|
if (isByVal) {
|
||
|
// Memcpy the argument to a temporary stack slot to prevent
|
||
|
// the caller from seeing any modifications the callee may make
|
||
|
// as guaranteed by the `byval` attribute.
|
||
|
int FrameIdx = MF.getFrameInfo().CreateStackObject(
|
||
|
Flags.getByValSize(),
|
||
|
std::max(Align(16), Flags.getNonZeroByValAlign()), false);
|
||
|
SDValue StackSlot =
|
||
|
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
|
||
|
Chain =
|
||
|
CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
|
||
|
// From now on treat this as a regular pointer
|
||
|
Arg = StackSlot;
|
||
|
isByVal = false;
|
||
|
} else {
|
||
|
// Store the argument.
|
||
|
SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
|
||
|
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
|
||
|
Chain = DAG.getStore(
|
||
|
Chain, dl, Arg, SpillSlot,
|
||
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
|
||
|
Arg = SpillSlot;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (VA.needsCustom()) {
|
||
|
assert(VA.getValVT() == MVT::v64i1 &&
|
||
|
"Currently the only custom case is when we split v64i1 to 2 regs");
|
||
|
// Split v64i1 value into two registers
|
||
|
Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
|
||
|
} else if (VA.isRegLoc()) {
|
||
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
|
||
|
const TargetOptions &Options = DAG.getTarget().Options;
|
||
|
if (Options.EmitCallSiteInfo)
|
||
|
CSInfo.emplace_back(VA.getLocReg(), I);
|
||
|
if (isVarArg && IsWin64) {
|
||
|
// Win64 ABI requires argument XMM reg to be copied to the corresponding
|
||
|
// shadow reg if callee is a varargs function.
|
||
|
Register ShadowReg;
|
||
|
switch (VA.getLocReg()) {
|
||
|
case X86::XMM0: ShadowReg = X86::RCX; break;
|
||
|
case X86::XMM1: ShadowReg = X86::RDX; break;
|
||
|
case X86::XMM2: ShadowReg = X86::R8; break;
|
||
|
case X86::XMM3: ShadowReg = X86::R9; break;
|
||
|
}
|
||
|
if (ShadowReg)
|
||
|
RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
|
||
|
}
|
||
|
} else if (!IsSibcall && (!isTailCall || isByVal)) {
|
||
|
assert(VA.isMemLoc());
|
||
|
if (!StackPtr.getNode())
|
||
|
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
|
||
|
getPointerTy(DAG.getDataLayout()));
|
||
|
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
|
||
|
dl, DAG, VA, Flags, isByVal));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!MemOpChains.empty())
|
||
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
|
||
|
|
||
|
if (Subtarget.isPICStyleGOT()) {
|
||
|
// ELF / PIC requires GOT in the EBX register before function calls via PLT
|
||
|
// GOT pointer (except regcall).
|
||
|
if (!isTailCall) {
|
||
|
// Indirect call with RegCall calling convertion may use up all the
|
||
|
// general registers, so it is not suitable to bind EBX reister for
|
||
|
// GOT address, just let register allocator handle it.
|
||
|
if (CallConv != CallingConv::X86_RegCall)
|
||
|
RegsToPass.push_back(std::make_pair(
|
||
|
Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
|
||
|
getPointerTy(DAG.getDataLayout()))));
|
||
|
} else {
|
||
|
// If we are tail calling and generating PIC/GOT style code load the
|
||
|
// address of the callee into ECX. The value in ecx is used as target of
|
||
|
// the tail jump. This is done to circumvent the ebx/callee-saved problem
|
||
|
// for tail calls on PIC/GOT architectures. Normally we would just put the
|
||
|
// address of GOT into ebx and then call target@PLT. But for tail calls
|
||
|
// ebx would be restored (since ebx is callee saved) before jumping to the
|
||
|
// target@PLT.
|
||
|
|
||
|
// Note: The actual moving to ECX is done further down.
|
||
|
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
|
||
|
if (G && !G->getGlobal()->hasLocalLinkage() &&
|
||
|
G->getGlobal()->hasDefaultVisibility())
|
||
|
Callee = LowerGlobalAddress(Callee, DAG);
|
||
|
else if (isa<ExternalSymbolSDNode>(Callee))
|
||
|
Callee = LowerExternalSymbol(Callee, DAG);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
|
||
|
(Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
|
||
|
// From AMD64 ABI document:
|
||
|
// For calls that may call functions that use varargs or stdargs
|
||
|
// (prototype-less calls or calls to functions containing ellipsis (...) in
|
||
|
// the declaration) %al is used as hidden argument to specify the number
|
||
|
// of SSE registers used. The contents of %al do not need to match exactly
|
||
|
// the number of registers, but must be an ubound on the number of SSE
|
||
|
// registers used and is in the range 0 - 8 inclusive.
|
||
|
|
||
|
// Count the number of XMM registers allocated.
|
||
|
static const MCPhysReg XMMArgRegs[] = {
|
||
|
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
|
||
|
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
|
||
|
};
|
||
|
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
|
||
|
assert((Subtarget.hasSSE1() || !NumXMMRegs)
|
||
|
&& "SSE registers cannot be used when SSE is disabled");
|
||
|
RegsToPass.push_back(std::make_pair(Register(X86::AL),
|
||
|
DAG.getConstant(NumXMMRegs, dl,
|
||
|
MVT::i8)));
|
||
|
}
|
||
|
|
||
|
if (isVarArg && IsMustTail) {
|
||
|
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
|
||
|
for (const auto &F : Forwards) {
|
||
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
|
||
|
RegsToPass.push_back(std::make_pair(F.PReg, Val));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
|
||
|
// don't need this because the eligibility check rejects calls that require
|
||
|
// shuffling arguments passed in memory.
|
||
|
if (!IsSibcall && isTailCall) {
|
||
|
// Force all the incoming stack arguments to be loaded from the stack
|
||
|
// before any new outgoing arguments are stored to the stack, because the
|
||
|
// outgoing stack slots may alias the incoming argument stack slots, and
|
||
|
// the alias isn't otherwise explicit. This is slightly more conservative
|
||
|
// than necessary, because it means that each store effectively depends
|
||
|
// on every argument instead of just those arguments it would clobber.
|
||
|
SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
|
||
|
|
||
|
SmallVector<SDValue, 8> MemOpChains2;
|
||
|
SDValue FIN;
|
||
|
int FI = 0;
|
||
|
for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
|
||
|
++I, ++OutsIndex) {
|
||
|
CCValAssign &VA = ArgLocs[I];
|
||
|
|
||
|
if (VA.isRegLoc()) {
|
||
|
if (VA.needsCustom()) {
|
||
|
assert((CallConv == CallingConv::X86_RegCall) &&
|
||
|
"Expecting custom case only in regcall calling convention");
|
||
|
// This means that we are in special case where one argument was
|
||
|
// passed through two register locations - Skip the next location
|
||
|
++I;
|
||
|
}
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
assert(VA.isMemLoc());
|
||
|
SDValue Arg = OutVals[OutsIndex];
|
||
|
ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
|
||
|
// Skip inalloca/preallocated arguments. They don't require any work.
|
||
|
if (Flags.isInAlloca() || Flags.isPreallocated())
|
||
|
continue;
|
||
|
// Create frame index.
|
||
|
int32_t Offset = VA.getLocMemOffset()+FPDiff;
|
||
|
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
|
||
|
FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
|
||
|
FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
|
||
|
|
||
|
if (Flags.isByVal()) {
|
||
|
// Copy relative to framepointer.
|
||
|
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
|
||
|
if (!StackPtr.getNode())
|
||
|
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
|
||
|
getPointerTy(DAG.getDataLayout()));
|
||
|
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
|
||
|
StackPtr, Source);
|
||
|
|
||
|
MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
|
||
|
ArgChain,
|
||
|
Flags, DAG, dl));
|
||
|
} else {
|
||
|
// Store relative to framepointer.
|
||
|
MemOpChains2.push_back(DAG.getStore(
|
||
|
ArgChain, dl, Arg, FIN,
|
||
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!MemOpChains2.empty())
|
||
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
|
||
|
|
||
|
// Store the return address to the appropriate stack slot.
|
||
|
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
|
||
|
getPointerTy(DAG.getDataLayout()),
|
||
|
RegInfo->getSlotSize(), FPDiff, dl);
|
||
|
}
|
||
|
|
||
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
||
|
// and glue operands which copy the outgoing args into registers.
|
||
|
SDValue InGlue;
|
||
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
|
||
|
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
|
||
|
RegsToPass[i].second, InGlue);
|
||
|
InGlue = Chain.getValue(1);
|
||
|
}
|
||
|
|
||
|
if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
|
||
|
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
|
||
|
// In the 64-bit large code model, we have to make all calls
|
||
|
// through a register, since the call instruction's 32-bit
|
||
|
// pc-relative offset may not be large enough to hold the whole
|
||
|
// address.
|
||
|
} else if (Callee->getOpcode() == ISD::GlobalAddress ||
|
||
|
Callee->getOpcode() == ISD::ExternalSymbol) {
|
||
|
// Lower direct calls to global addresses and external symbols. Setting
|
||
|
// ForCall to true here has the effect of removing WrapperRIP when possible
|
||
|
// to allow direct calls to be selected without first materializing the
|
||
|
// address into a register.
|
||
|
Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
|
||
|
} else if (Subtarget.isTarget64BitILP32() &&
|
||
|
Callee.getValueType() == MVT::i32) {
|
||
|
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
|
||
|
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
|
||
|
}
|
||
|
|
||
|
// Returns a chain & a glue for retval copy to use.
|
||
|
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
|
||
|
SmallVector<SDValue, 8> Ops;
|
||
|
|
||
|
if (!IsSibcall && isTailCall && !IsMustTail) {
|
||
|
Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
|
||
|
InGlue = Chain.getValue(1);
|
||
|
}
|
||
|
|
||
|
Ops.push_back(Chain);
|
||
|
Ops.push_back(Callee);
|
||
|
|
||
|
if (isTailCall)
|
||
|
Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
|
||
|
|
||
|
// Add argument registers to the end of the list so that they are known live
|
||
|
// into the call.
|
||
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
|
||
|
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
|
||
|
RegsToPass[i].second.getValueType()));
|
||
|
|
||
|
// Add a register mask operand representing the call-preserved registers.
|
||
|
const uint32_t *Mask = [&]() {
|
||
|
auto AdaptedCC = CallConv;
|
||
|
// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
|
||
|
// use X86_INTR calling convention because it has the same CSR mask
|
||
|
// (same preserved registers).
|
||
|
if (HasNCSR)
|
||
|
AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
|
||
|
// If NoCalleeSavedRegisters is requested, than use GHC since it happens
|
||
|
// to use the CSR_NoRegs_RegMask.
|
||
|
if (CB && CB->hasFnAttr("no_callee_saved_registers"))
|
||
|
AdaptedCC = (CallingConv::ID)CallingConv::GHC;
|
||
|
return RegInfo->getCallPreservedMask(MF, AdaptedCC);
|
||
|
}();
|
||
|
assert(Mask && "Missing call preserved mask for calling convention");
|
||
|
|
||
|
// If this is an invoke in a 32-bit function using a funclet-based
|
||
|
// personality, assume the function clobbers all registers. If an exception
|
||
|
// is thrown, the runtime will not restore CSRs.
|
||
|
// FIXME: Model this more precisely so that we can register allocate across
|
||
|
// the normal edge and spill and fill across the exceptional edge.
|
||
|
if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
|
||
|
const Function &CallerFn = MF.getFunction();
|
||
|
EHPersonality Pers =
|
||
|
CallerFn.hasPersonalityFn()
|
||
|
? classifyEHPersonality(CallerFn.getPersonalityFn())
|
||
|
: EHPersonality::Unknown;
|
||
|
if (isFuncletEHPersonality(Pers))
|
||
|
Mask = RegInfo->getNoPreservedMask();
|
||
|
}
|
||
|
|
||
|
// Define a new register mask from the existing mask.
|
||
|
uint32_t *RegMask = nullptr;
|
||
|
|
||
|
// In some calling conventions we need to remove the used physical registers
|
||
|
// from the reg mask. Create a new RegMask for such calling conventions.
|
||
|
// RegMask for calling conventions that disable only return registers (e.g.
|
||
|
// preserve_most) will be modified later in LowerCallResult.
|
||
|
bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
|
||
|
if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
|
||
|
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
|
||
|
|
||
|
// Allocate a new Reg Mask and copy Mask.
|
||
|
RegMask = MF.allocateRegMask();
|
||
|
unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
|
||
|
memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
|
||
|
|
||
|
// Make sure all sub registers of the argument registers are reset
|
||
|
// in the RegMask.
|
||
|
if (ShouldDisableArgRegs) {
|
||
|
for (auto const &RegPair : RegsToPass)
|
||
|
for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
|
||
|
RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
|
||
|
}
|
||
|
|
||
|
// Create the RegMask Operand according to our updated mask.
|
||
|
Ops.push_back(DAG.getRegisterMask(RegMask));
|
||
|
} else {
|
||
|
// Create the RegMask Operand according to the static mask.
|
||
|
Ops.push_back(DAG.getRegisterMask(Mask));
|
||
|
}
|
||
|
|
||
|
if (InGlue.getNode())
|
||
|
Ops.push_back(InGlue);
|
||
|
|
||
|
if (isTailCall) {
|
||
|
// We used to do:
|
||
|
//// If this is the first return lowered for this function, add the regs
|
||
|
//// to the liveout set for the function.
|
||
|
// This isn't right, although it's probably harmless on x86; liveouts
|
||
|
// should be computed from returns not tail calls. Consider a void
|
||
|
// function making a tail call to a function returning int.
|
||
|
MF.getFrameInfo().setHasTailCall();
|
||
|
SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
|
||
|
|
||
|
if (IsCFICall)
|
||
|
Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
|
||
|
|
||
|
DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
|
||
|
DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
|
||
|
return Ret;
|
||
|
}
|
||
|
|
||
|
if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
|
||
|
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
|
||
|
} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
|
||
|
// Calls with a "clang.arc.attachedcall" bundle are special. They should be
|
||
|
// expanded to the call, directly followed by a special marker sequence and
|
||
|
// a call to a ObjC library function. Use the CALL_RVMARKER to do that.
|
||
|
assert(!isTailCall &&
|
||
|
"tail calls cannot be marked with clang.arc.attachedcall");
|
||
|
assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
|
||
|
|
||
|
// Add a target global address for the retainRV/claimRV runtime function
|
||
|
// just before the call target.
|
||
|
Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
|
||
|
auto PtrVT = getPointerTy(DAG.getDataLayout());
|
||
|
auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
|
||
|
Ops.insert(Ops.begin() + 1, GA);
|
||
|
Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
|
||
|
} else {
|
||
|
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
|
||
|
}
|
||
|
|
||
|
if (IsCFICall)
|
||
|
Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
|
||
|
|
||
|
InGlue = Chain.getValue(1);
|
||
|
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
|
||
|
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
|
||
|
|
||
|
// Save heapallocsite metadata.
|
||
|
if (CLI.CB)
|
||
|
if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
|
||
|
DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
|
||
|
|
||
|
// Create the CALLSEQ_END node.
|
||
|
unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
|
||
|
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
|
||
|
DAG.getTarget().Options.GuaranteedTailCallOpt))
|
||
|
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
|
||
|
else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
|
||
|
// If this call passes a struct-return pointer, the callee
|
||
|
// pops that struct pointer.
|
||
|
NumBytesForCalleeToPop = 4;
|
||
|
|
||
|
// Returns a glue for retval copy to use.
|
||
|
if (!IsSibcall) {
|
||
|
Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
|
||
|
InGlue, dl);
|
||
|
InGlue = Chain.getValue(1);
|
||
|
}
|
||
|
|
||
|
// Handle result values, copying them out of physregs into vregs that we
|
||
|
// return.
|
||
|
return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
|
||
|
InVals, RegMask);
|
||
|
}
|
||
|
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
// Fast Calling Convention (tail call) implementation
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
// Like std call, callee cleans arguments, convention except that ECX is
|
||
|
// reserved for storing the tail called function address. Only 2 registers are
|
||
|
// free for argument passing (inreg). Tail call optimization is performed
|
||
|
// provided:
|
||
|
// * tailcallopt is enabled
|
||
|
// * caller/callee are fastcc
|
||
|
// On X86_64 architecture with GOT-style position independent code only local
|
||
|
// (within module) calls are supported at the moment.
|
||
|
// To keep the stack aligned according to platform abi the function
|
||
|
// GetAlignedArgumentStackSize ensures that argument delta is always multiples
|
||
|
// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
|
||
|
// If a tail called function callee has more arguments than the caller the
|
||
|
// caller needs to make sure that there is room to move the RETADDR to. This is
|
||
|
// achieved by reserving an area the size of the argument delta right after the
|
||
|
// original RETADDR, but before the saved framepointer or the spilled registers
|
||
|
// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
|
||
|
// stack layout:
|
||
|
// arg1
|
||
|
// arg2
|
||
|
// RETADDR
|
||
|
// [ new RETADDR
|
||
|
// move area ]
|
||
|
// (possible EBP)
|
||
|
// ESI
|
||
|
// EDI
|
||
|
// local1 ..
|
||
|
|
||
|
/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
|
||
|
/// requirement.
|
||
|
unsigned
|
||
|
X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
|
||
|
SelectionDAG &DAG) const {
|
||
|
const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
|
||
|
const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
|
||
|
assert(StackSize % SlotSize == 0 &&
|
||
|
"StackSize must be a multiple of SlotSize");
|
||
|
return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
|
||
|
}
|
||
|
|
||
|
/// Return true if the given stack call argument is already available in the
|
||
|
/// same position (relatively) of the caller's incoming argument stack.
|
||
|
static
|
||
|
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
|
||
|
MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
|
||
|
const X86InstrInfo *TII, const CCValAssign &VA) {
|
||
|
unsigned Bytes = Arg.getValueSizeInBits() / 8;
|
||
|
|
||
|
for (;;) {
|
||
|
// Look through nodes that don't alter the bits of the incoming value.
|
||
|
unsigned Op = Arg.getOpcode();
|
||
|
if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
|
||
|
Op == ISD::AssertZext) {
|
||
|
Arg = Arg.getOperand(0);
|
||
|
continue;
|
||
|
}
|
||
|
if (Op == ISD::TRUNCATE) {
|
||
|
const SDValue &TruncInput = Arg.getOperand(0);
|
||
|
if (TruncInput.getOpcode() == ISD::AssertZext &&
|
||
|
cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
|
||
|
Arg.getValueType()) {
|
||
|
Arg = TruncInput.getOperand(0);
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
int FI = INT_MAX;
|
||
|
if (Arg.getOpcode() == ISD::CopyFromReg) {
|
||
|
Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
|
||
|
if (!VR.isVirtual())
|
||
|
return false;
|
||
|
MachineInstr *Def = MRI->getVRegDef(VR);
|
||
|
if (!Def)
|
||
|
return false;
|
||
|
if (!Flags.isByVal()) {
|
||
|
if (!TII->isLoadFromStackSlot(*Def, FI))
|
||
|
return false;
|
||
|
} else {
|
||
|
unsigned Opcode = Def->getOpcode();
|
||
|
if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
|
||
|
Opcode == X86::LEA64_32r) &&
|
||
|
Def->getOperand(1).isFI()) {
|
||
|
FI = Def->getOperand(1).getIndex();
|
||
|
Bytes = Flags.getByValSize();
|
||
|
} else
|
||
|
return false;
|
||
|
}
|
||
|
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
|
||
|
if (Flags.isByVal())
|
||
|
// ByVal argument is passed in as a pointer but it's now being
|
||
|
// dereferenced. e.g.
|
||
|
// define @foo(%struct.X* %A) {
|
||
|
// tail call @bar(%struct.X* byval %A)
|
||
|
// }
|
||
|
return false;
|
||
|
SDValue Ptr = Ld->getBasePtr();
|
||
|
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
|
||
|
if (!FINode)
|
||
|
return false;
|
||
|
FI = FINode->getIndex();
|
||
|
} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
|
||
|
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
|
||
|
FI = FINode->getIndex();
|
||
|
Bytes = Flags.getByValSize();
|
||
|
} else
|
||
|
return false;
|
||
|
|
||
|
assert(FI != INT_MAX);
|
||
|
if (!MFI.isFixedObjectIndex(FI))
|
||
|
return false;
|
||
|
|
||
|
if (Offset != MFI.getObjectOffset(FI))
|
||
|
return false;
|
||
|
|
||
|
// If this is not byval, check that the argument stack object is immutable.
|
||
|
// inalloca and argument copy elision can create mutable argument stack
|
||
|
// objects. Byval objects can be mutated, but a byval call intends to pass the
|
||
|
// mutated memory.
|
||
|
if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
|
||
|
return false;
|
||
|
|
||
|
if (VA.getLocVT().getFixedSizeInBits() >
|
||
|
Arg.getValueSizeInBits().getFixedValue()) {
|
||
|
// If the argument location is wider than the argument type, check that any
|
||
|
// extension flags match.
|
||
|
if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
|
||
|
Flags.isSExt() != MFI.isObjectSExt(FI)) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return Bytes == MFI.getObjectSize(FI);
|
||
|
}
|
||
|
|
||
|
/// Check whether the call is eligible for tail call optimization. Targets
|
||
|
/// that want to do tail call optimization should implement this function.
|
||
|
bool X86TargetLowering::IsEligibleForTailCallOptimization(
|
||
|
SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
|
||
|
bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||
|
const SmallVectorImpl<SDValue> &OutVals,
|
||
|
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
|
||
|
if (!mayTailCallThisCC(CalleeCC))
|
||
|
return false;
|
||
|
|
||
|
// If -tailcallopt is specified, make fastcc functions tail-callable.
|
||
|
MachineFunction &MF = DAG.getMachineFunction();
|
||
|
const Function &CallerF = MF.getFunction();
|
||
|
|
||
|
// If the function return type is x86_fp80 and the callee return type is not,
|
||
|
// then the FP_EXTEND of the call result is not a nop. It's not safe to
|
||
|
// perform a tailcall optimization here.
|
||
|
if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
|
||
|
return false;
|
||
|
|
||
|
CallingConv::ID CallerCC = CallerF.getCallingConv();
|
||
|
bool CCMatch = CallerCC == CalleeCC;
|
||
|
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
|
||
|
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
|
||
|
bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
|
||
|
CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
|
||
|
|
||
|
// Win64 functions have extra shadow space for argument homing. Don't do the
|
||
|
// sibcall if the caller and callee have mismatched expectations for this
|
||
|
// space.
|
||
|
if (IsCalleeWin64 != IsCallerWin64)
|
||
|
return false;
|
||
|
|
||
|
if (IsGuaranteeTCO) {
|
||
|
if (canGuaranteeTCO(CalleeCC) && CCMatch)
|
||
|
return true;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Look for obvious safe cases to perform tail call optimization that do not
|
||
|
// require ABI changes. This is what gcc calls sibcall.
|
||
|
|
||
|
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
|
||
|
// emit a special epilogue.
|
||
|
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
|
||
|
if (RegInfo->hasStackRealignment(MF))
|
||
|
return false;
|
||
|
|
||
|
// Also avoid sibcall optimization if we're an sret return fn and the callee
|
||
|
// is incompatible. See comment in LowerReturn about why hasStructRetAttr is
|
||
|
// insufficient.
|
||
|
if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
|
||
|
// For a compatible tail call the callee must return our sret pointer. So it
|
||
|
// needs to be (a) an sret function itself and (b) we pass our sret as its
|
||
|
// sret. Condition #b is harder to determine.
|
||
|
return false;
|
||
|
} else if (IsCalleePopSRet)
|
||
|
// The callee pops an sret, so we cannot tail-call, as our caller doesn't
|
||
|
// expect that.
|
||
|
return false;
|
||
|
|
||
|
// Do not sibcall optimize vararg calls unless all arguments are passed via
|
||
|
// registers.
|
||
|
LLVMContext &C = *DAG.getContext();
|
||
|
if (isVarArg && !Outs.empty()) {
|
||
|
// Optimizing for varargs on Win64 is unlikely to be safe without
|
||
|
// additional testing.
|
||
|
if (IsCalleeWin64 || IsCallerWin64)
|
||
|
return false;
|
||
|
|
||
|
SmallVector<CCValAssign, 16> ArgLocs;
|
||
|
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
|
||
|
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
|
||
|
for (const auto &VA : ArgLocs)
|
||
|
if (!VA.isRegLoc())
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// If the call result is in ST0 / ST1, it needs to be popped off the x87
|
||
|
// stack. Therefore, if it's not used by the call it is not safe to optimize
|
||
|
// this into a sibcall.
|
||
|
bool Unused = false;
|
||
|
for (const auto &In : Ins) {
|
||
|
if (!In.Used) {
|
||
|
Unused = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (Unused) {
|
||
|
SmallVector<CCValAssign, 16> RVLocs;
|
||
|
CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
|
||
|
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
|
||
|
for (const auto &VA : RVLocs) {
|
||
|
if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Check that the call results are passed in the same way.
|
||
|
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
|
||
|
RetCC_X86, RetCC_X86))
|
||
|
return false;
|
||
|
// The callee has to preserve all registers the caller needs to preserve.
|
||
|
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
|
||
|
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
|
||
|
if (!CCMatch) {
|
||
|
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
|
||
|
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
unsigned StackArgsSize = 0;
|
||
|
|
||
|
// If the callee takes no arguments then go on to check the results of the
|
||
|
// call.
|
||
|
if (!Outs.empty()) {
|
||
|
// Check if stack adjustment is needed. For now, do not do this if any
|
||
|
// argument is passed on the stack.
|
||
|
SmallVector<CCValAssign, 16> ArgLocs;
|
||
|
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
|
||
|
|
||
|
// Allocate shadow area for Win64
|
||
|
if (IsCalleeWin64)
|
||
|
CCInfo.AllocateStack(32, Align(8));
|
||
|
|
||
|
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
|
||
|
StackArgsSize = CCInfo.getStackSize();
|
||
|
|
||
|
if (CCInfo.getStackSize()) {
|
||
|
// Check if the arguments are already laid out in the right way as
|
||
|
// the caller's fixed stack objects.
|
||
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||
|
const MachineRegisterInfo *MRI = &MF.getRegInfo();
|
||
|
const X86InstrInfo *TII = Subtarget.getInstrInfo();
|
||
|
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
|
||
|
const CCValAssign &VA = ArgLocs[I];
|
||
|
SDValue Arg = OutVals[I];
|
||
|
ISD::ArgFlagsTy Flags = Outs[I].Flags;
|
||
|
if (VA.getLocInfo() == CCValAssign::Indirect)
|
||
|
return false;
|
||
|
if (!VA.isRegLoc()) {
|
||
|
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
|
||
|
TII, VA))
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool PositionIndependent = isPositionIndependent();
|
||
|
// If the tailcall address may be in a register, then make sure it's
|
||
|
// possible to register allocate for it. In 32-bit, the call address can
|
||
|
// only target EAX, EDX, or ECX since the tail call must be scheduled after
|
||
|
// callee-saved registers are restored. These happen to be the same
|
||
|
// registers used to pass 'inreg' arguments so watch out for those.
|
||
|
if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
|
||
|
!isa<ExternalSymbolSDNode>(Callee)) ||
|
||
|
PositionIndependent)) {
|
||
|
unsigned NumInRegs = 0;
|
||
|
// In PIC we need an extra register to formulate the address computation
|
||
|
// for the callee.
|
||
|
unsigned MaxInRegs = PositionIndependent ? 2 : 3;
|
||
|
|
||
|
for (const auto &VA : ArgLocs) {
|
||
|
if (!VA.isRegLoc())
|
||
|
continue;
|
||
|
Register Reg = VA.getLocReg();
|
||
|
switch (Reg) {
|
||
|
default: break;
|
||
|
case X86::EAX: case X86::EDX: case X86::ECX:
|
||
|
if (++NumInRegs == MaxInRegs)
|
||
|
return false;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||
|
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool CalleeWillPop =
|
||
|
X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
|
||
|
MF.getTarget().Options.GuaranteedTailCallOpt);
|
||
|
|
||
|
if (unsigned BytesToPop =
|
||
|
MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
|
||
|
// If we have bytes to pop, the callee must pop them.
|
||
|
bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
|
||
|
if (!CalleePopMatches)
|
||
|
return false;
|
||
|
} else if (CalleeWillPop && StackArgsSize > 0) {
|
||
|
// If we don't have bytes to pop, make sure the callee doesn't pop any.
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/// Determines whether the callee is required to pop its own arguments.
|
||
|
/// Callee pop is necessary to support tail calls.
|
||
|
bool X86::isCalleePop(CallingConv::ID CallingConv,
|
||
|
bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
|
||
|
// If GuaranteeTCO is true, we force some calls to be callee pop so that we
|
||
|
// can guarantee TCO.
|
||
|
if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
|
||
|
return true;
|
||
|
|
||
|
switch (CallingConv) {
|
||
|
default:
|
||
|
return false;
|
||
|
case CallingConv::X86_StdCall:
|
||
|
case CallingConv::X86_FastCall:
|
||
|
case CallingConv::X86_ThisCall:
|
||
|
case CallingConv::X86_VectorCall:
|
||
|
return !is64Bit;
|
||
|
}
|
||
|
}
|