# --- T2-COPYRIGHT-BEGIN --- # t2/package/*/llvm/ia64-target.patch.ia64 # Copyright (C) 2026 The T2 SDE Project # SPDX-License-Identifier: GPL-2.0 or patched project license # --- T2-COPYRIGHT-END --- diff --git llvm/CMakeLists.txt llvm/CMakeLists.txt index fcbfed495383..5a95427b08ea 100644 --- llvm/CMakeLists.txt +++ llvm/CMakeLists.txt @@ -542,6 +542,7 @@ set(LLVM_ALL_EXPERIMENTAL_TARGETS ARC CSKY DirectX + IA64 M68k Xtensa ) diff --git llvm/cmake/config-ix.cmake llvm/cmake/config-ix.cmake index ed2bfa6df68f..387ba4bbf168 100644 --- llvm/cmake/config-ix.cmake +++ llvm/cmake/config-ix.cmake @@ -567,6 +567,8 @@ elseif (LLVM_NATIVE_ARCH MATCHES "msp430") set(LLVM_NATIVE_ARCH MSP430) elseif (LLVM_NATIVE_ARCH MATCHES "hexagon") set(LLVM_NATIVE_ARCH Hexagon) +elseif (LLVM_NATIVE_ARCH MATCHES "ia64") + set(LLVM_NATIVE_ARCH IA64) elseif (LLVM_NATIVE_ARCH MATCHES "s390x") set(LLVM_NATIVE_ARCH SystemZ) elseif (LLVM_NATIVE_ARCH MATCHES "wasm32") diff --git llvm/include/llvm/IR/RuntimeLibcalls.td llvm/include/llvm/IR/RuntimeLibcalls.td index e7d636841c4b..984887bb554c 100644 --- llvm/include/llvm/IR/RuntimeLibcalls.td +++ llvm/include/llvm/IR/RuntimeLibcalls.td @@ -3533,6 +3533,30 @@ def LegacyDefaultSystemLibrary DefaultStackProtector )>; +//===----------------------------------------------------------------------===// +// IA-64 Runtime Libcalls +//===----------------------------------------------------------------------===// + +def isIA64 : RuntimeLibcallPredicate<"TT.getArch() == Triple::ia64">; + +// The legacy default set, except that IA-64's 'long double' is 80-bit double +// extended, so the F80 libcalls stripped from the default set are live here: +// libgcc's ia64 lib1funcs provides __divxf3/__divdf3/__divsf3 (FP divide has +// no instruction; add/sub/mul are native, so there is no __addxf3 etc., and +// nothing expands to one) and libm provides the l-suffixed math functions +// (fmodl, sinl, sqrtl, ...). +def IA64SystemLibrary + : SystemRuntimeLibrary; + //===----------------------------------------------------------------------===// // Vector math libraries //===----------------------------------------------------------------------===// diff --git llvm/include/llvm/MC/MCAsmInfo.h llvm/include/llvm/MC/MCAsmInfo.h index ea8ac6dbe6e3..6cdb6f998619 100644 --- llvm/include/llvm/MC/MCAsmInfo.h +++ llvm/include/llvm/MC/MCAsmInfo.h @@ -206,6 +206,9 @@ protected: /// quotes. bool SupportsQuotedNames = true; + /// If true, append '#' to every non-temporary symbol reference. + bool UseSymbolHashSuffix = false; + /// This is true if data region markers should be printed as /// ".data_region/.end_data_region" directives. If false, use "$d/$a" labels /// instead. @@ -571,6 +574,8 @@ public: } bool supportsNameQuoting() const { return SupportsQuotedNames; } + bool useSymbolHashSuffix() const { return UseSymbolHashSuffix; } + bool doesSupportDataRegionDirectives() const { return UseDataRegionDirectives; } diff --git llvm/include/llvm/TargetParser/Triple.h llvm/include/llvm/TargetParser/Triple.h index 9c83abeeb3b1..7f9b3cae974a 100644 --- llvm/include/llvm/TargetParser/Triple.h +++ llvm/include/llvm/TargetParser/Triple.h @@ -61,6 +61,7 @@ public: csky, // CSKY: csky dxil, // DXIL 32-bit DirectX bytecode hexagon, // Hexagon: hexagon + ia64, // IA-64 (Itanium): ia64 loongarch32, // LoongArch (32-bit): loongarch32 loongarch64, // LoongArch (64-bit): loongarch64 m68k, // M68k: Motorola 680x0 family diff --git llvm/lib/MC/MCSymbol.cpp llvm/lib/MC/MCSymbol.cpp index cf44005139ab..971b4dedbd74 100644 --- llvm/lib/MC/MCSymbol.cpp +++ llvm/lib/MC/MCSymbol.cpp @@ -61,8 +61,17 @@ void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const { // some targets support quoting names with funny characters. If the name // contains a funny character, then print it quoted. StringRef Name = getName(); + + // Some assemblers (IA-64 GNU as) parse a bare identifier matching a register + // name alias as that register even in symbol position, so a non-temporary + // symbol reference is decorated with a trailing '#' that the assembler strips. + // See MCAsmInfo::UseSymbolHashSuffix. + bool HashSuffix = MAI && MAI->useSymbolHashSuffix() && !isTemporary(); + if (!MAI || MAI->isValidUnquotedName(Name)) { OS << Name; + if (HashSuffix) + OS << '#'; return; } @@ -81,6 +90,8 @@ void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const { OS << C; } OS << '"'; + if (HashSuffix) + OS << '#'; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git llvm/lib/Target/IA64/CMakeLists.txt llvm/lib/Target/IA64/CMakeLists.txt new file mode 100644 index 000000000000..7fa1c25e8e84 --- /dev/null +++ llvm/lib/Target/IA64/CMakeLists.txt @@ -0,0 +1,51 @@ +add_llvm_component_group(IA64) + +# The IA64ISD SDNodes are declared by hand in IA64ISelLowering.h (as the +# pre-removal backend did and many in-tree targets still do), so +# -gen-sd-node-info is not used. AsmParser, Disassembler and object emission +# (-gen-asm-matcher / -gen-disassembler / MCCodeEmitter / AsmBackend) are out of +# Stage-1 scope (asm-output path only). + +set(LLVM_TARGET_DEFINITIONS IA64.td) + +tablegen(LLVM IA64GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM IA64GenInstrInfo.inc -gen-instr-info) +tablegen(LLVM IA64GenCallingConv.inc -gen-callingconv) +tablegen(LLVM IA64GenDAGISel.inc -gen-dag-isel) +tablegen(LLVM IA64GenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM IA64GenSubtargetInfo.inc -gen-subtarget) + +add_public_tablegen_target(IA64CommonTableGen) + +add_llvm_target(IA64CodeGen + IA64AsmPrinter.cpp + IA64Bundling.cpp + IA64FrameLowering.cpp + IA64ISelDAGToDAG.cpp + IA64ISelLowering.cpp + IA64InstrInfo.cpp + IA64MCInstLower.cpp + IA64MachineFunctionInfo.cpp + IA64RegisterInfo.cpp + IA64Subtarget.cpp + IA64TargetMachine.cpp + + LINK_COMPONENTS + AsmPrinter + CodeGen + CodeGenTypes + Core + MC + SelectionDAG + Support + Target + TargetParser + IA64Desc + IA64Info + + ADD_TO_COMPONENT + IA64 + ) + +add_subdirectory(MCTargetDesc) +add_subdirectory(TargetInfo) diff --git llvm/lib/Target/IA64/IA64.h llvm/lib/Target/IA64/IA64.h new file mode 100644 index 000000000000..de72d6fdcb02 --- /dev/null +++ llvm/lib/Target/IA64/IA64.h @@ -0,0 +1,29 @@ +//===-- IA64.h - Top-level interface for IA64 representation ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// IA64 back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_IA64_IA64_H +#define LLVM_LIB_TARGET_IA64_IA64_H + +namespace llvm { + +class FunctionPass; +class PassRegistry; +class TargetMachine; + +FunctionPass *createIA64ISelDag(TargetMachine &TM); +FunctionPass *createIA64BundlingPass(); +void initializeIA64DAGToDAGISelLegacyPass(PassRegistry &); + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_IA64_IA64_H diff --git llvm/lib/Target/IA64/IA64.td llvm/lib/Target/IA64/IA64.td new file mode 100644 index 000000000000..7da4cb0fc3e8 --- /dev/null +++ llvm/lib/Target/IA64/IA64.td @@ -0,0 +1,52 @@ +//===-- IA64.td - Target definition file for Intel IA64 ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel IA-64 architecture, also +// known variously as ia64, IA-64, IPF, "the Itanium architecture" etc. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing. +include "llvm/Target/Target.td" +include "llvm/TableGen/SearchableTable.td" + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "IA64RegisterInfo.td" + +// Map the PointerLikeRegClass (ptr_rc) operands of the target-independent +// pseudo-instructions to IA-64's general registers. +defm : RemapAllTargetPseudoPointerOperands; + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "IA64InstrInfo.td" + +def IA64InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Calling Convention Description +//===----------------------------------------------------------------------===// + +include "IA64CallingConv.td" + +//===----------------------------------------------------------------------===// +// IA-64 processors +//===----------------------------------------------------------------------===// + +// The pre-removal backend had a trivial subtarget with no features; a single +// generic processor with no scheduling model is enough to drive -gen-subtarget. +def : ProcessorModel<"generic", NoSchedModel, []>; + +def IA64 : Target { + let InstructionSet = IA64InstrInfo; +} diff --git llvm/lib/Target/IA64/IA64AsmPrinter.cpp llvm/lib/Target/IA64/IA64AsmPrinter.cpp new file mode 100644 index 000000000000..7ad8b417b005 --- /dev/null +++ llvm/lib/Target/IA64/IA64AsmPrinter.cpp @@ -0,0 +1,253 @@ +//===-- IA64AsmPrinter.cpp - Print out IA64 LLVM as assembly --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts the machine-dependent LLVM code +// to GNU 'gas'-compatible IA-64 assembly. Unlike the pre-removal backend, which +// hand-formatted each MachineInstr, this lowers each MachineInstr to an MCInst +// and lets the streamer + IA64InstPrinter emit the text. +// +//===----------------------------------------------------------------------===// + +#include "IA64.h" +#include "IA64MCInstLower.h" +#include "MCTargetDesc/IA64InstPrinter.h" +#include "MCTargetDesc/IA64MCAsmInfo.h" +#include "MCTargetDesc/IA64MCTargetDesc.h" +#include "MCTargetDesc/IA64TargetStreamer.h" +#include "TargetInfo/IA64TargetInfo.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/Compiler.h" + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +namespace { +class IA64AsmPrinter : public AsmPrinter { + // Per-function state for driving the IA-64 unwind directives (see + // emitInstruction). Reset in emitFunctionBodyStart. + bool EmittedBody = false; + bool EmittedFFrame = false; + // A framed function with more than one epilogue needs .label_state / + // .copy_state around its '.restore sp's; otherwise gas rejects the second one. + bool NeedCopyState = false; + // Set while lowering a GlobalAlias's aliasee: an alias names the aliasee's + // entry-point symbol directly (`A = B`), so suppress the @fptr descriptor + // wrapping lowerConstant applies to functions stored in data. See + // emitGlobalAlias / lowerConstant. + bool InAliasLowering = false; + + IA64TargetStreamer &getTargetStreamer() { + return static_cast(*OutStreamer->getTargetStreamer()); + } + +public: + static char ID; + + explicit IA64AsmPrinter(TargetMachine &TM, + std::unique_ptr Streamer) + : AsmPrinter(TM, std::move(Streamer), ID) {} + + StringRef getPassName() const override { return "IA64 Assembly Printer"; } + + void emitStartOfAsmFile(Module &M) override; + void emitFunctionEntryLabel() override; + void emitFunctionBodyStart() override; + void emitFunctionBodyEnd() override; + void emitInstruction(const MachineInstr *MI) override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + const char *ExtraCode, raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + const char *ExtraCode, raw_ostream &O) override; + void emitGlobalAlias(const Module &M, const GlobalAlias &GA) override; + const MCExpr *lowerConstant(const Constant *CV, const Constant *BaseCV, + uint64_t Offset) override; +}; +} // end anonymous namespace + +char IA64AsmPrinter::ID = 0; + +void IA64AsmPrinter::emitStartOfAsmFile(Module & /*M*/) { + // The IA-64 assembly preamble expected by GNU gas, matching the pre-removal + // output. (lsb should be msb on HP-UX; we only support 64-bit.) + OutStreamer->emitRawText(StringRef("\t.psr\tlsb")); + OutStreamer->emitRawText(StringRef("\t.radix\tC")); + OutStreamer->emitRawText(StringRef("\t.psr\tabi64")); +} + +void IA64AsmPrinter::emitFunctionEntryLabel() { + // Open the unwind region before the function label, the way gcc does. The + // prologue/body directives are emitted per-instruction in emitInstruction; + // .endp follows the body in emitFunctionBodyEnd. + getTargetStreamer().emitProc(CurrentFnSym); + AsmPrinter::emitFunctionEntryLabel(); +} + +void IA64AsmPrinter::emitFunctionBodyStart() { + EmittedBody = false; + EmittedFFrame = false; + + // A '.restore sp' closes the unwind region it sits in, so a framed function + // with several return blocks needs .label_state/.copy_state to re-open it for + // each one. Single-epilogue (or frameless) functions emit a bare '.restore' + // (or none), matching gcc. getStackSize() != 0 is exactly the has-a-frame + // (and therefore has-a-'.restore sp') condition. + unsigned RetBlocks = 0; + bool Framed = MF->getFrameInfo().getStackSize() != 0; + if (Framed) + for (const MachineBasicBlock &MBB : *MF) + if (!MBB.empty() && MBB.back().getOpcode() == IA64::RET) + ++RetBlocks; + NeedCopyState = Framed && RetBlocks > 1; +} + +void IA64AsmPrinter::emitFunctionBodyEnd() { + getTargetStreamer().emitEndP(CurrentFnSym); +} + +void IA64AsmPrinter::emitInstruction(const MachineInstr *MI) { + IA64TargetStreamer &TS = getTargetStreamer(); + + // Emit the IA-64 unwind directive that describes this prologue/epilogue + // instruction, before the instruction itself, so gas associates the unwind + // record with the right PC. The prologue (alloc, the rp save, the stack + // adjust) is tagged FrameSetup by frame lowering and ISel; the stack restore + // is tagged FrameDestroy. The first non-prologue instruction ends the + // prologue region with .body. + if (MI->getFlag(MachineInstr::FrameSetup)) { + switch (MI->getOpcode()) { + case IA64::ALLOC: + // alloc copies the caller's ar.pfs into its destination register. + TS.emitPrologueDirective(); + TS.emitSaveARPFS( + IA64InstPrinter::getRegisterName(MI->getOperand(0).getReg().asMCReg())); + break; + case IA64::MOV: + // The return-pointer save is 'mov rN = rp'; distinguish it from the + // frame-pointer setup 'mov r5 = r12' by its source register. + if (MI->getOperand(1).getReg() == IA64::rp) + TS.emitSaveRP(IA64InstPrinter::getRegisterName( + MI->getOperand(0).getReg().asMCReg())); + break; + case IA64::ADDIMM22: + case IA64::ADD: + // The stack-pointer adjustment writes r12; the .fframe value is the final + // frame size frame lowering settled on. + if (!EmittedFFrame && MI->getOperand(0).getReg() == IA64::r12) { + TS.emitFFrame(MF->getFrameInfo().getStackSize()); + EmittedFFrame = true; + } + break; + } + } else if (!EmittedBody && !MI->isMetaInstruction() && + MI->getOpcode() != IA64::STOP) { + // End the prologue region at the first real body instruction. Skip the + // bundler's STOP (';;') pseudo: one can land between prologue instructions + // (e.g. the forced stop after 'alloc'), and treating it as the body start + // would push .fframe / a late .save past .body. + TS.emitBody(); + EmittedBody = true; + if (NeedCopyState) + TS.emitLabelState(1); + } + + if (MI->getFlag(MachineInstr::FrameDestroy) && + (MI->getOpcode() == IA64::ADDIMM22 || MI->getOpcode() == IA64::ADD) && + MI->getOperand(0).getReg() == IA64::r12) { + if (NeedCopyState) + TS.emitCopyState(1); + TS.emitRestoreSP(); + } + + IA64MCInstLower Lower(OutContext, *this); + MCInst TmpInst; + Lower.Lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); +} + +// Print an inline-asm operand referenced by a '$N' substitution. We handle the +// no-modifier register and immediate cases (covering the 'r'/'f' and immediate +// constraints); anything else defers to the generic AsmPrinter handler. +bool IA64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + const char *ExtraCode, raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) + // We define no IA-64-specific modifiers; let the generic handler try. + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); + + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + case MachineOperand::MO_Register: + O << IA64InstPrinter::getRegisterName(MO.getReg().asMCReg()); + return false; + case MachineOperand::MO_Immediate: + O << MO.getImm(); + return false; + default: + break; + } + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); +} + +// An inline-asm memory operand ('m'): the address lives in a single register, +// dereferenced as '[rN]'. +bool IA64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) + return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, O); + + const MachineOperand &MO = MI->getOperand(OpNo); + if (!MO.isReg()) + return true; + O << '[' << IA64InstPrinter::getRegisterName(MO.getReg().asMCReg()) << ']'; + return false; +} + +// A GlobalAlias is just another name for the aliasee's symbol; on IA-64 a +// function alias must resolve to the aliasee's *entry point*, not its function +// descriptor. The generic AsmPrinter lowers the aliasee through lowerConstant() +// (which wraps functions in @fptr), so `A = @fptr(B)` would be emitted: that +// both mis-aliases A to the descriptor and makes GNU as abort (a symbol's value +// expression can't be an @fptr pseudo-fixup -- "Case value 64 unexpected" in +// resolve_symbol_value). Flag the alias context so lowerConstant emits the bare +// entry-point symbol, yielding the correct `A = B`. +void IA64AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) { + InAliasLowering = true; + AsmPrinter::emitGlobalAlias(M, GA); + InAliasLowering = false; +} + +const MCExpr *IA64AsmPrinter::lowerConstant(const Constant *CV, + const Constant *BaseCV, + uint64_t Offset) { + // A function pointer stored in data is the address of the function's + // descriptor { entry, gp }, not its entry point: emit data8 @fptr(f). The + // linker materializes the .opd descriptor; an indirect call dereferences it. + // (Skipped under alias lowering, where the alias must equal the entry point.) + if (const auto *F = dyn_cast(CV)) { + const MCExpr *E = MCSymbolRefExpr::create(getSymbol(F), OutContext); + if (InAliasLowering) + return E; + return MCSpecifierExpr::create(E, IA64::S_FPTR, OutContext); + } + return AsmPrinter::lowerConstant(CV, BaseCV, Offset); +} + +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeIA64AsmPrinter() { + RegisterAsmPrinter X(getTheIA64Target()); +} diff --git llvm/lib/Target/IA64/IA64Bundling.cpp llvm/lib/Target/IA64/IA64Bundling.cpp new file mode 100644 index 000000000000..93fb1be99391 --- /dev/null +++ llvm/lib/Target/IA64/IA64Bundling.cpp @@ -0,0 +1,121 @@ +//===-- IA64Bundling.cpp - IA-64 instruction bundling pass. --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Add stops (;;) where required to prevent read-after-write and write-after- +// write dependencies, for registers. (The pre-removal pass noted exceptions for +// parallel compares targeting p0; those are not reintroduced here.) +// +// FIXME: actual bundle formation is left to the assembler; this only inserts +// stop bits. +// +//===----------------------------------------------------------------------===// + +#include "IA64.h" +#include "MCTargetDesc/IA64MCTargetDesc.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "ia64-bundling" + +STATISTIC(StopBitsAdded, "Number of stop bits added"); + +namespace { +struct IA64BundlingPass : public MachineFunctionPass { + static char ID; + + IA64BundlingPass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "IA64 (Itanium) Bundling Pass"; + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &F) override { + TII = F.getSubtarget().getInstrInfo(); + RSEWrite = false; + bool Changed = false; + for (MachineBasicBlock &MBB : F) + Changed |= runOnMachineBasicBlock(MBB); + return Changed; + } + +private: + const TargetInstrInfo *TII = nullptr; + + // Ugly carried state, but pending writes can cross basic blocks. Taken + // branches end instruction groups, so only fallthrough code matters. + std::set PendingRegWrites; + + // Likewise carried across blocks: an alloc writes the RSE/CFM and must be + // separated from a later call by a stop. The alloc commonly lives in the entry + // block while the first call sits in a fall-through successor (e.g. alloc in + // the prologue, first call in the next block), so a per-block flag would lose + // the pending alloc at the block boundary and skip the required stop. Reset + // only at function entry and when a stop is emitted below. + bool RSEWrite = false; +}; +char IA64BundlingPass::ID = 0; +} // end anonymous namespace + +/// createIA64BundlingPass - Returns a pass that adds STOP (;;) instructions +/// where inter-instruction register dependencies require them. +FunctionPass *llvm::createIA64BundlingPass() { return new IA64BundlingPass(); } + +bool IA64BundlingPass::runOnMachineBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) { + MachineInstr &MI = *I; + ++I; + + std::set CurrentReads, CurrentWrites, OrigWrites; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isUse()) // TODO: exclude p0 + CurrentReads.insert(MO.getReg()); + if (MO.isDef()) { // TODO: exclude p0 + CurrentWrites.insert(MO.getReg()); + OrigWrites.insert(MO.getReg()); + } + } + + // Does this instruction read or write any register that is pending a + // write (i.e. not yet separated from its writer by a stop)? + set_intersect(CurrentReads, PendingRegWrites); + set_intersect(CurrentWrites, PendingRegWrites); + + if ((RSEWrite && MI.isCall()) || + !(CurrentReads.empty() && CurrentWrites.empty())) { + // Conflict (or the forced stop after an alloc): insert a stop before this + // instruction and reset the pending set to this instruction's writes. + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(IA64::STOP)); + PendingRegWrites = OrigWrites; + Changed = true; + RSEWrite = false; + ++StopBitsAdded; + } else { + // No conflict: accumulate this instruction's writes. + set_union(PendingRegWrites, OrigWrites); + } + + // An alloc writes into the RSE and has to be separated from calls + if (MI.getOpcode() == IA64::ALLOC) + RSEWrite = true; + } + + return Changed; +} diff --git llvm/lib/Target/IA64/IA64CallingConv.td llvm/lib/Target/IA64/IA64CallingConv.td new file mode 100644 index 000000000000..c585afabc0ac --- /dev/null +++ llvm/lib/Target/IA64/IA64CallingConv.td @@ -0,0 +1,96 @@ +//===-- IA64CallingConv.td - Calling conventions for IA64 --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the IA-64 architecture. The +// pre-removal backend hand-coded this logic in IA64ISelLowering; modern LLVM +// expresses it as CCState-driven tables, so the equivalent mapping is captured +// here and consumed via -gen-callingconv. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// IA-64 C argument calling convention. +//===----------------------------------------------------------------------===// +def CC_IA64 : CallingConv<[ + // A large aggregate return value is materialized in a caller-allocated + // buffer; its address arrives in r8, not a normal parameter slot (psABI + // §8.6). r8 is scratch, so it does not consume out0. + CCIfSRet>>, + + // Integer types smaller than a register are passed as i64. + CCIfType<[i1, i8, i16, i32], CCPromoteToType>, + + // long double (f80): one FP register, shadowing two GR parameter slots. + CCIfType<[f80], CCCustom<"CC_IA64_F80">>, + + // The first eight integer arguments arrive in the incoming stacked GP + // registers r32-r39. + CCIfType<[i64], CCAssignToReg<[r32, r33, r34, r35, r36, r37, r38, r39]>>, + + // FP scalars: one positional parameter slot each. A fixed arg arrives in + // F8-F15 while *reserving* its GR slot so a following integer keeps its slot; + // a variadic arg arrives in the GR slot in memory format (psABI 8.5.4). + // CCAssignToRegWithShadow cannot express the "reserve the slot, not the + // FP-indexed GR" rule, so use a custom hook. + CCIfType<[f64, f32], CCCustom<"CC_IA64_FP">>, + + // Everything beyond the eight register slots is passed in 8-byte stack slots. + CCIfType<[i64], CCAssignToStack<8, 8>> +]>; + +//===----------------------------------------------------------------------===// +// IA-64 C *outgoing* (caller-side) argument convention. +// +// A caller places arguments in its output registers out0-out7, which the +// callee's 'alloc' renames into that callee's incoming r32+. This mirrors +// CC_IA64 exactly, but targets the out registers instead of r32-r39. +//===----------------------------------------------------------------------===// +def CC_IA64_Call : CallingConv<[ + // Large-aggregate-return buffer address goes in r8 (psABI §8.6), matching the + // incoming side in CC_IA64. + CCIfSRet>>, + + CCIfType<[i1, i8, i16, i32], CCPromoteToType>, + CCIfType<[f32], CCPromoteToType>, + + // long double (f80): a fixed arg goes in one FP register, shadowing two + // output parameter slots; a variadic arg goes in two general registers + // (memory format). LowerCall splits the variadic case. + CCIfType<[f80], CCCustom<"CC_IA64_Call_F80">>, + + CCIfType<[i64], + CCAssignToReg<[out0, out1, out2, out3, out4, out5, out6, out7]>>, + + // FP scalars (f64; f32 promoted above): one positional output slot each. A + // fixed arg goes in F8-F15 while reserving its output GR slot; a variadic arg + // ('...' match) goes in the output GR slot in memory format (prototyped- + // variadic psABI 8.5.4) -- LowerCall bit-casts it to its i64 pattern. The hook + // reserves the slot positionally, which CCAssignToRegWithShadow cannot do. + CCIfType<[f64], CCCustom<"CC_IA64_Call_FP">>, + + CCIfType<[i64], CCAssignToStack<8, 8>> +]>; + +//===----------------------------------------------------------------------===// +// IA-64 C return-value convention. +//===----------------------------------------------------------------------===// +def RetCC_IA64 : CallingConv<[ + CCIfType<[i1, i8, i16, i32], CCPromoteToType>, + + // Scalar integers/pointers return in r8 (65-128 bits in r8,r9). Aggregates up + // to 256 bits are returned by value in r8-r11 -- clang coerces them to an + // [N x i64] whose elements land here in order (psABI Table 8-2). Aggregates + // larger than 256 bits are returned via a caller buffer whose address is + // passed in r8 (sret; see CCIfSRet in the argument conventions). + CCIfType<[i64], CCAssignToReg<[r8, r9, r10, r11]>>, + + // Floating-point and homogeneous FP aggregates return in F8-F15 (a coerced + // [N x double]/[N x float] supplies one value per register). + CCIfType<[f64, f32], CCAssignToReg<[F8, F9, F10, F11, F12, F13, F14, F15]>>, + CCIfType<[f80], CCAssignToReg<[F8, F9, F10, F11, F12, F13, F14, F15]>> +]>; diff --git llvm/lib/Target/IA64/IA64FrameLowering.cpp llvm/lib/Target/IA64/IA64FrameLowering.cpp new file mode 100644 index 000000000000..a809e1a992a9 --- /dev/null +++ llvm/lib/Target/IA64/IA64FrameLowering.cpp @@ -0,0 +1,282 @@ +//===-- IA64FrameLowering.cpp - IA64 Frame Information --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the IA64 implementation of TargetFrameLowering. +// +//===----------------------------------------------------------------------===// + +#include "IA64FrameLowering.h" +#include "IA64MachineFunctionInfo.h" +#include "IA64RegisterInfo.h" +#include "MCTargetDesc/IA64MCTargetDesc.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" + +using namespace llvm; + +// hasFP - Return true if the specified function should have a dedicated frame +// pointer register. (The pre-removal backend also forced this off +// -fomit-frame-pointer; that global is gone, so we only key off var-sized +// objects.) +bool IA64FrameLowering::hasFPImpl(const MachineFunction &MF) const { + return MF.getFrameInfo().hasVarSizedObjects(); +} + +void IA64FrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + bool FP = hasFP(MF); + DebugLoc DL; + + // First, handle the 'alloc' instruction, which must be at the top of any + // function. There are 96 stacked GPRs the RSE worries about. + unsigned NumStackedGPRsUsed = 0; + for (unsigned i = 0; i != IA64NumStackedGPRs; ++i) { + // SkipRegMaskTest: count a stacked register only if it is really allocated + // to a value here, not merely clobbered by a call's regmask. A returns_twice + // (vfork/setjmp) call carries a regmask clobbering all of r32-r127 (see + // IA64TargetLowering::AdjustInstrPostInstrSelection) to keep values out of + // the RSE-backed frame across it; without skipping the mask that would size + // this 'alloc' to the full 96-register frame. + if (MF.getRegInfo().isPhysRegUsed(getIA64StackedGPR(i), + /*SkipRegMaskTest=*/true)) + NumStackedGPRsUsed = i + 1; // i+1, not ++ - consider fn(fp, fp, int) + } + + unsigned NumOutRegsUsed = MF.getInfo()->OutRegsUsed; + + IA64FunctionInfo *FInfo = MF.getInfo(); + + // Park the caller's ar.pfs in a fixed stacked local for the whole function. + // 'alloc' writes the incoming ar.pfs into its destination register, and every + // function must restore that value before br.ret so the register stack engine + // can recover the caller's frame. Make the destination a fresh stacked local + // just above the ones the allocator used: a register stack engine local is + // preserved across calls for free, and because the allocator never sees this + // register it is never spilled -- so the value stays in one place that the + // unwinder can name in a single '.save ar.pfs, ' directive valid for the + // entire body. + // + // The old backend instead let the allocator place the ar.pfs-save value (via + // PSEUDO_ALLOC). In a non-leaf function the allocator spilled that value to a + // stack slot across calls and reused the register, so '.save ar.pfs, ' + // named a register that no longer held ar.pfs at the call sites. That was + // invisible to gdb's read-only backtrace (which only needs the return address + // from '.save rp') but crashed libgcc's forced unwinder (pthread_exit / + // pthread_cancel), which must actually restore ar.pfs to pop the RSE frame. + Register SavedPFSReg = getIA64StackedGPR(NumStackedGPRsUsed); + ++NumStackedGPRsUsed; + FInfo->setSavedPFSReg(SavedPFSReg); + + // For a non-leaf function, br.call overwrites the return pointer (b0/rp), so + // we must preserve the caller's return address for our own br.ret. The + // register allocator already does this lazily -- it copies rp into a stacked + // local around each call -- but those copies land in a different register at + // each call site, so there is no single location the unwinder can name. Park + // rp once here, in a fresh stacked local (just like ar.pfs above), so the + // frame is describable by one '.save rp, ' directive. emitEpilogue + // restores b0 from it. + // + // hasCalls() is the right test: it covers libcalls (e.g. the __divdi3 a sdiv + // lowers to) that clobber rp without any IR-level call, which a check earlier + // than frame lowering could not see. + Register SavedRPReg; + if (MFI.hasCalls()) { + SavedRPReg = getIA64StackedGPR(NumStackedGPRsUsed); + ++NumStackedGPRsUsed; + FInfo->setSavedRPReg(SavedRPReg); + } + + // The whole stacked frame -- locals (the allocator's plus our ar.pfs/rp saves) + // and the outputs (out0-out7, placed by gas above the locals) -- must fit in + // the 96-register window. getReservedRegs guarantees this by capping the + // allocator's locals: it reserves the top 10 stacked GPRs (8 outputs + the rp + // save + the ar.pfs save). + assert(NumStackedGPRsUsed + NumOutRegsUsed <= IA64NumStackedGPRs && + "stacked-GPR frame overflow: locals + saves + outputs > 96"); + + // 'alloc' must be the first instruction in the function; its destination is + // the parked ar.pfs local. Mark that operand as a Define: 'alloc' writes the + // caller's ar.pfs into it, and the bundling pass needs to see that write so it + // inserts the mandatory stop before any instruction that reads the register -- + // notably the epilogue's 'mov ar.pfs = '. (Using 'alloc's result, or any + // register it renames, in the same instruction group is illegal and faults + // with SIGILL.) The old backend's PSEUDO_ALLOC supplied this def; without it, + // a use of the addReg default would leave 'alloc' looking like a reader. + // + // Tag it (and the rest of the prologue below) as frame setup so the asm + // printer can hang the IA-64 unwind directives (.prologue / .save ar.pfs / + // .save rp / .fframe) off the right instructions. + BuildMI(MBB, MBBI, DL, TII->get(IA64::ALLOC)) + .addReg(SavedPFSReg, RegState::Define) + .addImm(0) + .addImm(NumStackedGPRsUsed) + .addImm(NumOutRegsUsed) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + // The ar.pfs local is defined by 'alloc' here and used by emitEpilogue's + // restore in another block; mark it live across the whole function so its + // value is correctly seen as live everywhere. + for (MachineBasicBlock &Block : MF) + if (&Block != &MBB) + Block.addLiveIn(SavedPFSReg); + + // Save the incoming return pointer into its parked local, and likewise mark it + // live across the function. + if (SavedRPReg) { + BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), SavedRPReg) + .addReg(IA64::rp) + .setMIFlag(MachineInstr::FrameSetup); + for (MachineBasicBlock &Block : MF) + if (&Block != &MBB) + Block.addLiveIn(SavedRPReg); + } + + // Get the number of bytes to allocate from the FrameInfo. + unsigned NumBytes = MFI.getStackSize(); + + if (FP) + NumBytes += 8; // reserve space for the old FP + + // Do we need to allocate space on the stack? + if (NumBytes == 0) + return; + + // Add 16 bytes at the bottom of the stack (scratch area) and round the size + // to a multiple of the alignment. + unsigned Align = getStackAlign().value(); + unsigned Size = 16 + (FP ? 8 : 0); + NumBytes = (NumBytes + Size + Align - 1) / Align * Align; + MFI.setStackSize(NumBytes); + + // Adjust the stack pointer: r12 -= NumBytes. + if (NumBytes <= 8191) { + BuildMI(MBB, MBBI, DL, TII->get(IA64::ADDIMM22), IA64::r12) + .addReg(IA64::r12) + .addImm(-(int64_t)NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { // use r22 as a scratch register + BuildMI(MBB, MBBI, DL, TII->get(IA64::MOVLIMM64), IA64::r22) + .addImm(-(int64_t)NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(IA64::ADD), IA64::r12) + .addReg(IA64::r12) + .addReg(IA64::r22) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Now, if we need to, save the old FP and set the new one. + if (FP) { + BuildMI(MBB, MBBI, DL, TII->get(IA64::ST8)) + .addReg(IA64::r12) + .addReg(IA64::r5) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), IA64::r5) + .addReg(IA64::r12) + .setMIFlag(MachineInstr::FrameSetup); + } +} + +void IA64FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + MachineBasicBlock::iterator MBBI = std::prev(MBB.end()); + assert(MBBI->getOpcode() == IA64::RET && + "Can only insert epilog into returning blocks"); + DebugLoc DL = MBBI->getDebugLoc(); + bool FP = hasFP(MF); + + unsigned NumBytes = MFI.getStackSize(); + + // Restore the caller's ar.pfs from the local 'alloc' parked it in, so our + // br.ret lets the register stack engine recover the caller's frame. Every + // function has this save (see emitPrologue), so the register is always valid. + // Keeping it live here anchors the '.save ar.pfs' unwind region across the + // whole body. + Register SavedPFSReg = MF.getInfo()->getSavedPFSReg(); + BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV_TO_AR_PFS), IA64::AR_PFS) + .addReg(SavedPFSReg) + .setMIFlag(MachineInstr::FrameDestroy); + + // Restore the incoming return pointer (b0/rp) from the local the prologue + // parked it in, so our br.ret returns to the caller. This also keeps that + // local live, anchoring the '.save rp' unwind region across the whole body. + if (Register SavedRPReg = MF.getInfo()->getSavedRPReg()) + BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), IA64::rp) + .addReg(SavedRPReg) + .setMIFlag(MachineInstr::FrameDestroy); + + // If we need to, restore the old FP. + if (FP) { + // Copy the FP into the SP (discards allocas). + BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), IA64::r12) + .addReg(IA64::r5) + .setMIFlag(MachineInstr::FrameDestroy); + // Restore the FP. + BuildMI(MBB, MBBI, DL, TII->get(IA64::LD8), IA64::r5) + .addReg(IA64::r5) + .setMIFlag(MachineInstr::FrameDestroy); + } + + if (NumBytes != 0) { + if (NumBytes <= 8191) { + BuildMI(MBB, MBBI, DL, TII->get(IA64::ADDIMM22), IA64::r12) + .addReg(IA64::r12) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameDestroy); + } else { + BuildMI(MBB, MBBI, DL, TII->get(IA64::MOVLIMM64), IA64::r22) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameDestroy); + BuildMI(MBB, MBBI, DL, TII->get(IA64::ADD), IA64::r12) + .addReg(IA64::r12) + .addReg(IA64::r22) + .setMIFlag(MachineInstr::FrameDestroy); + } + } +} + +MachineBasicBlock::iterator IA64FrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + if (hasFP(MF)) { + // If we have a frame pointer, turn the adjcallstackup instruction into a + // 'sub sp, ' and the adjcallstackdown instruction into 'add sp, '. + MachineInstr &Old = *I; + unsigned Amount = Old.getOperand(0).getImm(); + DebugLoc DL = Old.getDebugLoc(); + if (Amount != 0) { + // Keep the stack aligned: round up to the next alignment boundary. + unsigned Align = getStackAlign().value(); + Amount = (Amount + Align - 1) / Align * Align; + + if (Old.getOpcode() == IA64::ADJUSTCALLSTACKDOWN) { + BuildMI(MBB, I, DL, TII->get(IA64::ADDIMM22), IA64::r12) + .addReg(IA64::r12) + .addImm(-(int64_t)Amount); + } else { + assert(Old.getOpcode() == IA64::ADJUSTCALLSTACKUP); + BuildMI(MBB, I, DL, TII->get(IA64::ADDIMM22), IA64::r12) + .addReg(IA64::r12) + .addImm(Amount); + } + } + } + + return MBB.erase(I); +} diff --git llvm/lib/Target/IA64/IA64FrameLowering.h llvm/lib/Target/IA64/IA64FrameLowering.h new file mode 100644 index 000000000000..d37a2e8421f0 --- /dev/null +++ llvm/lib/Target/IA64/IA64FrameLowering.h @@ -0,0 +1,57 @@ +//===-- IA64FrameLowering.h - Define frame lowering for IA64 ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class implements the IA64-specific bits of the TargetFrameLowering +// class. In the pre-removal backend this logic lived in IA64RegisterInfo; +// modern LLVM splits frame lowering into its own class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_IA64_IA64FRAMELOWERING_H +#define LLVM_LIB_TARGET_IA64_IA64FRAMELOWERING_H + +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/Support/Alignment.h" + +namespace llvm { + +class IA64FrameLowering : public TargetFrameLowering { +public: + // StackRealignable=false: this backend does not dynamically realign the + // stack. sp (r12) is only 16-byte aligned and the prologue never emits an + // 'and sp, -N', so we cannot honor a local whose alignment exceeds 16 by + // placing it at a static sp+offset slot. If we claimed otherwise (the + // default is true), FunctionLoweringInfo would fold an over-aligned + // (e.g. #[repr(align(64))]) alloca into the static frame; SelectionDAG's + // computeKnownBits would then trust the frame-index pointer to be 64-aligned + // and rewrite field GEPs 'add base, k' into 'or base, k' -- which collide + // and corrupt fields once the runtime address is merely 16-aligned. With + // this false, such allocas are instead demoted to variable-sized objects and + // lowered via DYNAMIC_STACKALLOC (Expand emits 'sp -= size; sp &= -align'), + // so the pointer is genuinely aligned and the 'or' rewrite is valid. The + // demotion also sets hasVarSizedObjects(), which turns on hasFP so the + // epilogue restores sp from the frame pointer. + IA64FrameLowering() + : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/Align(16), + /*LocalAreaOffset=*/0, /*TransientStackAlignment=*/ + Align(16), /*StackRealignable=*/false) {} + + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_IA64_IA64FRAMELOWERING_H diff --git llvm/lib/Target/IA64/IA64ISelDAGToDAG.cpp llvm/lib/Target/IA64/IA64ISelDAGToDAG.cpp new file mode 100644 index 000000000000..c2593c910fdd --- /dev/null +++ llvm/lib/Target/IA64/IA64ISelDAGToDAG.cpp @@ -0,0 +1,458 @@ +//===---- IA64ISelDAGToDAG.cpp - IA64 pattern matching inst selector ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines a pattern matching instruction selector for IA64, +// converting a legalized DAG into an IA64 DAG. +// +// The pre-removal selector hand-selected a great deal (FP divide expansion, the +// BRCALL call hack, manual load/store/branch handling). Most arithmetic now +// flows through the tablegen-generated matcher (SelectCode); the cases that +// cannot be (or were not) expressed as patterns are hand-selected here, as the +// pre-removal backend did: FrameIndex (Stage 1); the branches BR/BRCOND, whose +// target is an i64imm rather than a tablegen 'bb' operand; the IA64ISD::BRCALL +// call node; and loads/stores, dispatched on the memory type (Stage C). +// +//===----------------------------------------------------------------------===// + +#include "IA64.h" +#include "IA64ISelLowering.h" +#include "MCTargetDesc/IA64MCAsmInfo.h" +#include "MCTargetDesc/IA64MCTargetDesc.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +#define DEBUG_TYPE "ia64-isel" +#define PASS_NAME "IA64 (Itanium) DAG->DAG Pattern Instruction Selection" + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +namespace { +class IA64DAGToDAGISel : public SelectionDAGISel { +public: + IA64DAGToDAGISel() = delete; + + explicit IA64DAGToDAGISel(TargetMachine &TM) : SelectionDAGISel(TM) {} + + void Select(SDNode *N) override; + + // Inline-asm memory operand ('m'): IA-64 dereferences a single register, so + // the address is just passed through as one operand. + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + InlineAsm::ConstraintCode ConstraintID, + std::vector &OutOps) override; + + // Include the pieces autogenerated from the target description. +#include "IA64GenDAGISel.inc" +}; + +class IA64DAGToDAGISelLegacy : public SelectionDAGISelLegacy { +public: + static char ID; + explicit IA64DAGToDAGISelLegacy(TargetMachine &TM) + : SelectionDAGISelLegacy(ID, std::make_unique(TM)) {} +}; +} // end anonymous namespace + +char IA64DAGToDAGISelLegacy::ID = 0; + +INITIALIZE_PASS(IA64DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) + +// Convert a target-independent node to a target-specific one, unless the +// generated matcher can do it for us. +void IA64DAGToDAGISel::Select(SDNode *N) { + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return; // Already selected. + } + + switch (N->getOpcode()) { + case ISD::FrameIndex: { + int FI = cast(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i64); + CurDAG->SelectNodeTo(N, IA64::MOV, MVT::i64, TFI); + return; + } + + case ISD::GlobalAddress: { + // Materialize a global's address out of the linkage table (GOT), anchored + // by gp (r1), transcribing the pre-removal selector: + // addl rX = , gp ;; ld8 rX = [rX] + // The ADDL_GA computes the address of the symbol's GOT slot relative to gp, + // and the LD8 loads the symbol's runtime address from it. The GOT slot is + // invariant, so the load is chained off the entry node. + const GlobalValue *GV = cast(N)->getGlobal(); + SDLoc dl(N); + // Tag the symbol with the @ltoff specifier (carried on the target flags); + // IA64MCInstLower turns it into the printed "@ltoff(sym)" so gas builds the + // GOT entry the LD8 below reads. A function's address is its descriptor, so + // the GOT entry must hold @ltoff(@fptr(f)) (the descriptor address), not the + // raw entry point -- an indirect call dereferences it as { entry, gp }. + unsigned Spec = isa(GV) ? IA64::S_LTOFF_FPTR : IA64::S_LTOFF; + SDValue GA = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, /*offset=*/0, + Spec); + SDValue Slot = SDValue( + CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64, + CurDAG->getRegister(IA64::r1, MVT::i64), GA), + 0); + SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other, + Slot, CurDAG->getEntryNode()); + ReplaceUses(SDValue(N, 0), SDValue(Ld, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + + case IA64ISD::TLS_GOTLOAD: { + // Load a thread-local datum (a TLS offset or module id) from the symbol's + // GOT slot: addl rX = @ltoff(@(sym)), gp ;; ld8 rX = [rX]. Identical to + // the GlobalAddress case above, but the @ltoff specifier is already carried + // on the operand's target flags (set by LowerGlobalTLSAddress); the loaded + // value is consumed by 'add tp' (initial-exec) or __tls_get_addr (dynamic). + SDLoc dl(N); + SDValue GA = N->getOperand(0); + SDValue Slot = SDValue( + CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64, + CurDAG->getRegister(IA64::r1, MVT::i64), GA), + 0); + SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other, + Slot, CurDAG->getEntryNode()); + ReplaceUses(SDValue(N, 0), SDValue(Ld, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + + case IA64ISD::TLS_TPREL: { + // Materialize the local-exec tp-relative offset directly: movl rX = + // @tprel(sym). The operand is a TargetGlobalAddress tagged S_TPREL; the + // result is added to tp (r13) by the caller. + SDLoc dl(N); + CurDAG->SelectNodeTo(N, IA64::MOVL_GA, MVT::i64, N->getOperand(0)); + return; + } + + case ISD::JumpTable: { + // Materialize a jump table's base address the same way as a global: load it + // from its GOT slot (addl @ltoff(.LJTI), gp ;; ld8). BR_JT expands to this + // base + scaled index, an LD8 of the (absolute) entry, and a BRIND. + int JTI = cast(N)->getIndex(); + SDLoc dl(N); + SDValue JT = CurDAG->getTargetJumpTable(JTI, MVT::i64, IA64::S_LTOFF); + SDValue Slot = SDValue( + CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64, + CurDAG->getRegister(IA64::r1, MVT::i64), JT), + 0); + SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other, + Slot, CurDAG->getEntryNode()); + ReplaceUses(SDValue(N, 0), SDValue(Ld, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + + case ISD::ConstantPool: { + // Materialize a constant-pool entry's address the same way as a global or + // jump table: load it from its GOT slot (addl @ltoff(.LCPI), gp ;; ld8). + // The f80 ('long double') immediates that the legalizer spills here are then + // loaded with ldfe (the f80 load pattern). (f32/f64 immediates stay out of + // the pool -- see isFPImmLegal -- so this path is exercised only by f80.) + ConstantPoolSDNode *CP = cast(N); + SDLoc dl(N); + SDValue CPA = CurDAG->getTargetConstantPool( + CP->getConstVal(), MVT::i64, CP->getAlign(), CP->getOffset(), + IA64::S_LTOFF); + SDValue Slot = SDValue( + CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64, + CurDAG->getRegister(IA64::r1, MVT::i64), CPA), + 0); + SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other, + Slot, CurDAG->getEntryNode()); + ReplaceUses(SDValue(N, 0), SDValue(Ld, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + + case ISD::BR: { + // br bb -> (p0) brl.cond bb. The branch instructions carry an i64imm + // target operand (not a tablegen 'bb' operand), so they are hand-selected + // rather than pattern-matched, as the pre-removal backend did. The + // MachineBasicBlock operand is lowered to the block's symbol by + // IA64MCInstLower. Operands: (chain, BasicBlock). + SDValue Chain = N->getOperand(0); + SDValue Target = N->getOperand(1); + CurDAG->SelectNodeTo(N, IA64::BRL_NOTCALL, MVT::Other, Target, Chain); + return; + } + + case ISD::BRCOND: { + // brcond p, bb -> (p) brl.cond bb. The conditional branch keeps only the + // taken edge; the fall-through to the other successor is a separate ISD::BR + // (BR_CC/SELECT_CC stay Expand, so the legalizer hands us setcc + brcond). + // Operands: (chain, predicate, BasicBlock). + SDValue Chain = N->getOperand(0); + SDValue Pred = N->getOperand(1); + SDValue Target = N->getOperand(2); + CurDAG->SelectNodeTo(N, IA64::BRLCOND_NOTCALL, MVT::Other, Pred, Target, + Chain); + return; + } + + case ISD::BRIND: { + // brind addr -> mov b6 = addr ;; br.cond.sptk b6 (computed goto). + // Move the target address into branch register b6, glued to the branch so + // the copy stays adjacent. Operands: (chain, target address). + SDLoc dl(N); + SDValue Chain = N->getOperand(0); + SDValue Target = N->getOperand(1); + SDValue Copy = + CurDAG->getCopyToReg(Chain, dl, IA64::B6, Target, SDValue()); + CurDAG->SelectNodeTo(N, IA64::BRINDIRECT, MVT::Other, + CurDAG->getRegister(IA64::B6, MVT::i64), + Copy.getValue(0), Copy.getValue(1)); + return; + } + + case IA64ISD::BRCALL: { + // The call hack: LowerCall builds IA64ISD::BRCALL (chain, callee, + // arg-reg uses..., [glue]) and leaves the callee as a + // Target{GlobalAddress,ExternalSymbol}. A direct call selects to + // 'br.call rp = '; the argument-register operands carry through as + // the call's (precise) implicit uses. An indirect / function-descriptor + // call arrives with the callee already in b6 (a Register operand, set up by + // LowerCall) and selects to BRCALL_INDIRECT. + SDValue Chain = N->getOperand(0); + SDValue Callee = N->getOperand(1); + + // A trailing glue operand, if present, is last; everything between the + // callee and it is an argument-register use. + unsigned NumOps = N->getNumOperands(); + SDValue InGlue; + if (NumOps && N->getOperand(NumOps - 1).getValueType() == MVT::Glue) + InGlue = N->getOperand(--NumOps); + + unsigned Opc; + if (Callee.getOpcode() == ISD::TargetGlobalAddress) + Opc = IA64::BRCALL_IPREL_GA; + else if (Callee.getOpcode() == ISD::TargetExternalSymbol) + Opc = IA64::BRCALL_IPREL_ES; + else if (Callee.getOpcode() == ISD::Register) + // Indirect call: LowerCall already loaded the entry point into b6 (the + // Register operand here) and the callee's gp into r1. 'br.call rp = b6'. + Opc = IA64::BRCALL_INDIRECT; + else + report_fatal_error("IA64: unhandled call target"); + + // Machine-node operands: (calltarget, arg-reg uses..., chain, [glue]); + // results: (chain, glue). + SmallVector Ops; + Ops.push_back(Callee); + for (unsigned i = 2; i < NumOps; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(Chain); + if (InGlue.getNode()) + Ops.push_back(InGlue); + CurDAG->SelectNodeTo(N, Opc, MVT::Other, MVT::Glue, Ops); + return; + } + + case ISD::ATOMIC_CMP_SWAP: { + // cmpxchg: move the comparand into ar.ccv, then the size-keyed cmpxchg + // (which reads ar.ccv) returns the old word and stores $new on a match. + // Operands of the node are (chain, ptr, cmp, new). + AtomicSDNode *AN = cast(N); + SDLoc dl(N); + SDValue Chain = AN->getChain(); + SDValue Ptr = AN->getBasePtr(); + SDValue Cmp = N->getOperand(2); + SDValue New = N->getOperand(3); + + unsigned Opc; + switch (AN->getMemoryVT().getSimpleVT().SimpleTy) { + case MVT::i8: Opc = IA64::CMPXCHG1; break; + case MVT::i16: Opc = IA64::CMPXCHG2; break; + case MVT::i32: Opc = IA64::CMPXCHG4; break; + case MVT::i64: Opc = IA64::CMPXCHG8; break; + default: + report_fatal_error("IA64: cannot select a cmpxchg of this type"); + } + + // The cmpxchg itself is .acq (acquire). For release/seq_cst, prepend a full + // fence so prior memory effects are ordered before the swap; the combination + // is a correct (conservative) full barrier. + if (isReleaseOrStronger(AN->getMergedOrdering())) + Chain = + SDValue(CurDAG->getMachineNode(IA64::MF, dl, MVT::Other, Chain), 0); + + // mov ar.ccv = cmp, glued to the cmpxchg so it stays immediately before it + // (and the ar.ccv physreg def/use is not separated by another writer). + SDValue Ccv = + SDValue(CurDAG->getMachineNode(IA64::MOV_TO_AR_CCV, dl, MVT::Glue, Cmp), + 0); + + SDValue Ops[] = {Ptr, New, Chain, Ccv}; + MachineSDNode *Cas = + CurDAG->getMachineNode(Opc, dl, N->getValueType(0), MVT::Other, Ops); + CurDAG->setNodeMemRefs(Cas, {AN->getMemOperand()}); + ReplaceUses(SDValue(N, 0), SDValue(Cas, 0)); // old value + ReplaceUses(SDValue(N, 1), SDValue(Cas, 1)); // chain + CurDAG->RemoveDeadNode(N); + return; + } + + case ISD::LOAD: { + // Select by the memory type. IA-64 narrow integer loads zero-extend into + // the 64-bit GR, which matches zext/any-extend loads; a sign-extending load + // (SEXTLOAD) follows the LDx with the matching sxt (handled below). An i1 + // (bool) load is the compare-against-zero trick (LD1 + cmp.ne, handled + // above). The address is a single register; a FrameIndex base is + // materialized by the FrameIndex case above and resolved by + // eliminateFrameIndex. + LoadSDNode *LD = cast(N); + SDValue Chain = LD->getChain(); + SDValue Address = LD->getBasePtr(); + SDLoc dl(N); + + // Loading a predicate: a predicate can't be loaded from memory directly, so + // load the bool byte and test it != 0 (ld1 ;; cmp.ne dst = byte, r0). + if (LD->getMemoryVT() == MVT::i1) { + SDNode *Byte = CurDAG->getMachineNode(IA64::LD1, dl, MVT::i64, MVT::Other, + Address, Chain); + SDValue Pred = SDValue( + CurDAG->getMachineNode(IA64::CMPNE, dl, MVT::i1, SDValue(Byte, 0), + CurDAG->getRegister(IA64::r0, MVT::i64)), + 0); + ReplaceUses(SDValue(N, 0), Pred); // the i1 value + ReplaceUses(SDValue(N, 1), SDValue(Byte, 1)); // the chain + CurDAG->RemoveDeadNode(N); + return; + } + + unsigned Opc; + switch (LD->getMemoryVT().getSimpleVT().SimpleTy) { + case MVT::i8: Opc = IA64::LD1; break; + case MVT::i16: Opc = IA64::LD2; break; + case MVT::i32: Opc = IA64::LD4; break; + case MVT::i64: Opc = IA64::LD8; break; + case MVT::f32: Opc = IA64::LDF4; break; + case MVT::f64: Opc = IA64::LDF8; break; + case MVT::f80: Opc = IA64::LDFE; break; + default: + report_fatal_error("IA64: cannot select a load of this type"); + } + // A sign-extending narrow load: the LDx above zero-extends into the 64-bit + // GR, so follow it with the matching sxt to sign-extend. Without this a + // signed value (e.g. a negative 'int' used in a signed compare -- a Lua + // stack index) is read as a large positive number and the compare goes wrong. + if (LD->getExtensionType() == ISD::SEXTLOAD) { + unsigned SxtOpc; + switch (LD->getMemoryVT().getSimpleVT().SimpleTy) { + case MVT::i8: SxtOpc = IA64::SXT1; break; + case MVT::i16: SxtOpc = IA64::SXT2; break; + case MVT::i32: SxtOpc = IA64::SXT4; break; + default: + report_fatal_error("IA64: unexpected sign-extending load width"); + } + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, MVT::i64, MVT::Other, + Address, Chain); + SDNode *Sxt = + CurDAG->getMachineNode(SxtOpc, dl, MVT::i64, SDValue(Ld, 0)); + ReplaceUses(SDValue(N, 0), SDValue(Sxt, 0)); // sign-extended value + ReplaceUses(SDValue(N, 1), SDValue(Ld, 1)); // chain + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), MVT::Other, Address, Chain); + return; + } + + case ISD::STORE: { + // Operands: (chain, value, address). A non-truncating store picks ST8/STF8 + // by the value type; a truncating store picks ST1/2/4 (or STF4) by the + // memory type. The address register is handled as for loads above. + StoreSDNode *ST = cast(N); + SDValue Chain = ST->getChain(); + SDValue Value = ST->getValue(); + SDValue Address = ST->getBasePtr(); + SDLoc dl(N); + + // Storing a predicate: a predicate can't be stored to memory directly, so + // widen it to a 0/1 GR (the zext-PR sequence) and store one byte (st1). + if (Value.getValueType() == MVT::i1) { + SDValue Zero = SDValue( + CurDAG->getMachineNode(IA64::ADDS, dl, MVT::i64, + CurDAG->getRegister(IA64::r0, MVT::i64), + CurDAG->getTargetConstant(0, dl, MVT::i64)), + 0); + SDValue Wide = SDValue( + CurDAG->getMachineNode(IA64::TPCADDS, dl, MVT::i64, Zero, + CurDAG->getTargetConstant(1, dl, MVT::i64), + Value), + 0); + CurDAG->SelectNodeTo(N, IA64::ST1, MVT::Other, Address, Wide, Chain); + return; + } + + unsigned Opc; + if (!ST->isTruncatingStore()) { + switch (Value.getValueType().getSimpleVT().SimpleTy) { + case MVT::i64: Opc = IA64::ST8; break; + case MVT::f64: Opc = IA64::STF8; break; + case MVT::f32: Opc = IA64::STF4; break; + case MVT::f80: Opc = IA64::STFE; break; + default: + report_fatal_error("IA64: cannot select a store of this type"); + } + } else { + switch (ST->getMemoryVT().getSimpleVT().SimpleTy) { + case MVT::i8: Opc = IA64::ST1; break; + case MVT::i16: Opc = IA64::ST2; break; + case MVT::i32: Opc = IA64::ST4; break; + // NB: FP truncating stores are set to Expand in IA64TargetLowering -- + // stfs/stf8 do not round, so they must become an explicit fpround + // (FNORMS/FNORMD) plus a same-size store before reaching the selector. + default: + report_fatal_error("IA64: cannot select a truncating store of this type"); + } + } + // ST* operands are (dstPtr, value): address first, then the stored value. + CurDAG->SelectNodeTo(N, Opc, MVT::Other, Address, Value, Chain); + return; + } + } + + SelectCode(N); +} + +// Implement addressing-mode selection for inline-asm memory operands. IA-64 +// loads and stores dereference a single register with no displacement +// ('[rN]'), so for the 'm' (and equivalent 'o') constraint the address operand +// is passed straight through as one register; IA64AsmPrinter:: +// PrintAsmMemoryOperand then prints it as '[rN]'. +bool IA64DAGToDAGISel::SelectInlineAsmMemoryOperand( + const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, + std::vector &OutOps) { + switch (ConstraintID) { + default: + return true; + case InlineAsm::ConstraintCode::o: + case InlineAsm::ConstraintCode::m: + OutOps.push_back(Op); + return false; + } +} + +/// createIA64ISelDag - This pass converts a legalized DAG into an IA64-specific +/// DAG, ready for instruction scheduling. +FunctionPass *llvm::createIA64ISelDag(TargetMachine &TM) { + return new IA64DAGToDAGISelLegacy(TM); +} diff --git llvm/lib/Target/IA64/IA64ISelLowering.cpp llvm/lib/Target/IA64/IA64ISelLowering.cpp new file mode 100644 index 000000000000..26f13554d7cf --- /dev/null +++ llvm/lib/Target/IA64/IA64ISelLowering.cpp @@ -0,0 +1,1181 @@ +//===-- IA64ISelLowering.cpp - IA64 DAG Lowering Implementation -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the IA64TargetLowering class. +// +// Scope note: LowerFormalArguments / LowerReturn (Stage 1) and LowerCall +// (Stage C) are implemented for the integer, direct-call ABI that fib needs: +// args in r32-r39 (incoming) / out0-out7 (outgoing), return in r8, gp/sp/rp +// saved around calls; indirect calls go through the function descriptor +// (entry point into b6, callee gp into r1). TLS remains deferred. +// +//===----------------------------------------------------------------------===// + +#include "IA64ISelLowering.h" +#include "IA64MachineFunctionInfo.h" +#include "IA64RegisterInfo.h" +#include "MCTargetDesc/IA64MCAsmInfo.h" +#include "MCTargetDesc/IA64MCTargetDesc.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +// A floating-point scalar that is not long double (f64; f32 was promoted to +// f64 earlier) occupies exactly one parameter slot. IA-64's parameter model is +// positional: every argument, integer or FP, consumes a slot in one shared +// sequence -- the first eight slots map to r32-r39 (incoming) / out0-out7 +// (outgoing), the rest to 8-byte stack slots. A *fixed* FP value travels in the +// next floating-point register F8-F15, but it must still RESERVE its general +// parameter slot so a following integer argument keeps its positional slot. +// +// CCAssignToRegWithShadow cannot express this: it shadows the GR at the *FP +// register's* index, so the first FP arg always shadows r32 no matter how many +// integers preceded it, never reserving the slot the FP arg actually occupies. +// A trailing integer then reused that slot's register -- e.g. the long long in +// _testfunc_q_bhilfdq(b,h,i,l,f,d,q) landed in the float's slot and read back +// the float's bit pattern instead of q. +// +// A *variadic* FP arg ('...' match) is passed in a general register in memory +// format: a prototyped variadic callee reads its variable arguments out of the +// integer parameter slots / register save area, never F8-F15 (psABI 8.5.4). It +// is bit-cast to its i64 IEEE pattern (getf.d, the BCvt in LowerCall) and put +// in the next slot register. SlotRegs is r32-r39 (incoming) / out0-out7 (call). +static bool CC_IA64_FP_Common(unsigned ValNo, MVT ValVT, MVT LocVT, + ISD::ArgFlagsTy ArgFlags, CCState &State, + ArrayRef SlotRegs) { + static const MCPhysReg FPRegs[] = {IA64::F8, IA64::F9, IA64::F10, IA64::F11, + IA64::F12, IA64::F13, IA64::F14, IA64::F15}; + if (ArgFlags.isVarArg()) { + if (unsigned Reg = State.AllocateReg(SlotRegs)) + State.addLoc( + CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i64, CCValAssign::BCvt)); + else + State.addLoc(CCValAssign::getMem(ValNo, ValVT, + State.AllocateStack(8, Align(8)), + MVT::i64, CCValAssign::BCvt)); + return true; + } + // Fixed FP arg: reserve the positional GR slot; within the first eight slots + // the value rides in the parallel FP register. Slots and FP registers are + // consumed only by FP args here (and the f80 hook), so the FP register is + // always available when a slot was, and they run out together; once the eight + // slots are gone the value goes on the stack. + if (State.AllocateReg(SlotRegs)) { + unsigned FReg = State.AllocateReg(FPRegs); + State.addLoc( + CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, CCValAssign::Full)); + return true; + } + State.addLoc(CCValAssign::getMem( + ValNo, ValVT, State.AllocateStack(8, Align(8)), LocVT, CCValAssign::Full)); + return true; +} + +// Incoming f64/f32: the parameter slots are the incoming stacked GP registers. +static bool CC_IA64_FP(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo /*LocInfo*/, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + static const MCPhysReg SlotRegs[] = {IA64::r32, IA64::r33, IA64::r34, + IA64::r35, IA64::r36, IA64::r37, + IA64::r38, IA64::r39}; + return CC_IA64_FP_Common(ValNo, ValVT, LocVT, ArgFlags, State, SlotRegs); +} + +// Outgoing f64/f32: the parameter slots are the output registers out0-out7. +static bool CC_IA64_Call_FP(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo /*LocInfo*/, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + static const MCPhysReg SlotRegs[] = {IA64::out0, IA64::out1, IA64::out2, + IA64::out3, IA64::out4, IA64::out5, + IA64::out6, IA64::out7}; + return CC_IA64_FP_Common(ValNo, ValVT, LocVT, ArgFlags, State, SlotRegs); +} + +// A named (prototyped) f80 'long double' argument is passed in one FP register +// in register format, but -- being 16 bytes -- it occupies TWO 16-byte-aligned +// (Next-Even) parameter slots, so it shadows two general registers (psABI +// 8.5.1). A variadic long double is passed in the general registers in memory +// format (two slots). ShadowRegs is r32-r39 (incoming) or out0-out7 (outgoing). +static bool CC_IA64_F80_Common(unsigned ValNo, MVT ValVT, MVT LocVT, + ISD::ArgFlagsTy ArgFlags, CCState &State, + ArrayRef ShadowRegs) { + static const MCPhysReg FPRegs[] = {IA64::F8, IA64::F9, IA64::F10, IA64::F11, + IA64::F12, IA64::F13, IA64::F14, IA64::F15}; + // A long double (double-extended) uses the "Next Even" slot policy (psABI + // 8.5.1, Table 8-3): it occupies two parameter slots and must START on an + // even-numbered slot. The slot index equals the shadow-GR index for the + // first eight slots, so if the next free shadow GR is odd, burn it as a + // padding slot (it is not reused for any later parameter). Beyond the eight + // register slots the same alignment is enforced on the stack via Align(16). + unsigned NextSlot = State.getFirstUnallocated(ShadowRegs); + if (NextSlot < ShadowRegs.size() && (NextSlot & 1)) + State.AllocateReg(ShadowRegs); + + if (ArgFlags.isVarArg()) { + // A variadic long double is passed in the *general* registers in memory + // format (psABI 8.5), occupying two parameter slots; spill into the stack + // image if the registers are exhausted. Emit two i64 part-locations (this + // value gets two CCValAssigns); LowerCall splits the f80 into the two + // memory-format halves via an stfe/ld8 temporary. The first stack part is + // 16-byte aligned to keep the Next-Even policy on the stack. + for (int Part = 0; Part < 2; ++Part) { + if (unsigned Reg = State.AllocateReg(ShadowRegs)) + State.addLoc(CCValAssign::getReg(ValNo, MVT::i64, Reg, MVT::i64, + CCValAssign::Full)); + else + State.addLoc(CCValAssign::getMem( + ValNo, MVT::i64, + State.AllocateStack(8, Align(Part == 0 ? 16 : 8)), MVT::i64, + CCValAssign::Full)); + } + return true; + } + if (unsigned FReg = State.AllocateReg(FPRegs)) { + // Consume the two (now even-aligned) shadow GR parameter slots this 16-byte + // value occupies so following arguments keep their positional slots. + State.AllocateReg(ShadowRegs); + State.AllocateReg(ShadowRegs); + State.addLoc( + CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, CCValAssign::Full)); + return true; + } + // All FP argument registers used (reachable only via HFAs): pass the 16-byte + // value on the stack. + unsigned Off = State.AllocateStack(16, Align(16)); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, CCValAssign::Full)); + return true; +} + +// Incoming f80: shadow the incoming stacked GP registers r32-r39. +static bool CC_IA64_F80(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo /*LocInfo*/, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + static const MCPhysReg ShadowRegs[] = {IA64::r32, IA64::r33, IA64::r34, + IA64::r35, IA64::r36, IA64::r37, + IA64::r38, IA64::r39}; + return CC_IA64_F80_Common(ValNo, ValVT, LocVT, ArgFlags, State, ShadowRegs); +} + +// Outgoing f80: shadow the output registers out0-out7. +static bool CC_IA64_Call_F80(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo /*LocInfo*/, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + static const MCPhysReg ShadowRegs[] = {IA64::out0, IA64::out1, IA64::out2, + IA64::out3, IA64::out4, IA64::out5, + IA64::out6, IA64::out7}; + return CC_IA64_F80_Common(ValNo, ValVT, LocVT, ArgFlags, State, ShadowRegs); +} + +#include "IA64GenCallingConv.inc" + +IA64TargetLowering::IA64TargetLowering(const TargetMachine &TM, + const TargetSubtargetInfo &STI) + : TargetLowering(TM, STI) { + // Register classes: general (i64), floating-point (f32/f64/f80 = long double) + // and predicate (i1). f80 is the 80-bit double-extended C 'long double', + // held natively in the 82-bit FP registers (memory format via ldfe/stfe). + addRegisterClass(MVT::i64, &IA64::GRRegClass); + addRegisterClass(MVT::f32, &IA64::FPRegClass); + addRegisterClass(MVT::f64, &IA64::FPRegClass); + addRegisterClass(MVT::f80, &IA64::FPRegClass); + addRegisterClass(MVT::i1, &IA64::PRRegClass); + + // IA-64 uses SELECT, not SELECT_CC, and has no native BR_CC / jump tables. + setOperationAction(ISD::BRIND, MVT::Other, Legal); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + + // BR_CC / SELECT_CC must be keyed by the *compare operand* value type, not + // MVT::Other. The DAGCombiner folds brcond(setcc) -> br_cc whenever BR_CC is + // legal-or-custom for that operand type (DAGCombiner::visitBRCOND), and the + // legalizer likewise queries getOperationAction by the operand type. The + // pre-removal backend used MVT::Other, which was right for the LLVM 2.6 + // legalizer but is now a dead no-op -- it left BR_CC/i64 at its Legal default, + // so brcond(setcc) got folded into an unselectable br_cc. Marking i64 Expand + // keeps brcond(setcc) intact, which is exactly what our setcc (CMP*) patterns + // and the hand-selected BRCOND consume. (Sparc keys these by operand type + // too; it only differs in Custom-lowering them, having native cc-branches.) + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f80, Expand); + + // FP compares: keep brcond(setcc f64) from folding into an unselectable + // br_cc, so the legalizer hands us setcc + brcond. setcc f64 selects to the + // fcmp relations (FCMP* in IA64InstrInfo.td), which cover every clang FP + // condition except SETONE/SETUEQ; expand those into a pair joined by the i1 + // and/or patterns. + setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); + // f80 ('long double') compares select to the same fcmp relations. + setOperationAction(ISD::BR_CC, MVT::f80, Expand); + setCondCodeAction(ISD::SETONE, MVT::f80, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f80, Expand); + // ...and so do f32 compares (fcmp looks at the full register-format value, + // so single precision needs no separate compare path). + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + + // Comparing two predicates (i1): keep br_cc/select_cc as setcc + brcond/select, + // and custom-lower the i1 setcc to predicate logic (eq/ne -> xnor/xor). + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SETCC, MVT::i1, Custom); + // ...but mark the i1 eq/ne conditions Expand so the combiner's rebuildSetCC + // does not turn our lowered xor back into an i1 setcc (an infinite loop, since + // that setcc is Custom-lowered to the same xor again). + setCondCodeAction(ISD::SETEQ, MVT::i1, Expand); + setCondCodeAction(ISD::SETNE, MVT::i1, Expand); + + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FDIV, MVT::f32, Expand); + setOperationAction(ISD::FDIV, MVT::f64, Expand); + // f80 ('long double') has no inline divide/remainder; use the libcall + // (__divxf3 / fmodl). fadd/fsub/fmpy/fma are native (FADD etc.). + setOperationAction(ISD::FREM, MVT::f80, Expand); + setOperationAction(ISD::FDIV, MVT::f80, Expand); + + // FP truncating stores must round first. stfs/stf8 emit fp_fr_to_mem_format, + // which *assumes the FR was already rounded* to the destination precision -- + // they do not round themselves. So storing an unrounded wider value as a + // narrower one would just slice its bits and corrupt the result. Expanding + // these turns a truncstore into an explicit fpround (FNORMS/FNORMD) followed + // by a same-size store of the now-rounded value, and also stops DAGCombiner + // from re-merging store(fpround x) back into a single truncating store. + // (The load direction needs no dual: ldfs/ldf8 always widen correctly.) + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f80, MVT::f32, Expand); + setTruncStoreAction(MVT::f80, MVT::f64, Expand); + + // IA-64 has no native half (f16). Convert to/from f16 via the soft-float + // libcalls (__truncsfhf2/__extendhfsf2 etc.) and never load/store f16 as an + // extended/truncated FP value -- it is handled as i16 bits. Mirrors SPARC. + // (f128 needs no such setup: with no f128 register class it is soft-floated + // to the default libgcc __*tf3 libcalls.) + for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { + setOperationAction(ISD::FP_TO_FP16, VT, Expand); + setOperationAction(ISD::FP16_TO_FP, VT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setTruncStoreAction(VT, MVT::f16, Expand); + } + + // We don't support sin/cos/sqrt/pow (expand to libcalls: sinl/cosl/sqrtl/...). + for (MVT VT : {MVT::f32, MVT::f64, MVT::f80}) { + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + // FIXME: IA64 supports fcopysign natively. + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + } + + // IA-64 has a native population count (popcnt); select ctpop directly. + setOperationAction(ISD::CTPOP, MVT::i64, Legal); + // ctlz/cttz have no direct instruction; let the legalizer expand them (now + // cheaply, in terms of the legal ctpop above). + setOperationAction(ISD::CTLZ, MVT::i64, Expand); + setOperationAction(ISD::CTTZ, MVT::i64, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + // FIXME: IA64 has this (mux @rev), but it is not implemented. + setOperationAction(ISD::BSWAP, MVT::i64, Expand); + + // Use toolchain built-in for integer division + for (unsigned Op : {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::UDIVREM, + ISD::SDIVREM}) + setOperationAction(Op, MVT::i64, Expand); + + // No single instruction yields both halves of a 64x64 product; expand into a + // separate low MUL and a high MULHU/MULHS (both of which we select). + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + + // 128-bit shifts (i128, e.g. `core`'s checked_shl) legalize to a *_PARTS node + // over an i64 register pair. We have no instruction for that; mark them Expand + // so the integer legalizer emits the libgcc libcall (__ashlti3/__ashrti3/ + // __lshrti3) instead, matching how we already handle 128-bit divide/modulo. + setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand); + + // va_start points the va_list at the register save area (custom); va_arg, + // va_copy and va_end use the generic load/increment/store expansion. The + // va_list is a plain pointer, so the default va_copy/va_end suffice. + // Thread-local addresses are lowered per TLS model (see LowerGlobalTLSAddress); + // there is no generic expansion, so it must be Custom. + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + + // Atomics. An aligned <=8-byte ld/st is atomic on the hardware, but LLVM + // represents an atomic access as a distinct node (ISD::ATOMIC_LOAD/STORE) + // that the selector won't turn into ld8/st8 on its own. Custom-lower them to + // a plain load/store carrying the same (atomic) memory operand; see + // LowerOperation. Ordering is handled separately: shouldInsertFencesForAtomic + // asks AtomicExpand to wrap stronger orderings with fences and demote the + // access to monotonic, so the only atomic load/store we ever lower here is + // monotonic. The fences become ISD::ATOMIC_FENCE, selected to 'mf'. + // + // Only the legal integer type i64 is marked Custom: a narrow (i8/i16/i32) + // atomic load/store has an illegal type and is first widened by the *type* + // legalizer (PromoteIntRes_Atomic0 / PromoteIntOp_ATOMIC_STORE) to an i64 + // access carrying the narrow memory VT, which then reaches LowerOperation as + // an i64 Custom op. Marking the narrow types Custom instead would divert type + // legalization into ReplaceNodeResults (which we do not implement) and abort. + setMaxAtomicSizeInBitsSupported(64); + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal); + + setStackPointerRegisterToSaveRestore(IA64::r12); + + // The pre-removal backend reported a Log2 function alignment of 5, i.e. a + // 32-byte alignment ('.align 32' in the reference output). + setMinFunctionAlignment(Align(32)); + + computeRegisterProperties(STI.getRegisterInfo()); + + // Note: the pre-removal backend called addLegalFPImmediate(0/±1) here; that + // API was removed (FP-immediate legality is now an isFPImmLegal override). + // plus.ll uses no FP immediates, so this is left for a later stage. +} + +const char *IA64TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: + return nullptr; + case IA64ISD::GETFD: + return "IA64ISD::GETFD"; + case IA64ISD::BRCALL: + return "IA64ISD::BRCALL"; + case IA64ISD::RET_FLAG: + return "IA64ISD::RET_FLAG"; + case IA64ISD::TLS_TPREL: + return "IA64ISD::TLS_TPREL"; + case IA64ISD::TLS_GOTLOAD: + return "IA64ISD::TLS_GOTLOAD"; + } +} + +EVT IA64TargetLowering::getSetCCResultType(const DataLayout & /*DL*/, + LLVMContext & /*Context*/, + EVT /*VT*/) const { + // SETCC produces a predicate register value. + return MVT::i1; +} + +bool IA64TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction & /*MF*/, + EVT VT) const { + // fma/fms/fnma fuse a*b+c into one single-rounding F-unit op. f32 (fma.s), + // f64 (fma.d) and f80 (fma) each have a hardware FMA pattern, so contracting + // fmul+fadd is profitable for them. This stays an explicit whitelist (not + // `true`): f16 and f128 are soft-floated, and contracting those would form an + // fma node of a width with no hardware pattern (unselectable / soft-float). + return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80; +} + +bool IA64TargetLowering::isFPImmLegal(const APFloat & /*Imm*/, EVT VT, + bool /*ForCodeSize*/) const { + // Keep f32/f64 constants out of the constant pool: we materialise them from + // their integer bit pattern (movl + setf.d) -- see the fpimm patterns in + // IA64InstrInfo.td. f80 ('long double') is 80 bits and cannot be built from a + // single 64-bit movl, so its literals go to the constant pool (loaded by ldfe; + // see the ISD::ConstantPool selection in IA64ISelDAGToDAG). + return VT == MVT::f32 || VT == MVT::f64; +} + +SDValue IA64TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + CCInfo.AnalyzeFormalArguments(Ins, CC_IA64); + + for (CCValAssign &VA : ArgLocs) { + if (VA.isRegLoc()) { + // The argument arrives in a register. + MVT RegVT = VA.getLocVT(); + const TargetRegisterClass *RC; + if (RegVT == MVT::i64) + RC = &IA64::GRRegClass; + else if (RegVT == MVT::f32 || RegVT == MVT::f64 || RegVT == MVT::f80) + RC = &IA64::FPRegClass; + else + report_fatal_error("IA64: unhandled formal-argument register type"); + + Register VReg = RegInfo.createVirtualRegister(RC); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT); + + // If the argument was widened to fill the register, narrow it back to + // its declared type. + if (RegVT != VA.getValVT()) { + if (RegVT.isInteger()) + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + else + ArgValue = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), ArgValue, + DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); + } + + InVals.push_back(ArgValue); + } else { + // The argument arrives on the stack. Per the psABI (§8.5.3) parameter + // slot 8 is at sp+16, slot 9 at sp+24, and so on (the 16-byte scratch + // area sits below at [sp, sp+16)). This holds whether or not the function + // is variadic -- the variadic register-home spill area is carved out of + // *this* frame and the scratch area, not reserved by the caller (see the + // spill loop below). + assert(VA.isMemLoc() && "unexpected argument location"); + int FI = MF.getFrameInfo().CreateFixedObject( + 8, 16 + VA.getLocMemOffset(), /*IsImmutable=*/true); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + InVals.push_back( + DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo())); + } + } + + // Variadic functions: spill the unnamed incoming GP registers to their + // parameter-slot memory homes so va_start/va_arg can walk the variadic + // arguments as a single contiguous in-memory image. Per the psABI (§8.5.4) + // the callee spills in6/in7 into the 16-byte scratch area at [sp, sp+16) and + // in0-in5 into up to 48 bytes at the base of its own frame, just below sp. + // This places parameter slot i at offset 8*i - 48 from the incoming sp: + // slot6 -> sp+0, slot7 -> sp+8, slot8 (first stack arg) -> sp+16, slot9 -> + // sp+24, ... -- one contiguous ascending block running from the frame base up + // into the caller's memory arguments. A va_list is just an ascending pointer, + // so it walks out of the register homes straight into the stack arguments. + // (CreateFixedObject offsets are relative to the incoming sp; negative + // offsets land in this frame, which PrologEpilogInserter sizes to cover. + // Storing the registers also marks them used, so frame lowering's 'alloc' + // keeps all eight incoming GP registers live as locals.) + if (isVarArg) { + static const MCPhysReg ArgGPRs[] = {IA64::r32, IA64::r33, IA64::r34, + IA64::r35, IA64::r36, IA64::r37, + IA64::r38, IA64::r39}; + unsigned FirstVar = CCInfo.getFirstUnallocated(ArgGPRs); + MachineFrameInfo &MFI = MF.getFrameInfo(); + int VAFI = 0; + SmallVector Stores; + for (unsigned i = FirstVar; i < 8; ++i) { + int FI = MFI.CreateFixedObject(8, 8 * (int)i - 48, /*IsImmutable=*/false); + if (i == FirstVar) + VAFI = FI; // va_start points at the first unnamed slot's home + Register VReg = RegInfo.createVirtualRegister(&IA64::GRRegClass); + RegInfo.addLiveIn(ArgGPRs[i], VReg); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + SDValue Addr = DAG.getFrameIndex(FI, MVT::i64); + Stores.push_back(DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo::getFixedStack(MF, FI))); + } + // All eight GP slots named: no register varargs, so va_start points at the + // first unnamed stack slot. That is slot 8 (sp+16) only when there are no + // *named* stack arguments; if the prototype has named parameters beyond the + // eight register slots (e.g. Links' input_field: 8 register params + 4 named + // stack args + ...), the unnamed args begin after them, at + // sp + 16 + . CCInfo.getStackSize() is exactly + // those bytes (the formals were just analyzed above). + if (FirstVar == 8) + VAFI = MFI.CreateFixedObject(8, 16 + CCInfo.getStackSize(), + /*IsImmutable=*/true); + MF.getInfo()->setVarArgsFrameIndex(VAFI); + if (!Stores.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + } + + // 'alloc' (which captures the caller's ar.pfs) and its restore are now emitted + // entirely by frame lowering into a reserved stacked local, so there is + // nothing to materialise here. See IA64FrameLowering::emitPrologue. + return Chain; +} + +SDValue IA64TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &dl = CLI.DL; + SmallVectorImpl &Outs = CLI.Outs; + SmallVectorImpl &OutVals = CLI.OutVals; + SmallVectorImpl &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + MachineFunction &MF = DAG.getMachineFunction(); + + // No tail calls yet. + CLI.IsTailCall = false; + + // Assign the outgoing arguments to out0-out7 / F8-F15 (caller convention). + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + CCInfo.AnalyzeCallOperands(Outs, CC_IA64_Call); + + // A 16-byte scratch area sits at the bottom of the outgoing frame; keep the + // whole thing 16-byte aligned. Stack-passed arguments begin at sp+16 (psABI + // §8.5.3), variadic or not: the variadic register-home spill area is built by + // the callee out of its own frame and the scratch area, not reserved here + // (see LowerFormalArguments). + unsigned NumBytes = (CCInfo.getStackSize() + 16 + 15) & ~15u; + + // Record how many output registers this call needs; the prologue 'alloc' + // sizes its output region from the max over all of the function's calls. + // Count the actually-allocated out registers rather than the argument count: + // an FP argument shadows (consumes) its parameter slot(s) without occupying an + // out register for the value, while a long double (f80) shadows *two* out + // slots -- so a trailing integer arg can land in a higher out register than + // the plain argument count would suggest. + static const MCPhysReg OutRegs[] = {IA64::out0, IA64::out1, IA64::out2, + IA64::out3, IA64::out4, IA64::out5, + IA64::out6, IA64::out7}; + unsigned NumOutRegs = 0; + for (unsigned i = 0; i < 8; ++i) + if (CCInfo.isAllocated(OutRegs[i])) + NumOutRegs = i + 1; + IA64FunctionInfo *FInfo = MF.getInfo(); + FInfo->OutRegsUsed = std::max(FInfo->OutRegsUsed, NumOutRegs); + + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); + + // An indirect callee is a function pointer: not a GlobalAddress/ExternalSymbol + // but an ordinary i64 value pointing at a function descriptor { entry, gp }. + // Read the descriptor here, while Chain is still a plain (unglued) chain and + // before the gp save below latches the caller's r1; the entry point and the + // callee's gp are installed into b6 / r1 just before the call further down. + bool IsIndirect = !isa(Callee) && + !isa(Callee); + SDValue EntryPoint, NewGp; + if (IsIndirect) { + EntryPoint = DAG.getLoad(MVT::i64, dl, Chain, Callee, MachinePointerInfo()); + Chain = EntryPoint.getValue(1); + SDValue GpAddr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, + DAG.getIntPtrConstant(8, dl)); + NewGp = DAG.getLoad(MVT::i64, dl, Chain, GpAddr, MachinePointerInfo()); + Chain = NewGp.getValue(1); + } + + // Collect the (out-register, value) pairs to copy in just before the call, + // and the stores for any arguments that overflow onto the outgoing stack. + SmallVector, 8> RegsToPass; + SmallVector MemOpChains; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + // Index by ValNo, not i: a variadic long double maps one argument value to + // two consecutive parameter-slot locations (see below), after which i and + // the argument number diverge. + SDValue Arg = OutVals[VA.getValNo()]; + + // By-value aggregate argument. The psABI passes aggregates by value; the + // frontend models this as a `byval` pointer to the caller's object and + // expects the callee to receive a pointer to a *private copy*. We currently + // realize that copy here (the callee then dereferences the pointer as usual) + // rather than flattening the aggregate into parameter slots/GRs -- that full + // ABI is still TODO (see struct-value-abi.md). The copy is mandatory: without + // it the argument aliases caller memory, and a callee that mutates or frees + // that memory corrupts the caller. Concretely, glibc regex's re_dfa_add_node + // takes an re_token_t by value and `realloc`s the very dfa->nodes array a + // by-value `dfa->nodes[org_idx]` argument points into -- so the un-copied + // pointer dangled into the freed block and read back garbage. + ISD::ArgFlagsTy Flags = Outs[VA.getValNo()].Flags; + if (Flags.isByVal()) { + unsigned Size = Flags.getByValSize(); + if (Size != 0) { + Align ByValAlign = Flags.getNonZeroByValAlign(); + int FI = MF.getFrameInfo().CreateStackObject(Size, ByValAlign, false); + SDValue Copy = DAG.getFrameIndex(FI, MVT::i64); + SDValue MemcpyChain = DAG.getMemcpy( + Chain, dl, Copy, Arg, DAG.getIntPtrConstant(Size, dl), ByValAlign, + /*isVol=*/false, /*AlwaysInline=*/false, /*CI=*/nullptr, + /*OverrideTailCall=*/std::nullopt, + MachinePointerInfo::getFixedStack(MF, FI), MachinePointerInfo()); + // Order the copy before the call (alongside the other arg stores). + MemOpChains.push_back(MemcpyChain); + Arg = Copy; // pass the private copy's address per VA below + } + } + + // Variadic long double (f80): the CC gave it two consecutive i64 slots -- + // this location and the next, both tagged with the same ValNo. It is passed + // in memory format (psABI 8.5). + if (i + 1 < e && ArgLocs[i + 1].getValNo() == VA.getValNo()) { + CCValAssign &VAHi = ArgLocs[i + 1]; + + // Both halves land on the outgoing stack: store the long double straight + // to its parameter slot with stfe (memory format) -- no register + // round-trip. (The two slots are adjacent, so one 10-byte stfe covers the + // significant bytes; the callee's va_arg reads it back with ldfe.) The + // spill-and-reload path below would only DAGCombine down to this if the + // combiner forwarded an f80 store into i64 loads, which it does not. + if (VA.isMemLoc() && VAHi.isMemLoc()) { + unsigned Off = 16 + VA.getLocMemOffset(); // psABI: slot 8 at sp+16 + SDValue Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, + DAG.getRegister(IA64::r12, MVT::i64), + DAG.getIntPtrConstant(Off, dl)); + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, Addr, + MachinePointerInfo::getStack(MF, Off))); + ++i; // consumed both part-locations + continue; + } + + // At least one half goes in a general register: spill to a 16-byte + // temporary with stfe and reload the two 8-byte memory-format halves + // (ld8) into the assigned slots -- the in-memory image the callee's + // va_arg reconstructs with ldfe. (There is no register instruction to + // extract the 80-bit *memory* format into GRs, so the spill is required.) + int FI = MF.getFrameInfo().CreateStackObject(16, Align(16), false); + SDValue Tmp = DAG.getFrameIndex(FI, MVT::i64); + SDValue St = DAG.getStore(Chain, dl, Arg, Tmp, + MachinePointerInfo::getFixedStack(MF, FI)); + SDValue HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i64, Tmp, + DAG.getIntPtrConstant(8, dl)); + SDValue Half[2] = { + DAG.getLoad(MVT::i64, dl, St, Tmp, + MachinePointerInfo::getFixedStack(MF, FI)), + DAG.getLoad(MVT::i64, dl, St, HiAddr, + MachinePointerInfo::getFixedStack(MF, FI, 8))}; + // Order the spill/reload before the call. + MemOpChains.push_back(Half[0].getValue(1)); + MemOpChains.push_back(Half[1].getValue(1)); + for (unsigned Part = 0; Part < 2; ++Part) { + CCValAssign &PVA = ArgLocs[i + Part]; + if (PVA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(PVA.getLocReg(), Half[Part])); + } else { + unsigned Off = 16 + PVA.getLocMemOffset(); // psABI: slot 8 at sp+16 + SDValue Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, + DAG.getRegister(IA64::r12, MVT::i64), + DAG.getIntPtrConstant(Off, dl)); + MemOpChains.push_back(DAG.getStore( + Chain, dl, Half[Part], Addr, MachinePointerInfo::getStack(MF, Off))); + } + } + ++i; // consumed both part-locations + continue; + } + + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + // A variadic FP arg routed into a GR slot: reinterpret the f64 as its + // i64 IEEE bit pattern (selects to getf.d). See CC_IA64_FP_Common. + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::FPExt: + Arg = DAG.getNode(ISD::FP_EXTEND, dl, VA.getLocVT(), Arg); + break; + default: + report_fatal_error("IA64: unhandled argument CCValAssign"); + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + // Arguments beyond out0-out7 are passed on the outgoing stack, just above + // the 16-byte scratch area: parameter slot 8 at sp+16, slot 9 at sp+24, + // ... (psABI §8.5.3) -- the same layout LowerFormalArguments reads + // incoming stack arguments from. The store is sp-relative: with a reserved + // call frame (no variable-sized objects) sp is constant here; otherwise + // the call-frame pseudos adjust it around the call. + assert(VA.isMemLoc() && "argument neither in register nor on the stack"); + unsigned Off = 16 + VA.getLocMemOffset(); + SDValue StackPtr = DAG.getRegister(IA64::r12, MVT::i64); + SDValue Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, + DAG.getIntPtrConstant(Off, dl)); + MemOpChains.push_back(DAG.getStore( + Chain, dl, Arg, Addr, MachinePointerInfo::getStack(MF, Off))); + } + } + + // Sequence all the outgoing-argument stores before the call. + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // Save gp/sp around the call. br.call may transfer into another load module + // (so the callee's gp must be reinstalled afterwards) and sp is restored + // conservatively. These reads must precede the call and the restores must + // follow it, so the whole save -> args -> call -> restore sequence is tied + // together with glue. Use the glue-carrying getCopyFromReg overload even for + // the first save (with a null input glue): it still gives the node a glue + // *result* to start the chain. The plain 4-operand form has no glue result, + // so reading getValue(2) off it would be out of range. + // + // We deliberately do NOT save/restore rp (b0) per call here. br.call does + // overwrite b0, but frame lowering already parks the incoming rp once in a + // stacked local for the whole function (IA64FrameLowering::emitPrologue) and + // the epilogue restores b0 from it, so our br.ret returns correctly no matter + // how many calls clobber rp in between -- the per-call save was redundant. + // Worse, it was actively wrong: rp is a member of the GR class (so that + // 'mov rN = rp' works), the save value was live across the call and coalesced + // into the physical rp, and the spiller then spilled it with a plain + // 'st8 [slot] = rp' / 'ld8 rp = [slot]'. That is illegal -- st8/ld8 require a + // general register, not the branch register b0 -- and gas rejects it + // ("Operand N of st8/ld8 should be a general register"). The only place rp is + // still read around a call is the returns_twice path below, where it is parked + // into the CSR r7 *before* the call and so is never live across it as b0. + SDValue InGlue; + SDValue GPSave = DAG.getCopyFromReg(Chain, dl, IA64::r1, MVT::i64, InGlue); + Chain = GPSave.getValue(1); + InGlue = GPSave.getValue(2); + SDValue SPSave = DAG.getCopyFromReg(Chain, dl, IA64::r12, MVT::i64, InGlue); + Chain = SPSave.getValue(1); + InGlue = SPSave.getValue(2); + + // In a function that calls setjmp (and so may be re-entered by longjmp), the + // save vregs above cannot be allowed to land in stacked locals: longjmp brings + // the stacked frame back only to its last-written values, and the register + // allocator reuses the save register right after the (singly-modeled) restore + // -- which sits before the setjmp-result branch, i.e. exactly the longjmp + // re-entry point -- so the restored value is garbage (observed: gp = 0, then a + // stale slot address). Park gp/sp/rp instead in the static callee-saved + // registers r4/r6/r7, which glibc's setjmp/longjmp save and restore through the + // jmpbuf: on a longjmp re-entry they come back holding the setjmp-time + // gp/sp/rp, and any reuse after the restore is harmless because longjmp + // overwrites it. Because they are true CSRs (getCalleeSavedRegs), a nested + // setjmp call saves and restores them, so it cannot clobber an outer frame's + // parked values. The restore below reads them back out of r4/r6/r7. Reading rp + // here is safe (it is parked into r7, a GR, before the call -- never spilled as + // b0 across the call). + bool ReturnsTwice = MF.exposesReturnsTwice(); + SDValue RPSave; + if (ReturnsTwice) { + RPSave = DAG.getCopyFromReg(Chain, dl, IA64::rp, MVT::i64, InGlue); + Chain = RPSave.getValue(1); + InGlue = RPSave.getValue(2); + Chain = DAG.getCopyToReg(Chain, dl, IA64::r4, GPSave, InGlue); + InGlue = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, IA64::r6, SPSave, InGlue); + InGlue = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, IA64::r7, RPSave, InGlue); + InGlue = Chain.getValue(1); + } + + // Copy the outgoing arguments into their out registers, glued before the call. + for (auto &R : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, dl, R.first, R.second, InGlue); + InGlue = Chain.getValue(1); + } + + // Set up the br.call target. For an indirect call, install the callee's gp + // (r1) and the entry point (b6) read from the descriptor above, glued in just + // after the argument copies; BRCALL then branches to b6. For a direct call, + // make the callee a target node so the generic selector leaves it alone and + // the IA64ISD::BRCALL selection consumes it as the br.call target. + if (IsIndirect) { + Chain = DAG.getCopyToReg(Chain, dl, IA64::r1, NewGp, InGlue); + InGlue = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, IA64::B6, EntryPoint, InGlue); + InGlue = Chain.getValue(1); + Callee = DAG.getRegister(IA64::B6, MVT::i64); + } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i64); + else if (ExternalSymbolSDNode *E = dyn_cast(Callee)) + Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i64); + + // Emit the call. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SmallVector Ops = {Chain, Callee}; + for (auto &R : RegsToPass) + Ops.push_back(DAG.getRegister(R.first, R.second.getValueType())); + if (InGlue.getNode()) + Ops.push_back(InGlue); + Chain = DAG.getNode(IA64ISD::BRCALL, dl, NodeTys, Ops); + InGlue = Chain.getValue(1); + + // Restore gp/sp after the call. For a returns_twice function read gp/sp/rp back + // out of r4/r6/r7 (longjmp-safe, see the save above) and reinstate rp from r7 + // (a plain GR->GR copy, never spilled as b0); otherwise restore gp/sp from the + // save vregs directly. The common path needs no rp restore -- frame lowering + // owns the function's return pointer (see the save block above). + if (ReturnsTwice) { + GPSave = DAG.getCopyFromReg(Chain, dl, IA64::r4, MVT::i64, InGlue); + Chain = GPSave.getValue(1); + InGlue = GPSave.getValue(2); + SPSave = DAG.getCopyFromReg(Chain, dl, IA64::r6, MVT::i64, InGlue); + Chain = SPSave.getValue(1); + InGlue = SPSave.getValue(2); + RPSave = DAG.getCopyFromReg(Chain, dl, IA64::r7, MVT::i64, InGlue); + Chain = RPSave.getValue(1); + InGlue = RPSave.getValue(2); + } + Chain = DAG.getCopyToReg(Chain, dl, IA64::r1, GPSave, InGlue); + InGlue = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, IA64::r12, SPSave, InGlue); + InGlue = Chain.getValue(1); + // rp last (only for returns_twice), preserving the gp -> sp -> rp restore order. + if (ReturnsTwice) { + Chain = DAG.getCopyToReg(Chain, dl, IA64::rp, RPSave, InGlue); + InGlue = Chain.getValue(1); + } + + Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, dl); + InGlue = Chain.getValue(1); + + // Read the return value(s) out of r8 / F8. + SmallVector RVLocs; + CCState RVInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); + RVInfo.AnalyzeCallResult(Ins, RetCC_IA64); + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + SDValue Val = + DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InGlue); + Chain = Val.getValue(1); + InGlue = Val.getValue(2); + + if (VA.getLocVT() != VA.getValVT()) { + if (VA.getLocVT().isInteger()) + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + else + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); + } + InVals.push_back(Val); + } + + return Chain; +} + +SDValue IA64TargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + report_fatal_error("IA64: unimplemented custom operation lowering"); + case ISD::FRAMEADDR: { + // __builtin_frame_address(0): the address of the current frame, which we + // take to be the frame register (the frame pointer r5 if one is forced, + // else the stack pointer r12). + if (Op.getConstantOperandVal(0) != 0) + report_fatal_error("IA64: __builtin_frame_address with nonzero depth is " + "not supported"); + MachineFunction &MF = DAG.getMachineFunction(); + MF.getFrameInfo().setFrameAddressIsTaken(true); + Register FrameReg = MF.getSubtarget().getRegisterInfo()->getFrameRegister(MF); + return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FrameReg, + Op.getValueType()); + } + case ISD::GlobalTLSAddress: + return LowerGlobalTLSAddress(Op, DAG); + case ISD::SETCC: { + // i1 (predicate) comparison: a != b is xor, a == b is its complement + // (xor then invert via xor with 1). Booleans only ever use eq/ne. + SDLoc dl(Op); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op.getOperand(0), + Op.getOperand(1)); + if (CC == ISD::SETNE) + return Xor; + if (CC == ISD::SETEQ) + return DAG.getNode(ISD::XOR, dl, MVT::i1, Xor, + DAG.getConstant(1, dl, MVT::i1)); + report_fatal_error("IA64: unhandled i1 SETCC condition (expected eq/ne)"); + } + case ISD::ATOMIC_LOAD: { + // Lower a monotonic atomic load (AtomicExpand has already split off any + // stronger ordering into fences) to a plain load with the same atomic + // memory operand. The existing ISD::LOAD selector picks ld1/ld2/ld4/ld8 by + // the memory type and applies the zero/sign extension carried here. + AtomicSDNode *AN = cast(Op); + SDLoc dl(Op); + return DAG.getExtLoad(AN->getExtensionType(), dl, Op.getValueType(), + AN->getChain(), AN->getBasePtr(), AN->getMemoryVT(), + AN->getMemOperand()); + } + case ISD::ATOMIC_STORE: { + // Mirror of ATOMIC_LOAD: a monotonic atomic store becomes a plain (possibly + // truncating) store. The value is promoted to i64, so a narrow access is a + // truncating store keyed on the memory type, which the store selector + // handles. + AtomicSDNode *AN = cast(Op); + SDLoc dl(Op); + SDValue Val = AN->getVal(); + EVT MemVT = AN->getMemoryVT(); + if (MemVT == Val.getValueType()) + return DAG.getStore(AN->getChain(), dl, Val, AN->getBasePtr(), + AN->getMemOperand()); + return DAG.getTruncStore(AN->getChain(), dl, Val, AN->getBasePtr(), MemVT, + AN->getMemOperand()); + } + case ISD::VASTART: { + // va_start stores the address of the register save area (the first variadic + // argument slot, filled in by LowerFormalArguments) into the va_list. + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc dl(Op); + SDValue FR = DAG.getFrameIndex( + MF.getInfo()->getVarArgsFrameIndex(), + getPointerTy(DAG.getDataLayout())); + const Value *SV = cast(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), + MachinePointerInfo(SV)); + } + } +} + +SDValue IA64TargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GA = cast(Op); + const GlobalValue *GV = GA->getGlobal(); + SDLoc dl(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + // -femulated-tls is handled generically; otherwise emit native ELF TLS. + if (DAG.getTarget().useEmulatedTLS()) + return LowerToTLSEmulatedModel(GA, DAG); + + // Read the thread pointer (tp / r13). It is reserved, so a CopyFromReg of the + // physreg observes its live value; the per-model offset below is added to it. + auto ThreadPointer = [&]() { + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, IA64::r13, PtrVT); + }; + + switch (getTargetMachine().getTLSModel(GV)) { + case TLSModel::LocalExec: { + // The offset is a static-link-time constant materialised directly (no GOT): + // movl rX = @tprel(sym) ;; add rX = rX, tp + SDValue Sym = + DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0, IA64::S_TPREL); + SDValue Off = DAG.getNode(IA64ISD::TLS_TPREL, dl, PtrVT, Sym); + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer(), Off); + } + case TLSModel::InitialExec: { + // The offset is resolved by the dynamic linker into a GOT slot: + // addl rX = @ltoff(@tprel(sym)), gp ;; ld8 rX = [rX] ;; add rX = rX, tp + SDValue Sym = DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0, + IA64::S_LTOFF_TPREL); + SDValue Off = DAG.getNode(IA64ISD::TLS_GOTLOAD, dl, PtrVT, Sym); + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer(), Off); + } + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: { + // Call __tls_get_addr(module, offset): the two arguments are loaded from the + // @ltoff(@dtpmod)/@ltoff(@dtprel) GOT slots, and the call returns the + // variable's address. (Local-dynamic is lowered identically to + // general-dynamic -- one call per access using the variable's own + // dtpmod/dtprel -- which is correct, just without the LDM module-base + // sharing optimization.) IA-64's __tls_get_addr takes the two scalars + // directly (out0/out1), not a pointer to a tls_index struct. + SDValue ModSym = DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0, + IA64::S_LTOFF_DTPMOD); + SDValue OffSym = DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0, + IA64::S_LTOFF_DTPREL); + SDValue Module = DAG.getNode(IA64ISD::TLS_GOTLOAD, dl, PtrVT, ModSym); + SDValue Offset = DAG.getNode(IA64ISD::TLS_GOTLOAD, dl, PtrVT, OffSym); + + Type *I64Ty = Type::getInt64Ty(*DAG.getContext()); + ArgListTy Args; + Args.push_back(ArgListEntry(Module, I64Ty)); + Args.push_back(ArgListEntry(Offset, I64Ty)); + + // __tls_get_addr is an external symbol, so LowerCall emits a direct br.call + // and (via AdjustInstrPostInstrSelection) models the gp clobber -> the gp + // save/restore around the call survives, as GCC emits. + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, PointerType::getUnqual(*DAG.getContext()), + DAG.getExternalSymbol("__tls_get_addr", PtrVT), + std::move(Args)); + return LowerCallTo(CLI).first; + } + } + llvm_unreachable("Unknown TLS model"); +} + +SDValue IA64TargetLowering::LowerReturn( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, const SDLoc &dl, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_IA64); + + SDValue Glue; + SmallVector RetOps(1, Chain); // RetOps[0] is patched below. + + // Copy the return values into their assigned registers (r8 / F8). + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "return value must be in a register"); + SDValue Val = OutVals[i]; + + if (VA.getLocVT() != VA.getValVT()) { + if (VA.getLocVT().isInteger()) + Val = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Val); + else + Val = DAG.getNode(ISD::FP_EXTEND, dl, VA.getLocVT(), Val); + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Val, Glue); + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + RetOps[0] = Chain; + if (Glue.getNode()) + RetOps.push_back(Glue); + + return DAG.getNode(IA64ISD::RET_FLAG, dl, MVT::Other, RetOps); +} + +void IA64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode * /*Node*/) const { + unsigned Opc = MI.getOpcode(); + if (Opc != IA64::BRCALL_IPREL_GA && Opc != IA64::BRCALL_IPREL_ES) + return; + + // gp (r1) is caller-saved at any call that is *not* provably local to this + // load module: such a call may be resolved through an import stub that loads + // the callee's own gp, and whether that happens is a static-vs-dynamic + // linking decision we cannot see at compile time -- so we must conservatively + // assume it does. Marking the call as defining r1 keeps the gp save/restore + // LowerCall emits from being coalesced away (the same mechanism as rp/b0). + // + // A dso_local callee (e.g. a recursive self-call) keeps gp, so we leave it + // alone and the redundant save/restore folds away -- no per-call gp churn. + // (LTO could later prove more callees local and drop the clobber.) + // + // The call's only explicit operand (0) is the target: a GlobalAddress (direct + // call to a known function) or an ExternalSymbol (always external). + const MachineOperand &Target = MI.getOperand(0); + bool IsLocal = Target.isGlobal() && Target.getGlobal()->isDSOLocal(); + if (!IsLocal) + MI.addOperand( + MachineOperand::CreateReg(IA64::r1, /*isDef=*/true, /*isImp=*/true)); + + // A returns_twice callee on IA-64 cannot preserve the caller's stacked + // register frame (r32-r127). The two cases that matter -- setjmp/longjmp and + // vfork -- both leave the caller's stacked registers holding something other + // than their call-time values: vfork in particular runs the child in the + // parent's address space while the parent is blocked, so the child's use of + // the shared register backing store overwrites the parent's stacked locals + // (observed: an 'interp' argument parked in r32 reads back as 0 -- the value + // the vfork child stored there -- in Tcl's TclpCreateProcess, freeing a + // non-heap pointer). The static callee-saved registers r4-r7 are not in the + // backing store and survive (the kernel restores them from the parent's saved + // context); only the RSE-backed stacked registers are unsafe. + // + // The fixed BRCALL clobber list deliberately omits r32-r127 because an + // ordinary call *does* preserve the caller's frame via the RSE. For a + // returns_twice call we must additionally mark every stacked register clobbered + // so the allocator keeps nothing live across the call there -- such values are + // forced into r4-r7 or spilled to memory (which the child does not touch), + // exactly as GCC's 'calls_setjmp' handling requires. This complements the + // gp/sp/rp parking LowerCall already does for returns_twice functions. + // + // Express it as a regmask rather than 96 implicit-defs: implicit-def reg + // operands make MachineRegisterInfo::isPhysRegUsed report every stacked + // register as used, which IA64FrameLowering would then size the 'alloc' frame + // around (ballooning it to the 96-register maximum). A regmask is tested + // separately and is skipped by the frame-sizing scan (isPhysRegUsed's + // SkipRegMaskTest), so it constrains the allocator without inflating the frame. + const Function *Callee = + Target.isGlobal() ? dyn_cast(Target.getGlobal()) : nullptr; + if (Callee && Callee->hasFnAttribute(Attribute::ReturnsTwice)) { + MachineFunction &MF = *MI.getMF(); + unsigned NumRegs = MF.getSubtarget().getRegisterInfo()->getNumRegs(); + uint32_t *Mask = MF.allocateRegMask(); + // A set bit means "preserved"; allocateRegMask zero-inits (clobber all), so + // mark everything preserved and then clear just the stacked GPRs. The fixed + // Defs above keep clobbering the caller-saved set on top of this mask. + for (unsigned I = 0, E = MachineOperand::getRegMaskSize(NumRegs); I != E; ++I) + Mask[I] = ~0u; + // Register 0 is NoRegister, not a physical register: it must stay clobbered + // (bit clear), or regmask consumers that expand preserved bits to reg units + // (e.g. MachineCopyPropagation) assert iterating reg-units of reg 0. + Mask[0] &= ~1u; + for (unsigned I = 0; I != IA64NumStackedGPRs; ++I) { + MCRegister R = getIA64StackedGPR(I); + Mask[R.id() / 32] &= ~(1u << (R.id() % 32)); + } + MI.addOperand(MachineOperand::CreateRegMask(Mask)); + } +} + +//===----------------------------------------------------------------------===// +// Inline Assembly Support +//===----------------------------------------------------------------------===// + +TargetLowering::ConstraintType +IA64TargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: + break; + case 'r': // general register + case 'f': // floating-point register + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +std::pair +IA64TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, + MVT VT) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'r': + // Any integer value (including the i1 a Rust bool / black_box produces) + // lives in a general register. The GR class only carries i64, so the + // generic exact-type search fails for the narrower types; map them here. + if (VT.isInteger() || VT == MVT::Other) + return std::make_pair(0U, &IA64::GRRegClass); + break; + case 'f': + // f80 ('long double') is wider than the FP class's representative type + // (f64), which makes the generic inline-asm register-tiling code assert. + // Hand it the f80-only class so its register type is f80; f32/f64 are no + // wider than the representative type and use the multi-typed FP class. + if (VT == MVT::f80) + return std::make_pair(0U, &IA64::FP80RegClass); + if (VT == MVT::f32 || VT == MVT::f64) + return std::make_pair(0U, &IA64::FPRegClass); + break; + } + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} diff --git llvm/lib/Target/IA64/IA64ISelLowering.h llvm/lib/Target/IA64/IA64ISelLowering.h new file mode 100644 index 000000000000..a3e604545284 --- /dev/null +++ llvm/lib/Target/IA64/IA64ISelLowering.h @@ -0,0 +1,150 @@ +//===-- IA64ISelLowering.h - IA64 DAG Lowering Interface --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that IA64 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_IA64_IA64ISELLOWERING_H +#define LLVM_LIB_TARGET_IA64_IA64ISELLOWERING_H + +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/TargetLowering.h" + +namespace llvm { + +class MachineInstr; +class TargetSubtargetInfo; + +namespace IA64ISD { +enum NodeType : unsigned { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// GETFD - the getf.d instruction takes a floating point operand and + /// returns its 64-bit memory representation as an i64. + GETFD, + + /// BRCALL - the call hack (see the pre-removal backend). + BRCALL, + + /// RET_FLAG - Return with a flag operand. + RET_FLAG, + + /// TLS_TPREL - local-exec thread-pointer-relative offset of a thread-local + /// symbol. Its single operand is a TargetGlobalAddress tagged S_TPREL; + /// selected to 'movl rX = @tprel(sym)'. + TLS_TPREL, + + /// TLS_GOTLOAD - a value loaded from the symbol's GOT slot. Its single + /// operand is a TargetGlobalAddress whose target flags carry the @ltoff(...) + /// specifier (S_LTOFF_TPREL / S_LTOFF_DTPMOD / S_LTOFF_DTPREL); selected to + /// 'addl rX = , gp ;; ld8 rX = [rX]', the GlobalAddress GOT sequence. + TLS_GOTLOAD +}; +} // end namespace IA64ISD + +class IA64TargetLowering : public TargetLowering { +public: + explicit IA64TargetLowering(const TargetMachine &TM, + const TargetSubtargetInfo &STI); + + const char *getTargetNodeName(unsigned Opcode) const override; + + /// Jump-table entries are absolute code pointers (data8