# --- T2-COPYRIGHT-BEGIN ---
# t2/package/*/llvm/ia64-target.patch.ia64
# Copyright (C) 2026 The T2 SDE Project
# SPDX-License-Identifier: GPL-2.0 or patched project license
# --- T2-COPYRIGHT-END ---

diff --git llvm/CMakeLists.txt llvm/CMakeLists.txt
index fcbfed495383..5a95427b08ea 100644
--- llvm/CMakeLists.txt
+++ llvm/CMakeLists.txt
@@ -542,6 +542,7 @@ set(LLVM_ALL_EXPERIMENTAL_TARGETS
   ARC
   CSKY
   DirectX
+  IA64
   M68k
   Xtensa
 )
diff --git llvm/cmake/config-ix.cmake llvm/cmake/config-ix.cmake
index ed2bfa6df68f..387ba4bbf168 100644
--- llvm/cmake/config-ix.cmake
+++ llvm/cmake/config-ix.cmake
@@ -567,6 +567,8 @@ elseif (LLVM_NATIVE_ARCH MATCHES "msp430")
   set(LLVM_NATIVE_ARCH MSP430)
 elseif (LLVM_NATIVE_ARCH MATCHES "hexagon")
   set(LLVM_NATIVE_ARCH Hexagon)
+elseif (LLVM_NATIVE_ARCH MATCHES "ia64")
+  set(LLVM_NATIVE_ARCH IA64)
 elseif (LLVM_NATIVE_ARCH MATCHES "s390x")
   set(LLVM_NATIVE_ARCH SystemZ)
 elseif (LLVM_NATIVE_ARCH MATCHES "wasm32")
diff --git llvm/include/llvm/IR/RuntimeLibcalls.td llvm/include/llvm/IR/RuntimeLibcalls.td
index e7d636841c4b..984887bb554c 100644
--- llvm/include/llvm/IR/RuntimeLibcalls.td
+++ llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -3533,6 +3533,30 @@ def LegacyDefaultSystemLibrary
          DefaultStackProtector
 )>;
 
+//===----------------------------------------------------------------------===//
+// IA-64 Runtime Libcalls
+//===----------------------------------------------------------------------===//
+
+def isIA64 : RuntimeLibcallPredicate<"TT.getArch() == Triple::ia64">;
+
+// The legacy default set, except that IA-64's 'long double' is 80-bit double
+// extended, so the F80 libcalls stripped from the default set are live here:
+// libgcc's ia64 lib1funcs provides __divxf3/__divdf3/__divsf3 (FP divide has
+// no instruction; add/sub/mul are native, so there is no __addxf3 etc., and
+// nothing expands to one) and libm provides the l-suffixed math functions
+// (fmodl, sinl, sqrtl, ...).
+def IA64SystemLibrary
+    : SystemRuntimeLibrary<isIA64,
+      (add DefaultRuntimeLibcallImpls,
+         DefaultRuntimeLibcallImpls_f80,
+         LibmHasSinCosF32, LibmHasSinCosF64, LibmHasSinCosF80,
+         LibmHasExp10F32, LibmHasExp10F64, LibmHasExp10F80,
+         LibmHasFrexpF80, LibmHasLdexpF80,
+         __powisf2, __powidf2, __powixf2,
+         Int128RTLibcalls,
+         DefaultStackProtector
+)>;
+
 //===----------------------------------------------------------------------===//
 // Vector math libraries
 //===----------------------------------------------------------------------===//
diff --git llvm/include/llvm/MC/MCAsmInfo.h llvm/include/llvm/MC/MCAsmInfo.h
index ea8ac6dbe6e3..6cdb6f998619 100644
--- llvm/include/llvm/MC/MCAsmInfo.h
+++ llvm/include/llvm/MC/MCAsmInfo.h
@@ -206,6 +206,9 @@ protected:
   /// quotes.
   bool SupportsQuotedNames = true;
 
+  /// If true, append '#' to every non-temporary symbol reference.
+  bool UseSymbolHashSuffix = false;
+
   /// This is true if data region markers should be printed as
   /// ".data_region/.end_data_region" directives. If false, use "$d/$a" labels
   /// instead.
@@ -571,6 +574,8 @@ public:
   }
   bool supportsNameQuoting() const { return SupportsQuotedNames; }
 
+  bool useSymbolHashSuffix() const { return UseSymbolHashSuffix; }
+
   bool doesSupportDataRegionDirectives() const {
     return UseDataRegionDirectives;
   }
diff --git llvm/include/llvm/TargetParser/Triple.h llvm/include/llvm/TargetParser/Triple.h
index 9c83abeeb3b1..7f9b3cae974a 100644
--- llvm/include/llvm/TargetParser/Triple.h
+++ llvm/include/llvm/TargetParser/Triple.h
@@ -61,6 +61,7 @@ public:
     csky,        // CSKY: csky
     dxil,        // DXIL 32-bit DirectX bytecode
     hexagon,     // Hexagon: hexagon
+    ia64,        // IA-64 (Itanium): ia64
     loongarch32, // LoongArch (32-bit): loongarch32
     loongarch64, // LoongArch (64-bit): loongarch64
     m68k,        // M68k: Motorola 680x0 family
diff --git llvm/lib/MC/MCSymbol.cpp llvm/lib/MC/MCSymbol.cpp
index cf44005139ab..971b4dedbd74 100644
--- llvm/lib/MC/MCSymbol.cpp
+++ llvm/lib/MC/MCSymbol.cpp
@@ -61,8 +61,17 @@ void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   // some targets support quoting names with funny characters.  If the name
   // contains a funny character, then print it quoted.
   StringRef Name = getName();
+
+  // Some assemblers (IA-64 GNU as) parse a bare identifier matching a register
+  // name alias as that register even in symbol position, so a non-temporary
+  // symbol reference is decorated with a trailing '#' that the assembler strips.
+  // See MCAsmInfo::UseSymbolHashSuffix.
+  bool HashSuffix = MAI && MAI->useSymbolHashSuffix() && !isTemporary();
+
   if (!MAI || MAI->isValidUnquotedName(Name)) {
     OS << Name;
+    if (HashSuffix)
+      OS << '#';
     return;
   }
 
@@ -81,6 +90,8 @@ void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
       OS << C;
   }
   OS << '"';
+  if (HashSuffix)
+    OS << '#';
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git llvm/lib/Target/IA64/CMakeLists.txt llvm/lib/Target/IA64/CMakeLists.txt
new file mode 100644
index 000000000000..7fa1c25e8e84
--- /dev/null
+++ llvm/lib/Target/IA64/CMakeLists.txt
@@ -0,0 +1,51 @@
+add_llvm_component_group(IA64)
+
+# The IA64ISD SDNodes are declared by hand in IA64ISelLowering.h (as the
+# pre-removal backend did and many in-tree targets still do), so
+# -gen-sd-node-info is not used. AsmParser, Disassembler and object emission
+# (-gen-asm-matcher / -gen-disassembler / MCCodeEmitter / AsmBackend) are out of
+# Stage-1 scope (asm-output path only).
+
+set(LLVM_TARGET_DEFINITIONS IA64.td)
+
+tablegen(LLVM IA64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM IA64GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM IA64GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM IA64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM IA64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM IA64GenSubtargetInfo.inc -gen-subtarget)
+
+add_public_tablegen_target(IA64CommonTableGen)
+
+add_llvm_target(IA64CodeGen
+  IA64AsmPrinter.cpp
+  IA64Bundling.cpp
+  IA64FrameLowering.cpp
+  IA64ISelDAGToDAG.cpp
+  IA64ISelLowering.cpp
+  IA64InstrInfo.cpp
+  IA64MCInstLower.cpp
+  IA64MachineFunctionInfo.cpp
+  IA64RegisterInfo.cpp
+  IA64Subtarget.cpp
+  IA64TargetMachine.cpp
+
+  LINK_COMPONENTS
+  AsmPrinter
+  CodeGen
+  CodeGenTypes
+  Core
+  MC
+  SelectionDAG
+  Support
+  Target
+  TargetParser
+  IA64Desc
+  IA64Info
+
+  ADD_TO_COMPONENT
+  IA64
+  )
+
+add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
diff --git llvm/lib/Target/IA64/IA64.h llvm/lib/Target/IA64/IA64.h
new file mode 100644
index 000000000000..de72d6fdcb02
--- /dev/null
+++ llvm/lib/Target/IA64/IA64.h
@@ -0,0 +1,29 @@
+//===-- IA64.h - Top-level interface for IA64 representation ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// IA64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64_H
+#define LLVM_LIB_TARGET_IA64_IA64_H
+
+namespace llvm {
+
+class FunctionPass;
+class PassRegistry;
+class TargetMachine;
+
+FunctionPass *createIA64ISelDag(TargetMachine &TM);
+FunctionPass *createIA64BundlingPass();
+void initializeIA64DAGToDAGISelLegacyPass(PassRegistry &);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64_H
diff --git llvm/lib/Target/IA64/IA64.td llvm/lib/Target/IA64/IA64.td
new file mode 100644
index 000000000000..7da4cb0fc3e8
--- /dev/null
+++ llvm/lib/Target/IA64/IA64.td
@@ -0,0 +1,52 @@
+//===-- IA64.td - Target definition file for Intel IA64 ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel IA-64 architecture, also
+// known variously as ia64, IA-64, IPF, "the Itanium architecture" etc.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+include "llvm/Target/Target.td"
+include "llvm/TableGen/SearchableTable.td"
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "IA64RegisterInfo.td"
+
+// Map the PointerLikeRegClass (ptr_rc) operands of the target-independent
+// pseudo-instructions to IA-64's general registers.
+defm : RemapAllTargetPseudoPointerOperands<GR>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "IA64InstrInfo.td"
+
+def IA64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "IA64CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// IA-64 processors
+//===----------------------------------------------------------------------===//
+
+// The pre-removal backend had a trivial subtarget with no features; a single
+// generic processor with no scheduling model is enough to drive -gen-subtarget.
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
+def IA64 : Target {
+  let InstructionSet = IA64InstrInfo;
+}
diff --git llvm/lib/Target/IA64/IA64AsmPrinter.cpp llvm/lib/Target/IA64/IA64AsmPrinter.cpp
new file mode 100644
index 000000000000..7ad8b417b005
--- /dev/null
+++ llvm/lib/Target/IA64/IA64AsmPrinter.cpp
@@ -0,0 +1,253 @@
+//===-- IA64AsmPrinter.cpp - Print out IA64 LLVM as assembly --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts the machine-dependent LLVM code
+// to GNU 'gas'-compatible IA-64 assembly. Unlike the pre-removal backend, which
+// hand-formatted each MachineInstr, this lowers each MachineInstr to an MCInst
+// and lets the streamer + IA64InstPrinter emit the text.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64.h"
+#include "IA64MCInstLower.h"
+#include "MCTargetDesc/IA64InstPrinter.h"
+#include "MCTargetDesc/IA64MCAsmInfo.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "MCTargetDesc/IA64TargetStreamer.h"
+#include "TargetInfo/IA64TargetInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+class IA64AsmPrinter : public AsmPrinter {
+  // Per-function state for driving the IA-64 unwind directives (see
+  // emitInstruction). Reset in emitFunctionBodyStart.
+  bool EmittedBody = false;
+  bool EmittedFFrame = false;
+  // A framed function with more than one epilogue needs .label_state /
+  // .copy_state around its '.restore sp's; otherwise gas rejects the second one.
+  bool NeedCopyState = false;
+  // Set while lowering a GlobalAlias's aliasee: an alias names the aliasee's
+  // entry-point symbol directly (`A = B`), so suppress the @fptr descriptor
+  // wrapping lowerConstant applies to functions stored in data. See
+  // emitGlobalAlias / lowerConstant.
+  bool InAliasLowering = false;
+
+  IA64TargetStreamer &getTargetStreamer() {
+    return static_cast<IA64TargetStreamer &>(*OutStreamer->getTargetStreamer());
+  }
+
+public:
+  static char ID;
+
+  explicit IA64AsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer), ID) {}
+
+  StringRef getPassName() const override { return "IA64 Assembly Printer"; }
+
+  void emitStartOfAsmFile(Module &M) override;
+  void emitFunctionEntryLabel() override;
+  void emitFunctionBodyStart() override;
+  void emitFunctionBodyEnd() override;
+  void emitInstruction(const MachineInstr *MI) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       const char *ExtraCode, raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             const char *ExtraCode, raw_ostream &O) override;
+  void emitGlobalAlias(const Module &M, const GlobalAlias &GA) override;
+  const MCExpr *lowerConstant(const Constant *CV, const Constant *BaseCV,
+                              uint64_t Offset) override;
+};
+} // end anonymous namespace
+
+char IA64AsmPrinter::ID = 0;
+
+void IA64AsmPrinter::emitStartOfAsmFile(Module & /*M*/) {
+  // The IA-64 assembly preamble expected by GNU gas, matching the pre-removal
+  // output. (lsb should be msb on HP-UX; we only support 64-bit.)
+  OutStreamer->emitRawText(StringRef("\t.psr\tlsb"));
+  OutStreamer->emitRawText(StringRef("\t.radix\tC"));
+  OutStreamer->emitRawText(StringRef("\t.psr\tabi64"));
+}
+
+void IA64AsmPrinter::emitFunctionEntryLabel() {
+  // Open the unwind region before the function label, the way gcc does. The
+  // prologue/body directives are emitted per-instruction in emitInstruction;
+  // .endp follows the body in emitFunctionBodyEnd.
+  getTargetStreamer().emitProc(CurrentFnSym);
+  AsmPrinter::emitFunctionEntryLabel();
+}
+
+void IA64AsmPrinter::emitFunctionBodyStart() {
+  EmittedBody = false;
+  EmittedFFrame = false;
+
+  // A '.restore sp' closes the unwind region it sits in, so a framed function
+  // with several return blocks needs .label_state/.copy_state to re-open it for
+  // each one. Single-epilogue (or frameless) functions emit a bare '.restore'
+  // (or none), matching gcc. getStackSize() != 0 is exactly the has-a-frame
+  // (and therefore has-a-'.restore sp') condition.
+  unsigned RetBlocks = 0;
+  bool Framed = MF->getFrameInfo().getStackSize() != 0;
+  if (Framed)
+    for (const MachineBasicBlock &MBB : *MF)
+      if (!MBB.empty() && MBB.back().getOpcode() == IA64::RET)
+        ++RetBlocks;
+  NeedCopyState = Framed && RetBlocks > 1;
+}
+
+void IA64AsmPrinter::emitFunctionBodyEnd() {
+  getTargetStreamer().emitEndP(CurrentFnSym);
+}
+
+void IA64AsmPrinter::emitInstruction(const MachineInstr *MI) {
+  IA64TargetStreamer &TS = getTargetStreamer();
+
+  // Emit the IA-64 unwind directive that describes this prologue/epilogue
+  // instruction, before the instruction itself, so gas associates the unwind
+  // record with the right PC. The prologue (alloc, the rp save, the stack
+  // adjust) is tagged FrameSetup by frame lowering and ISel; the stack restore
+  // is tagged FrameDestroy. The first non-prologue instruction ends the
+  // prologue region with .body.
+  if (MI->getFlag(MachineInstr::FrameSetup)) {
+    switch (MI->getOpcode()) {
+    case IA64::ALLOC:
+      // alloc copies the caller's ar.pfs into its destination register.
+      TS.emitPrologueDirective();
+      TS.emitSaveARPFS(
+          IA64InstPrinter::getRegisterName(MI->getOperand(0).getReg().asMCReg()));
+      break;
+    case IA64::MOV:
+      // The return-pointer save is 'mov rN = rp'; distinguish it from the
+      // frame-pointer setup 'mov r5 = r12' by its source register.
+      if (MI->getOperand(1).getReg() == IA64::rp)
+        TS.emitSaveRP(IA64InstPrinter::getRegisterName(
+            MI->getOperand(0).getReg().asMCReg()));
+      break;
+    case IA64::ADDIMM22:
+    case IA64::ADD:
+      // The stack-pointer adjustment writes r12; the .fframe value is the final
+      // frame size frame lowering settled on.
+      if (!EmittedFFrame && MI->getOperand(0).getReg() == IA64::r12) {
+        TS.emitFFrame(MF->getFrameInfo().getStackSize());
+        EmittedFFrame = true;
+      }
+      break;
+    }
+  } else if (!EmittedBody && !MI->isMetaInstruction() &&
+             MI->getOpcode() != IA64::STOP) {
+    // End the prologue region at the first real body instruction. Skip the
+    // bundler's STOP (';;') pseudo: one can land between prologue instructions
+    // (e.g. the forced stop after 'alloc'), and treating it as the body start
+    // would push .fframe / a late .save past .body.
+    TS.emitBody();
+    EmittedBody = true;
+    if (NeedCopyState)
+      TS.emitLabelState(1);
+  }
+
+  if (MI->getFlag(MachineInstr::FrameDestroy) &&
+      (MI->getOpcode() == IA64::ADDIMM22 || MI->getOpcode() == IA64::ADD) &&
+      MI->getOperand(0).getReg() == IA64::r12) {
+    if (NeedCopyState)
+      TS.emitCopyState(1);
+    TS.emitRestoreSP();
+  }
+
+  IA64MCInstLower Lower(OutContext, *this);
+  MCInst TmpInst;
+  Lower.Lower(MI, TmpInst);
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Print an inline-asm operand referenced by a '$N' substitution. We handle the
+// no-modifier register and immediate cases (covering the 'r'/'f' and immediate
+// constraints); anything else defers to the generic AsmPrinter handler.
+bool IA64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                     const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    // We define no IA-64-specific modifiers; let the generic handler try.
+    return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << IA64InstPrinter::getRegisterName(MO.getReg().asMCReg());
+    return false;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return false;
+  default:
+    break;
+  }
+  return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+}
+
+// An inline-asm memory operand ('m'): the address lives in a single register,
+// dereferenced as '[rN]'.
+bool IA64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                           const char *ExtraCode,
+                                           raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, O);
+
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  if (!MO.isReg())
+    return true;
+  O << '[' << IA64InstPrinter::getRegisterName(MO.getReg().asMCReg()) << ']';
+  return false;
+}
+
+// A GlobalAlias is just another name for the aliasee's symbol; on IA-64 a
+// function alias must resolve to the aliasee's *entry point*, not its function
+// descriptor. The generic AsmPrinter lowers the aliasee through lowerConstant()
+// (which wraps functions in @fptr), so `A = @fptr(B)` would be emitted: that
+// both mis-aliases A to the descriptor and makes GNU as abort (a symbol's value
+// expression can't be an @fptr pseudo-fixup -- "Case value 64 unexpected" in
+// resolve_symbol_value). Flag the alias context so lowerConstant emits the bare
+// entry-point symbol, yielding the correct `A = B`.
+void IA64AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) {
+  InAliasLowering = true;
+  AsmPrinter::emitGlobalAlias(M, GA);
+  InAliasLowering = false;
+}
+
+const MCExpr *IA64AsmPrinter::lowerConstant(const Constant *CV,
+                                            const Constant *BaseCV,
+                                            uint64_t Offset) {
+  // A function pointer stored in data is the address of the function's
+  // descriptor { entry, gp }, not its entry point: emit data8 @fptr(f). The
+  // linker materializes the .opd descriptor; an indirect call dereferences it.
+  // (Skipped under alias lowering, where the alias must equal the entry point.)
+  if (const auto *F = dyn_cast<Function>(CV)) {
+    const MCExpr *E = MCSymbolRefExpr::create(getSymbol(F), OutContext);
+    if (InAliasLowering)
+      return E;
+    return MCSpecifierExpr::create(E, IA64::S_FPTR, OutContext);
+  }
+  return AsmPrinter::lowerConstant(CV, BaseCV, Offset);
+}
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeIA64AsmPrinter() {
+  RegisterAsmPrinter<IA64AsmPrinter> X(getTheIA64Target());
+}
diff --git llvm/lib/Target/IA64/IA64Bundling.cpp llvm/lib/Target/IA64/IA64Bundling.cpp
new file mode 100644
index 000000000000..93fb1be99391
--- /dev/null
+++ llvm/lib/Target/IA64/IA64Bundling.cpp
@@ -0,0 +1,121 @@
+//===-- IA64Bundling.cpp - IA-64 instruction bundling pass. --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Add stops (;;) where required to prevent read-after-write and write-after-
+// write dependencies, for registers. (The pre-removal pass noted exceptions for
+// parallel compares targeting p0; those are not reintroduced here.)
+//
+// FIXME: actual bundle formation is left to the assembler; this only inserts
+// stop bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include <set>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ia64-bundling"
+
+STATISTIC(StopBitsAdded, "Number of stop bits added");
+
+namespace {
+struct IA64BundlingPass : public MachineFunctionPass {
+  static char ID;
+
+  IA64BundlingPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "IA64 (Itanium) Bundling Pass";
+  }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    TII = F.getSubtarget().getInstrInfo();
+    RSEWrite = false;
+    bool Changed = false;
+    for (MachineBasicBlock &MBB : F)
+      Changed |= runOnMachineBasicBlock(MBB);
+    return Changed;
+  }
+
+private:
+  const TargetInstrInfo *TII = nullptr;
+
+  // Ugly carried state, but pending writes can cross basic blocks. Taken
+  // branches end instruction groups, so only fallthrough code matters.
+  std::set<unsigned> PendingRegWrites;
+
+  // Likewise carried across blocks: an alloc writes the RSE/CFM and must be
+  // separated from a later call by a stop. The alloc commonly lives in the entry
+  // block while the first call sits in a fall-through successor (e.g. alloc in
+  // the prologue, first call in the next block), so a per-block flag would lose
+  // the pending alloc at the block boundary and skip the required stop. Reset
+  // only at function entry and when a stop is emitted below.
+  bool RSEWrite = false;
+};
+char IA64BundlingPass::ID = 0;
+} // end anonymous namespace
+
+/// createIA64BundlingPass - Returns a pass that adds STOP (;;) instructions
+/// where inter-instruction register dependencies require them.
+FunctionPass *llvm::createIA64BundlingPass() { return new IA64BundlingPass(); }
+
+bool IA64BundlingPass::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
+    MachineInstr &MI = *I;
+    ++I;
+
+    std::set<unsigned> CurrentReads, CurrentWrites, OrigWrites;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isUse()) // TODO: exclude p0
+        CurrentReads.insert(MO.getReg());
+      if (MO.isDef()) { // TODO: exclude p0
+        CurrentWrites.insert(MO.getReg());
+        OrigWrites.insert(MO.getReg());
+      }
+    }
+
+    // Does this instruction read or write any register that is pending a
+    // write (i.e. not yet separated from its writer by a stop)?
+    set_intersect(CurrentReads, PendingRegWrites);
+    set_intersect(CurrentWrites, PendingRegWrites);
+
+    if ((RSEWrite && MI.isCall()) ||
+        !(CurrentReads.empty() && CurrentWrites.empty())) {
+      // Conflict (or the forced stop after an alloc): insert a stop before this
+      // instruction and reset the pending set to this instruction's writes.
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(IA64::STOP));
+      PendingRegWrites = OrigWrites;
+      Changed = true;
+      RSEWrite = false;
+      ++StopBitsAdded;
+    } else {
+      // No conflict: accumulate this instruction's writes.
+      set_union(PendingRegWrites, OrigWrites);
+    }
+
+    // An alloc writes into the RSE and has to be separated from calls
+    if (MI.getOpcode() == IA64::ALLOC)
+      RSEWrite = true;
+  }
+
+  return Changed;
+}
diff --git llvm/lib/Target/IA64/IA64CallingConv.td llvm/lib/Target/IA64/IA64CallingConv.td
new file mode 100644
index 000000000000..c585afabc0ac
--- /dev/null
+++ llvm/lib/Target/IA64/IA64CallingConv.td
@@ -0,0 +1,96 @@
+//===-- IA64CallingConv.td - Calling conventions for IA64 --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the IA-64 architecture. The
+// pre-removal backend hand-coded this logic in IA64ISelLowering; modern LLVM
+// expresses it as CCState-driven tables, so the equivalent mapping is captured
+// here and consumed via -gen-callingconv.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// IA-64 C argument calling convention.
+//===----------------------------------------------------------------------===//
+def CC_IA64 : CallingConv<[
+  // A large aggregate return value is materialized in a caller-allocated
+  // buffer; its address arrives in r8, not a normal parameter slot (psABI
+  // §8.6). r8 is scratch, so it does not consume out0.
+  CCIfSRet<CCIfType<[i64], CCAssignToReg<[r8]>>>,
+
+  // Integer types smaller than a register are passed as i64.
+  CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
+
+  // long double (f80): one FP register, shadowing two GR parameter slots.
+  CCIfType<[f80], CCCustom<"CC_IA64_F80">>,
+
+  // The first eight integer arguments arrive in the incoming stacked GP
+  // registers r32-r39.
+  CCIfType<[i64], CCAssignToReg<[r32, r33, r34, r35, r36, r37, r38, r39]>>,
+
+  // FP scalars: one positional parameter slot each. A fixed arg arrives in
+  // F8-F15 while *reserving* its GR slot so a following integer keeps its slot;
+  // a variadic arg arrives in the GR slot in memory format (psABI 8.5.4).
+  // CCAssignToRegWithShadow cannot express the "reserve the slot, not the
+  // FP-indexed GR" rule, so use a custom hook.
+  CCIfType<[f64, f32], CCCustom<"CC_IA64_FP">>,
+
+  // Everything beyond the eight register slots is passed in 8-byte stack slots.
+  CCIfType<[i64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// IA-64 C *outgoing* (caller-side) argument convention.
+//
+// A caller places arguments in its output registers out0-out7, which the
+// callee's 'alloc' renames into that callee's incoming r32+. This mirrors
+// CC_IA64 exactly, but targets the out registers instead of r32-r39.
+//===----------------------------------------------------------------------===//
+def CC_IA64_Call : CallingConv<[
+  // Large-aggregate-return buffer address goes in r8 (psABI §8.6), matching the
+  // incoming side in CC_IA64.
+  CCIfSRet<CCIfType<[i64], CCAssignToReg<[r8]>>>,
+
+  CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
+  CCIfType<[f32], CCPromoteToType<f64>>,
+
+  // long double (f80): a fixed arg goes in one FP register, shadowing two
+  // output parameter slots; a variadic arg goes in two general registers
+  // (memory format). LowerCall splits the variadic case.
+  CCIfType<[f80], CCCustom<"CC_IA64_Call_F80">>,
+
+  CCIfType<[i64],
+           CCAssignToReg<[out0, out1, out2, out3, out4, out5, out6, out7]>>,
+
+  // FP scalars (f64; f32 promoted above): one positional output slot each. A
+  // fixed arg goes in F8-F15 while reserving its output GR slot; a variadic arg
+  // ('...' match) goes in the output GR slot in memory format (prototyped-
+  // variadic psABI 8.5.4) -- LowerCall bit-casts it to its i64 pattern. The hook
+  // reserves the slot positionally, which CCAssignToRegWithShadow cannot do.
+  CCIfType<[f64], CCCustom<"CC_IA64_Call_FP">>,
+
+  CCIfType<[i64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// IA-64 C return-value convention.
+//===----------------------------------------------------------------------===//
+def RetCC_IA64 : CallingConv<[
+  CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Scalar integers/pointers return in r8 (65-128 bits in r8,r9). Aggregates up
+  // to 256 bits are returned by value in r8-r11 -- clang coerces them to an
+  // [N x i64] whose elements land here in order (psABI Table 8-2). Aggregates
+  // larger than 256 bits are returned via a caller buffer whose address is
+  // passed in r8 (sret; see CCIfSRet in the argument conventions).
+  CCIfType<[i64], CCAssignToReg<[r8, r9, r10, r11]>>,
+
+  // Floating-point and homogeneous FP aggregates return in F8-F15 (a coerced
+  // [N x double]/[N x float] supplies one value per register).
+  CCIfType<[f64, f32], CCAssignToReg<[F8, F9, F10, F11, F12, F13, F14, F15]>>,
+  CCIfType<[f80], CCAssignToReg<[F8, F9, F10, F11, F12, F13, F14, F15]>>
+]>;
diff --git llvm/lib/Target/IA64/IA64FrameLowering.cpp llvm/lib/Target/IA64/IA64FrameLowering.cpp
new file mode 100644
index 000000000000..a809e1a992a9
--- /dev/null
+++ llvm/lib/Target/IA64/IA64FrameLowering.cpp
@@ -0,0 +1,282 @@
+//===-- IA64FrameLowering.cpp - IA64 Frame Information --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of TargetFrameLowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64FrameLowering.h"
+#include "IA64MachineFunctionInfo.h"
+#include "IA64RegisterInfo.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register. (The pre-removal backend also forced this off
+// -fomit-frame-pointer; that global is gone, so we only key off var-sized
+// objects.)
+bool IA64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
+  return MF.getFrameInfo().hasVarSizedObjects();
+}
+
+void IA64FrameLowering::emitPrologue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  bool FP = hasFP(MF);
+  DebugLoc DL;
+
+  // First, handle the 'alloc' instruction, which must be at the top of any
+  // function. There are 96 stacked GPRs the RSE worries about.
+  unsigned NumStackedGPRsUsed = 0;
+  for (unsigned i = 0; i != IA64NumStackedGPRs; ++i) {
+    // SkipRegMaskTest: count a stacked register only if it is really allocated
+    // to a value here, not merely clobbered by a call's regmask. A returns_twice
+    // (vfork/setjmp) call carries a regmask clobbering all of r32-r127 (see
+    // IA64TargetLowering::AdjustInstrPostInstrSelection) to keep values out of
+    // the RSE-backed frame across it; without skipping the mask that would size
+    // this 'alloc' to the full 96-register frame.
+    if (MF.getRegInfo().isPhysRegUsed(getIA64StackedGPR(i),
+                                      /*SkipRegMaskTest=*/true))
+      NumStackedGPRsUsed = i + 1; // i+1, not ++ - consider fn(fp, fp, int)
+  }
+
+  unsigned NumOutRegsUsed = MF.getInfo<IA64FunctionInfo>()->OutRegsUsed;
+
+  IA64FunctionInfo *FInfo = MF.getInfo<IA64FunctionInfo>();
+
+  // Park the caller's ar.pfs in a fixed stacked local for the whole function.
+  // 'alloc' writes the incoming ar.pfs into its destination register, and every
+  // function must restore that value before br.ret so the register stack engine
+  // can recover the caller's frame. Make the destination a fresh stacked local
+  // just above the ones the allocator used: a register stack engine local is
+  // preserved across calls for free, and because the allocator never sees this
+  // register it is never spilled -- so the value stays in one place that the
+  // unwinder can name in a single '.save ar.pfs, <reg>' directive valid for the
+  // entire body.
+  //
+  // The old backend instead let the allocator place the ar.pfs-save value (via
+  // PSEUDO_ALLOC). In a non-leaf function the allocator spilled that value to a
+  // stack slot across calls and reused the register, so '.save ar.pfs, <reg>'
+  // named a register that no longer held ar.pfs at the call sites. That was
+  // invisible to gdb's read-only backtrace (which only needs the return address
+  // from '.save rp') but crashed libgcc's forced unwinder (pthread_exit /
+  // pthread_cancel), which must actually restore ar.pfs to pop the RSE frame.
+  Register SavedPFSReg = getIA64StackedGPR(NumStackedGPRsUsed);
+  ++NumStackedGPRsUsed;
+  FInfo->setSavedPFSReg(SavedPFSReg);
+
+  // For a non-leaf function, br.call overwrites the return pointer (b0/rp), so
+  // we must preserve the caller's return address for our own br.ret. The
+  // register allocator already does this lazily -- it copies rp into a stacked
+  // local around each call -- but those copies land in a different register at
+  // each call site, so there is no single location the unwinder can name. Park
+  // rp once here, in a fresh stacked local (just like ar.pfs above), so the
+  // frame is describable by one '.save rp, <reg>' directive. emitEpilogue
+  // restores b0 from it.
+  //
+  // hasCalls() is the right test: it covers libcalls (e.g. the __divdi3 a sdiv
+  // lowers to) that clobber rp without any IR-level call, which a check earlier
+  // than frame lowering could not see.
+  Register SavedRPReg;
+  if (MFI.hasCalls()) {
+    SavedRPReg = getIA64StackedGPR(NumStackedGPRsUsed);
+    ++NumStackedGPRsUsed;
+    FInfo->setSavedRPReg(SavedRPReg);
+  }
+
+  // The whole stacked frame -- locals (the allocator's plus our ar.pfs/rp saves)
+  // and the outputs (out0-out7, placed by gas above the locals) -- must fit in
+  // the 96-register window. getReservedRegs guarantees this by capping the
+  // allocator's locals: it reserves the top 10 stacked GPRs (8 outputs + the rp
+  // save + the ar.pfs save).
+  assert(NumStackedGPRsUsed + NumOutRegsUsed <= IA64NumStackedGPRs &&
+         "stacked-GPR frame overflow: locals + saves + outputs > 96");
+
+  // 'alloc' must be the first instruction in the function; its destination is
+  // the parked ar.pfs local. Mark that operand as a Define: 'alloc' writes the
+  // caller's ar.pfs into it, and the bundling pass needs to see that write so it
+  // inserts the mandatory stop before any instruction that reads the register --
+  // notably the epilogue's 'mov ar.pfs = <reg>'. (Using 'alloc's result, or any
+  // register it renames, in the same instruction group is illegal and faults
+  // with SIGILL.) The old backend's PSEUDO_ALLOC supplied this def; without it,
+  // a use of the addReg default would leave 'alloc' looking like a reader.
+  //
+  // Tag it (and the rest of the prologue below) as frame setup so the asm
+  // printer can hang the IA-64 unwind directives (.prologue / .save ar.pfs /
+  // .save rp / .fframe) off the right instructions.
+  BuildMI(MBB, MBBI, DL, TII->get(IA64::ALLOC))
+      .addReg(SavedPFSReg, RegState::Define)
+      .addImm(0)
+      .addImm(NumStackedGPRsUsed)
+      .addImm(NumOutRegsUsed)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // The ar.pfs local is defined by 'alloc' here and used by emitEpilogue's
+  // restore in another block; mark it live across the whole function so its
+  // value is correctly seen as live everywhere.
+  for (MachineBasicBlock &Block : MF)
+    if (&Block != &MBB)
+      Block.addLiveIn(SavedPFSReg);
+
+  // Save the incoming return pointer into its parked local, and likewise mark it
+  // live across the function.
+  if (SavedRPReg) {
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), SavedRPReg)
+        .addReg(IA64::rp)
+        .setMIFlag(MachineInstr::FrameSetup);
+    for (MachineBasicBlock &Block : MF)
+      if (&Block != &MBB)
+        Block.addLiveIn(SavedRPReg);
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned NumBytes = MFI.getStackSize();
+
+  if (FP)
+    NumBytes += 8; // reserve space for the old FP
+
+  // Do we need to allocate space on the stack?
+  if (NumBytes == 0)
+    return;
+
+  // Add 16 bytes at the bottom of the stack (scratch area) and round the size
+  // to a multiple of the alignment.
+  unsigned Align = getStackAlign().value();
+  unsigned Size = 16 + (FP ? 8 : 0);
+  NumBytes = (NumBytes + Size + Align - 1) / Align * Align;
+  MFI.setStackSize(NumBytes);
+
+  // Adjust the stack pointer: r12 -= NumBytes.
+  if (NumBytes <= 8191) {
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::ADDIMM22), IA64::r12)
+        .addReg(IA64::r12)
+        .addImm(-(int64_t)NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+  } else { // use r22 as a scratch register
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::MOVLIMM64), IA64::r22)
+        .addImm(-(int64_t)NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::ADD), IA64::r12)
+        .addReg(IA64::r12)
+        .addReg(IA64::r22)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Now, if we need to, save the old FP and set the new one.
+  if (FP) {
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::ST8))
+        .addReg(IA64::r12)
+        .addReg(IA64::r5)
+        .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), IA64::r5)
+        .addReg(IA64::r12)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+}
+
+void IA64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  MachineBasicBlock::iterator MBBI = std::prev(MBB.end());
+  assert(MBBI->getOpcode() == IA64::RET &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc DL = MBBI->getDebugLoc();
+  bool FP = hasFP(MF);
+
+  unsigned NumBytes = MFI.getStackSize();
+
+  // Restore the caller's ar.pfs from the local 'alloc' parked it in, so our
+  // br.ret lets the register stack engine recover the caller's frame. Every
+  // function has this save (see emitPrologue), so the register is always valid.
+  // Keeping it live here anchors the '.save ar.pfs' unwind region across the
+  // whole body.
+  Register SavedPFSReg = MF.getInfo<IA64FunctionInfo>()->getSavedPFSReg();
+  BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV_TO_AR_PFS), IA64::AR_PFS)
+      .addReg(SavedPFSReg)
+      .setMIFlag(MachineInstr::FrameDestroy);
+
+  // Restore the incoming return pointer (b0/rp) from the local the prologue
+  // parked it in, so our br.ret returns to the caller. This also keeps that
+  // local live, anchoring the '.save rp' unwind region across the whole body.
+  if (Register SavedRPReg = MF.getInfo<IA64FunctionInfo>()->getSavedRPReg())
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), IA64::rp)
+        .addReg(SavedRPReg)
+        .setMIFlag(MachineInstr::FrameDestroy);
+
+  // If we need to, restore the old FP.
+  if (FP) {
+    // Copy the FP into the SP (discards allocas).
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::MOV), IA64::r12)
+        .addReg(IA64::r5)
+        .setMIFlag(MachineInstr::FrameDestroy);
+    // Restore the FP.
+    BuildMI(MBB, MBBI, DL, TII->get(IA64::LD8), IA64::r5)
+        .addReg(IA64::r5)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  }
+
+  if (NumBytes != 0) {
+    if (NumBytes <= 8191) {
+      BuildMI(MBB, MBBI, DL, TII->get(IA64::ADDIMM22), IA64::r12)
+          .addReg(IA64::r12)
+          .addImm(NumBytes)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(IA64::MOVLIMM64), IA64::r22)
+          .addImm(NumBytes)
+          .setMIFlag(MachineInstr::FrameDestroy);
+      BuildMI(MBB, MBBI, DL, TII->get(IA64::ADD), IA64::r12)
+          .addReg(IA64::r12)
+          .addReg(IA64::r22)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
+  }
+}
+
+MachineBasicBlock::iterator IA64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub sp, <amt>' and the adjcallstackdown instruction into 'add sp, <amt>'.
+    MachineInstr &Old = *I;
+    unsigned Amount = Old.getOperand(0).getImm();
+    DebugLoc DL = Old.getDebugLoc();
+    if (Amount != 0) {
+      // Keep the stack aligned: round up to the next alignment boundary.
+      unsigned Align = getStackAlign().value();
+      Amount = (Amount + Align - 1) / Align * Align;
+
+      if (Old.getOpcode() == IA64::ADJUSTCALLSTACKDOWN) {
+        BuildMI(MBB, I, DL, TII->get(IA64::ADDIMM22), IA64::r12)
+            .addReg(IA64::r12)
+            .addImm(-(int64_t)Amount);
+      } else {
+        assert(Old.getOpcode() == IA64::ADJUSTCALLSTACKUP);
+        BuildMI(MBB, I, DL, TII->get(IA64::ADDIMM22), IA64::r12)
+            .addReg(IA64::r12)
+            .addImm(Amount);
+      }
+    }
+  }
+
+  return MBB.erase(I);
+}
diff --git llvm/lib/Target/IA64/IA64FrameLowering.h llvm/lib/Target/IA64/IA64FrameLowering.h
new file mode 100644
index 000000000000..d37a2e8421f0
--- /dev/null
+++ llvm/lib/Target/IA64/IA64FrameLowering.h
@@ -0,0 +1,57 @@
+//===-- IA64FrameLowering.h - Define frame lowering for IA64 ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the IA64-specific bits of the TargetFrameLowering
+// class. In the pre-removal backend this logic lived in IA64RegisterInfo;
+// modern LLVM splits frame lowering into its own class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64FRAMELOWERING_H
+#define LLVM_LIB_TARGET_IA64_IA64FRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/Alignment.h"
+
+namespace llvm {
+
+class IA64FrameLowering : public TargetFrameLowering {
+public:
+  // StackRealignable=false: this backend does not dynamically realign the
+  // stack. sp (r12) is only 16-byte aligned and the prologue never emits an
+  // 'and sp, -N', so we cannot honor a local whose alignment exceeds 16 by
+  // placing it at a static sp+offset slot. If we claimed otherwise (the
+  // default is true), FunctionLoweringInfo would fold an over-aligned
+  // (e.g. #[repr(align(64))]) alloca into the static frame; SelectionDAG's
+  // computeKnownBits would then trust the frame-index pointer to be 64-aligned
+  // and rewrite field GEPs 'add base, k' into 'or base, k' -- which collide
+  // and corrupt fields once the runtime address is merely 16-aligned. With
+  // this false, such allocas are instead demoted to variable-sized objects and
+  // lowered via DYNAMIC_STACKALLOC (Expand emits 'sp -= size; sp &= -align'),
+  // so the pointer is genuinely aligned and the 'or' rewrite is valid. The
+  // demotion also sets hasVarSizedObjects(), which turns on hasFP so the
+  // epilogue restores sp from the frame pointer.
+  IA64FrameLowering()
+      : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/Align(16),
+                            /*LocalAreaOffset=*/0, /*TransientStackAlignment=*/
+                            Align(16), /*StackRealignable=*/false) {}
+
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64FRAMELOWERING_H
diff --git llvm/lib/Target/IA64/IA64ISelDAGToDAG.cpp llvm/lib/Target/IA64/IA64ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..c2593c910fdd
--- /dev/null
+++ llvm/lib/Target/IA64/IA64ISelDAGToDAG.cpp
@@ -0,0 +1,458 @@
+//===---- IA64ISelDAGToDAG.cpp - IA64 pattern matching inst selector ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for IA64,
+// converting a legalized DAG into an IA64 DAG.
+//
+// The pre-removal selector hand-selected a great deal (FP divide expansion, the
+// BRCALL call hack, manual load/store/branch handling). Most arithmetic now
+// flows through the tablegen-generated matcher (SelectCode); the cases that
+// cannot be (or were not) expressed as patterns are hand-selected here, as the
+// pre-removal backend did: FrameIndex (Stage 1); the branches BR/BRCOND, whose
+// target is an i64imm rather than a tablegen 'bb' operand; the IA64ISD::BRCALL
+// call node; and loads/stores, dispatched on the memory type (Stage C).
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64.h"
+#include "IA64ISelLowering.h"
+#include "MCTargetDesc/IA64MCAsmInfo.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ia64-isel"
+#define PASS_NAME "IA64 (Itanium) DAG->DAG Pattern Instruction Selection"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+class IA64DAGToDAGISel : public SelectionDAGISel {
+public:
+  IA64DAGToDAGISel() = delete;
+
+  explicit IA64DAGToDAGISel(TargetMachine &TM) : SelectionDAGISel(TM) {}
+
+  void Select(SDNode *N) override;
+
+  // Inline-asm memory operand ('m'): IA-64 dereferences a single register, so
+  // the address is just passed through as one operand.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    InlineAsm::ConstraintCode ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+  // Include the pieces autogenerated from the target description.
+#include "IA64GenDAGISel.inc"
+};
+
+class IA64DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
+public:
+  static char ID;
+  explicit IA64DAGToDAGISelLegacy(TargetMachine &TM)
+      : SelectionDAGISelLegacy(ID, std::make_unique<IA64DAGToDAGISel>(TM)) {}
+};
+} // end anonymous namespace
+
+char IA64DAGToDAGISelLegacy::ID = 0;
+
+INITIALIZE_PASS(IA64DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
+
+// Convert a target-independent node to a target-specific one, unless the
+// generated matcher can do it for us.
+void IA64DAGToDAGISel::Select(SDNode *N) {
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return; // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i64);
+    CurDAG->SelectNodeTo(N, IA64::MOV, MVT::i64, TFI);
+    return;
+  }
+
+  case ISD::GlobalAddress: {
+    // Materialize a global's address out of the linkage table (GOT), anchored
+    // by gp (r1), transcribing the pre-removal selector:
+    //   addl rX = <sym>, gp ;; ld8 rX = [rX]
+    // The ADDL_GA computes the address of the symbol's GOT slot relative to gp,
+    // and the LD8 loads the symbol's runtime address from it. The GOT slot is
+    // invariant, so the load is chained off the entry node.
+    const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
+    SDLoc dl(N);
+    // Tag the symbol with the @ltoff specifier (carried on the target flags);
+    // IA64MCInstLower turns it into the printed "@ltoff(sym)" so gas builds the
+    // GOT entry the LD8 below reads. A function's address is its descriptor, so
+    // the GOT entry must hold @ltoff(@fptr(f)) (the descriptor address), not the
+    // raw entry point -- an indirect call dereferences it as { entry, gp }.
+    unsigned Spec = isa<Function>(GV) ? IA64::S_LTOFF_FPTR : IA64::S_LTOFF;
+    SDValue GA = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, /*offset=*/0,
+                                                Spec);
+    SDValue Slot = SDValue(
+        CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64,
+                               CurDAG->getRegister(IA64::r1, MVT::i64), GA),
+        0);
+    SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other,
+                                        Slot, CurDAG->getEntryNode());
+    ReplaceUses(SDValue(N, 0), SDValue(Ld, 0));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+
+  case IA64ISD::TLS_GOTLOAD: {
+    // Load a thread-local datum (a TLS offset or module id) from the symbol's
+    // GOT slot: addl rX = @ltoff(@<tls>(sym)), gp ;; ld8 rX = [rX]. Identical to
+    // the GlobalAddress case above, but the @ltoff specifier is already carried
+    // on the operand's target flags (set by LowerGlobalTLSAddress); the loaded
+    // value is consumed by 'add tp' (initial-exec) or __tls_get_addr (dynamic).
+    SDLoc dl(N);
+    SDValue GA = N->getOperand(0);
+    SDValue Slot = SDValue(
+        CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64,
+                               CurDAG->getRegister(IA64::r1, MVT::i64), GA),
+        0);
+    SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other,
+                                        Slot, CurDAG->getEntryNode());
+    ReplaceUses(SDValue(N, 0), SDValue(Ld, 0));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+
+  case IA64ISD::TLS_TPREL: {
+    // Materialize the local-exec tp-relative offset directly: movl rX =
+    // @tprel(sym). The operand is a TargetGlobalAddress tagged S_TPREL; the
+    // result is added to tp (r13) by the caller.
+    SDLoc dl(N);
+    CurDAG->SelectNodeTo(N, IA64::MOVL_GA, MVT::i64, N->getOperand(0));
+    return;
+  }
+
+  case ISD::JumpTable: {
+    // Materialize a jump table's base address the same way as a global: load it
+    // from its GOT slot (addl @ltoff(.LJTI), gp ;; ld8). BR_JT expands to this
+    // base + scaled index, an LD8 of the (absolute) entry, and a BRIND.
+    int JTI = cast<JumpTableSDNode>(N)->getIndex();
+    SDLoc dl(N);
+    SDValue JT = CurDAG->getTargetJumpTable(JTI, MVT::i64, IA64::S_LTOFF);
+    SDValue Slot = SDValue(
+        CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64,
+                               CurDAG->getRegister(IA64::r1, MVT::i64), JT),
+        0);
+    SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other,
+                                        Slot, CurDAG->getEntryNode());
+    ReplaceUses(SDValue(N, 0), SDValue(Ld, 0));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+
+  case ISD::ConstantPool: {
+    // Materialize a constant-pool entry's address the same way as a global or
+    // jump table: load it from its GOT slot (addl @ltoff(.LCPI), gp ;; ld8).
+    // The f80 ('long double') immediates that the legalizer spills here are then
+    // loaded with ldfe (the f80 load pattern). (f32/f64 immediates stay out of
+    // the pool -- see isFPImmLegal -- so this path is exercised only by f80.)
+    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    SDLoc dl(N);
+    SDValue CPA = CurDAG->getTargetConstantPool(
+        CP->getConstVal(), MVT::i64, CP->getAlign(), CP->getOffset(),
+        IA64::S_LTOFF);
+    SDValue Slot = SDValue(
+        CurDAG->getMachineNode(IA64::ADDL_GA, dl, MVT::i64,
+                               CurDAG->getRegister(IA64::r1, MVT::i64), CPA),
+        0);
+    SDNode *Ld = CurDAG->getMachineNode(IA64::LD8, dl, MVT::i64, MVT::Other,
+                                        Slot, CurDAG->getEntryNode());
+    ReplaceUses(SDValue(N, 0), SDValue(Ld, 0));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+
+  case ISD::BR: {
+    // br bb  ->  (p0) brl.cond bb.  The branch instructions carry an i64imm
+    // target operand (not a tablegen 'bb' operand), so they are hand-selected
+    // rather than pattern-matched, as the pre-removal backend did. The
+    // MachineBasicBlock operand is lowered to the block's symbol by
+    // IA64MCInstLower. Operands: (chain, BasicBlock).
+    SDValue Chain = N->getOperand(0);
+    SDValue Target = N->getOperand(1);
+    CurDAG->SelectNodeTo(N, IA64::BRL_NOTCALL, MVT::Other, Target, Chain);
+    return;
+  }
+
+  case ISD::BRCOND: {
+    // brcond p, bb  ->  (p) brl.cond bb.  The conditional branch keeps only the
+    // taken edge; the fall-through to the other successor is a separate ISD::BR
+    // (BR_CC/SELECT_CC stay Expand, so the legalizer hands us setcc + brcond).
+    // Operands: (chain, predicate, BasicBlock).
+    SDValue Chain = N->getOperand(0);
+    SDValue Pred = N->getOperand(1);
+    SDValue Target = N->getOperand(2);
+    CurDAG->SelectNodeTo(N, IA64::BRLCOND_NOTCALL, MVT::Other, Pred, Target,
+                         Chain);
+    return;
+  }
+
+  case ISD::BRIND: {
+    // brind addr  ->  mov b6 = addr ;; br.cond.sptk b6  (computed goto).
+    // Move the target address into branch register b6, glued to the branch so
+    // the copy stays adjacent. Operands: (chain, target address).
+    SDLoc dl(N);
+    SDValue Chain = N->getOperand(0);
+    SDValue Target = N->getOperand(1);
+    SDValue Copy =
+        CurDAG->getCopyToReg(Chain, dl, IA64::B6, Target, SDValue());
+    CurDAG->SelectNodeTo(N, IA64::BRINDIRECT, MVT::Other,
+                         CurDAG->getRegister(IA64::B6, MVT::i64),
+                         Copy.getValue(0), Copy.getValue(1));
+    return;
+  }
+
+  case IA64ISD::BRCALL: {
+    // The call hack: LowerCall builds IA64ISD::BRCALL (chain, callee,
+    // arg-reg uses..., [glue]) and leaves the callee as a
+    // Target{GlobalAddress,ExternalSymbol}. A direct call selects to
+    // 'br.call rp = <target>'; the argument-register operands carry through as
+    // the call's (precise) implicit uses. An indirect / function-descriptor
+    // call arrives with the callee already in b6 (a Register operand, set up by
+    // LowerCall) and selects to BRCALL_INDIRECT.
+    SDValue Chain = N->getOperand(0);
+    SDValue Callee = N->getOperand(1);
+
+    // A trailing glue operand, if present, is last; everything between the
+    // callee and it is an argument-register use.
+    unsigned NumOps = N->getNumOperands();
+    SDValue InGlue;
+    if (NumOps && N->getOperand(NumOps - 1).getValueType() == MVT::Glue)
+      InGlue = N->getOperand(--NumOps);
+
+    unsigned Opc;
+    if (Callee.getOpcode() == ISD::TargetGlobalAddress)
+      Opc = IA64::BRCALL_IPREL_GA;
+    else if (Callee.getOpcode() == ISD::TargetExternalSymbol)
+      Opc = IA64::BRCALL_IPREL_ES;
+    else if (Callee.getOpcode() == ISD::Register)
+      // Indirect call: LowerCall already loaded the entry point into b6 (the
+      // Register operand here) and the callee's gp into r1. 'br.call rp = b6'.
+      Opc = IA64::BRCALL_INDIRECT;
+    else
+      report_fatal_error("IA64: unhandled call target");
+
+    // Machine-node operands: (calltarget, arg-reg uses..., chain, [glue]);
+    // results: (chain, glue).
+    SmallVector<SDValue, 12> Ops;
+    Ops.push_back(Callee);
+    for (unsigned i = 2; i < NumOps; ++i)
+      Ops.push_back(N->getOperand(i));
+    Ops.push_back(Chain);
+    if (InGlue.getNode())
+      Ops.push_back(InGlue);
+    CurDAG->SelectNodeTo(N, Opc, MVT::Other, MVT::Glue, Ops);
+    return;
+  }
+
+  case ISD::ATOMIC_CMP_SWAP: {
+    // cmpxchg: move the comparand into ar.ccv, then the size-keyed cmpxchg
+    // (which reads ar.ccv) returns the old word and stores $new on a match.
+    // Operands of the node are (chain, ptr, cmp, new).
+    AtomicSDNode *AN = cast<AtomicSDNode>(N);
+    SDLoc dl(N);
+    SDValue Chain = AN->getChain();
+    SDValue Ptr = AN->getBasePtr();
+    SDValue Cmp = N->getOperand(2);
+    SDValue New = N->getOperand(3);
+
+    unsigned Opc;
+    switch (AN->getMemoryVT().getSimpleVT().SimpleTy) {
+    case MVT::i8:  Opc = IA64::CMPXCHG1; break;
+    case MVT::i16: Opc = IA64::CMPXCHG2; break;
+    case MVT::i32: Opc = IA64::CMPXCHG4; break;
+    case MVT::i64: Opc = IA64::CMPXCHG8; break;
+    default:
+      report_fatal_error("IA64: cannot select a cmpxchg of this type");
+    }
+
+    // The cmpxchg itself is .acq (acquire). For release/seq_cst, prepend a full
+    // fence so prior memory effects are ordered before the swap; the combination
+    // is a correct (conservative) full barrier.
+    if (isReleaseOrStronger(AN->getMergedOrdering()))
+      Chain =
+          SDValue(CurDAG->getMachineNode(IA64::MF, dl, MVT::Other, Chain), 0);
+
+    // mov ar.ccv = cmp, glued to the cmpxchg so it stays immediately before it
+    // (and the ar.ccv physreg def/use is not separated by another writer).
+    SDValue Ccv =
+        SDValue(CurDAG->getMachineNode(IA64::MOV_TO_AR_CCV, dl, MVT::Glue, Cmp),
+                0);
+
+    SDValue Ops[] = {Ptr, New, Chain, Ccv};
+    MachineSDNode *Cas =
+        CurDAG->getMachineNode(Opc, dl, N->getValueType(0), MVT::Other, Ops);
+    CurDAG->setNodeMemRefs(Cas, {AN->getMemOperand()});
+    ReplaceUses(SDValue(N, 0), SDValue(Cas, 0)); // old value
+    ReplaceUses(SDValue(N, 1), SDValue(Cas, 1)); // chain
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+
+  case ISD::LOAD: {
+    // Select by the memory type. IA-64 narrow integer loads zero-extend into
+    // the 64-bit GR, which matches zext/any-extend loads; a sign-extending load
+    // (SEXTLOAD) follows the LDx with the matching sxt (handled below). An i1
+    // (bool) load is the compare-against-zero trick (LD1 + cmp.ne, handled
+    // above). The address is a single register; a FrameIndex base is
+    // materialized by the FrameIndex case above and resolved by
+    // eliminateFrameIndex.
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    SDValue Chain = LD->getChain();
+    SDValue Address = LD->getBasePtr();
+    SDLoc dl(N);
+
+    // Loading a predicate: a predicate can't be loaded from memory directly, so
+    // load the bool byte and test it != 0 (ld1 ;; cmp.ne dst = byte, r0).
+    if (LD->getMemoryVT() == MVT::i1) {
+      SDNode *Byte = CurDAG->getMachineNode(IA64::LD1, dl, MVT::i64, MVT::Other,
+                                            Address, Chain);
+      SDValue Pred = SDValue(
+          CurDAG->getMachineNode(IA64::CMPNE, dl, MVT::i1, SDValue(Byte, 0),
+                                 CurDAG->getRegister(IA64::r0, MVT::i64)),
+          0);
+      ReplaceUses(SDValue(N, 0), Pred);            // the i1 value
+      ReplaceUses(SDValue(N, 1), SDValue(Byte, 1)); // the chain
+      CurDAG->RemoveDeadNode(N);
+      return;
+    }
+
+    unsigned Opc;
+    switch (LD->getMemoryVT().getSimpleVT().SimpleTy) {
+    case MVT::i8:  Opc = IA64::LD1;  break;
+    case MVT::i16: Opc = IA64::LD2;  break;
+    case MVT::i32: Opc = IA64::LD4;  break;
+    case MVT::i64: Opc = IA64::LD8;  break;
+    case MVT::f32: Opc = IA64::LDF4; break;
+    case MVT::f64: Opc = IA64::LDF8; break;
+    case MVT::f80: Opc = IA64::LDFE; break;
+    default:
+      report_fatal_error("IA64: cannot select a load of this type");
+    }
+    // A sign-extending narrow load: the LDx above zero-extends into the 64-bit
+    // GR, so follow it with the matching sxt to sign-extend. Without this a
+    // signed value (e.g. a negative 'int' used in a signed compare -- a Lua
+    // stack index) is read as a large positive number and the compare goes wrong.
+    if (LD->getExtensionType() == ISD::SEXTLOAD) {
+      unsigned SxtOpc;
+      switch (LD->getMemoryVT().getSimpleVT().SimpleTy) {
+      case MVT::i8:  SxtOpc = IA64::SXT1; break;
+      case MVT::i16: SxtOpc = IA64::SXT2; break;
+      case MVT::i32: SxtOpc = IA64::SXT4; break;
+      default:
+        report_fatal_error("IA64: unexpected sign-extending load width");
+      }
+      SDNode *Ld = CurDAG->getMachineNode(Opc, dl, MVT::i64, MVT::Other,
+                                          Address, Chain);
+      SDNode *Sxt =
+          CurDAG->getMachineNode(SxtOpc, dl, MVT::i64, SDValue(Ld, 0));
+      ReplaceUses(SDValue(N, 0), SDValue(Sxt, 0)); // sign-extended value
+      ReplaceUses(SDValue(N, 1), SDValue(Ld, 1));  // chain
+      CurDAG->RemoveDeadNode(N);
+      return;
+    }
+    CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), MVT::Other, Address, Chain);
+    return;
+  }
+
+  case ISD::STORE: {
+    // Operands: (chain, value, address). A non-truncating store picks ST8/STF8
+    // by the value type; a truncating store picks ST1/2/4 (or STF4) by the
+    // memory type. The address register is handled as for loads above.
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    SDValue Chain = ST->getChain();
+    SDValue Value = ST->getValue();
+    SDValue Address = ST->getBasePtr();
+    SDLoc dl(N);
+
+    // Storing a predicate: a predicate can't be stored to memory directly, so
+    // widen it to a 0/1 GR (the zext-PR sequence) and store one byte (st1).
+    if (Value.getValueType() == MVT::i1) {
+      SDValue Zero = SDValue(
+          CurDAG->getMachineNode(IA64::ADDS, dl, MVT::i64,
+                                 CurDAG->getRegister(IA64::r0, MVT::i64),
+                                 CurDAG->getTargetConstant(0, dl, MVT::i64)),
+          0);
+      SDValue Wide = SDValue(
+          CurDAG->getMachineNode(IA64::TPCADDS, dl, MVT::i64, Zero,
+                                 CurDAG->getTargetConstant(1, dl, MVT::i64),
+                                 Value),
+          0);
+      CurDAG->SelectNodeTo(N, IA64::ST1, MVT::Other, Address, Wide, Chain);
+      return;
+    }
+
+    unsigned Opc;
+    if (!ST->isTruncatingStore()) {
+      switch (Value.getValueType().getSimpleVT().SimpleTy) {
+      case MVT::i64: Opc = IA64::ST8;  break;
+      case MVT::f64: Opc = IA64::STF8; break;
+      case MVT::f32: Opc = IA64::STF4; break;
+      case MVT::f80: Opc = IA64::STFE; break;
+      default:
+        report_fatal_error("IA64: cannot select a store of this type");
+      }
+    } else {
+      switch (ST->getMemoryVT().getSimpleVT().SimpleTy) {
+      case MVT::i8:  Opc = IA64::ST1;  break;
+      case MVT::i16: Opc = IA64::ST2;  break;
+      case MVT::i32: Opc = IA64::ST4;  break;
+      // NB: FP truncating stores are set to Expand in IA64TargetLowering --
+      // stfs/stf8 do not round, so they must become an explicit fpround
+      // (FNORMS/FNORMD) plus a same-size store before reaching the selector.
+      default:
+        report_fatal_error("IA64: cannot select a truncating store of this type");
+      }
+    }
+    // ST* operands are (dstPtr, value): address first, then the stored value.
+    CurDAG->SelectNodeTo(N, Opc, MVT::Other, Address, Value, Chain);
+    return;
+  }
+  }
+
+  SelectCode(N);
+}
+
+// Implement addressing-mode selection for inline-asm memory operands. IA-64
+// loads and stores dereference a single register with no displacement
+// ('[rN]'), so for the 'm' (and equivalent 'o') constraint the address operand
+// is passed straight through as one register; IA64AsmPrinter::
+// PrintAsmMemoryOperand then prints it as '[rN]'.
+bool IA64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
+    std::vector<SDValue> &OutOps) {
+  switch (ConstraintID) {
+  default:
+    return true;
+  case InlineAsm::ConstraintCode::o:
+  case InlineAsm::ConstraintCode::m:
+    OutOps.push_back(Op);
+    return false;
+  }
+}
+
+/// createIA64ISelDag - This pass converts a legalized DAG into an IA64-specific
+/// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createIA64ISelDag(TargetMachine &TM) {
+  return new IA64DAGToDAGISelLegacy(TM);
+}
diff --git llvm/lib/Target/IA64/IA64ISelLowering.cpp llvm/lib/Target/IA64/IA64ISelLowering.cpp
new file mode 100644
index 000000000000..26f13554d7cf
--- /dev/null
+++ llvm/lib/Target/IA64/IA64ISelLowering.cpp
@@ -0,0 +1,1181 @@
+//===-- IA64ISelLowering.cpp - IA64 DAG Lowering Implementation -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64TargetLowering class.
+//
+// Scope note: LowerFormalArguments / LowerReturn (Stage 1) and LowerCall
+// (Stage C) are implemented for the integer, direct-call ABI that fib needs:
+// args in r32-r39 (incoming) / out0-out7 (outgoing), return in r8, gp/sp/rp
+// saved around calls; indirect calls go through the function descriptor
+// (entry point into b6, callee gp into r1). TLS remains deferred.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64ISelLowering.h"
+#include "IA64MachineFunctionInfo.h"
+#include "IA64RegisterInfo.h"
+#include "MCTargetDesc/IA64MCAsmInfo.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+// A floating-point scalar that is not long double (f64; f32 was promoted to
+// f64 earlier) occupies exactly one parameter slot. IA-64's parameter model is
+// positional: every argument, integer or FP, consumes a slot in one shared
+// sequence -- the first eight slots map to r32-r39 (incoming) / out0-out7
+// (outgoing), the rest to 8-byte stack slots. A *fixed* FP value travels in the
+// next floating-point register F8-F15, but it must still RESERVE its general
+// parameter slot so a following integer argument keeps its positional slot.
+//
+// CCAssignToRegWithShadow cannot express this: it shadows the GR at the *FP
+// register's* index, so the first FP arg always shadows r32 no matter how many
+// integers preceded it, never reserving the slot the FP arg actually occupies.
+// A trailing integer then reused that slot's register -- e.g. the long long in
+// _testfunc_q_bhilfdq(b,h,i,l,f,d,q) landed in the float's slot and read back
+// the float's bit pattern instead of q.
+//
+// A *variadic* FP arg ('...' match) is passed in a general register in memory
+// format: a prototyped variadic callee reads its variable arguments out of the
+// integer parameter slots / register save area, never F8-F15 (psABI 8.5.4). It
+// is bit-cast to its i64 IEEE pattern (getf.d, the BCvt in LowerCall) and put
+// in the next slot register. SlotRegs is r32-r39 (incoming) / out0-out7 (call).
+static bool CC_IA64_FP_Common(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State,
+                              ArrayRef<MCPhysReg> SlotRegs) {
+  static const MCPhysReg FPRegs[] = {IA64::F8,  IA64::F9,  IA64::F10, IA64::F11,
+                                     IA64::F12, IA64::F13, IA64::F14, IA64::F15};
+  if (ArgFlags.isVarArg()) {
+    if (unsigned Reg = State.AllocateReg(SlotRegs))
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i64, CCValAssign::BCvt));
+    else
+      State.addLoc(CCValAssign::getMem(ValNo, ValVT,
+                                       State.AllocateStack(8, Align(8)),
+                                       MVT::i64, CCValAssign::BCvt));
+    return true;
+  }
+  // Fixed FP arg: reserve the positional GR slot; within the first eight slots
+  // the value rides in the parallel FP register. Slots and FP registers are
+  // consumed only by FP args here (and the f80 hook), so the FP register is
+  // always available when a slot was, and they run out together; once the eight
+  // slots are gone the value goes on the stack.
+  if (State.AllocateReg(SlotRegs)) {
+    unsigned FReg = State.AllocateReg(FPRegs);
+    State.addLoc(
+        CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, CCValAssign::Full));
+    return true;
+  }
+  State.addLoc(CCValAssign::getMem(
+      ValNo, ValVT, State.AllocateStack(8, Align(8)), LocVT, CCValAssign::Full));
+  return true;
+}
+
+// Incoming f64/f32: the parameter slots are the incoming stacked GP registers.
+static bool CC_IA64_FP(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo /*LocInfo*/,
+                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg SlotRegs[] = {IA64::r32, IA64::r33, IA64::r34,
+                                       IA64::r35, IA64::r36, IA64::r37,
+                                       IA64::r38, IA64::r39};
+  return CC_IA64_FP_Common(ValNo, ValVT, LocVT, ArgFlags, State, SlotRegs);
+}
+
+// Outgoing f64/f32: the parameter slots are the output registers out0-out7.
+static bool CC_IA64_Call_FP(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo /*LocInfo*/,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg SlotRegs[] = {IA64::out0, IA64::out1, IA64::out2,
+                                       IA64::out3, IA64::out4, IA64::out5,
+                                       IA64::out6, IA64::out7};
+  return CC_IA64_FP_Common(ValNo, ValVT, LocVT, ArgFlags, State, SlotRegs);
+}
+
+// A named (prototyped) f80 'long double' argument is passed in one FP register
+// in register format, but -- being 16 bytes -- it occupies TWO 16-byte-aligned
+// (Next-Even) parameter slots, so it shadows two general registers (psABI
+// 8.5.1). A variadic long double is passed in the general registers in memory
+// format (two slots). ShadowRegs is r32-r39 (incoming) or out0-out7 (outgoing).
+static bool CC_IA64_F80_Common(unsigned ValNo, MVT ValVT, MVT LocVT,
+                               ISD::ArgFlagsTy ArgFlags, CCState &State,
+                               ArrayRef<MCPhysReg> ShadowRegs) {
+  static const MCPhysReg FPRegs[] = {IA64::F8,  IA64::F9,  IA64::F10, IA64::F11,
+                                     IA64::F12, IA64::F13, IA64::F14, IA64::F15};
+  // A long double (double-extended) uses the "Next Even" slot policy (psABI
+  // 8.5.1, Table 8-3): it occupies two parameter slots and must START on an
+  // even-numbered slot. The slot index equals the shadow-GR index for the
+  // first eight slots, so if the next free shadow GR is odd, burn it as a
+  // padding slot (it is not reused for any later parameter). Beyond the eight
+  // register slots the same alignment is enforced on the stack via Align(16).
+  unsigned NextSlot = State.getFirstUnallocated(ShadowRegs);
+  if (NextSlot < ShadowRegs.size() && (NextSlot & 1))
+    State.AllocateReg(ShadowRegs);
+
+  if (ArgFlags.isVarArg()) {
+    // A variadic long double is passed in the *general* registers in memory
+    // format (psABI 8.5), occupying two parameter slots; spill into the stack
+    // image if the registers are exhausted. Emit two i64 part-locations (this
+    // value gets two CCValAssigns); LowerCall splits the f80 into the two
+    // memory-format halves via an stfe/ld8 temporary. The first stack part is
+    // 16-byte aligned to keep the Next-Even policy on the stack.
+    for (int Part = 0; Part < 2; ++Part) {
+      if (unsigned Reg = State.AllocateReg(ShadowRegs))
+        State.addLoc(CCValAssign::getReg(ValNo, MVT::i64, Reg, MVT::i64,
+                                         CCValAssign::Full));
+      else
+        State.addLoc(CCValAssign::getMem(
+            ValNo, MVT::i64,
+            State.AllocateStack(8, Align(Part == 0 ? 16 : 8)), MVT::i64,
+            CCValAssign::Full));
+    }
+    return true;
+  }
+  if (unsigned FReg = State.AllocateReg(FPRegs)) {
+    // Consume the two (now even-aligned) shadow GR parameter slots this 16-byte
+    // value occupies so following arguments keep their positional slots.
+    State.AllocateReg(ShadowRegs);
+    State.AllocateReg(ShadowRegs);
+    State.addLoc(
+        CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, CCValAssign::Full));
+    return true;
+  }
+  // All FP argument registers used (reachable only via HFAs): pass the 16-byte
+  // value on the stack.
+  unsigned Off = State.AllocateStack(16, Align(16));
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, CCValAssign::Full));
+  return true;
+}
+
+// Incoming f80: shadow the incoming stacked GP registers r32-r39.
+static bool CC_IA64_F80(unsigned ValNo, MVT ValVT, MVT LocVT,
+                        CCValAssign::LocInfo /*LocInfo*/,
+                        ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg ShadowRegs[] = {IA64::r32, IA64::r33, IA64::r34,
+                                         IA64::r35, IA64::r36, IA64::r37,
+                                         IA64::r38, IA64::r39};
+  return CC_IA64_F80_Common(ValNo, ValVT, LocVT, ArgFlags, State, ShadowRegs);
+}
+
+// Outgoing f80: shadow the output registers out0-out7.
+static bool CC_IA64_Call_F80(unsigned ValNo, MVT ValVT, MVT LocVT,
+                             CCValAssign::LocInfo /*LocInfo*/,
+                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg ShadowRegs[] = {IA64::out0, IA64::out1, IA64::out2,
+                                         IA64::out3, IA64::out4, IA64::out5,
+                                         IA64::out6, IA64::out7};
+  return CC_IA64_F80_Common(ValNo, ValVT, LocVT, ArgFlags, State, ShadowRegs);
+}
+
+#include "IA64GenCallingConv.inc"
+
+IA64TargetLowering::IA64TargetLowering(const TargetMachine &TM,
+                                       const TargetSubtargetInfo &STI)
+    : TargetLowering(TM, STI) {
+  // Register classes: general (i64), floating-point (f32/f64/f80 = long double)
+  // and predicate (i1). f80 is the 80-bit double-extended C 'long double',
+  // held natively in the 82-bit FP registers (memory format via ldfe/stfe).
+  addRegisterClass(MVT::i64, &IA64::GRRegClass);
+  addRegisterClass(MVT::f32, &IA64::FPRegClass);
+  addRegisterClass(MVT::f64, &IA64::FPRegClass);
+  addRegisterClass(MVT::f80, &IA64::FPRegClass);
+  addRegisterClass(MVT::i1, &IA64::PRRegClass);
+
+  // IA-64 uses SELECT, not SELECT_CC, and has no native BR_CC / jump tables.
+  setOperationAction(ISD::BRIND, MVT::Other, Legal);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+  // BR_CC / SELECT_CC must be keyed by the *compare operand* value type, not
+  // MVT::Other. The DAGCombiner folds brcond(setcc) -> br_cc whenever BR_CC is
+  // legal-or-custom for that operand type (DAGCombiner::visitBRCOND), and the
+  // legalizer likewise queries getOperationAction by the operand type. The
+  // pre-removal backend used MVT::Other, which was right for the LLVM 2.6
+  // legalizer but is now a dead no-op -- it left BR_CC/i64 at its Legal default,
+  // so brcond(setcc) got folded into an unselectable br_cc. Marking i64 Expand
+  // keeps brcond(setcc) intact, which is exactly what our setcc (CMP*) patterns
+  // and the hand-selected BRCOND consume. (Sparc keys these by operand type
+  // too; it only differs in Custom-lowering them, having native cc-branches.)
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f80, Expand);
+
+  // FP compares: keep brcond(setcc f64) from folding into an unselectable
+  // br_cc, so the legalizer hands us setcc + brcond. setcc f64 selects to the
+  // fcmp relations (FCMP* in IA64InstrInfo.td), which cover every clang FP
+  // condition except SETONE/SETUEQ; expand those into a pair joined by the i1
+  // and/or patterns.
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  // f80 ('long double') compares select to the same fcmp relations.
+  setOperationAction(ISD::BR_CC, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f80, Expand);
+  // ...and so do f32 compares (fcmp looks at the full register-format value,
+  // so single precision needs no separate compare path).
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+
+  // Comparing two predicates (i1): keep br_cc/select_cc as setcc + brcond/select,
+  // and custom-lower the i1 setcc to predicate logic (eq/ne -> xnor/xor).
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SETCC, MVT::i1, Custom);
+  // ...but mark the i1 eq/ne conditions Expand so the combiner's rebuildSetCC
+  // does not turn our lowered xor back into an i1 setcc (an infinite loop, since
+  // that setcc is Custom-lowered to the same xor again).
+  setCondCodeAction(ISD::SETEQ, MVT::i1, Expand);
+  setCondCodeAction(ISD::SETNE, MVT::i1, Expand);
+
+  setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FDIV, MVT::f32, Expand);
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);
+  // f80 ('long double') has no inline divide/remainder; use the libcall
+  // (__divxf3 / fmodl). fadd/fsub/fmpy/fma are native (FADD etc.).
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
+  setOperationAction(ISD::FDIV, MVT::f80, Expand);
+
+  // FP truncating stores must round first. stfs/stf8 emit fp_fr_to_mem_format,
+  // which *assumes the FR was already rounded* to the destination precision --
+  // they do not round themselves. So storing an unrounded wider value as a
+  // narrower one would just slice its bits and corrupt the result. Expanding
+  // these turns a truncstore into an explicit fpround (FNORMS/FNORMD) followed
+  // by a same-size store of the now-rounded value, and also stops DAGCombiner
+  // from re-merging store(fpround x) back into a single truncating store.
+  // (The load direction needs no dual: ldfs/ldf8 always widen correctly.)
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f64, Expand);
+
+  // IA-64 has no native half (f16). Convert to/from f16 via the soft-float
+  // libcalls (__truncsfhf2/__extendhfsf2 etc.) and never load/store f16 as an
+  // extended/truncated FP value -- it is handled as i16 bits. Mirrors SPARC.
+  // (f128 needs no such setup: with no f128 register class it is soft-floated
+  // to the default libgcc __*tf3 libcalls.)
+  for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
+    setOperationAction(ISD::FP_TO_FP16, VT, Expand);
+    setOperationAction(ISD::FP16_TO_FP, VT, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+    setTruncStoreAction(VT, MVT::f16, Expand);
+  }
+
+  // We don't support sin/cos/sqrt/pow (expand to libcalls: sinl/cosl/sqrtl/...).
+  for (MVT VT : {MVT::f32, MVT::f64, MVT::f80}) {
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    // FIXME: IA64 supports fcopysign natively.
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+  }
+
+  // IA-64 has a native population count (popcnt); select ctpop directly.
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+  // ctlz/cttz have no direct instruction; let the legalizer expand them (now
+  // cheaply, in terms of the legal ctpop above).
+  setOperationAction(ISD::CTLZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  // FIXME: IA64 has this (mux @rev), but it is not implemented.
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  // Use toolchain built-in for integer division
+  for (unsigned Op : {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::UDIVREM,
+                      ISD::SDIVREM})
+    setOperationAction(Op, MVT::i64, Expand);
+
+  // No single instruction yields both halves of a 64x64 product; expand into a
+  // separate low MUL and a high MULHU/MULHS (both of which we select).
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+  // 128-bit shifts (i128, e.g. `core`'s checked_shl) legalize to a *_PARTS node
+  // over an i64 register pair. We have no instruction for that; mark them Expand
+  // so the integer legalizer emits the libgcc libcall (__ashlti3/__ashrti3/
+  // __lshrti3) instead, matching how we already handle 128-bit divide/modulo.
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+
+  // va_start points the va_list at the register save area (custom); va_arg,
+  // va_copy and va_end use the generic load/increment/store expansion. The
+  // va_list is a plain pointer, so the default va_copy/va_end suffice.
+  // Thread-local addresses are lowered per TLS model (see LowerGlobalTLSAddress);
+  // there is no generic expansion, so it must be Custom.
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // Atomics. An aligned <=8-byte ld/st is atomic on the hardware, but LLVM
+  // represents an atomic access as a distinct node (ISD::ATOMIC_LOAD/STORE)
+  // that the selector won't turn into ld8/st8 on its own. Custom-lower them to
+  // a plain load/store carrying the same (atomic) memory operand; see
+  // LowerOperation. Ordering is handled separately: shouldInsertFencesForAtomic
+  // asks AtomicExpand to wrap stronger orderings with fences and demote the
+  // access to monotonic, so the only atomic load/store we ever lower here is
+  // monotonic. The fences become ISD::ATOMIC_FENCE, selected to 'mf'.
+  //
+  // Only the legal integer type i64 is marked Custom: a narrow (i8/i16/i32)
+  // atomic load/store has an illegal type and is first widened by the *type*
+  // legalizer (PromoteIntRes_Atomic0 / PromoteIntOp_ATOMIC_STORE) to an i64
+  // access carrying the narrow memory VT, which then reaches LowerOperation as
+  // an i64 Custom op. Marking the narrow types Custom instead would divert type
+  // legalization into ReplaceNodeResults (which we do not implement) and abort.
+  setMaxAtomicSizeInBitsSupported(64);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
+
+  setStackPointerRegisterToSaveRestore(IA64::r12);
+
+  // The pre-removal backend reported a Log2 function alignment of 5, i.e. a
+  // 32-byte alignment ('.align 32' in the reference output).
+  setMinFunctionAlignment(Align(32));
+
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  // Note: the pre-removal backend called addLegalFPImmediate(0/±1) here; that
+  // API was removed (FP-immediate legality is now an isFPImmLegal override).
+  // plus.ll uses no FP immediates, so this is left for a later stage.
+}
+
+const char *IA64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case IA64ISD::GETFD:
+    return "IA64ISD::GETFD";
+  case IA64ISD::BRCALL:
+    return "IA64ISD::BRCALL";
+  case IA64ISD::RET_FLAG:
+    return "IA64ISD::RET_FLAG";
+  case IA64ISD::TLS_TPREL:
+    return "IA64ISD::TLS_TPREL";
+  case IA64ISD::TLS_GOTLOAD:
+    return "IA64ISD::TLS_GOTLOAD";
+  }
+}
+
+EVT IA64TargetLowering::getSetCCResultType(const DataLayout & /*DL*/,
+                                           LLVMContext & /*Context*/,
+                                           EVT /*VT*/) const {
+  // SETCC produces a predicate register value.
+  return MVT::i1;
+}
+
+bool IA64TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction & /*MF*/,
+                                                    EVT VT) const {
+  // fma/fms/fnma fuse a*b+c into one single-rounding F-unit op. f32 (fma.s),
+  // f64 (fma.d) and f80 (fma) each have a hardware FMA pattern, so contracting
+  // fmul+fadd is profitable for them. This stays an explicit whitelist (not
+  // `true`): f16 and f128 are soft-floated, and contracting those would form an
+  // fma node of a width with no hardware pattern (unselectable / soft-float).
+  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80;
+}
+
+bool IA64TargetLowering::isFPImmLegal(const APFloat & /*Imm*/, EVT VT,
+                                      bool /*ForCodeSize*/) const {
+  // Keep f32/f64 constants out of the constant pool: we materialise them from
+  // their integer bit pattern (movl + setf.d) -- see the fpimm patterns in
+  // IA64InstrInfo.td. f80 ('long double') is 80 bits and cannot be built from a
+  // single 64-bit movl, so its literals go to the constant pool (loaded by ldfe;
+  // see the ISD::ConstantPool selection in IA64ISelDAGToDAG).
+  return VT == MVT::f32 || VT == MVT::f64;
+}
+
+SDValue IA64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_IA64);
+
+  for (CCValAssign &VA : ArgLocs) {
+    if (VA.isRegLoc()) {
+      // The argument arrives in a register.
+      MVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC;
+      if (RegVT == MVT::i64)
+        RC = &IA64::GRRegClass;
+      else if (RegVT == MVT::f32 || RegVT == MVT::f64 || RegVT == MVT::f80)
+        RC = &IA64::FPRegClass;
+      else
+        report_fatal_error("IA64: unhandled formal-argument register type");
+
+      Register VReg = RegInfo.createVirtualRegister(RC);
+      RegInfo.addLiveIn(VA.getLocReg(), VReg);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+      // If the argument was widened to fill the register, narrow it back to
+      // its declared type.
+      if (RegVT != VA.getValVT()) {
+        if (RegVT.isInteger())
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        else
+          ArgValue = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), ArgValue,
+                                 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+      }
+
+      InVals.push_back(ArgValue);
+    } else {
+      // The argument arrives on the stack. Per the psABI (§8.5.3) parameter
+      // slot 8 is at sp+16, slot 9 at sp+24, and so on (the 16-byte scratch
+      // area sits below at [sp, sp+16)). This holds whether or not the function
+      // is variadic -- the variadic register-home spill area is carved out of
+      // *this* frame and the scratch area, not reserved by the caller (see the
+      // spill loop below).
+      assert(VA.isMemLoc() && "unexpected argument location");
+      int FI = MF.getFrameInfo().CreateFixedObject(
+          8, 16 + VA.getLocMemOffset(), /*IsImmutable=*/true);
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      InVals.push_back(
+          DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
+    }
+  }
+
+  // Variadic functions: spill the unnamed incoming GP registers to their
+  // parameter-slot memory homes so va_start/va_arg can walk the variadic
+  // arguments as a single contiguous in-memory image. Per the psABI (§8.5.4)
+  // the callee spills in6/in7 into the 16-byte scratch area at [sp, sp+16) and
+  // in0-in5 into up to 48 bytes at the base of its own frame, just below sp.
+  // This places parameter slot i at offset 8*i - 48 from the incoming sp:
+  // slot6 -> sp+0, slot7 -> sp+8, slot8 (first stack arg) -> sp+16, slot9 ->
+  // sp+24, ... -- one contiguous ascending block running from the frame base up
+  // into the caller's memory arguments. A va_list is just an ascending pointer,
+  // so it walks out of the register homes straight into the stack arguments.
+  // (CreateFixedObject offsets are relative to the incoming sp; negative
+  // offsets land in this frame, which PrologEpilogInserter sizes to cover.
+  // Storing the registers also marks them used, so frame lowering's 'alloc'
+  // keeps all eight incoming GP registers live as locals.)
+  if (isVarArg) {
+    static const MCPhysReg ArgGPRs[] = {IA64::r32, IA64::r33, IA64::r34,
+                                        IA64::r35, IA64::r36, IA64::r37,
+                                        IA64::r38, IA64::r39};
+    unsigned FirstVar = CCInfo.getFirstUnallocated(ArgGPRs);
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    int VAFI = 0;
+    SmallVector<SDValue, 8> Stores;
+    for (unsigned i = FirstVar; i < 8; ++i) {
+      int FI = MFI.CreateFixedObject(8, 8 * (int)i - 48, /*IsImmutable=*/false);
+      if (i == FirstVar)
+        VAFI = FI; // va_start points at the first unnamed slot's home
+      Register VReg = RegInfo.createVirtualRegister(&IA64::GRRegClass);
+      RegInfo.addLiveIn(ArgGPRs[i], VReg);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+      SDValue Addr = DAG.getFrameIndex(FI, MVT::i64);
+      Stores.push_back(DAG.getStore(Val.getValue(1), dl, Val, Addr,
+                                    MachinePointerInfo::getFixedStack(MF, FI)));
+    }
+    // All eight GP slots named: no register varargs, so va_start points at the
+    // first unnamed stack slot. That is slot 8 (sp+16) only when there are no
+    // *named* stack arguments; if the prototype has named parameters beyond the
+    // eight register slots (e.g. Links' input_field: 8 register params + 4 named
+    // stack args + ...), the unnamed args begin after them, at
+    // sp + 16 + <bytes of named stack args>. CCInfo.getStackSize() is exactly
+    // those bytes (the formals were just analyzed above).
+    if (FirstVar == 8)
+      VAFI = MFI.CreateFixedObject(8, 16 + CCInfo.getStackSize(),
+                                   /*IsImmutable=*/true);
+    MF.getInfo<IA64FunctionInfo>()->setVarArgsFrameIndex(VAFI);
+    if (!Stores.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+  }
+
+  // 'alloc' (which captures the caller's ar.pfs) and its restore are now emitted
+  // entirely by frame lowering into a reserved stacked local, so there is
+  // nothing to materialise here. See IA64FrameLowering::emitPrologue.
+  return Chain;
+}
+
+SDValue IA64TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &dl = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool isVarArg = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // No tail calls yet.
+  CLI.IsTailCall = false;
+
+  // Assign the outgoing arguments to out0-out7 / F8-F15 (caller convention).
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CC_IA64_Call);
+
+  // A 16-byte scratch area sits at the bottom of the outgoing frame; keep the
+  // whole thing 16-byte aligned. Stack-passed arguments begin at sp+16 (psABI
+  // §8.5.3), variadic or not: the variadic register-home spill area is built by
+  // the callee out of its own frame and the scratch area, not reserved here
+  // (see LowerFormalArguments).
+  unsigned NumBytes = (CCInfo.getStackSize() + 16 + 15) & ~15u;
+
+  // Record how many output registers this call needs; the prologue 'alloc'
+  // sizes its output region from the max over all of the function's calls.
+  // Count the actually-allocated out registers rather than the argument count:
+  // an FP argument shadows (consumes) its parameter slot(s) without occupying an
+  // out register for the value, while a long double (f80) shadows *two* out
+  // slots -- so a trailing integer arg can land in a higher out register than
+  // the plain argument count would suggest.
+  static const MCPhysReg OutRegs[] = {IA64::out0, IA64::out1, IA64::out2,
+                                      IA64::out3, IA64::out4, IA64::out5,
+                                      IA64::out6, IA64::out7};
+  unsigned NumOutRegs = 0;
+  for (unsigned i = 0; i < 8; ++i)
+    if (CCInfo.isAllocated(OutRegs[i]))
+      NumOutRegs = i + 1;
+  IA64FunctionInfo *FInfo = MF.getInfo<IA64FunctionInfo>();
+  FInfo->OutRegsUsed = std::max(FInfo->OutRegsUsed, NumOutRegs);
+
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+
+  // An indirect callee is a function pointer: not a GlobalAddress/ExternalSymbol
+  // but an ordinary i64 value pointing at a function descriptor { entry, gp }.
+  // Read the descriptor here, while Chain is still a plain (unglued) chain and
+  // before the gp save below latches the caller's r1; the entry point and the
+  // callee's gp are installed into b6 / r1 just before the call further down.
+  bool IsIndirect = !isa<GlobalAddressSDNode>(Callee) &&
+                    !isa<ExternalSymbolSDNode>(Callee);
+  SDValue EntryPoint, NewGp;
+  if (IsIndirect) {
+    EntryPoint = DAG.getLoad(MVT::i64, dl, Chain, Callee, MachinePointerInfo());
+    Chain = EntryPoint.getValue(1);
+    SDValue GpAddr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee,
+                                 DAG.getIntPtrConstant(8, dl));
+    NewGp = DAG.getLoad(MVT::i64, dl, Chain, GpAddr, MachinePointerInfo());
+    Chain = NewGp.getValue(1);
+  }
+
+  // Collect the (out-register, value) pairs to copy in just before the call,
+  // and the stores for any arguments that overflow onto the outgoing stack.
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // Index by ValNo, not i: a variadic long double maps one argument value to
+    // two consecutive parameter-slot locations (see below), after which i and
+    // the argument number diverge.
+    SDValue Arg = OutVals[VA.getValNo()];
+
+    // By-value aggregate argument. The psABI passes aggregates by value; the
+    // frontend models this as a `byval` pointer to the caller's object and
+    // expects the callee to receive a pointer to a *private copy*. We currently
+    // realize that copy here (the callee then dereferences the pointer as usual)
+    // rather than flattening the aggregate into parameter slots/GRs -- that full
+    // ABI is still TODO (see struct-value-abi.md). The copy is mandatory: without
+    // it the argument aliases caller memory, and a callee that mutates or frees
+    // that memory corrupts the caller. Concretely, glibc regex's re_dfa_add_node
+    // takes an re_token_t by value and `realloc`s the very dfa->nodes array a
+    // by-value `dfa->nodes[org_idx]` argument points into -- so the un-copied
+    // pointer dangled into the freed block and read back garbage.
+    ISD::ArgFlagsTy Flags = Outs[VA.getValNo()].Flags;
+    if (Flags.isByVal()) {
+      unsigned Size = Flags.getByValSize();
+      if (Size != 0) {
+        Align ByValAlign = Flags.getNonZeroByValAlign();
+        int FI = MF.getFrameInfo().CreateStackObject(Size, ByValAlign, false);
+        SDValue Copy = DAG.getFrameIndex(FI, MVT::i64);
+        SDValue MemcpyChain = DAG.getMemcpy(
+            Chain, dl, Copy, Arg, DAG.getIntPtrConstant(Size, dl), ByValAlign,
+            /*isVol=*/false, /*AlwaysInline=*/false, /*CI=*/nullptr,
+            /*OverrideTailCall=*/std::nullopt,
+            MachinePointerInfo::getFixedStack(MF, FI), MachinePointerInfo());
+        // Order the copy before the call (alongside the other arg stores).
+        MemOpChains.push_back(MemcpyChain);
+        Arg = Copy; // pass the private copy's address per VA below
+      }
+    }
+
+    // Variadic long double (f80): the CC gave it two consecutive i64 slots --
+    // this location and the next, both tagged with the same ValNo. It is passed
+    // in memory format (psABI 8.5).
+    if (i + 1 < e && ArgLocs[i + 1].getValNo() == VA.getValNo()) {
+      CCValAssign &VAHi = ArgLocs[i + 1];
+
+      // Both halves land on the outgoing stack: store the long double straight
+      // to its parameter slot with stfe (memory format) -- no register
+      // round-trip. (The two slots are adjacent, so one 10-byte stfe covers the
+      // significant bytes; the callee's va_arg reads it back with ldfe.) The
+      // spill-and-reload path below would only DAGCombine down to this if the
+      // combiner forwarded an f80 store into i64 loads, which it does not.
+      if (VA.isMemLoc() && VAHi.isMemLoc()) {
+        unsigned Off = 16 + VA.getLocMemOffset(); // psABI: slot 8 at sp+16
+        SDValue Addr = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                                   DAG.getRegister(IA64::r12, MVT::i64),
+                                   DAG.getIntPtrConstant(Off, dl));
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, Addr,
+                                           MachinePointerInfo::getStack(MF, Off)));
+        ++i; // consumed both part-locations
+        continue;
+      }
+
+      // At least one half goes in a general register: spill to a 16-byte
+      // temporary with stfe and reload the two 8-byte memory-format halves
+      // (ld8) into the assigned slots -- the in-memory image the callee's
+      // va_arg reconstructs with ldfe. (There is no register instruction to
+      // extract the 80-bit *memory* format into GRs, so the spill is required.)
+      int FI = MF.getFrameInfo().CreateStackObject(16, Align(16), false);
+      SDValue Tmp = DAG.getFrameIndex(FI, MVT::i64);
+      SDValue St = DAG.getStore(Chain, dl, Arg, Tmp,
+                                MachinePointerInfo::getFixedStack(MF, FI));
+      SDValue HiAddr = DAG.getNode(ISD::ADD, dl, MVT::i64, Tmp,
+                                   DAG.getIntPtrConstant(8, dl));
+      SDValue Half[2] = {
+          DAG.getLoad(MVT::i64, dl, St, Tmp,
+                      MachinePointerInfo::getFixedStack(MF, FI)),
+          DAG.getLoad(MVT::i64, dl, St, HiAddr,
+                      MachinePointerInfo::getFixedStack(MF, FI, 8))};
+      // Order the spill/reload before the call.
+      MemOpChains.push_back(Half[0].getValue(1));
+      MemOpChains.push_back(Half[1].getValue(1));
+      for (unsigned Part = 0; Part < 2; ++Part) {
+        CCValAssign &PVA = ArgLocs[i + Part];
+        if (PVA.isRegLoc()) {
+          RegsToPass.push_back(std::make_pair(PVA.getLocReg(), Half[Part]));
+        } else {
+          unsigned Off = 16 + PVA.getLocMemOffset(); // psABI: slot 8 at sp+16
+          SDValue Addr = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                                     DAG.getRegister(IA64::r12, MVT::i64),
+                                     DAG.getIntPtrConstant(Off, dl));
+          MemOpChains.push_back(DAG.getStore(
+              Chain, dl, Half[Part], Addr, MachinePointerInfo::getStack(MF, Off)));
+        }
+      }
+      ++i; // consumed both part-locations
+      continue;
+    }
+
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      // A variadic FP arg routed into a GR slot: reinterpret the f64 as its
+      // i64 IEEE bit pattern (selects to getf.d). See CC_IA64_FP_Common.
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    default:
+      report_fatal_error("IA64: unhandled argument CCValAssign");
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Arguments beyond out0-out7 are passed on the outgoing stack, just above
+      // the 16-byte scratch area: parameter slot 8 at sp+16, slot 9 at sp+24,
+      // ... (psABI §8.5.3) -- the same layout LowerFormalArguments reads
+      // incoming stack arguments from. The store is sp-relative: with a reserved
+      // call frame (no variable-sized objects) sp is constant here; otherwise
+      // the call-frame pseudos adjust it around the call.
+      assert(VA.isMemLoc() && "argument neither in register nor on the stack");
+      unsigned Off = 16 + VA.getLocMemOffset();
+      SDValue StackPtr = DAG.getRegister(IA64::r12, MVT::i64);
+      SDValue Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr,
+                                 DAG.getIntPtrConstant(Off, dl));
+      MemOpChains.push_back(DAG.getStore(
+          Chain, dl, Arg, Addr, MachinePointerInfo::getStack(MF, Off)));
+    }
+  }
+
+  // Sequence all the outgoing-argument stores before the call.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Save gp/sp around the call. br.call may transfer into another load module
+  // (so the callee's gp must be reinstalled afterwards) and sp is restored
+  // conservatively. These reads must precede the call and the restores must
+  // follow it, so the whole save -> args -> call -> restore sequence is tied
+  // together with glue. Use the glue-carrying getCopyFromReg overload even for
+  // the first save (with a null input glue): it still gives the node a glue
+  // *result* to start the chain. The plain 4-operand form has no glue result,
+  // so reading getValue(2) off it would be out of range.
+  //
+  // We deliberately do NOT save/restore rp (b0) per call here. br.call does
+  // overwrite b0, but frame lowering already parks the incoming rp once in a
+  // stacked local for the whole function (IA64FrameLowering::emitPrologue) and
+  // the epilogue restores b0 from it, so our br.ret returns correctly no matter
+  // how many calls clobber rp in between -- the per-call save was redundant.
+  // Worse, it was actively wrong: rp is a member of the GR class (so that
+  // 'mov rN = rp' works), the save value was live across the call and coalesced
+  // into the physical rp, and the spiller then spilled it with a plain
+  // 'st8 [slot] = rp' / 'ld8 rp = [slot]'. That is illegal -- st8/ld8 require a
+  // general register, not the branch register b0 -- and gas rejects it
+  // ("Operand N of st8/ld8 should be a general register"). The only place rp is
+  // still read around a call is the returns_twice path below, where it is parked
+  // into the CSR r7 *before* the call and so is never live across it as b0.
+  SDValue InGlue;
+  SDValue GPSave = DAG.getCopyFromReg(Chain, dl, IA64::r1, MVT::i64, InGlue);
+  Chain = GPSave.getValue(1);
+  InGlue = GPSave.getValue(2);
+  SDValue SPSave = DAG.getCopyFromReg(Chain, dl, IA64::r12, MVT::i64, InGlue);
+  Chain = SPSave.getValue(1);
+  InGlue = SPSave.getValue(2);
+
+  // In a function that calls setjmp (and so may be re-entered by longjmp), the
+  // save vregs above cannot be allowed to land in stacked locals: longjmp brings
+  // the stacked frame back only to its last-written values, and the register
+  // allocator reuses the save register right after the (singly-modeled) restore
+  // -- which sits before the setjmp-result branch, i.e. exactly the longjmp
+  // re-entry point -- so the restored value is garbage (observed: gp = 0, then a
+  // stale slot address). Park gp/sp/rp instead in the static callee-saved
+  // registers r4/r6/r7, which glibc's setjmp/longjmp save and restore through the
+  // jmpbuf: on a longjmp re-entry they come back holding the setjmp-time
+  // gp/sp/rp, and any reuse after the restore is harmless because longjmp
+  // overwrites it. Because they are true CSRs (getCalleeSavedRegs), a nested
+  // setjmp call saves and restores them, so it cannot clobber an outer frame's
+  // parked values. The restore below reads them back out of r4/r6/r7. Reading rp
+  // here is safe (it is parked into r7, a GR, before the call -- never spilled as
+  // b0 across the call).
+  bool ReturnsTwice = MF.exposesReturnsTwice();
+  SDValue RPSave;
+  if (ReturnsTwice) {
+    RPSave = DAG.getCopyFromReg(Chain, dl, IA64::rp, MVT::i64, InGlue);
+    Chain = RPSave.getValue(1);
+    InGlue = RPSave.getValue(2);
+    Chain = DAG.getCopyToReg(Chain, dl, IA64::r4, GPSave, InGlue);
+    InGlue = Chain.getValue(1);
+    Chain = DAG.getCopyToReg(Chain, dl, IA64::r6, SPSave, InGlue);
+    InGlue = Chain.getValue(1);
+    Chain = DAG.getCopyToReg(Chain, dl, IA64::r7, RPSave, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
+  // Copy the outgoing arguments into their out registers, glued before the call.
+  for (auto &R : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, dl, R.first, R.second, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
+  // Set up the br.call target. For an indirect call, install the callee's gp
+  // (r1) and the entry point (b6) read from the descriptor above, glued in just
+  // after the argument copies; BRCALL then branches to b6. For a direct call,
+  // make the callee a target node so the generic selector leaves it alone and
+  // the IA64ISD::BRCALL selection consumes it as the br.call target.
+  if (IsIndirect) {
+    Chain = DAG.getCopyToReg(Chain, dl, IA64::r1, NewGp, InGlue);
+    InGlue = Chain.getValue(1);
+    Chain = DAG.getCopyToReg(Chain, dl, IA64::B6, EntryPoint, InGlue);
+    InGlue = Chain.getValue(1);
+    Callee = DAG.getRegister(IA64::B6, MVT::i64);
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i64);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i64);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 12> Ops = {Chain, Callee};
+  for (auto &R : RegsToPass)
+    Ops.push_back(DAG.getRegister(R.first, R.second.getValueType()));
+  if (InGlue.getNode())
+    Ops.push_back(InGlue);
+  Chain = DAG.getNode(IA64ISD::BRCALL, dl, NodeTys, Ops);
+  InGlue = Chain.getValue(1);
+
+  // Restore gp/sp after the call. For a returns_twice function read gp/sp/rp back
+  // out of r4/r6/r7 (longjmp-safe, see the save above) and reinstate rp from r7
+  // (a plain GR->GR copy, never spilled as b0); otherwise restore gp/sp from the
+  // save vregs directly. The common path needs no rp restore -- frame lowering
+  // owns the function's return pointer (see the save block above).
+  if (ReturnsTwice) {
+    GPSave = DAG.getCopyFromReg(Chain, dl, IA64::r4, MVT::i64, InGlue);
+    Chain = GPSave.getValue(1);
+    InGlue = GPSave.getValue(2);
+    SPSave = DAG.getCopyFromReg(Chain, dl, IA64::r6, MVT::i64, InGlue);
+    Chain = SPSave.getValue(1);
+    InGlue = SPSave.getValue(2);
+    RPSave = DAG.getCopyFromReg(Chain, dl, IA64::r7, MVT::i64, InGlue);
+    Chain = RPSave.getValue(1);
+    InGlue = RPSave.getValue(2);
+  }
+  Chain = DAG.getCopyToReg(Chain, dl, IA64::r1, GPSave, InGlue);
+  InGlue = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, IA64::r12, SPSave, InGlue);
+  InGlue = Chain.getValue(1);
+  // rp last (only for returns_twice), preserving the gp -> sp -> rp restore order.
+  if (ReturnsTwice) {
+    Chain = DAG.getCopyToReg(Chain, dl, IA64::rp, RPSave, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, dl);
+  InGlue = Chain.getValue(1);
+
+  // Read the return value(s) out of r8 / F8.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
+  RVInfo.AnalyzeCallResult(Ins, RetCC_IA64);
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InGlue);
+    Chain = Val.getValue(1);
+    InGlue = Val.getValue(2);
+
+    if (VA.getLocVT() != VA.getValVT()) {
+      if (VA.getLocVT().isInteger())
+        Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      else
+        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                          DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+    }
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+SDValue IA64TargetLowering::LowerOperation(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    report_fatal_error("IA64: unimplemented custom operation lowering");
+  case ISD::FRAMEADDR: {
+    // __builtin_frame_address(0): the address of the current frame, which we
+    // take to be the frame register (the frame pointer r5 if one is forced,
+    // else the stack pointer r12).
+    if (Op.getConstantOperandVal(0) != 0)
+      report_fatal_error("IA64: __builtin_frame_address with nonzero depth is "
+                         "not supported");
+    MachineFunction &MF = DAG.getMachineFunction();
+    MF.getFrameInfo().setFrameAddressIsTaken(true);
+    Register FrameReg = MF.getSubtarget().getRegisterInfo()->getFrameRegister(MF);
+    return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FrameReg,
+                              Op.getValueType());
+  }
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC: {
+    // i1 (predicate) comparison: a != b is xor, a == b is its complement
+    // (xor then invert via xor with 1). Booleans only ever use eq/ne.
+    SDLoc dl(Op);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op.getOperand(0),
+                              Op.getOperand(1));
+    if (CC == ISD::SETNE)
+      return Xor;
+    if (CC == ISD::SETEQ)
+      return DAG.getNode(ISD::XOR, dl, MVT::i1, Xor,
+                         DAG.getConstant(1, dl, MVT::i1));
+    report_fatal_error("IA64: unhandled i1 SETCC condition (expected eq/ne)");
+  }
+  case ISD::ATOMIC_LOAD: {
+    // Lower a monotonic atomic load (AtomicExpand has already split off any
+    // stronger ordering into fences) to a plain load with the same atomic
+    // memory operand. The existing ISD::LOAD selector picks ld1/ld2/ld4/ld8 by
+    // the memory type and applies the zero/sign extension carried here.
+    AtomicSDNode *AN = cast<AtomicSDNode>(Op);
+    SDLoc dl(Op);
+    return DAG.getExtLoad(AN->getExtensionType(), dl, Op.getValueType(),
+                          AN->getChain(), AN->getBasePtr(), AN->getMemoryVT(),
+                          AN->getMemOperand());
+  }
+  case ISD::ATOMIC_STORE: {
+    // Mirror of ATOMIC_LOAD: a monotonic atomic store becomes a plain (possibly
+    // truncating) store. The value is promoted to i64, so a narrow access is a
+    // truncating store keyed on the memory type, which the store selector
+    // handles.
+    AtomicSDNode *AN = cast<AtomicSDNode>(Op);
+    SDLoc dl(Op);
+    SDValue Val = AN->getVal();
+    EVT MemVT = AN->getMemoryVT();
+    if (MemVT == Val.getValueType())
+      return DAG.getStore(AN->getChain(), dl, Val, AN->getBasePtr(),
+                          AN->getMemOperand());
+    return DAG.getTruncStore(AN->getChain(), dl, Val, AN->getBasePtr(), MemVT,
+                             AN->getMemOperand());
+  }
+  case ISD::VASTART: {
+    // va_start stores the address of the register save area (the first variadic
+    // argument slot, filled in by LowerFormalArguments) into the va_list.
+    MachineFunction &MF = DAG.getMachineFunction();
+    SDLoc dl(Op);
+    SDValue FR = DAG.getFrameIndex(
+        MF.getInfo<IA64FunctionInfo>()->getVarArgsFrameIndex(),
+        getPointerTy(DAG.getDataLayout()));
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV));
+  }
+  }
+}
+
+SDValue IA64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GA->getGlobal();
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // -femulated-tls is handled generically; otherwise emit native ELF TLS.
+  if (DAG.getTarget().useEmulatedTLS())
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  // Read the thread pointer (tp / r13). It is reserved, so a CopyFromReg of the
+  // physreg observes its live value; the per-model offset below is added to it.
+  auto ThreadPointer = [&]() {
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, IA64::r13, PtrVT);
+  };
+
+  switch (getTargetMachine().getTLSModel(GV)) {
+  case TLSModel::LocalExec: {
+    // The offset is a static-link-time constant materialised directly (no GOT):
+    //   movl rX = @tprel(sym) ;; add rX = rX, tp
+    SDValue Sym =
+        DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0, IA64::S_TPREL);
+    SDValue Off = DAG.getNode(IA64ISD::TLS_TPREL, dl, PtrVT, Sym);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer(), Off);
+  }
+  case TLSModel::InitialExec: {
+    // The offset is resolved by the dynamic linker into a GOT slot:
+    //   addl rX = @ltoff(@tprel(sym)), gp ;; ld8 rX = [rX] ;; add rX = rX, tp
+    SDValue Sym = DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0,
+                                             IA64::S_LTOFF_TPREL);
+    SDValue Off = DAG.getNode(IA64ISD::TLS_GOTLOAD, dl, PtrVT, Sym);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer(), Off);
+  }
+  case TLSModel::GeneralDynamic:
+  case TLSModel::LocalDynamic: {
+    // Call __tls_get_addr(module, offset): the two arguments are loaded from the
+    // @ltoff(@dtpmod)/@ltoff(@dtprel) GOT slots, and the call returns the
+    // variable's address. (Local-dynamic is lowered identically to
+    // general-dynamic -- one call per access using the variable's own
+    // dtpmod/dtprel -- which is correct, just without the LDM module-base
+    // sharing optimization.) IA-64's __tls_get_addr takes the two scalars
+    // directly (out0/out1), not a pointer to a tls_index struct.
+    SDValue ModSym = DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0,
+                                                IA64::S_LTOFF_DTPMOD);
+    SDValue OffSym = DAG.getTargetGlobalAddress(GV, dl, PtrVT, /*offset=*/0,
+                                                IA64::S_LTOFF_DTPREL);
+    SDValue Module = DAG.getNode(IA64ISD::TLS_GOTLOAD, dl, PtrVT, ModSym);
+    SDValue Offset = DAG.getNode(IA64ISD::TLS_GOTLOAD, dl, PtrVT, OffSym);
+
+    Type *I64Ty = Type::getInt64Ty(*DAG.getContext());
+    ArgListTy Args;
+    Args.push_back(ArgListEntry(Module, I64Ty));
+    Args.push_back(ArgListEntry(Offset, I64Ty));
+
+    // __tls_get_addr is an external symbol, so LowerCall emits a direct br.call
+    // and (via AdjustInstrPostInstrSelection) models the gp clobber -> the gp
+    // save/restore around the call survives, as GCC emits.
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(DAG.getEntryNode())
+        .setLibCallee(CallingConv::C, PointerType::getUnqual(*DAG.getContext()),
+                      DAG.getExternalSymbol("__tls_get_addr", PtrVT),
+                      std::move(Args));
+    return LowerCallTo(CLI).first;
+  }
+  }
+  llvm_unreachable("Unknown TLS model");
+}
+
+SDValue IA64TargetLowering::LowerReturn(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+    SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_IA64);
+
+  SDValue Glue;
+  SmallVector<SDValue, 4> RetOps(1, Chain); // RetOps[0] is patched below.
+
+  // Copy the return values into their assigned registers (r8 / F8).
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "return value must be in a register");
+    SDValue Val = OutVals[i];
+
+    if (VA.getLocVT() != VA.getValVT()) {
+      if (VA.getLocVT().isInteger())
+        Val = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Val);
+      else
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, VA.getLocVT(), Val);
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Val, Glue);
+    Glue = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;
+  if (Glue.getNode())
+    RetOps.push_back(Glue);
+
+  return DAG.getNode(IA64ISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+void IA64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                       SDNode * /*Node*/) const {
+  unsigned Opc = MI.getOpcode();
+  if (Opc != IA64::BRCALL_IPREL_GA && Opc != IA64::BRCALL_IPREL_ES)
+    return;
+
+  // gp (r1) is caller-saved at any call that is *not* provably local to this
+  // load module: such a call may be resolved through an import stub that loads
+  // the callee's own gp, and whether that happens is a static-vs-dynamic
+  // linking decision we cannot see at compile time -- so we must conservatively
+  // assume it does. Marking the call as defining r1 keeps the gp save/restore
+  // LowerCall emits from being coalesced away (the same mechanism as rp/b0).
+  //
+  // A dso_local callee (e.g. a recursive self-call) keeps gp, so we leave it
+  // alone and the redundant save/restore folds away -- no per-call gp churn.
+  // (LTO could later prove more callees local and drop the clobber.)
+  //
+  // The call's only explicit operand (0) is the target: a GlobalAddress (direct
+  // call to a known function) or an ExternalSymbol (always external).
+  const MachineOperand &Target = MI.getOperand(0);
+  bool IsLocal = Target.isGlobal() && Target.getGlobal()->isDSOLocal();
+  if (!IsLocal)
+    MI.addOperand(
+        MachineOperand::CreateReg(IA64::r1, /*isDef=*/true, /*isImp=*/true));
+
+  // A returns_twice callee on IA-64 cannot preserve the caller's stacked
+  // register frame (r32-r127). The two cases that matter -- setjmp/longjmp and
+  // vfork -- both leave the caller's stacked registers holding something other
+  // than their call-time values: vfork in particular runs the child in the
+  // parent's address space while the parent is blocked, so the child's use of
+  // the shared register backing store overwrites the parent's stacked locals
+  // (observed: an 'interp' argument parked in r32 reads back as 0 -- the value
+  // the vfork child stored there -- in Tcl's TclpCreateProcess, freeing a
+  // non-heap pointer). The static callee-saved registers r4-r7 are not in the
+  // backing store and survive (the kernel restores them from the parent's saved
+  // context); only the RSE-backed stacked registers are unsafe.
+  //
+  // The fixed BRCALL clobber list deliberately omits r32-r127 because an
+  // ordinary call *does* preserve the caller's frame via the RSE. For a
+  // returns_twice call we must additionally mark every stacked register clobbered
+  // so the allocator keeps nothing live across the call there -- such values are
+  // forced into r4-r7 or spilled to memory (which the child does not touch),
+  // exactly as GCC's 'calls_setjmp' handling requires. This complements the
+  // gp/sp/rp parking LowerCall already does for returns_twice functions.
+  //
+  // Express it as a regmask rather than 96 implicit-defs: implicit-def reg
+  // operands make MachineRegisterInfo::isPhysRegUsed report every stacked
+  // register as used, which IA64FrameLowering would then size the 'alloc' frame
+  // around (ballooning it to the 96-register maximum). A regmask is tested
+  // separately and is skipped by the frame-sizing scan (isPhysRegUsed's
+  // SkipRegMaskTest), so it constrains the allocator without inflating the frame.
+  const Function *Callee =
+      Target.isGlobal() ? dyn_cast<Function>(Target.getGlobal()) : nullptr;
+  if (Callee && Callee->hasFnAttribute(Attribute::ReturnsTwice)) {
+    MachineFunction &MF = *MI.getMF();
+    unsigned NumRegs = MF.getSubtarget().getRegisterInfo()->getNumRegs();
+    uint32_t *Mask = MF.allocateRegMask();
+    // A set bit means "preserved"; allocateRegMask zero-inits (clobber all), so
+    // mark everything preserved and then clear just the stacked GPRs. The fixed
+    // Defs above keep clobbering the caller-saved set on top of this mask.
+    for (unsigned I = 0, E = MachineOperand::getRegMaskSize(NumRegs); I != E; ++I)
+      Mask[I] = ~0u;
+    // Register 0 is NoRegister, not a physical register: it must stay clobbered
+    // (bit clear), or regmask consumers that expand preserved bits to reg units
+    // (e.g. MachineCopyPropagation) assert iterating reg-units of reg 0.
+    Mask[0] &= ~1u;
+    for (unsigned I = 0; I != IA64NumStackedGPRs; ++I) {
+      MCRegister R = getIA64StackedGPR(I);
+      Mask[R.id() / 32] &= ~(1u << (R.id() % 32));
+    }
+    MI.addOperand(MachineOperand::CreateRegMask(Mask));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+IA64TargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'r': // general register
+    case 'f': // floating-point register
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+IA64TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                 StringRef Constraint,
+                                                 MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      // Any integer value (including the i1 a Rust bool / black_box produces)
+      // lives in a general register. The GR class only carries i64, so the
+      // generic exact-type search fails for the narrower types; map them here.
+      if (VT.isInteger() || VT == MVT::Other)
+        return std::make_pair(0U, &IA64::GRRegClass);
+      break;
+    case 'f':
+      // f80 ('long double') is wider than the FP class's representative type
+      // (f64), which makes the generic inline-asm register-tiling code assert.
+      // Hand it the f80-only class so its register type is f80; f32/f64 are no
+      // wider than the representative type and use the multi-typed FP class.
+      if (VT == MVT::f80)
+        return std::make_pair(0U, &IA64::FP80RegClass);
+      if (VT == MVT::f32 || VT == MVT::f64)
+        return std::make_pair(0U, &IA64::FPRegClass);
+      break;
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
diff --git llvm/lib/Target/IA64/IA64ISelLowering.h llvm/lib/Target/IA64/IA64ISelLowering.h
new file mode 100644
index 000000000000..a3e604545284
--- /dev/null
+++ llvm/lib/Target/IA64/IA64ISelLowering.h
@@ -0,0 +1,150 @@
+//===-- IA64ISelLowering.h - IA64 DAG Lowering Interface --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that IA64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64ISELLOWERING_H
+#define LLVM_LIB_TARGET_IA64_IA64ISELLOWERING_H
+
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+
+class MachineInstr;
+class TargetSubtargetInfo;
+
+namespace IA64ISD {
+enum NodeType : unsigned {
+  // Start the numbering where the builtin ops and target ops leave off.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  /// GETFD - the getf.d instruction takes a floating point operand and
+  /// returns its 64-bit memory representation as an i64.
+  GETFD,
+
+  /// BRCALL - the call hack (see the pre-removal backend).
+  BRCALL,
+
+  /// RET_FLAG - Return with a flag operand.
+  RET_FLAG,
+
+  /// TLS_TPREL - local-exec thread-pointer-relative offset of a thread-local
+  /// symbol. Its single operand is a TargetGlobalAddress tagged S_TPREL;
+  /// selected to 'movl rX = @tprel(sym)'.
+  TLS_TPREL,
+
+  /// TLS_GOTLOAD - a value loaded from the symbol's GOT slot. Its single
+  /// operand is a TargetGlobalAddress whose target flags carry the @ltoff(...)
+  /// specifier (S_LTOFF_TPREL / S_LTOFF_DTPMOD / S_LTOFF_DTPREL); selected to
+  /// 'addl rX = <spec>, gp ;; ld8 rX = [rX]', the GlobalAddress GOT sequence.
+  TLS_GOTLOAD
+};
+} // end namespace IA64ISD
+
+class IA64TargetLowering : public TargetLowering {
+public:
+  explicit IA64TargetLowering(const TargetMachine &TM,
+                              const TargetSubtargetInfo &STI);
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  /// Jump-table entries are absolute code pointers (data8 <label>), loaded with
+  /// a plain LD8 and branched to via BRIND -- the simplest path, and it avoids
+  /// the 32-bit label-difference entries (which would need a sext-load).
+  unsigned getJumpTableEncoding() const override {
+    return MachineJumpTableInfo::EK_BlockAddress;
+  }
+
+  /// The entries above are absolute, so BR_JT must branch straight to the loaded
+  /// entry -- it must NOT add the table base back. The default keys this off
+  /// isPositionIndependent() (true here, since the ABI is PIC), which would make
+  /// the expansion compute base+entry and jump to garbage; force it off.
+  bool isJumpTableRelative() const override { return false; }
+
+  /// IA-64 has a single-rounding fused multiply-add (fma/fms/fnma), so a*b+c
+  /// is cheaper (and more accurate) fused. Returning true makes llvm.fmuladd
+  /// (clang's default -ffp-contract=on form of a*b+c) lower to ISD::FMA.
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT VT) const override;
+
+  /// getSetCCResultType - SETCC produces a predicate (i1) on IA-64.
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+  /// isFPImmLegal - return true for all FP immediates so the legalizer keeps
+  /// them as ConstantFP nodes (which we materialise from their integer bit
+  /// pattern via movl + setf.d) rather than emitting a constant-pool load,
+  /// which this backend does not lower.
+  bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                    bool ForCodeSize) const override;
+
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       const SDLoc &dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  /// IA-64 ld/st carry no implicit ordering, so acquire/release/seq_cst
+  /// atomics need explicit barriers. Returning true makes AtomicExpand bracket
+  /// stronger-than-monotonic atomic accesses with fences (which we select to
+  /// 'mf') and demote the access itself to monotonic -- and a monotonic,
+  /// aligned <=8-byte access is just a plain ld/st on the hardware (lowered as
+  /// such in LowerOperation).
+  bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+    return true;
+  }
+
+  /// The only atomic read-modify-write IA-64 has a single instruction for is
+  /// fetchadd (and only for a few immediates), so lower every atomicrmw
+  /// (add/sub/and/or/xor/nand/min/max/xchg/...) to a cmpxchg loop in IR. That
+  /// reduces all of them to the one primitive the backend selects natively,
+  /// ISD::ATOMIC_CMP_SWAP (cmpxchg{1,2,4,8}). Correct, not yet optimized.
+  AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override {
+    return AtomicExpansionKind::CmpXChg;
+  }
+
+  /// Lower a thread-local address access (ISD::GlobalTLSAddress) per the model
+  /// TargetMachine::getTLSModel picks: local-exec / initial-exec materialise a
+  /// tp-relative offset and add tp (r13); general/local-dynamic call
+  /// __tls_get_addr(module, offset).
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                      SelectionDAG &DAG) const override;
+
+  /// Mark a call to a non-local callee as clobbering gp (r1), so the gp
+  /// save/restore that LowerCall emits survives coalescing. Local (dso_local)
+  /// callees keep gp and are left alone.
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                     SDNode *Node) const override;
+
+  /// Inline-asm support. We recognise the GCC IA-64 register constraints 'r'
+  /// (general register) and 'f' (floating-point register); everything else
+  /// falls back to the generic handling.
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64ISELLOWERING_H
diff --git llvm/lib/Target/IA64/IA64InstrFormats.td llvm/lib/Target/IA64/IA64InstrFormats.td
new file mode 100644
index 000000000000..5a456cdaef01
--- /dev/null
+++ llvm/lib/Target/IA64/IA64InstrFormats.td
@@ -0,0 +1,78 @@
+//===- IA64InstrFormats.td - IA64 Instruction Formats -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  - Warning: the stuff in here isn't really being used, so is mostly
+//             junk. It'll get fixed as the JIT gets built.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+class InstIA64<bits<4> op, dag OOL, dag IOL, string asmstr> : Instruction {
+  // IA64 instruction baseline
+  field bits<41> Inst;
+  let Namespace = "IA64";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+
+  let Inst{40-37} = op;
+}
+
+//"Each Itanium instruction is categorized into one of six types."
+//We should have:
+// A, I, M, F, B, L+X
+
+class AForm<bits<4> opcode, bits<6> qpReg, dag OOL, dag IOL, string asmstr> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+
+  let Inst{5-0} = qpReg;
+}
+
+class AForm_DAG<bits<4> opcode, bits<6> qpReg, dag OOL, dag IOL, string asmstr,
+      list<dag> pattern> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+
+  let Pattern = pattern;
+  let Inst{5-0} = qpReg;
+}
+
+let isBranch = 1, isTerminator = 1 in
+class BForm<bits<4> opcode, bits<6> x6, bits<3> btype, dag OOL, dag IOL, string asmstr> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+
+  let Inst{32-27} = x6;
+  let Inst{8-6} = btype;
+}
+
+class MForm<bits<4> opcode, bits<6> x6, dag OOL, dag IOL, string asmstr> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+    bits<7> Ra;
+    bits<7> Rb;
+    bits<16> disp;
+
+    let Inst{35-30} = x6;
+//  let Inst{20-16} = Rb;
+    let Inst{15-0} = disp;
+}
+
+class RawForm<bits<4> opcode, bits<26> rest, dag OOL, dag IOL, string asmstr> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+    let Inst{25-0} = rest;
+}
+
+// Pseudo instructions.
+class PseudoInstIA64<dag OOL, dag IOL, string nm> : InstIA64<0, OOL, IOL, nm>  {
+}
+
+class PseudoInstIA64_DAG<dag OOL, dag IOL, string nm, list<dag> pattern>
+  : InstIA64<0, OOL, IOL, nm> {
+  let Pattern = pattern;
+}
diff --git llvm/lib/Target/IA64/IA64InstrInfo.cpp llvm/lib/Target/IA64/IA64InstrInfo.cpp
new file mode 100644
index 000000000000..b9c889382c76
--- /dev/null
+++ llvm/lib/Target/IA64/IA64InstrInfo.cpp
@@ -0,0 +1,295 @@
+//===- IA64InstrInfo.cpp - IA64 Instruction Information -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64InstrInfo.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "IA64GenInstrInfo.inc"
+
+// Pin the vtable to this translation unit.
+void IA64InstrInfo::anchor() {}
+
+IA64InstrInfo::IA64InstrInfo(const TargetSubtargetInfo &STI)
+    : IA64GenInstrInfo(STI, RI, IA64::ADJUSTCALLSTACKDOWN,
+                       IA64::ADJUSTCALLSTACKUP),
+      RI() {}
+
+void IA64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                const DebugLoc &DL, Register DestReg,
+                                Register SrcReg, bool KillSrc,
+                                bool /*RenamableDest*/,
+                                bool /*RenamableSrc*/) const {
+  if (IA64::PRRegClass.contains(DestReg)) {
+    if (IA64::PRRegClass.contains(SrcReg)) {
+      // Predicate -> predicate: (SrcReg) DestReg = cmp.eq.unc(r0, r0). The .unc
+      // form writes DestReg in both cases (1 when SrcReg holds, else 0).
+      BuildMI(MBB, I, DL, get(IA64::PCMPEQUNC), DestReg)
+          .addReg(IA64::r0)
+          .addReg(IA64::r0)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      // General register -> predicate: DestReg = (SrcReg != 0), the inverse of
+      // the GR<-PR copy below. There is no 'mov PR = GR'.
+      BuildMI(MBB, I, DL, get(IA64::CMPNE), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addReg(IA64::r0);
+    }
+    return;
+  }
+
+  if (IA64::ARRegClass.contains(DestReg)) {
+    // Restoring ar.pfs from a general register: 'mov ar.pfs = rN'. ar.pfs is
+    // in its own register class, so the generic GR MOV below cannot name it.
+    BuildMI(MBB, I, DL, get(IA64::MOV_TO_AR_PFS), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (IA64::BRRegClass.contains(DestReg)) {
+    // Loading a branch register (b6) for an indirect call: 'mov b6 = rN'. Like
+    // ar.pfs, b6 is in its own class, so the generic GR MOV below cannot name it.
+    BuildMI(MBB, I, DL, get(IA64::MOV_TO_BR), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (IA64::GRRegClass.contains(DestReg) && IA64::PRRegClass.contains(SrcReg)) {
+    // Reading a predicate into a general register: materialize its 0/1 boolean
+    // value (there is no 'mov GR = PR'). DestReg = 0 ;; (SrcReg) DestReg = 1 --
+    // the same zext-PR sequence used in the td. The tied TPCADDS adds 1 only
+    // when the predicate holds.
+    BuildMI(MBB, I, DL, get(IA64::ADDS), DestReg).addReg(IA64::r0).addImm(0);
+    BuildMI(MBB, I, DL, get(IA64::TPCADDS), DestReg)
+        .addReg(DestReg)
+        .addImm(1)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // Otherwise MOV works for both general and FP registers.
+  BuildMI(MBB, I, DL, get(IA64::MOV), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void IA64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        Register SrcReg, bool isKill,
+                                        int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        Register /*VReg*/,
+                                        MachineInstr::MIFlag /*Flags*/) const {
+  DebugLoc DL;
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  if (RC == &IA64::FPRegClass) {
+    BuildMI(MBB, MI, DL, get(IA64::STF_SPILL))
+        .addFrameIndex(FrameIdx)
+        .addReg(SrcReg, getKillRegState(isKill));
+  } else if (IA64::GRRegClass.hasSubClassEq(RC)) {
+    // GR or a GR sub-class (e.g. GR03, the restricted r0-r3 ADDL-addend class):
+    // any of them spills with a plain 8-byte store.
+    BuildMI(MBB, MI, DL, get(IA64::ST8))
+        .addFrameIndex(FrameIdx)
+        .addReg(SrcReg, getKillRegState(isKill));
+  } else if (RC == &IA64::PRRegClass) {
+    // We use IA64::r2 as a temporary register for doing this hackery.
+    // First we load 0:
+    BuildMI(MBB, MI, DL, get(IA64::MOV), IA64::r2).addReg(IA64::r0);
+    // Then conditionally add 1:
+    BuildMI(MBB, MI, DL, get(IA64::CADDIMM22), IA64::r2)
+        .addReg(IA64::r2)
+        .addImm(1)
+        .addReg(SrcReg, getKillRegState(isKill));
+    // And then store it to the stack:
+    BuildMI(MBB, MI, DL, get(IA64::ST8)).addFrameIndex(FrameIdx).addReg(IA64::r2);
+  } else {
+    llvm_unreachable("sorry, I don't know how to store this sort of reg "
+                     "in the stack");
+  }
+}
+
+void IA64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         Register DestReg, int FrameIdx,
+                                         const TargetRegisterClass *RC,
+                                         Register /*VReg*/, unsigned /*SubReg*/,
+                                         MachineInstr::MIFlag /*Flags*/) const {
+  DebugLoc DL;
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  if (RC == &IA64::FPRegClass) {
+    BuildMI(MBB, MI, DL, get(IA64::LDF_FILL), DestReg).addFrameIndex(FrameIdx);
+  } else if (IA64::GRRegClass.hasSubClassEq(RC)) {
+    // GR or a GR sub-class (e.g. GR03): reload with a plain 8-byte load.
+    BuildMI(MBB, MI, DL, get(IA64::LD8), DestReg).addFrameIndex(FrameIdx);
+  } else if (RC == &IA64::PRRegClass) {
+    // First we load a byte from the stack into r2, our 'predicate hackery'
+    // scratch reg.
+    BuildMI(MBB, MI, DL, get(IA64::LD8), IA64::r2).addFrameIndex(FrameIdx);
+    // Then we compare it to zero. If it _is_ zero, compare-not-equal to r0
+    // gives us 0, which is what we want, so that's nice.
+    BuildMI(MBB, MI, DL, get(IA64::CMPNE), DestReg)
+        .addReg(IA64::r2)
+        .addReg(IA64::r0);
+  } else {
+    llvm_unreachable("sorry, I don't know how to load this sort of reg "
+                     "from the stack");
+  }
+}
+
+// The two branch forms our selector emits: 'BRL_NOTCALL' is the unconditional
+// '(p0) brl.cond TBB' (operand 0 = target block); 'BRLCOND_NOTCALL' is the
+// conditional '($qp) brl.cond TBB' (operand 0 = predicate, operand 1 = target).
+static bool isUncondBranchOpcode(unsigned Opc) {
+  return Opc == IA64::BRL_NOTCALL;
+}
+static bool isCondBranchOpcode(unsigned Opc) {
+  return Opc == IA64::BRLCOND_NOTCALL;
+}
+
+// The branch condition for IA-64 is the single qualifying predicate register.
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&TBB,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  Cond.push_back(LastInst->getOperand(0)); // the predicate
+  TBB = LastInst->getOperand(1).getMBB();
+}
+
+bool IA64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false; // empty block, falls through
+
+  if (!isUnpredicatedTerminator(*I))
+    return false; // last instruction isn't a terminator, falls through
+
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+
+  // Just one terminator.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      parseCondBranch(LastInst, TBB, Cond); // ends with fall-through cond branch
+      return false;
+    }
+    return true; // some other terminator (e.g. indirect branch): can't analyze
+  }
+
+  MachineInstr *SecondLastInst = &*I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If the block ends with two or more unconditional branches, the trailing
+  // ones are dead; drop them when allowed.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      }
+      SecondLastInst = &*I;
+      SecondLastOpc = SecondLastInst->getOpcode();
+    }
+  }
+
+  // Three terminators: bail out.
+  if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
+    return true;
+
+  // Conditional branch to TBB followed by an unconditional branch to FBB.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // Two unconditional branches: the second is unreachable.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  return true; // anything else: can't analyze
+}
+
+unsigned IA64InstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                     int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugInstr())
+      continue;
+    if (!isCondBranchOpcode(I->getOpcode()) &&
+        !isUncondBranchOpcode(I->getOpcode()))
+      break; // not a branch
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+  return Count;
+}
+
+unsigned IA64InstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *TBB,
+                                     MachineBasicBlock *FBB,
+                                     ArrayRef<MachineOperand> Cond,
+                                     const DebugLoc &DL, int *BytesAdded) const {
+  assert(!BytesAdded && "code size not handled");
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert(Cond.size() <= 1 &&
+         "IA64 branch condition is a single qualifying predicate!");
+
+  if (Cond.empty()) {
+    // Unconditional branch.
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(IA64::BRL_NOTCALL)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch '($qp) brl.cond TBB'.
+  BuildMI(&MBB, DL, get(IA64::BRLCOND_NOTCALL)).add(Cond[0]).addMBB(TBB);
+  if (!FBB)
+    return 1;
+
+  // Two-way branch: append the unconditional branch to the false target.
+  BuildMI(&MBB, DL, get(IA64::BRL_NOTCALL)).addMBB(FBB);
+  return 2;
+}
+
+bool IA64InstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  // The condition is a single qualifying predicate register. Its complement is
+  // not available -- the CMP* instructions discard the complement predicate
+  // (they write 'p0' for it) -- so the condition cannot be reversed in place.
+  // Returning true signals "cannot reverse"; callers fall back accordingly
+  // (e.g. they still remove a redundant fall-through branch, which needs no
+  // reversal).
+  return true;
+}
diff --git llvm/lib/Target/IA64/IA64InstrInfo.h llvm/lib/Target/IA64/IA64InstrInfo.h
new file mode 100644
index 000000000000..ba61c67f1762
--- /dev/null
+++ llvm/lib/Target/IA64/IA64InstrInfo.h
@@ -0,0 +1,75 @@
+//===- IA64InstrInfo.h - IA64 Instruction Information -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64INSTRINFO_H
+#define LLVM_LIB_TARGET_IA64_IA64INSTRINFO_H
+
+#include "IA64RegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "IA64GenInstrInfo.inc"
+
+namespace llvm {
+
+class IA64InstrInfo : public IA64GenInstrInfo {
+  const IA64RegisterInfo RI;
+  virtual void anchor();
+
+public:
+  // The pre-removal backend held the InstrInfo standalone; the modern
+  // -gen-instr-info constructor needs the subtarget + register info. We take
+  // the base TargetSubtargetInfo (rather than a not-yet-existing IA64Subtarget)
+  // so this class can be built before the subtarget aggregate lands.
+  explicit IA64InstrInfo(const TargetSubtargetInfo &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  const IA64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, Register DestReg, Register SrcReg,
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
+
+  void storeRegToStackSlot(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
+      MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
+
+  void loadRegFromStackSlot(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
+      unsigned SubReg = 0,
+      MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64INSTRINFO_H
diff --git llvm/lib/Target/IA64/IA64InstrInfo.td llvm/lib/Target/IA64/IA64InstrInfo.td
new file mode 100644
index 000000000000..c2aa7101d05b
--- /dev/null
+++ llvm/lib/Target/IA64/IA64InstrInfo.td
@@ -0,0 +1,960 @@
+//===- IA64InstrInfo.td - Describe the IA64 Instruction Set *- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the IA64 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+// Phase C ported the instruction *descriptions* (operands, asm strings, unit
+// type, explicit flags / Defs / Uses) and wired -gen-instr-info, leaving every
+// DAG selection pattern as an empty `[]` placeholder. Phase D (D6) restores
+// only the patterns the Stage-1 acceptance test (plus.ll) exercises — the
+// reg/reg `add` and the `retflag`/RET return — and wires -gen-dag-isel. The
+// remaining patterns, PatLeaf predicates and the GETFD/BRCALL nodes stay
+// deferred (calls, loads, FP, etc. are out of Stage-1 scope). The generic `ret`
+// SDNode no longer exists, so the old `Pat<(ret), (RET)>` is dropped; RET is now
+// matched directly from the target-specific `retflag` node.
+//
+//===----------------------------------------------------------------------===//
+
+include "IA64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// IA-64 specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// RET_FLAG - return with a flag (glue) operand. LowerReturn emits this after
+// copying the return values into place; it selects to the RET (br.ret)
+// instruction below. SDNPVariadic is required so the instruction selector
+// transfers the extra register operands LowerReturn glues onto this node (the
+// return-value regs, e.g. r8) onto the selected RET machine instruction as
+// implicit uses. Without it those registers look dead and the return-value
+// copies (and everything feeding them) get eliminated, leaving just the
+// 'alloc'. (ar.pfs is no longer among them: frame lowering now restores it from
+// a parked stacked local in the epilogue, alongside rp.)
+def retflag : SDNode<"IA64ISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Call-sequence markers are target-independent nodes (ISD::CALLSEQ_START/END)
+// but each target declares them with its own operand type profile. IA-64 frame
+// amounts are i64. ADJUSTCALLSTACKDOWN/UP select directly from these.
+def SDT_IA64CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>, SDTCisVT<1, i64>]>;
+def SDT_IA64CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i64>, SDTCisVT<1, i64>]>;
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_IA64CallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_IA64CallSeqEnd,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction types
+//===----------------------------------------------------------------------===//
+
+class isA { bit A=1; } // I or M unit
+class isM { bit M=1; } // M unit
+class isI { bit I=1; } // I unit
+class isB { bit B=1; } // B unit
+class isF { bit F=1; } // F unit
+class isLX { bit LX=1; } // I/B
+
+//===----------------------------------------------------------------------===//
+// Operands
+//===----------------------------------------------------------------------===//
+
+def u2imm : Operand<i8>;
+def u6imm : Operand<i8>;
+def s8imm : Operand<i8> {
+  let PrintMethod = "printS8ImmOperand";
+}
+def s14imm  : Operand<i64> {
+  let PrintMethod = "printS14ImmOperand";
+}
+def s22imm  : Operand<i64> {
+  let PrintMethod = "printS22ImmOperand";
+}
+def u64imm  : Operand<i64> {
+  let PrintMethod = "printU64ImmOperand";
+}
+def s64imm  : Operand<i64> {
+  let PrintMethod = "printS64ImmOperand";
+}
+
+let PrintMethod = "printGlobalOperand" in
+  def globaladdress : Operand<i64>;
+
+// the asmprinter needs to know about calls
+let PrintMethod = "printCallOperand" in
+  def calltarget : Operand<i64>;
+
+//===----------------------------------------------------------------------===//
+// Immediate predicates
+//===----------------------------------------------------------------------===//
+
+// The constant-materialization ladder: a 64-bit constant is realized by the
+// cheapest instruction whose signed-immediate field it fits in. The three
+// ranges nest (14 ⊂ 22 ⊂ 64); the AddedComplexity annotations on the
+// materialization patterns below make TableGen pick the narrowest fit.
+
+// immSExt14 - fits a signed 14-bit field (the 'adds' immediate).
+def immSExt14 : PatLeaf<(i64 imm), [{
+  int64_t v = (int64_t)N->getZExtValue();
+  return (v <= 8191 && v >= -8192);
+}]>;
+
+// immSExt22 - fits a signed 22-bit field. 'mov r1 = imm22' is the architectural
+// pseudo-op of 'addl r1 = imm22, r0' (the immediate is sign-extended to 64
+// bits), so a single A-slot instruction covers this range without a full 'movl'.
+def immSExt22 : PatLeaf<(i64 imm), [{
+  int64_t v = (int64_t)N->getZExtValue();
+  return (v < (1 << 21) && v >= -(1 << 21));
+}]>;
+
+// imm64 - matches any 64-bit constant; the unpredicated 'movl' catch-all.
+def imm64 : PatLeaf<(i64 imm)>;
+
+// is32ones - the constant 0x00000000FFFFFFFF, the mask of a 32->64 zero-extend
+// ('zxt4'). Lets the i32->i64 zero-extend select to a single instruction.
+def is32ones : PatLeaf<(i64 imm), [{
+  return ((uint64_t)N->getZExtValue() == 0x00000000FFFFFFFFULL);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+def ADD  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "add $dst = $src1, $src2",
+           [(set GR:$dst, (add GR:$src1, GR:$src2))]>, isA;
+
+def ADD1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "add $dst = $src1, $src2, 1", []>, isA;
+
+// 'adds' (A4) is a register + signed-14-bit-immediate add. Its register
+// operand r3 may be any GR, so it doubles as a general add-immediate.
+let AddedComplexity = 2 in
+def ADDS : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s14imm:$imm),
+           "adds $dst = $imm, $src1",
+           [(set GR:$dst, (add GR:$src1, immSExt14:$imm))]>, isA;
+
+// 'addl' (A5) carries a wider signed-22-bit immediate, but it pays for those
+// extra bits with a 2-bit register field: its addend must be r0-r3 (in practice
+// only r0, since r1/r2 are reserved and r3 is the lone allocatable member of
+// GR03).
+// Used for constant materialization only.
+def ADDL : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR03:$src1, s22imm:$imm),
+           "addl $dst = $imm, $src1", []>, isA;
+
+def MOVL : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins s64imm:$imm),
+           "movl $dst = $imm",
+           [(set GR:$dst, imm64:$imm)]>, isLX;
+
+// Bare-constant materialization ladder, narrowest-fit first (AddedComplexity
+// 2 > 1 > 0): 'adds rX = imm14, r0' for 14-bit values, 'addl rX = imm22, r0'
+// (the 'mov rX = imm22' pseudo-op; r0 is a valid GR03 addend) for 22-bit, and
+// the unpredicated 'movl' (above) as the catch-all.
+let AddedComplexity = 2 in
+def : Pat<(i64 immSExt14:$imm), (ADDS r0, immSExt14:$imm)>;
+let AddedComplexity = 1 in
+def : Pat<(i64 immSExt22:$imm), (ADDL r0, immSExt22:$imm)>;
+
+def ADDL_GA : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, globaladdress:$imm),
+           "addl $dst = $imm, $src1", []>, isA;
+
+// movl of a relocated symbol: materializes a full 64-bit symbol-relative value
+// directly in the instruction (no GOT). Hand-selected (no DAG pattern) for the
+// local-exec TLS sequence -- 'movl rX = @tprel(sym)' (R_IA64_TPREL64I) -- which
+// must use a direct relocation since local-exec by definition skips the GOT;
+// the 64-bit form dodges the addl imm22/r0-r3 restriction for any tp offset.
+def MOVL_GA : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins globaladdress:$imm),
+           "movl $dst = $imm", []>, isLX;
+
+// hmm
+def ADDL_EA : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, calltarget:$imm),
+           "addl $dst = $imm, $src1", []>, isA;
+
+def SUB  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "sub $dst = $src1, $src2",
+           [(set GR:$dst, (sub GR:$src1, GR:$src2))]>, isA;
+
+def SUB1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "sub $dst = $src1, $src2, 1", []>, isA;
+
+// The pre-removal backend marked these "isTwoAddress"; the modern equivalent
+// is a tied-operand Constraints string (first input tied to the def).
+let Constraints = "$src1 = $dst" in {
+def TPCADDIMM22 : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, s22imm:$imm, PR:$qp),
+    "($qp) add $dst = $imm, $dst">, isA;
+def TPCADDS : AForm_DAG<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, s14imm:$imm, PR:$qp),
+    "($qp) adds $dst = $imm, $dst", []>, isA;
+def TPCMPIMM8NE : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src1, s22imm:$imm, GR:$src2, PR:$qp),
+    "($qp) cmp.ne $dst , p0 = $imm, $src2">, isA;
+}
+
+// normal sign/zero-extends
+def SXT1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "sxt1 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i8))]>, isI;
+def ZXT1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "zxt1 $dst = $src",
+           [(set GR:$dst, (and GR:$src, 255))]>, isI;
+def SXT2 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "sxt2 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i16))]>, isI;
+def ZXT2 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "zxt2 $dst = $src",
+           [(set GR:$dst, (and GR:$src, 65535))]>, isI;
+def SXT4 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "sxt4 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i32))]>, isI;
+def ZXT4 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "zxt4 $dst = $src",
+           [(set GR:$dst, (and GR:$src, is32ones))]>, isI;
+
+// fixme: shrs vs shru?
+def MIX1L : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix1.l $dst = $src1, $src2", []>, isI;
+def MIX2L : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix2.l $dst = $src1, $src2", []>, isI;
+def MIX4L : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix4.l $dst = $src1, $src2", []>, isI;
+def MIX1R : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix1.r $dst = $src1, $src2", []>, isI;
+def MIX2R : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix2.r $dst = $src1, $src2", []>, isI;
+def MIX4R : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix4.r $dst = $src1, $src2", []>, isI;
+
+def GETFSIGD : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins FP:$src),
+  "getf.sig $dst = $src", []>, isM;
+def SETFSIGD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins GR:$src),
+  "setf.sig $dst = $src", []>, isM;
+
+def XMALD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.l $dst = $src1, $src2, $src3", []>, isF;
+def XMAHD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.h $dst = $src1, $src2, $src3", []>, isF;
+def XMAHUD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.hu $dst = $src1, $src2, $src3", []>, isF;
+
+// IA-64 has no integer multiply unit; 'mul' is synthesized through the FP
+// fixed-point multiply-add (xma) pipe. Move both operands into FP registers
+// (setf.sig), multiply-add with F0 (the fixed +0.0) as the addend so the
+// result is just the product -- xma.l yields the low 64 bits, which is exactly
+// the low-half integer 'mul' -- then move the result back to a GR (getf.sig).
+// The high 64 bits of a 64x64 product come from the same pipe: xma.hu for the
+// unsigned high (mulhu), xma.h for the signed high (mulhs). These feed clang's
+// divide-by-constant strength reduction (a magic-number multiply-high).
+def : Pat<(mul GR:$src1, GR:$src2),
+          (GETFSIGD (f64 (XMALD (f64 (SETFSIGD GR:$src1)),
+                                (f64 (SETFSIGD GR:$src2)), (f64 F0))))>;
+def : Pat<(mulhu GR:$src1, GR:$src2),
+          (GETFSIGD (f64 (XMAHUD (f64 (SETFSIGD GR:$src1)),
+                                 (f64 (SETFSIGD GR:$src2)), (f64 F0))))>;
+def : Pat<(mulhs GR:$src1, GR:$src2),
+          (GETFSIGD (f64 (XMAHD (f64 (SETFSIGD GR:$src1)),
+                                (f64 (SETFSIGD GR:$src2)), (f64 F0))))>;
+
+def AND   : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "and $dst = $src1, $src2",
+          [(set GR:$dst, (and GR:$src1, GR:$src2))]>, isA;
+def ANDCM : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "andcm $dst = $src1, $src2", []>, isA;
+// TODO: and/andcm/or/xor/add/sub/shift immediate forms
+def OR    : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "or $dst = $src1, $src2",
+          [(set GR:$dst, (or GR:$src1, GR:$src2))]>, isA;
+
+def pOR   : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+          "($qp) or $dst = $src1, $src2">, isA;
+
+// the following are all a bit unfortunate: we throw away the complement
+// of the compare!
+def CMPEQ : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.eq $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETEQ))]>, isA;
+def CMPGT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.gt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETGT))]>, isA;
+def CMPGE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.ge $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETGE))]>, isA;
+def CMPLT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.lt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETLT))]>, isA;
+def CMPLE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.le $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETLE))]>, isA;
+def CMPNE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.ne $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETNE))]>, isA;
+def CMPLTU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.ltu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETULT))]>, isA;
+def CMPGTU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.gtu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETUGT))]>, isA;
+def CMPLEU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.leu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETULE))]>, isA;
+def CMPGEU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.geu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc GR:$src1, GR:$src2, SETUGE))]>, isA;
+
+// FP compares. Each sets a predicate from an IA-64 fcmp relation. The ordered
+// relations (eq/lt/le/gt/ge) are false when an operand is NaN; the negated
+// relations (nlt/nle/ngt/nge) and unord are true when unordered -- exactly the
+// U-prefixed setcc semantics. SETONE/SETUEQ have no single relation and are
+// expanded by the legalizer (setCondCodeAction) into a pair of these joined by
+// the i1 and/or patterns.
+def FCMPEQ : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.eq $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETOEQ))]>, isF;
+def FCMPGT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.gt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETOGT))]>, isF;
+def FCMPGE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.ge $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETOGE))]>, isF;
+def FCMPLT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.lt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETOLT))]>, isF;
+def FCMPLE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.le $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETOLE))]>, isF;
+def FCMPNE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.neq $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETUNE))]>, isF;
+def FCMPORD : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.ord $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETO))]>, isF;
+def FCMPUNORD : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.unord $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETUO))]>, isF;
+def FCMPNLT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.nlt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETUGE))]>, isF;
+def FCMPNLE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.nle $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETUGT))]>, isF;
+def FCMPNGT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.ngt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETULE))]>, isF;
+def FCMPNGE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.nge $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setcc (f64 FP:$src1), (f64 FP:$src2), SETULT))]>, isF;
+
+// The "plain" (NaN-don't-care) FP conditions, which the optimizer emits when
+// NaN ordering is irrelevant (e.g. comparing a folded constant). Map them to
+// the ordered relations -- NaN compares false, matching the hardware fcmp.
+def : Pat<(setcc (f64 FP:$a), (f64 FP:$b), SETEQ), (FCMPEQ FP:$a, FP:$b)>;
+def : Pat<(setcc (f64 FP:$a), (f64 FP:$b), SETGT), (FCMPGT FP:$a, FP:$b)>;
+def : Pat<(setcc (f64 FP:$a), (f64 FP:$b), SETGE), (FCMPGE FP:$a, FP:$b)>;
+def : Pat<(setcc (f64 FP:$a), (f64 FP:$b), SETLT), (FCMPLT FP:$a, FP:$b)>;
+def : Pat<(setcc (f64 FP:$a), (f64 FP:$b), SETLE), (FCMPLE FP:$a, FP:$b)>;
+def : Pat<(setcc (f64 FP:$a), (f64 FP:$b), SETNE), (FCMPNE FP:$a, FP:$b)>;
+
+// f32 comparisons reuse the same (precision-independent) FCMP* instructions:
+// the register compare looks at the full 82-bit register-format value, so a
+// single-precision operand needs no separate compare. Mirror the f64 rules.
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETOEQ), (FCMPEQ FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETOGT), (FCMPGT FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETOGE), (FCMPGE FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETOLT), (FCMPLT FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETOLE), (FCMPLE FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETUNE), (FCMPNE FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETO),   (FCMPORD FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETUO),  (FCMPUNORD FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETUGE), (FCMPNLT FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETUGT), (FCMPNLE FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETULE), (FCMPNGT FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETULT), (FCMPNGE FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETEQ),  (FCMPEQ FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETGT),  (FCMPGT FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETGE),  (FCMPGE FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETLT),  (FCMPLT FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETLE),  (FCMPLE FP:$a, FP:$b)>;
+def : Pat<(setcc (f32 FP:$a), (f32 FP:$b), SETNE),  (FCMPNE FP:$a, FP:$b)>;
+
+def PCMPEQUNCR0R0 : AForm<0x03, 0x0b, (outs PR:$dst), (ins PR:$qp),
+    "($qp) cmp.eq.unc $dst, p0 = r0, r0">, isA;
+
+// materialization of PR (i1) constants using CMPNE/CMPEQ
+def : Pat<(i1 0), (CMPNE r0, r0)>;
+def : Pat<(i1 1), (CMPEQ r0, r0)>;
+def : Pat<(i1 -1), (CMPEQ r0, r0)>;
+
+let Constraints = "$bogus = $dst" in {
+  def TPCMPEQR0R0 : AForm<0x03, 0x0b, (outs PR:$dst), (ins PR:$bogus, PR:$qp),
+    "($qp) cmp.eq $dst, p0 = r0, r0">, isA;
+  def TPCMPNER0R0 : AForm<0x03, 0x0b, (outs PR:$dst), (ins PR:$bogus, PR:$qp),
+    "($qp) cmp.ne $dst, p0 = r0, r0">, isA;
+}
+
+def XOR   : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "xor $dst = $src1, $src2",
+          [(set GR:$dst, (xor GR:$src1, GR:$src2))]>, isA;
+
+def SHLADD: AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1,s64imm:$imm,GR:$src2),
+          "shladd $dst = $src1, $imm, $src2", []>, isA;
+
+def SHL   : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "shl $dst = $src1, $src2",
+          [(set GR:$dst, (shl GR:$src1, GR:$src2))]>, isI;
+
+def SHRU  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "shr.u $dst = $src1, $src2",
+          [(set GR:$dst, (srl GR:$src1, GR:$src2))]>, isI;
+
+def SHRS  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "shr $dst = $src1, $src2",
+          [(set GR:$dst, (sra GR:$src1, GR:$src2))]>, isI;
+
+def MOV : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "mov $dst = $src">, isA;
+// Restore ar.pfs from a general register: 'mov ar.pfs = rN'. ar.pfs lives in
+// its own (AR) register class, so this is a distinct instruction from the
+// GR->GR MOV above; copyPhysReg emits it for copies into AR_PFS.
+def MOV_TO_AR_PFS : AForm<0x03, 0x0b, (outs AR:$dst), (ins GR:$src),
+  "mov $dst = $src">, isA;
+// Move a general register into a branch register: 'mov b6 = rN'. Like ar.pfs
+// above, b6 is in its own (BR) class; copyPhysReg emits this for copies into it.
+def MOV_TO_BR : AForm<0x03, 0x0b, (outs BR:$dst), (ins GR:$src),
+  "mov $dst = $src">, isA;
+def FMOV : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "mov $dst = $src">, isF; // XXX: there _is_ no fmov
+def PMOV : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src, PR:$qp),
+  "($qp) mov $dst = $src">, isA;
+
+def SPILL_ALL_PREDICATES_TO_GR : AForm<0x03, 0x0b, (outs GR:$dst), (ins),
+  "mov $dst = pr">, isI;
+def FILL_ALL_PREDICATES_FROM_GR : AForm<0x03, 0x0b, (outs), (ins GR:$src),
+  "mov pr = $src">, isI;
+
+let Constraints = "$src2 = $dst" in {
+  def CMOV : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src2, GR:$src, PR:$qp),
+    "($qp) mov $dst = $src">, isA;
+}
+
+def PFMOV : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src, PR:$qp),
+  "($qp) mov $dst = $src">, isF;
+
+let Constraints = "$src2 = $dst" in {
+  def CFMOV : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src2, FP:$src, PR:$qp),
+    "($qp) mov $dst = $src">, isF;
+}
+
+// IA-64 predicates rather than branching: lower (select p, t, f) to a single
+// predicated move whose *false* value enters through the tied default operand,
+// predicating only the *true* arm:
+//   mov dst = f  ;;  (p) mov dst = t
+// The complement predicate is never needed, so no bidirectional compare is
+// required; the mandatory stop after the predicate-defining cmp covers the
+// def->use hazard regardless. The i1 condition is already a PR (our setcc
+// yields one), so select(setcc(..), t, f) consumes the cmp we already emit.
+def : Pat<(i64 (select PR:$qp, GR:$t, GR:$f)), (CMOV  GR:$f, GR:$t, PR:$qp)>;
+def : Pat<(f32 (select PR:$qp, FP:$t, FP:$f)), (CFMOV FP:$f, FP:$t, PR:$qp)>;
+def : Pat<(f64 (select PR:$qp, FP:$t, FP:$f)), (CFMOV FP:$f, FP:$t, PR:$qp)>;
+// Selecting one predicate from two: there is no predicated predicate-move, so
+// widen each arm to a 0/1 GR (the zext-PR sequence), CMOV those, and compare the
+// result back to a predicate (!= 0). Mirrors the pre-removal SELECTBOOL.
+def : Pat<(i1 (select PR:$qp, PR:$t, PR:$f)),
+          (CMPNE (CMOV (TPCADDS (ADDS r0, 0), 1, PR:$f),
+                       (TPCADDS (ADDS r0, 0), 1, PR:$t), PR:$qp),
+                 r0)>;
+
+// load constants of various sizes // FIXME: prettyprint -ve constants
+
+// TODO: support postincrement (reg, imm9) loads+stores - this needs more
+// tablegen support
+
+def IUSE : PseudoInstIA64<(outs), (ins variable_ops), "// IUSE">;
+// Call-frame setup/teardown. Selected directly from callseq_start/callseq_end
+// via these patterns (the modern idiom, as on Sparc) so TableGen threads the
+// chain and glue for us -- the pre-removal backend hand-selected these, which
+// predates the two-operand + glue callseq_end shape. They def/use sp (r12);
+// IA64FrameLowering::eliminateCallFramePseudoInstr reads the byte count from
+// operand 0 and turns them into sp adjustments (or erases them).
+let Defs = [r12], Uses = [r12] in {
+def ADJUSTCALLSTACKDOWN : PseudoInstIA64_DAG<(outs),
+                          (ins i64imm:$amt1, i64imm:$amt2),
+                          "// ADJUSTCALLSTACKDOWN",
+                          [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJUSTCALLSTACKUP : PseudoInstIA64_DAG<(outs),
+                        (ins i64imm:$amt1, i64imm:$amt2),
+                        "// ADJUSTCALLSTACKUP",
+                        [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+def ALLOC : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins i8imm:$inputs, i8imm:$locals, i8imm:$outputs, i8imm:$rotating),
+    "alloc $dst = ar.pfs,$inputs,$locals,$outputs,$rotating">, isM;
+
+let Constraints = "$src2 = $dst" in {
+  def TCMPNE : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4),
+    "cmp.ne $dst, p0 = $src3, $src4">, isA;
+
+  def TPCMPEQOR : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.eq.or $dst, p0 = $src3, $src4">, isA;
+
+  def TPCMPNE : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.ne $dst, p0 = $src3, $src4">, isA;
+
+  def TPCMPEQ : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.eq $dst, p0 = $src3, $src4">, isA;
+}
+
+def MOVSIMM14 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s14imm:$imm),
+  "mov $dst = $imm">, isA;
+def MOVSIMM22 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s22imm:$imm),
+  "mov $dst = $imm">, isA;
+def MOVLIMM64 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s64imm:$imm),
+  "movl $dst = $imm">, isLX;
+
+def SHLI : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, u6imm:$imm),
+  "shl $dst = $src1, $imm">, isI;
+def SHRUI : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, u6imm:$imm),
+  "shr.u $dst = $src1, $imm">, isI;
+def SHRSI : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, u6imm:$imm),
+  "shr $dst = $src1, $imm">, isI;
+
+def EXTRU : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, u6imm:$imm1, u6imm:$imm2),
+  "extr.u $dst = $src1, $imm1, $imm2">, isI;
+
+// extend PR into GR. zext/anyext set the low bit (0/1); sext fills the whole
+// register, since for a 1-bit value the bit is the sign (set -> all-ones, -1).
+def : Pat<(zext PR:$src),   (TPCADDS (ADDS r0, 0),  1, PR:$src)>;
+def : Pat<(anyext PR:$src), (TPCADDS (ADDS r0, 0),  1, PR:$src)>;
+def : Pat<(sext PR:$src),   (TPCADDS (ADDS r0, 0), -1, PR:$src)>;
+
+// truncate GR into PR: the predicate is the low bit (extr.u then compare != 0)
+def : Pat<(i1 (trunc GR:$src)), (CMPNE (EXTRU GR:$src, 0, 1), r0)>;
+
+def DEPZ : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, u6imm:$imm1, u6imm:$imm2),
+  "dep.z $dst = $src1, $imm1, $imm2">, isI;
+
+def PCMPEQOR : AForm<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.eq.or $dst, p0 = $src1, $src2">, isA;
+def PCMPEQUNC : AForm<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.eq.unc $dst, p0 = $src1, $src2">, isA;
+def PCMPNE : AForm<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.ne $dst, p0 = $src1, $src2">, isA;
+
+// two destinations!
+def BCMPEQ : AForm<0x03, 0x0b, (outs PR:$dst1, PR:$dst2), (ins GR:$src1, GR:$src2),
+  "cmp.eq $dst1, dst2 = $src1, $src2">, isA;
+
+// Binary logical operations (implemented using conditional assignment)
+def : Pat<(i1 (or  PR:$src1, PR:$src2)),
+  // $dst <- 0
+  // if ($src1) $dst <- 1
+  // if ($src2) $dst <- 1
+  (TPCMPEQ (TPCMPEQ (CMPNE r0, r0), r0, r0, $src1), r0, r0, $src2)>;
+def : Pat<(i1 (and PR:$src1, PR:$src2)),
+  // $dst <- $src1
+  // $aux <- 1
+  // if ($src2) $aux <- 0
+  // if ($aux)  $dst <- 0
+  (TPCMPNE (TPCMPEQ (CMPNE r0, r0), r0, r0, $src1), r0, r0,
+    (TPCMPNE (CMPEQ r0, r0), r0, r0, $src2))>;
+def : Pat<(i1 (xor PR:$src1, PR:$src2)),
+  // $aux1 <- $src1
+  // if ($src2) $aux1 <- 0
+  // $aux2 <- $src2
+  // if ($src1) $aux2 <- 0
+  // $dst <- $p1
+  // if ($aux2) $dst <- 1
+  (TPCMPEQ
+    (TPCMPNE (TPCMPEQ (CMPNE r0, r0), r0, r0, $src1), r0, r0, $src2),
+    r0, r0,
+    (TPCMPNE (TPCMPEQ (CMPNE r0, r0), r0, r0, $src2), r0, r0, $src1))>;
+
+def ADDIMM14 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s14imm:$imm),
+  "adds $dst = $imm, $src1">, isA;
+
+def ADDIMM22 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s22imm:$imm),
+  "add $dst = $imm, $src1">, isA;
+def CADDIMM22 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s22imm:$imm, PR:$qp),
+  "($qp) add $dst = $imm, $src1">, isA;
+
+def SUBIMM8 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s8imm:$imm, GR:$src2),
+  "sub $dst = $imm, $src2">, isA;
+
+let mayStore = 1 in {
+  def ST1 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st1 [$dstPtr] = $value">, isM;
+  def ST2 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st2 [$dstPtr] = $value">, isM;
+  def ST4 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st4 [$dstPtr] = $value">, isM;
+  def ST8 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st8 [$dstPtr] = $value">, isM;
+  def STF4 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, FP:$value),
+    "stfs [$dstPtr] = $value">, isM;
+  def STF8 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, FP:$value),
+    "stfd [$dstPtr] = $value">, isM;
+  def STF_SPILL : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, FP:$value),
+    "stf.spill [$dstPtr] = $value">, isM;
+  // Store the 80-bit double-extended (long double) memory format -- 10 bytes,
+  // the C 'long double' object format. Distinct from stf.spill (16-byte raw
+  // register format used for vreg spills).
+  def STFE : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, FP:$value),
+    "stfe [$dstPtr] = $value">, isM;
+}
+
+let canFoldAsLoad = 1 in {
+  def LD1 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld1 $dst = [$srcPtr]">, isM;
+  def LD2 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld2 $dst = [$srcPtr]">, isM;
+  def LD4 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld4 $dst = [$srcPtr]">, isM;
+  def LD8 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld8 $dst = [$srcPtr]">, isM;
+  def LDF4 : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$srcPtr),
+    "ldfs $dst = [$srcPtr]">, isM;
+  def LDF8 : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$srcPtr),
+    "ldfd $dst = [$srcPtr]">, isM;
+  def LDF_FILL : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$srcPtr),
+    "ldf.fill $dst = [$srcPtr]">, isM;
+  // Load the 80-bit double-extended (long double) memory format; see STFE.
+  def LDFE : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$srcPtr),
+    "ldfe $dst = [$srcPtr]">, isM;
+}
+
+def POPCNT : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src),
+  "popcnt $dst = $src", [(set GR:$dst, (ctpop GR:$src))]>, isI;
+
+// Memory fence. IA-64 ld/st are unordered beyond data dependence, so every
+// atomic ordering above monotonic is realised as an 'mf' barrier (the type
+// legalizer / AtomicExpand reduces acquire/release/seq_cst to monotonic
+// access + fence). One full fence covers all of them. The atomic_fence node
+// carries chain + side-effect (but not mayLoad/mayStore), so hasSideEffects=1
+// matches its inferred properties and keeps the mf from being reordered or
+// deleted.
+let hasSideEffects = 1 in
+def MF : AForm<0x03, 0x0b, (outs), (ins), "mf">, isM;
+def : Pat<(atomic_fence timm, timm), (MF)>;
+
+// Atomic compare-and-exchange. The IA-64 cmpxchg takes its comparand from the
+// ar.ccv application register, so the comparand is first moved there by
+// MOV_TO_AR_CCV; the cmpxchg then atomically compares ar.ccv against the memory
+// word, stores $new on a match, and returns the original word (zero-extended
+// into the GR for the narrow forms). Both are selected by hand in
+// IA64ISelDAGToDAG: the mov is glued immediately before the cmpxchg, and the
+// ar.ccv physreg def/use makes the bundler insert the required stop between
+// them. The .acq completer gives the access acquire ordering; the
+// release/seq_cst side is an 'mf' the selector prepends.
+let Defs = [AR_CCV] in
+def MOV_TO_AR_CCV : AForm<0x03, 0x0b, (outs), (ins GR:$src),
+  "mov ar.ccv = $src">, isM;
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1, Uses = [AR_CCV] in {
+  def CMPXCHG1 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$ptr, GR:$new),
+    "cmpxchg1.acq $dst = [$ptr], $new, ar.ccv">, isM;
+  def CMPXCHG2 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$ptr, GR:$new),
+    "cmpxchg2.acq $dst = [$ptr], $new, ar.ccv">, isM;
+  def CMPXCHG4 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$ptr, GR:$new),
+    "cmpxchg4.acq $dst = [$ptr], $new, ar.ccv">, isM;
+  def CMPXCHG8 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$ptr, GR:$new),
+    "cmpxchg8.acq $dst = [$ptr], $new, ar.ccv">, isM;
+}
+
+// some FP stuff:
+// Double-precision (f64) arithmetic carries the .d completer (53-bit rounding);
+// the unsuffixed forms below are the extended (f80) ops, and the .s forms are
+// single (f32). Naming mirrors the completer: FADDD/.d, FADD/none, FADDS/.s.
+def FADDD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fadd.d $dst = $src1, $src2",
+  [(set FP:$dst, (f64 (fadd FP:$src1, FP:$src2)))]>, isF;
+def FADDS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fadd.s $dst = $src1, $src2",
+  [(set FP:$dst, (f32 (fadd FP:$src1, FP:$src2)))]>, isF;
+def FSUBD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fsub.d $dst = $src1, $src2",
+  [(set FP:$dst, (f64 (fsub FP:$src1, FP:$src2)))]>, isF;
+def FSUBS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fsub.s $dst = $src1, $src2",
+  [(set FP:$dst, (f32 (fsub FP:$src1, FP:$src2)))]>, isF;
+def FMPYD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fmpy.d $dst = $src1, $src2",
+  [(set FP:$dst, (f64 (fmul FP:$src1, FP:$src2)))]>, isF;
+def FMPYS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fmpy.s $dst = $src1, $src2",
+  [(set FP:$dst, (f32 (fmul FP:$src1, FP:$src2)))]>, isF;
+// Fused multiply-add forms. These match the ISD::FMA node (a single-rounding
+// a*b+c), which the DAG only forms when FP contraction is permitted -- so unlike
+// the pre-removal (fadd (fmul ...)) pattern they never silently fuse. fms is
+// a*b-c, fnma is c-a*b (= (-a)*b+c).
+def FMAD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fma.d $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f64 (fma FP:$src1, FP:$src2, FP:$src3)))]>, isF;
+def FMSD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fms.d $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f64 (fma FP:$src1, FP:$src2, (fneg FP:$src3))))]>, isF;
+def FNMAD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fnma.d $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f64 (fma (fneg FP:$src1), FP:$src2, FP:$src3)))]>, isF;
+// f32 single-rounding forms (fma.s). compiler_builtins' libm calls
+// llvm.fma.f32 directly, which produces an f32 ISD::FMA node regardless of
+// contraction, so f32 needs its own pattern.
+def FMAS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fma.s $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f32 (fma FP:$src1, FP:$src2, FP:$src3)))]>, isF;
+def FMSS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fms.s $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f32 (fma FP:$src1, FP:$src2, (fneg FP:$src3))))]>, isF;
+def FNMAS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fnma.s $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f32 (fma (fneg FP:$src1), FP:$src2, FP:$src3)))]>, isF;
+
+// Operations only affecting sign do not care about width, so there is just one
+// instruction for all of f32/f64/f80.
+def FABS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fabs $dst = $src", []>, isF;
+def FNEG : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fneg $dst = $src", []>, isF;
+def FNEGABS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fnegabs $dst = $src", []>, isF;
+foreach vt = [f32, f64, f80] in {
+  def : Pat<(vt (fabs vt:$src)), (FABS FP:$src)>;
+  def : Pat<(vt (fneg vt:$src)), (FNEG FP:$src)>;
+  def : Pat<(vt (fneg (fabs vt:$src))), (FNEGABS FP:$src)>;
+}
+
+// Extended-precision (f80 / 'long double') arithmetic. With no precision
+// completer the F-unit rounds to the dynamic precision of status field sf0,
+// which the SysV ABI initialises to 80-bit double-extended -- exactly the C
+// long double. (FADDD etc. force .d / 53-bit; FADDS forces .s.) Memory format
+// is the 10-byte ldfe/stfe; the 82-bit register holds it exactly.
+def FADD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fadd $dst = $src1, $src2",
+  [(set FP:$dst, (f80 (fadd FP:$src1, FP:$src2)))]>, isF;
+def FSUB : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fsub $dst = $src1, $src2",
+  [(set FP:$dst, (f80 (fsub FP:$src1, FP:$src2)))]>, isF;
+def FMPY : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fmpy $dst = $src1, $src2",
+  [(set FP:$dst, (f80 (fmul FP:$src1, FP:$src2)))]>, isF;
+def FMA : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fma $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f80 (fma FP:$src1, FP:$src2, FP:$src3)))]>, isF;
+def FMS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fms $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f80 (fma FP:$src1, FP:$src2, (fneg FP:$src3))))]>, isF;
+def FNMA : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fnma $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (f80 (fma (fneg FP:$src1), FP:$src2, FP:$src3)))]>, isF;
+
+let Constraints = "$bogussrc = $dst" in {
+def TCFMAS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF;
+def TCFMADS0 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF;
+}
+
+def CFMAS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF;
+def CFNMAS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fnma.s1 $dst = $src1, $src2, $src3">, isF;
+
+def CFMADS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s1 $dst = $src1, $src2, $src3">, isF;
+def CFMADS0 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF;
+def CFNMADS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fnma.d.s1 $dst = $src1, $src2, $src3">, isF;
+
+def FRCPAS0 : AForm<0x03, 0x0b, (outs FP:$dstFR, PR:$dstPR), (ins FP:$src1, FP:$src2),
+  "frcpa.s0 $dstFR, $dstPR = $src1, $src2">, isF;
+def FRCPAS1 : AForm<0x03, 0x0b, (outs FP:$dstFR, PR:$dstPR), (ins FP:$src1, FP:$src2),
+  "frcpa.s1 $dstFR, $dstPR = $src1, $src2">, isF;
+
+def XMAL : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.l $dst = $src1, $src2, $src3">, isF;
+
+def FCVTXF : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.xf $dst = $src">, isF;
+def FCVTXUF : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.xuf $dst = $src">, isF;
+def FCVTXUFS1 : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.xuf.s1 $dst = $src">, isF;
+def FCVTFX : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fx $dst = $src">, isF;
+def FCVTFXU : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fxu $dst = $src">, isF;
+
+def FCVTFXTRUNC : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fx.trunc $dst = $src">, isF;
+def FCVTFXUTRUNC : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fxu.trunc $dst = $src">, isF;
+
+def FCVTFXTRUNCS1 : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fx.trunc.s1 $dst = $src">, isF;
+def FCVTFXUTRUNCS1 : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fxu.trunc.s1 $dst = $src">, isF;
+
+def FNORMD : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fnorm.d $dst = $src">, isF;
+def FNORMS : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fnorm.s $dst = $src">, isF;
+
+// FP register<->register rounding/widening.
+// Rounding uses FNORM-type instructions (FNORMD for f80->f64, FNORMS for
+// f80/f64->f32).
+// Widening needs no instruction as all sources of f32/f64 are correctly
+// widened already.
+def : Pat<(f64 (fpround  FP:$src)),         (FNORMD FP:$src)>;  // f80 -> f64
+def : Pat<(f32 (fpround (f64 FP:$src))),    (FNORMS FP:$src)>;
+def : Pat<(f32 (fpround (f80 FP:$src))),    (FNORMS FP:$src)>;
+def : Pat<(f64 (fpextend FP:$src)),         (COPY_TO_REGCLASS FP:$src, FP)>;
+def : Pat<(f80 (fpextend (f32 FP:$src))),   (COPY_TO_REGCLASS FP:$src, FP)>;
+def : Pat<(f80 (fpextend (f64 FP:$src))),   (COPY_TO_REGCLASS FP:$src, FP)>;
+
+// f80 comparisons reuse the (precision-independent) FCMP* instructions; the
+// register compare looks at the full 82-bit value. Mirror the f64 SETCC rules.
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETOEQ), (FCMPEQ FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETOGT), (FCMPGT FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETOGE), (FCMPGE FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETOLT), (FCMPLT FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETOLE), (FCMPLE FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETUNE), (FCMPNE FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETO),   (FCMPORD FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETUO),  (FCMPUNORD FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETUGE), (FCMPNLT FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETUGT), (FCMPNLE FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETULE), (FCMPNGT FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETULT), (FCMPNGE FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETEQ),  (FCMPEQ FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETGT),  (FCMPGT FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETGE),  (FCMPGE FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETLT),  (FCMPLT FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETLE),  (FCMPLE FP:$a, FP:$b)>;
+def : Pat<(setcc (f80 FP:$a), (f80 FP:$b), SETNE),  (FCMPNE FP:$a, FP:$b)>;
+
+// Integer <-> floating-point conversions.
+// Converted integer lands in f80 form (full 82-bit width), normalize using
+// FNORM if f64/f32 requested.
+// Converted float is always rounded to zero (.trunc variant of fcvt.fx).
+def : Pat<(f80 (sint_to_fp GR:$src)), (FCVTXF  (SETFSIGD GR:$src))>;
+def : Pat<(f80 (uint_to_fp GR:$src)), (FCVTXUF (SETFSIGD GR:$src))>;
+def : Pat<(f64 (sint_to_fp GR:$src)), (FNORMD (FCVTXF (SETFSIGD GR:$src)))>;
+def : Pat<(f64 (uint_to_fp GR:$src)), (FNORMD (FCVTXUF (SETFSIGD GR:$src)))>;
+def : Pat<(f32 (sint_to_fp GR:$src)), (FNORMS (f64 (FCVTXF  (f64 (SETFSIGD GR:$src)))))>;
+def : Pat<(f32 (uint_to_fp GR:$src)), (FNORMS (f64 (FCVTXUF (f64 (SETFSIGD GR:$src)))))>;
+foreach vt = [f32, f64, f80] in {
+  def : Pat<(i64 (fp_to_sint (vt FP:$src))), (GETFSIGD (FCVTFXTRUNC  FP:$src))>;
+  def : Pat<(i64 (fp_to_uint (vt FP:$src))), (GETFSIGD (FCVTFXUTRUNC FP:$src))>;
+}
+
+def GETFD : AForm<0x03, 0x0b, (outs GR:$dst), (ins FP:$src),
+  "getf.d $dst = $src">, isM;
+def SETFD : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$src),
+  "setf.d $dst = $src">, isM;
+
+def GETFSIG : AForm<0x03, 0x0b, (outs GR:$dst), (ins FP:$src),
+  "getf.sig $dst = $src">, isM;
+def SETFSIG : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$src),
+  "setf.sig $dst = $src">, isM;
+
+// Implement f64/f32 immediates using i64 immediates
+def fpimm_bits64 : SDNodeXForm<fpimm, [{
+  APFloat F = N->getValueAPF();
+  bool Ignored;
+  F.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Ignored);
+  return CurDAG->getTargetConstant(F.bitcastToAPInt().getZExtValue(),
+                                   SDLoc(N), MVT::i64);
+}]>;
+def : Pat<(f64 fpimm:$v), (SETFD (MOVL (fpimm_bits64 $v)))>;
+def : Pat<(f32 fpimm:$v), (SETFD (MOVL (fpimm_bits64 $v)))>;
+
+// Reinterpret an f64's bits as an i64 and back (getf.d / setf.d), the 64-bit
+// IEEE memory form -- used to pass a variadic double in a general register
+// (see CC_IA64_Call_VarArgFP).
+def : Pat<(i64 (bitconvert (f64 FP:$src))), (GETFD FP:$src)>;
+def : Pat<(f64 (bitconvert (i64 GR:$src))), (SETFD GR:$src)>;
+
+let isTerminator = 1, isBranch = 1 in {
+  let isBarrier = 1 in
+  def BRL_NOTCALL : RawForm<0x03, 0xb0, (outs), (ins i64imm:$dst),
+    "(p0) brl.cond.sptk $dst">, isB;
+  def BRLCOND_NOTCALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, i64imm:$dst),
+    "($qp) brl.cond.sptk $dst">, isB;
+  def BRCOND_NOTCALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, GR:$dst),
+    "($qp) br.cond.sptk $dst">, isB;
+  // Indirect branch (computed goto): the target address is moved into a branch
+  // register (b6) and branched to. Unconditional, so a barrier.
+  let isBarrier = 1, isIndirectBranch = 1 in
+  def BRINDIRECT : RawForm<0x03, 0xb0, (outs), (ins BR:$target),
+    "(p0) br.cond.sptk $target">, isB;
+}
+
+let isCall = 1, /* isTerminator = 1, isBranch = 1, */
+  // A non-local callee may clobber gp (r1) via an import stub; the call lowering
+  // can't know that statically, so AdjustInstrPostInstrSelection adds r1 to the
+  // Defs of a non-dso_local call after selection. hasPostISelHook routes the
+  // call instructions through that hook.
+  hasPostISelHook = 1,
+// all calls clobber non-callee-saved registers, and for now, they are these:
+// rp (b0) leads the list: br.call writes the return address into it, so every
+// call clobbers it. Without this the rp save/restore LowerCall emits around a
+// call is a no-op round trip (rp looks unchanged) and the coalescer deletes it,
+// leaving b0 corrupted across the call -> the function's own br.ret jumps to
+// garbage (observed as SIGILL on a recursive call).
+  Defs = [rp,
+  r2,r3,r8,r9,r10,r11,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,
+  r25,r26,r27,r28,r29,r30,r31,
+  p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,
+  F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,
+  F32,F33,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48,F49,
+  F50,F51,F52,F53,F54,F55,F56,
+  F57,F58,F59,F60,F61,F62,F63,F64,F65,F66,F67,F68,F69,F70,F71,F72,F73,F74,
+  F75,F76,F77,F78,F79,F80,F81,
+  F82,F83,F84,F85,F86,F87,F88,F89,F90,F91,F92,F93,F94,F95,F96,F97,F98,F99,
+  F100,F101,F102,F103,F104,F105,
+  F106,F107,F108,F109,F110,F111,F112,F113,F114,F115,F116,F117,F118,F119,
+  F120,F121,F122,F123,F124,F125,F126,F127,
+  out0,out1,out2,out3,out4,out5,out6,out7] in {
+// old pattern call
+  def BRCALL: RawForm<0x03, 0xb0, (outs), (ins calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;   // FIXME: teach llvm about branch regs?
+
+// calls a globaladdress
+  def BRCALL_IPREL_GA : RawForm<0x03, 0xb0, (outs), (ins calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;       // FIXME: teach llvm about branch regs?
+// calls an externalsymbol
+  def BRCALL_IPREL_ES : RawForm<0x03, 0xb0, (outs), (ins calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;       // FIXME: teach llvm about branch regs?
+// calls through a function descriptor
+  def BRCALL_INDIRECT : RawForm<0x03, 0xb0, (outs), (ins BR:$branchreg),
+  "br.call.sptk rp = $branchreg">, isB;
+  def BRLCOND_CALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, i64imm:$dst),
+    "($qp) brl.cond.call.sptk $dst">, isB;
+  def BRCOND_CALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, GR:$dst),
+    "($qp) br.cond.call.sptk $dst">, isB;
+}
+
+// Return branch: br.ret reads the return pointer (b0/rp) and ar.pfs (to restore
+// the caller's register stack frame). Declaring those uses lets the bundling
+// pass insert the mandatory stop between the epilogue's 'mov rp = <reg>' /
+// 'mov ar.pfs = <reg>' and this branch; without it br.ret could read the stale
+// values from the same instruction group (and writing ar.pfs in br.ret's group
+// is an RSE hazard).
+let isTerminator = 1, isReturn = 1, Uses = [rp, AR_PFS] in
+  def RET : AForm_DAG<0x03, 0x0b, (outs), (ins),
+            "br.ret.sptk.many rp",
+            [(retflag)]>, isB; // return
+
+// the evil stop bit of despair
+def STOP : PseudoInstIA64<(outs), (ins variable_ops), ";;">;
diff --git llvm/lib/Target/IA64/IA64MCInstLower.cpp llvm/lib/Target/IA64/IA64MCInstLower.cpp
new file mode 100644
index 000000000000..59cca02ab91d
--- /dev/null
+++ llvm/lib/Target/IA64/IA64MCInstLower.cpp
@@ -0,0 +1,100 @@
+//===-- IA64MCInstLower.cpp - Lower MachineInstr to MCInst ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64MCInstLower.h"
+#include "MCTargetDesc/IA64MCAsmInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+MCOperand IA64MCInstLower::lowerSymbolOperand(const MachineOperand &MO,
+                                              MCSymbol *Sym) const {
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+  // A jump-table index carries no addend (and getOffset() asserts on it); only
+  // globals/external symbols can have a non-zero offset here.
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  // A relocation specifier (e.g. IA64::S_LTOFF) is carried on the operand's
+  // target flags; wrap the symbol so it prints as "@ltoff(sym)". The S_LTOFF_*
+  // values are markers for a value loaded through the GOT: nest the inner
+  // specifier inside @ltoff so the GOT entry holds the descriptor / TLS datum,
+  // e.g. @ltoff(@fptr(sym)) or @ltoff(@tprel(sym)).
+  unsigned Specifier = MO.getTargetFlags();
+  unsigned Inner = 0;
+  switch (Specifier) {
+  case IA64::S_LTOFF_FPTR:
+    Inner = IA64::S_FPTR;
+    break;
+  case IA64::S_LTOFF_TPREL:
+    Inner = IA64::S_TPREL;
+    break;
+  case IA64::S_LTOFF_DTPMOD:
+    Inner = IA64::S_DTPMOD;
+    break;
+  case IA64::S_LTOFF_DTPREL:
+    Inner = IA64::S_DTPREL;
+    break;
+  }
+  if (Inner) {
+    Expr = MCSpecifierExpr::create(Expr, Inner, Ctx);
+    Expr = MCSpecifierExpr::create(Expr, IA64::S_LTOFF, Ctx);
+  } else if (Specifier) {
+    Expr = MCSpecifierExpr::create(Expr, Specifier, Ctx);
+  }
+  return MCOperand::createExpr(Expr);
+}
+
+void IA64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (const MachineOperand &MO : MI->operands()) {
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      report_fatal_error("IA64: unsupported operand type in MCInstLower");
+    case MachineOperand::MO_Register:
+      // Implicit operands aren't part of the asm template; drop them.
+      if (MO.isImplicit())
+        continue;
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(
+          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = lowerSymbolOperand(MO, Printer.getSymbol(MO.getGlobal()));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = lowerSymbolOperand(
+          MO, Printer.GetExternalSymbolSymbol(MO.getSymbolName()));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = lowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = lowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      // Call-clobber masks carry no printable operand.
+      continue;
+    }
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git llvm/lib/Target/IA64/IA64MCInstLower.h llvm/lib/Target/IA64/IA64MCInstLower.h
new file mode 100644
index 000000000000..abfb3b0e83b2
--- /dev/null
+++ llvm/lib/Target/IA64/IA64MCInstLower.h
@@ -0,0 +1,39 @@
+//===-- IA64MCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64MCINSTLOWER_H
+#define LLVM_LIB_TARGET_IA64_IA64MCINSTLOWER_H
+
+namespace llvm {
+
+class AsmPrinter;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+
+/// IA64MCInstLower - This class lowers a MachineInstr into an MCInst. This did
+/// not exist in the pre-removal backend, which printed MachineInstrs directly;
+/// modern LLVM routes machine code through the MC layer.
+class IA64MCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+
+public:
+  IA64MCInstLower(MCContext &Ctx, AsmPrinter &Printer)
+      : Ctx(Ctx), Printer(Printer) {}
+
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64MCINSTLOWER_H
diff --git llvm/lib/Target/IA64/IA64MachineFunctionInfo.cpp llvm/lib/Target/IA64/IA64MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..19697ee7cc39
--- /dev/null
+++ llvm/lib/Target/IA64/IA64MachineFunctionInfo.cpp
@@ -0,0 +1,19 @@
+//===-- IA64MachineFunctionInfo.cpp - IA64 machine function info ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void IA64FunctionInfo::anchor() {}
+
+MachineFunctionInfo *IA64FunctionInfo::clone(
+    BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) const {
+  return DestMF.cloneInfo<IA64FunctionInfo>(*this);
+}
diff --git llvm/lib/Target/IA64/IA64MachineFunctionInfo.h llvm/lib/Target/IA64/IA64MachineFunctionInfo.h
new file mode 100644
index 000000000000..cb9628c44e2d
--- /dev/null
+++ llvm/lib/Target/IA64/IA64MachineFunctionInfo.h
@@ -0,0 +1,70 @@
+//===-- IA64MachineFunctionInfo.h - IA64 machine function info --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares IA64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_IA64_IA64MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class IA64FunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  // The stacked local that emitPrologue makes 'alloc' write the incoming ar.pfs
+  // into, and that emitEpilogue restores ar.pfs from before the return. Like
+  // SavedRPReg below it is picked just above the locals the allocator used (so
+  // the register stack engine preserves it across calls and it is never spilled)
+  // and reserved by widening the 'alloc' frame. This gives the unwinder one
+  // fixed location to name in a '.save ar.pfs, <reg>' directive.
+  Register SavedPFSReg;
+
+  // FrameIndex of the varargs register save area: the slot holding the first
+  // variadic argument. LowerFormalArguments spills the unnamed incoming GP
+  // registers here; LowerVASTART hands its address to va_start.
+  int VarArgsFrameIndex = 0;
+
+  // The stacked local that emitPrologue parks the incoming return pointer
+  // (b0/rp) in, for a non-leaf function. It is picked just above the locals the
+  // allocator used (so the register stack engine preserves it across calls for
+  // free) and reserved by widening the 'alloc' frame; emitEpilogue restores b0
+  // from it before the return. The unwinder gets one fixed location to name in a
+  // '.save rp, <reg>' directive, which the asm printer reads off the FrameSetup
+  // 'mov <reg> = rp'. Left null for a leaf function, which never clobbers b0.
+  Register SavedRPReg;
+
+public:
+  // How many 'out' registers are used by this MachineFunction. Used to compute
+  // the appropriate entry in the 'alloc' instruction at the top of the
+  // function. Set during call lowering.
+  unsigned OutRegsUsed = 0;
+
+  IA64FunctionInfo(const Function &F, const TargetSubtargetInfo *STI) {}
+
+  Register getSavedPFSReg() const { return SavedPFSReg; }
+  void setSavedPFSReg(Register Reg) { SavedPFSReg = Reg; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int FI) { VarArgsFrameIndex = FI; }
+
+  Register getSavedRPReg() const { return SavedRPReg; }
+  void setSavedRPReg(Register Reg) { SavedRPReg = Reg; }
+
+  MachineFunctionInfo *
+  clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+        const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+      const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64MACHINEFUNCTIONINFO_H
diff --git llvm/lib/Target/IA64/IA64RegisterInfo.cpp llvm/lib/Target/IA64/IA64RegisterInfo.cpp
new file mode 100644
index 000000000000..5c6c47508039
--- /dev/null
+++ llvm/lib/Target/IA64/IA64RegisterInfo.cpp
@@ -0,0 +1,177 @@
+//===- IA64RegisterInfo.cpp - IA64 Register Information ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64RegisterInfo.h"
+#include "IA64FrameLowering.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "IA64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+// The 96 stacked general registers in register-stack order (index 0 == r32).
+static const MCPhysReg StackedGPRsInOrder[IA64NumStackedGPRs] = {
+    IA64::r32,  IA64::r33,  IA64::r34,  IA64::r35,  IA64::r36,  IA64::r37,
+    IA64::r38,  IA64::r39,  IA64::r40,  IA64::r41,  IA64::r42,  IA64::r43,
+    IA64::r44,  IA64::r45,  IA64::r46,  IA64::r47,  IA64::r48,  IA64::r49,
+    IA64::r50,  IA64::r51,  IA64::r52,  IA64::r53,  IA64::r54,  IA64::r55,
+    IA64::r56,  IA64::r57,  IA64::r58,  IA64::r59,  IA64::r60,  IA64::r61,
+    IA64::r62,  IA64::r63,  IA64::r64,  IA64::r65,  IA64::r66,  IA64::r67,
+    IA64::r68,  IA64::r69,  IA64::r70,  IA64::r71,  IA64::r72,  IA64::r73,
+    IA64::r74,  IA64::r75,  IA64::r76,  IA64::r77,  IA64::r78,  IA64::r79,
+    IA64::r80,  IA64::r81,  IA64::r82,  IA64::r83,  IA64::r84,  IA64::r85,
+    IA64::r86,  IA64::r87,  IA64::r88,  IA64::r89,  IA64::r90,  IA64::r91,
+    IA64::r92,  IA64::r93,  IA64::r94,  IA64::r95,  IA64::r96,  IA64::r97,
+    IA64::r98,  IA64::r99,  IA64::r100, IA64::r101, IA64::r102, IA64::r103,
+    IA64::r104, IA64::r105, IA64::r106, IA64::r107, IA64::r108, IA64::r109,
+    IA64::r110, IA64::r111, IA64::r112, IA64::r113, IA64::r114, IA64::r115,
+    IA64::r116, IA64::r117, IA64::r118, IA64::r119, IA64::r120, IA64::r121,
+    IA64::r122, IA64::r123, IA64::r124, IA64::r125, IA64::r126, IA64::r127};
+
+MCRegister llvm::getIA64StackedGPR(unsigned Idx) {
+  assert(Idx < IA64NumStackedGPRs && "stacked-GPR index out of range");
+  return StackedGPRsInOrder[Idx];
+}
+
+// rp (the return pointer, branch register b0) is the return-address register.
+IA64RegisterInfo::IA64RegisterInfo() : IA64GenRegisterInfo(IA64::rp) {}
+
+const MCPhysReg *
+IA64RegisterInfo::getCalleeSavedRegs(const MachineFunction * /*MF*/) const {
+  // r4-r7 are the static callee-saved general registers (IA-64 SysV psABI);
+  // glibc's setjmp/longjmp save and restore them via the jmpbuf. The backend
+  // rarely allocates them (they trail the GR allocation order), but LowerCall
+  // parks gp/sp/rp in r4/r6/r7 across calls in returns_twice (setjmp) functions
+  // -- which only works if every function that touches them saves/restores them,
+  // i.e. they must be true CSRs so a nested setjmp call does not clobber an
+  // outer frame's parked values. (r5 is also the frame pointer.)
+  static const MCPhysReg CalleeSavedRegs[] = {IA64::r4, IA64::r5, IA64::r6,
+                                              IA64::r7, 0};
+  return CalleeSavedRegs;
+}
+
+BitVector IA64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(IA64::r0);  // always zero
+  Reserved.set(IA64::r1);  // global data pointer (gp)
+  Reserved.set(IA64::r2);  // reserved for spilling/filling predicates
+  Reserved.set(IA64::r5);  // frame pointer
+  Reserved.set(IA64::r12); // stack pointer (sp)
+  Reserved.set(IA64::r13); // thread pointer (tp)
+  Reserved.set(IA64::r22); // reserved as an address-calculation scratch
+  Reserved.set(IA64::rp);  // return pointer (b0)
+
+  // F0 and F1 are the architectural fixed FP constants +0.0 and +1.0; they are
+  // members of the FP class only so they can be named as explicit operands
+  // (e.g. F0 is the addend in the xma-based integer-multiply sequence). They
+  // must never be allocated as scratch, or the constant they hold is clobbered.
+  Reserved.set(IA64::F0);  // fixed +0.0
+  Reserved.set(IA64::F1);  // fixed +1.0
+
+  // The output registers (out0-out7) are an alias for the top of the stacked
+  // register frame that 'alloc' carves out for passing arguments to callees;
+  // they are not freely allocatable. The pre-removal backend hid them from the
+  // GR allocation order via RegisterClass MethodBodies (a mechanism that no
+  // longer exists); we express that reservation here. They lead the GR
+  // allocation order, so without this the ar.pfs-save GR lands on 'out7',
+  // which is meaningless when 'alloc' declares zero output registers.
+  Reserved.set(IA64::out0);
+  Reserved.set(IA64::out1);
+  Reserved.set(IA64::out2);
+  Reserved.set(IA64::out3);
+  Reserved.set(IA64::out4);
+  Reserved.set(IA64::out5);
+  Reserved.set(IA64::out6);
+  Reserved.set(IA64::out7);
+
+  // ar.pfs is an application register, not a freely allocatable GPR — it is a
+  // member of the GR class only so 'mov ar.pfs = rN' / 'alloc rN = ar.pfs' can
+  // name it. The pre-removal backend kept it out of the GR allocation order via
+  // RegisterClass MethodBodies; reserving it here is the modern equivalent.
+  // Without this, the coalescer folds the ar.pfs-save vreg straight into
+  // AR_PFS, producing the nonsensical 'alloc ar.pfs = ar.pfs' (the save GR is
+  // lost). Reserved, the restore copy 'mov ar.pfs = rN' survives and the
+  // save vreg is allocated to a real scratch GR (r3 for a leaf function).
+  Reserved.set(IA64::AR_PFS);
+  Reserved.set(IA64::B6);  // indirect-call branch target (set up per call site)
+
+  // Cap the stacked-GPR register frame. 'alloc' carves a frame of
+  // (locals + outputs) stacked registers out of r32-r127, and the architecture
+  // limits that frame to 96 registers.
+  // Reserve the last 8 + 2 registers for out0-out7 + the saved return pointer +
+  // the saved ar.pfs. Frame lowering parks rp and ar.pfs in stacked locals just
+  // above the ones the allocator used (see IA64FrameLowering::emitPrologue), so
+  // the allocator must leave two registers below the outputs for them.
+  Reserved.set(IA64::r118);
+  Reserved.set(IA64::r119);
+  Reserved.set(IA64::r120);
+  Reserved.set(IA64::r121);
+  Reserved.set(IA64::r122);
+  Reserved.set(IA64::r123);
+  Reserved.set(IA64::r124);
+  Reserved.set(IA64::r125);
+  Reserved.set(IA64::r126);
+  Reserved.set(IA64::r127);
+  return Reserved;
+}
+
+bool IA64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                           int SPAdj, unsigned FIOperandNum,
+                                           RegScavenger * /*RS*/) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  DebugLoc DL = MI.getDebugLoc();
+
+  bool FP = TFI->hasFP(MF);
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Choose a base register: ( hasFP ? frame pointer : stack pointer ).
+  unsigned BaseRegister = FP ? IA64::r5 : IA64::r12;
+
+  // Add the frame object offset to the offset from the base register.
+  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+  Offset += MF.getFrameInfo().getStackSize();
+
+  // We use 'r22' as an address-calculation scratch register here.
+  MI.getOperand(FIOperandNum).ChangeToRegister(IA64::r22, false);
+  if (Offset <= 8191 && Offset >= -8192) { // smallish offset
+    BuildMI(MBB, II, DL, TII->get(IA64::ADDIMM22), IA64::r22)
+        .addReg(BaseRegister)
+        .addImm(Offset);
+  } else { // it's big
+    BuildMI(MBB, II, DL, TII->get(IA64::MOVLIMM64), IA64::r22).addImm(Offset);
+    BuildMI(MBB, II, DL, TII->get(IA64::ADD), IA64::r22)
+        .addReg(BaseRegister)
+        .addReg(IA64::r22);
+  }
+
+  return false;
+}
+
+Register IA64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  return TFI->hasFP(MF) ? IA64::r5 : IA64::r12;
+}
diff --git llvm/lib/Target/IA64/IA64RegisterInfo.h llvm/lib/Target/IA64/IA64RegisterInfo.h
new file mode 100644
index 000000000000..275d4a2eae4d
--- /dev/null
+++ llvm/lib/Target/IA64/IA64RegisterInfo.h
@@ -0,0 +1,50 @@
+//===- IA64RegisterInfo.h - IA64 Register Information Impl ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64REGISTERINFO_H
+#define LLVM_LIB_TARGET_IA64_IA64REGISTERINFO_H
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "IA64GenRegisterInfo.inc"
+
+namespace llvm {
+
+// Number of stacked general registers (r32-r127) the register stack engine
+// manages; 'alloc' carves its inputs/locals/outputs from this window.
+static constexpr unsigned IA64NumStackedGPRs = 96;
+
+// The stacked general register at register-stack index Idx: 0 -> r32, ...,
+// 95 -> r127. The GR enum values are not contiguous (other register classes are
+// interleaved), so this maps an index through an explicit table.
+MCRegister getIA64StackedGPR(unsigned Idx);
+
+struct IA64RegisterInfo : public IA64GenRegisterInfo {
+  IA64RegisterInfo();
+
+  // Code Generation virtual methods.
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  Register getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64REGISTERINFO_H
diff --git llvm/lib/Target/IA64/IA64RegisterInfo.td llvm/lib/Target/IA64/IA64RegisterInfo.td
new file mode 100644
index 000000000000..5f473c5d1cb1
--- /dev/null
+++ llvm/lib/Target/IA64/IA64RegisterInfo.td
@@ -0,0 +1,518 @@
+//===- IA64RegisterInfo.td - Describe the IA64 Register File *- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the IA64 register file, defining the registers
+// themselves, aliases between the registers, and the register classes built
+// out of the registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+class IA64Register<string n> : Register<n> {
+  let Namespace = "IA64";
+}
+
+// GR - One of 128 64-bit general registers
+class GR<bits<7> num, string n> : IA64Register<n> {
+  let HWEncoding{6-0} = num;
+}
+
+// FP - One of 128 82-bit floating-point registers
+class FP<bits<7> num, string n> : IA64Register<n> {
+  let HWEncoding{6-0} = num;
+}
+
+// PR - One of 64 1-bit predicate registers
+class PR<bits<6> num, string n> : IA64Register<n> {
+  let HWEncoding{5-0} = num;
+}
+
+/* general registers */
+def r0 : GR< 0, "r0">, DwarfRegNum<[0]>;
+def r1 : GR< 1, "r1">, DwarfRegNum<[1]>;
+def r2 : GR< 2, "r2">, DwarfRegNum<[2]>;
+def r3 : GR< 3, "r3">, DwarfRegNum<[3]>;
+def r4 : GR< 4, "r4">, DwarfRegNum<[4]>;
+def r5 : GR< 5, "r5">, DwarfRegNum<[5]>;
+def r6 : GR< 6, "r6">, DwarfRegNum<[6]>;
+def r7 : GR< 7, "r7">, DwarfRegNum<[7]>;
+def r8 : GR< 8, "r8">, DwarfRegNum<[8]>;
+def r9 : GR< 9, "r9">, DwarfRegNum<[9]>;
+def r10 : GR< 10, "r10">, DwarfRegNum<[10]>;
+def r11 : GR< 11, "r11">, DwarfRegNum<[11]>;
+def r12 : GR< 12, "r12">, DwarfRegNum<[12]>;
+def r13 : GR< 13, "r13">, DwarfRegNum<[13]>;
+def r14 : GR< 14, "r14">, DwarfRegNum<[14]>;
+def r15 : GR< 15, "r15">, DwarfRegNum<[15]>;
+def r16 : GR< 16, "r16">, DwarfRegNum<[16]>;
+def r17 : GR< 17, "r17">, DwarfRegNum<[17]>;
+def r18 : GR< 18, "r18">, DwarfRegNum<[18]>;
+def r19 : GR< 19, "r19">, DwarfRegNum<[19]>;
+def r20 : GR< 20, "r20">, DwarfRegNum<[20]>;
+def r21 : GR< 21, "r21">, DwarfRegNum<[21]>;
+def r22 : GR< 22, "r22">, DwarfRegNum<[22]>;
+def r23 : GR< 23, "r23">, DwarfRegNum<[23]>;
+def r24 : GR< 24, "r24">, DwarfRegNum<[24]>;
+def r25 : GR< 25, "r25">, DwarfRegNum<[25]>;
+def r26 : GR< 26, "r26">, DwarfRegNum<[26]>;
+def r27 : GR< 27, "r27">, DwarfRegNum<[27]>;
+def r28 : GR< 28, "r28">, DwarfRegNum<[28]>;
+def r29 : GR< 29, "r29">, DwarfRegNum<[29]>;
+def r30 : GR< 30, "r30">, DwarfRegNum<[30]>;
+def r31 : GR< 31, "r31">, DwarfRegNum<[31]>;
+def r32 : GR< 32, "r32">, DwarfRegNum<[32]>;
+def r33 : GR< 33, "r33">, DwarfRegNum<[33]>;
+def r34 : GR< 34, "r34">, DwarfRegNum<[34]>;
+def r35 : GR< 35, "r35">, DwarfRegNum<[35]>;
+def r36 : GR< 36, "r36">, DwarfRegNum<[36]>;
+def r37 : GR< 37, "r37">, DwarfRegNum<[37]>;
+def r38 : GR< 38, "r38">, DwarfRegNum<[38]>;
+def r39 : GR< 39, "r39">, DwarfRegNum<[39]>;
+def r40 : GR< 40, "r40">, DwarfRegNum<[40]>;
+def r41 : GR< 41, "r41">, DwarfRegNum<[41]>;
+def r42 : GR< 42, "r42">, DwarfRegNum<[42]>;
+def r43 : GR< 43, "r43">, DwarfRegNum<[43]>;
+def r44 : GR< 44, "r44">, DwarfRegNum<[44]>;
+def r45 : GR< 45, "r45">, DwarfRegNum<[45]>;
+def r46 : GR< 46, "r46">, DwarfRegNum<[46]>;
+def r47 : GR< 47, "r47">, DwarfRegNum<[47]>;
+def r48 : GR< 48, "r48">, DwarfRegNum<[48]>;
+def r49 : GR< 49, "r49">, DwarfRegNum<[49]>;
+def r50 : GR< 50, "r50">, DwarfRegNum<[50]>;
+def r51 : GR< 51, "r51">, DwarfRegNum<[51]>;
+def r52 : GR< 52, "r52">, DwarfRegNum<[52]>;
+def r53 : GR< 53, "r53">, DwarfRegNum<[53]>;
+def r54 : GR< 54, "r54">, DwarfRegNum<[54]>;
+def r55 : GR< 55, "r55">, DwarfRegNum<[55]>;
+def r56 : GR< 56, "r56">, DwarfRegNum<[56]>;
+def r57 : GR< 57, "r57">, DwarfRegNum<[57]>;
+def r58 : GR< 58, "r58">, DwarfRegNum<[58]>;
+def r59 : GR< 59, "r59">, DwarfRegNum<[59]>;
+def r60 : GR< 60, "r60">, DwarfRegNum<[60]>;
+def r61 : GR< 61, "r61">, DwarfRegNum<[61]>;
+def r62 : GR< 62, "r62">, DwarfRegNum<[62]>;
+def r63 : GR< 63, "r63">, DwarfRegNum<[63]>;
+def r64 : GR< 64, "r64">, DwarfRegNum<[64]>;
+def r65 : GR< 65, "r65">, DwarfRegNum<[65]>;
+def r66 : GR< 66, "r66">, DwarfRegNum<[66]>;
+def r67 : GR< 67, "r67">, DwarfRegNum<[67]>;
+def r68 : GR< 68, "r68">, DwarfRegNum<[68]>;
+def r69 : GR< 69, "r69">, DwarfRegNum<[69]>;
+def r70 : GR< 70, "r70">, DwarfRegNum<[70]>;
+def r71 : GR< 71, "r71">, DwarfRegNum<[71]>;
+def r72 : GR< 72, "r72">, DwarfRegNum<[72]>;
+def r73 : GR< 73, "r73">, DwarfRegNum<[73]>;
+def r74 : GR< 74, "r74">, DwarfRegNum<[74]>;
+def r75 : GR< 75, "r75">, DwarfRegNum<[75]>;
+def r76 : GR< 76, "r76">, DwarfRegNum<[76]>;
+def r77 : GR< 77, "r77">, DwarfRegNum<[77]>;
+def r78 : GR< 78, "r78">, DwarfRegNum<[78]>;
+def r79 : GR< 79, "r79">, DwarfRegNum<[79]>;
+def r80 : GR< 80, "r80">, DwarfRegNum<[80]>;
+def r81 : GR< 81, "r81">, DwarfRegNum<[81]>;
+def r82 : GR< 82, "r82">, DwarfRegNum<[82]>;
+def r83 : GR< 83, "r83">, DwarfRegNum<[83]>;
+def r84 : GR< 84, "r84">, DwarfRegNum<[84]>;
+def r85 : GR< 85, "r85">, DwarfRegNum<[85]>;
+def r86 : GR< 86, "r86">, DwarfRegNum<[86]>;
+def r87 : GR< 87, "r87">, DwarfRegNum<[87]>;
+def r88 : GR< 88, "r88">, DwarfRegNum<[88]>;
+def r89 : GR< 89, "r89">, DwarfRegNum<[89]>;
+def r90 : GR< 90, "r90">, DwarfRegNum<[90]>;
+def r91 : GR< 91, "r91">, DwarfRegNum<[91]>;
+def r92 : GR< 92, "r92">, DwarfRegNum<[92]>;
+def r93 : GR< 93, "r93">, DwarfRegNum<[93]>;
+def r94 : GR< 94, "r94">, DwarfRegNum<[94]>;
+def r95 : GR< 95, "r95">, DwarfRegNum<[95]>;
+def r96 : GR< 96, "r96">, DwarfRegNum<[96]>;
+def r97 : GR< 97, "r97">, DwarfRegNum<[97]>;
+def r98 : GR< 98, "r98">, DwarfRegNum<[98]>;
+def r99 : GR< 99, "r99">, DwarfRegNum<[99]>;
+def r100 : GR< 100, "r100">, DwarfRegNum<[100]>;
+def r101 : GR< 101, "r101">, DwarfRegNum<[101]>;
+def r102 : GR< 102, "r102">, DwarfRegNum<[102]>;
+def r103 : GR< 103, "r103">, DwarfRegNum<[103]>;
+def r104 : GR< 104, "r104">, DwarfRegNum<[104]>;
+def r105 : GR< 105, "r105">, DwarfRegNum<[105]>;
+def r106 : GR< 106, "r106">, DwarfRegNum<[106]>;
+def r107 : GR< 107, "r107">, DwarfRegNum<[107]>;
+def r108 : GR< 108, "r108">, DwarfRegNum<[108]>;
+def r109 : GR< 109, "r109">, DwarfRegNum<[109]>;
+def r110 : GR< 110, "r110">, DwarfRegNum<[110]>;
+def r111 : GR< 111, "r111">, DwarfRegNum<[111]>;
+def r112 : GR< 112, "r112">, DwarfRegNum<[112]>;
+def r113 : GR< 113, "r113">, DwarfRegNum<[113]>;
+def r114 : GR< 114, "r114">, DwarfRegNum<[114]>;
+def r115 : GR< 115, "r115">, DwarfRegNum<[115]>;
+def r116 : GR< 116, "r116">, DwarfRegNum<[116]>;
+def r117 : GR< 117, "r117">, DwarfRegNum<[117]>;
+def r118 : GR< 118, "r118">, DwarfRegNum<[118]>;
+def r119 : GR< 119, "r119">, DwarfRegNum<[119]>;
+def r120 : GR< 120, "r120">, DwarfRegNum<[120]>;
+def r121 : GR< 121, "r121">, DwarfRegNum<[121]>;
+def r122 : GR< 122, "r122">, DwarfRegNum<[122]>;
+def r123 : GR< 123, "r123">, DwarfRegNum<[123]>;
+def r124 : GR< 124, "r124">, DwarfRegNum<[124]>;
+def r125 : GR< 125, "r125">, DwarfRegNum<[125]>;
+def r126 : GR< 126, "r126">, DwarfRegNum<[126]>;
+def r127 : GR< 127, "r127">, DwarfRegNum<[127]>;
+
+/* floating-point registers */
+def F0 : FP< 0, "f0">, DwarfRegNum<[128]>;
+def F1 : FP< 1, "f1">, DwarfRegNum<[129]>;
+def F2 : FP< 2, "f2">, DwarfRegNum<[130]>;
+def F3 : FP< 3, "f3">, DwarfRegNum<[131]>;
+def F4 : FP< 4, "f4">, DwarfRegNum<[132]>;
+def F5 : FP< 5, "f5">, DwarfRegNum<[133]>;
+def F6 : FP< 6, "f6">, DwarfRegNum<[134]>;
+def F7 : FP< 7, "f7">, DwarfRegNum<[135]>;
+def F8 : FP< 8, "f8">, DwarfRegNum<[136]>;
+def F9 : FP< 9, "f9">, DwarfRegNum<[137]>;
+def F10 : FP< 10, "f10">, DwarfRegNum<[138]>;
+def F11 : FP< 11, "f11">, DwarfRegNum<[139]>;
+def F12 : FP< 12, "f12">, DwarfRegNum<[140]>;
+def F13 : FP< 13, "f13">, DwarfRegNum<[141]>;
+def F14 : FP< 14, "f14">, DwarfRegNum<[142]>;
+def F15 : FP< 15, "f15">, DwarfRegNum<[143]>;
+def F16 : FP< 16, "f16">, DwarfRegNum<[144]>;
+def F17 : FP< 17, "f17">, DwarfRegNum<[145]>;
+def F18 : FP< 18, "f18">, DwarfRegNum<[146]>;
+def F19 : FP< 19, "f19">, DwarfRegNum<[147]>;
+def F20 : FP< 20, "f20">, DwarfRegNum<[148]>;
+def F21 : FP< 21, "f21">, DwarfRegNum<[149]>;
+def F22 : FP< 22, "f22">, DwarfRegNum<[150]>;
+def F23 : FP< 23, "f23">, DwarfRegNum<[151]>;
+def F24 : FP< 24, "f24">, DwarfRegNum<[152]>;
+def F25 : FP< 25, "f25">, DwarfRegNum<[153]>;
+def F26 : FP< 26, "f26">, DwarfRegNum<[154]>;
+def F27 : FP< 27, "f27">, DwarfRegNum<[155]>;
+def F28 : FP< 28, "f28">, DwarfRegNum<[156]>;
+def F29 : FP< 29, "f29">, DwarfRegNum<[157]>;
+def F30 : FP< 30, "f30">, DwarfRegNum<[158]>;
+def F31 : FP< 31, "f31">, DwarfRegNum<[159]>;
+def F32 : FP< 32, "f32">, DwarfRegNum<[160]>;
+def F33 : FP< 33, "f33">, DwarfRegNum<[161]>;
+def F34 : FP< 34, "f34">, DwarfRegNum<[162]>;
+def F35 : FP< 35, "f35">, DwarfRegNum<[163]>;
+def F36 : FP< 36, "f36">, DwarfRegNum<[164]>;
+def F37 : FP< 37, "f37">, DwarfRegNum<[165]>;
+def F38 : FP< 38, "f38">, DwarfRegNum<[166]>;
+def F39 : FP< 39, "f39">, DwarfRegNum<[167]>;
+def F40 : FP< 40, "f40">, DwarfRegNum<[168]>;
+def F41 : FP< 41, "f41">, DwarfRegNum<[169]>;
+def F42 : FP< 42, "f42">, DwarfRegNum<[170]>;
+def F43 : FP< 43, "f43">, DwarfRegNum<[171]>;
+def F44 : FP< 44, "f44">, DwarfRegNum<[172]>;
+def F45 : FP< 45, "f45">, DwarfRegNum<[173]>;
+def F46 : FP< 46, "f46">, DwarfRegNum<[174]>;
+def F47 : FP< 47, "f47">, DwarfRegNum<[175]>;
+def F48 : FP< 48, "f48">, DwarfRegNum<[176]>;
+def F49 : FP< 49, "f49">, DwarfRegNum<[177]>;
+def F50 : FP< 50, "f50">, DwarfRegNum<[178]>;
+def F51 : FP< 51, "f51">, DwarfRegNum<[179]>;
+def F52 : FP< 52, "f52">, DwarfRegNum<[180]>;
+def F53 : FP< 53, "f53">, DwarfRegNum<[181]>;
+def F54 : FP< 54, "f54">, DwarfRegNum<[182]>;
+def F55 : FP< 55, "f55">, DwarfRegNum<[183]>;
+def F56 : FP< 56, "f56">, DwarfRegNum<[184]>;
+def F57 : FP< 57, "f57">, DwarfRegNum<[185]>;
+def F58 : FP< 58, "f58">, DwarfRegNum<[186]>;
+def F59 : FP< 59, "f59">, DwarfRegNum<[187]>;
+def F60 : FP< 60, "f60">, DwarfRegNum<[188]>;
+def F61 : FP< 61, "f61">, DwarfRegNum<[189]>;
+def F62 : FP< 62, "f62">, DwarfRegNum<[190]>;
+def F63 : FP< 63, "f63">, DwarfRegNum<[191]>;
+def F64 : FP< 64, "f64">, DwarfRegNum<[192]>;
+def F65 : FP< 65, "f65">, DwarfRegNum<[193]>;
+def F66 : FP< 66, "f66">, DwarfRegNum<[194]>;
+def F67 : FP< 67, "f67">, DwarfRegNum<[195]>;
+def F68 : FP< 68, "f68">, DwarfRegNum<[196]>;
+def F69 : FP< 69, "f69">, DwarfRegNum<[197]>;
+def F70 : FP< 70, "f70">, DwarfRegNum<[198]>;
+def F71 : FP< 71, "f71">, DwarfRegNum<[199]>;
+def F72 : FP< 72, "f72">, DwarfRegNum<[200]>;
+def F73 : FP< 73, "f73">, DwarfRegNum<[201]>;
+def F74 : FP< 74, "f74">, DwarfRegNum<[202]>;
+def F75 : FP< 75, "f75">, DwarfRegNum<[203]>;
+def F76 : FP< 76, "f76">, DwarfRegNum<[204]>;
+def F77 : FP< 77, "f77">, DwarfRegNum<[205]>;
+def F78 : FP< 78, "f78">, DwarfRegNum<[206]>;
+def F79 : FP< 79, "f79">, DwarfRegNum<[207]>;
+def F80 : FP< 80, "f80">, DwarfRegNum<[208]>;
+def F81 : FP< 81, "f81">, DwarfRegNum<[209]>;
+def F82 : FP< 82, "f82">, DwarfRegNum<[210]>;
+def F83 : FP< 83, "f83">, DwarfRegNum<[211]>;
+def F84 : FP< 84, "f84">, DwarfRegNum<[212]>;
+def F85 : FP< 85, "f85">, DwarfRegNum<[213]>;
+def F86 : FP< 86, "f86">, DwarfRegNum<[214]>;
+def F87 : FP< 87, "f87">, DwarfRegNum<[215]>;
+def F88 : FP< 88, "f88">, DwarfRegNum<[216]>;
+def F89 : FP< 89, "f89">, DwarfRegNum<[217]>;
+def F90 : FP< 90, "f90">, DwarfRegNum<[218]>;
+def F91 : FP< 91, "f91">, DwarfRegNum<[219]>;
+def F92 : FP< 92, "f92">, DwarfRegNum<[220]>;
+def F93 : FP< 93, "f93">, DwarfRegNum<[221]>;
+def F94 : FP< 94, "f94">, DwarfRegNum<[222]>;
+def F95 : FP< 95, "f95">, DwarfRegNum<[223]>;
+def F96 : FP< 96, "f96">, DwarfRegNum<[224]>;
+def F97 : FP< 97, "f97">, DwarfRegNum<[225]>;
+def F98 : FP< 98, "f98">, DwarfRegNum<[226]>;
+def F99 : FP< 99, "f99">, DwarfRegNum<[227]>;
+def F100 : FP< 100, "f100">, DwarfRegNum<[228]>;
+def F101 : FP< 101, "f101">, DwarfRegNum<[229]>;
+def F102 : FP< 102, "f102">, DwarfRegNum<[230]>;
+def F103 : FP< 103, "f103">, DwarfRegNum<[231]>;
+def F104 : FP< 104, "f104">, DwarfRegNum<[232]>;
+def F105 : FP< 105, "f105">, DwarfRegNum<[233]>;
+def F106 : FP< 106, "f106">, DwarfRegNum<[234]>;
+def F107 : FP< 107, "f107">, DwarfRegNum<[235]>;
+def F108 : FP< 108, "f108">, DwarfRegNum<[236]>;
+def F109 : FP< 109, "f109">, DwarfRegNum<[237]>;
+def F110 : FP< 110, "f110">, DwarfRegNum<[238]>;
+def F111 : FP< 111, "f111">, DwarfRegNum<[239]>;
+def F112 : FP< 112, "f112">, DwarfRegNum<[240]>;
+def F113 : FP< 113, "f113">, DwarfRegNum<[241]>;
+def F114 : FP< 114, "f114">, DwarfRegNum<[242]>;
+def F115 : FP< 115, "f115">, DwarfRegNum<[243]>;
+def F116 : FP< 116, "f116">, DwarfRegNum<[244]>;
+def F117 : FP< 117, "f117">, DwarfRegNum<[245]>;
+def F118 : FP< 118, "f118">, DwarfRegNum<[246]>;
+def F119 : FP< 119, "f119">, DwarfRegNum<[247]>;
+def F120 : FP< 120, "f120">, DwarfRegNum<[248]>;
+def F121 : FP< 121, "f121">, DwarfRegNum<[249]>;
+def F122 : FP< 122, "f122">, DwarfRegNum<[250]>;
+def F123 : FP< 123, "f123">, DwarfRegNum<[251]>;
+def F124 : FP< 124, "f124">, DwarfRegNum<[252]>;
+def F125 : FP< 125, "f125">, DwarfRegNum<[253]>;
+def F126 : FP< 126, "f126">, DwarfRegNum<[254]>;
+def F127 : FP< 127, "f127">, DwarfRegNum<[255]>;
+
+/* predicate registers */
+def p0 : PR< 0, "p0">, DwarfRegNum<[256]>;
+def p1 : PR< 1, "p1">, DwarfRegNum<[257]>;
+def p2 : PR< 2, "p2">, DwarfRegNum<[258]>;
+def p3 : PR< 3, "p3">, DwarfRegNum<[259]>;
+def p4 : PR< 4, "p4">, DwarfRegNum<[260]>;
+def p5 : PR< 5, "p5">, DwarfRegNum<[261]>;
+def p6 : PR< 6, "p6">, DwarfRegNum<[262]>;
+def p7 : PR< 7, "p7">, DwarfRegNum<[263]>;
+def p8 : PR< 8, "p8">, DwarfRegNum<[264]>;
+def p9 : PR< 9, "p9">, DwarfRegNum<[265]>;
+def p10 : PR< 10, "p10">, DwarfRegNum<[266]>;
+def p11 : PR< 11, "p11">, DwarfRegNum<[267]>;
+def p12 : PR< 12, "p12">, DwarfRegNum<[268]>;
+def p13 : PR< 13, "p13">, DwarfRegNum<[269]>;
+def p14 : PR< 14, "p14">, DwarfRegNum<[270]>;
+def p15 : PR< 15, "p15">, DwarfRegNum<[271]>;
+def p16 : PR< 16, "p16">, DwarfRegNum<[272]>;
+def p17 : PR< 17, "p17">, DwarfRegNum<[273]>;
+def p18 : PR< 18, "p18">, DwarfRegNum<[274]>;
+def p19 : PR< 19, "p19">, DwarfRegNum<[275]>;
+def p20 : PR< 20, "p20">, DwarfRegNum<[276]>;
+def p21 : PR< 21, "p21">, DwarfRegNum<[277]>;
+def p22 : PR< 22, "p22">, DwarfRegNum<[278]>;
+def p23 : PR< 23, "p23">, DwarfRegNum<[279]>;
+def p24 : PR< 24, "p24">, DwarfRegNum<[280]>;
+def p25 : PR< 25, "p25">, DwarfRegNum<[281]>;
+def p26 : PR< 26, "p26">, DwarfRegNum<[282]>;
+def p27 : PR< 27, "p27">, DwarfRegNum<[283]>;
+def p28 : PR< 28, "p28">, DwarfRegNum<[284]>;
+def p29 : PR< 29, "p29">, DwarfRegNum<[285]>;
+def p30 : PR< 30, "p30">, DwarfRegNum<[286]>;
+def p31 : PR< 31, "p31">, DwarfRegNum<[287]>;
+def p32 : PR< 32, "p32">, DwarfRegNum<[288]>;
+def p33 : PR< 33, "p33">, DwarfRegNum<[289]>;
+def p34 : PR< 34, "p34">, DwarfRegNum<[290]>;
+def p35 : PR< 35, "p35">, DwarfRegNum<[291]>;
+def p36 : PR< 36, "p36">, DwarfRegNum<[292]>;
+def p37 : PR< 37, "p37">, DwarfRegNum<[293]>;
+def p38 : PR< 38, "p38">, DwarfRegNum<[294]>;
+def p39 : PR< 39, "p39">, DwarfRegNum<[295]>;
+def p40 : PR< 40, "p40">, DwarfRegNum<[296]>;
+def p41 : PR< 41, "p41">, DwarfRegNum<[297]>;
+def p42 : PR< 42, "p42">, DwarfRegNum<[298]>;
+def p43 : PR< 43, "p43">, DwarfRegNum<[299]>;
+def p44 : PR< 44, "p44">, DwarfRegNum<[300]>;
+def p45 : PR< 45, "p45">, DwarfRegNum<[301]>;
+def p46 : PR< 46, "p46">, DwarfRegNum<[302]>;
+def p47 : PR< 47, "p47">, DwarfRegNum<[303]>;
+def p48 : PR< 48, "p48">, DwarfRegNum<[304]>;
+def p49 : PR< 49, "p49">, DwarfRegNum<[305]>;
+def p50 : PR< 50, "p50">, DwarfRegNum<[306]>;
+def p51 : PR< 51, "p51">, DwarfRegNum<[307]>;
+def p52 : PR< 52, "p52">, DwarfRegNum<[308]>;
+def p53 : PR< 53, "p53">, DwarfRegNum<[309]>;
+def p54 : PR< 54, "p54">, DwarfRegNum<[310]>;
+def p55 : PR< 55, "p55">, DwarfRegNum<[311]>;
+def p56 : PR< 56, "p56">, DwarfRegNum<[312]>;
+def p57 : PR< 57, "p57">, DwarfRegNum<[313]>;
+def p58 : PR< 58, "p58">, DwarfRegNum<[314]>;
+def p59 : PR< 59, "p59">, DwarfRegNum<[315]>;
+def p60 : PR< 60, "p60">, DwarfRegNum<[316]>;
+def p61 : PR< 61, "p61">, DwarfRegNum<[317]>;
+def p62 : PR< 62, "p62">, DwarfRegNum<[318]>;
+def p63 : PR< 63, "p63">, DwarfRegNum<[319]>;
+
+// XXX : this is temporary, we'll eventually have the output registers
+// in the general purpose register class too?
+def out0 : GR<0, "out0">, DwarfRegNum<[120]>;
+def out1 : GR<1, "out1">, DwarfRegNum<[121]>;
+def out2 : GR<2, "out2">, DwarfRegNum<[122]>;
+def out3 : GR<3, "out3">, DwarfRegNum<[123]>;
+def out4 : GR<4, "out4">, DwarfRegNum<[124]>;
+def out5 : GR<5, "out5">, DwarfRegNum<[125]>;
+def out6 : GR<6, "out6">, DwarfRegNum<[126]>;
+def out7 : GR<7, "out7">, DwarfRegNum<[127]>;
+
+// application (special) registers:
+
+// "previous function state" application register
+def AR_PFS : GR<0, "ar.pfs">, DwarfRegNum<[331]>;
+
+// "compare and exchange compare value" application register (ar.ccv, AR[32]):
+// cmpxchg reads its comparand from here. Not a member of any allocatable class
+// (so it is implicitly reserved); written by MOV_TO_AR_CCV and read by the
+// CMPXCHG* instructions as an implicit physreg, which is enough for the bundler
+// to force the required stop between them.
+def AR_CCV : GR<0, "ar.ccv">, DwarfRegNum<[332]>;
+
+// "return pointer" (this is really branch register b0)
+def rp : GR<0, "rp">, DwarfRegNum<[-1]>;
+
+// branch reg 6
+def B6 : GR<0, "b6">, DwarfRegNum<[326]>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// these are the scratch (+stacked) general registers
+// FIXME/XXX  we also reserve a frame pointer (r5)
+// FIXME/XXX  we also reserve r2 for spilling/filling predicates
+// in IA64RegisterInfo.cpp
+// FIXME/XXX  we also reserve r22 for calculating addresses
+// in IA64RegisterInfo.cpp
+
+def GR : RegisterClass<"IA64", [i64], 64,
+       (add
+//FIXME!: for both readability and performance, we don't want the out
+//        registers to be the first ones allocated
+
+        out7, out6, out5, out4, out3, out2, out1, out0,
+        r3,  r8,  r9,  r10, r11, r14, r15,
+        r16, r17, r18, r19, r20, r21, r23,
+        r24, r25, r26, r27, r28, r29, r30, r31,
+        r32, r33, r34, r35, r36, r37, r38, r39,
+        r40, r41, r42, r43, r44, r45, r46, r47,
+        r48, r49, r50, r51, r52, r53, r54, r55,
+        r56, r57, r58, r59, r60, r61, r62, r63,
+        r64, r65, r66, r67, r68, r69, r70, r71,
+        r72, r73, r74, r75, r76, r77, r78, r79,
+        r80, r81, r82, r83, r84, r85, r86, r87,
+        r88, r89, r90, r91, r92, r93, r94, r95,
+        r96, r97, r98, r99, r100, r101, r102, r103,
+        r104, r105, r106, r107, r108, r109, r110, r111,
+        r112, r113, r114, r115, r116, r117, r118, r119,
+        // last 16 are special (look down)
+        r120, r121, r122, r123, r124, r125, r126, r127,
+        // r4/r6/r7: the static callee-saved GRs (getCalleeSavedRegs). They trail
+        // the order so the allocator reaches them only under heavy pressure, but
+        // are class members so copyPhysReg treats them as GRs -- LowerCall parks
+        // gp/sp/rp in them across a call in a returns_twice (setjmp) function,
+        // where they survive a longjmp re-entry (glibc setjmp/longjmp save and
+        // restore them via the jmpbuf).
+        r4, r6, r7,
+        r0, r1, r2, r5, r12, r13, r22, rp)>;
+// The pre-removal backend hid the 'out' registers and the reserved registers
+// (r0,r1,r2,r5,r12,r13,r22,rp) from the allocation order via RegisterClass
+// MethodBodies, a mechanism that no longer exists.  Those reservations are now
+// expressed in IA64RegisterInfo::getReservedRegs (ported in Phase D).
+//
+// ar.pfs (the "previous function state" application register) is deliberately
+// NOT a member of GR.  The pre-removal backend kept it in GR and excluded it
+// from the allocation order, but the modern register coalescer will then fold
+// the ar.pfs-save vreg straight into the AR_PFS physreg (yielding the
+// nonsensical 'alloc ar.pfs = ar.pfs'), because the vreg and physreg share a
+// class.  Giving ar.pfs its own class makes the 'mov ar.pfs = rN' restore a
+// cross-class copy the coalescer cannot join, so the save register stays a
+// real scratch GR.
+// 'addl r1 = imm22, r3' (A5 form) spends most of its instruction word on the
+// 22-bit immediate, leaving only a 2-bit field for its source register: that
+// operand must be one of r0-r3. GR03 is that restricted class, used for ADDL's
+// addend. Of its members r0 (zero), r1 (gp) and r2 (predicate-spill scratch)
+// are reserved by getReservedRegs, so the register allocator's only free pick
+// here is r3; r0 is still usable as an explicit operand (the 'addl rX = imm22,
+// r0' = 'mov rX = imm22' constant-materialization form). r3 leads the order so
+// it is preferred for allocation.
+def GR03 : RegisterClass<"IA64", [i64], 64, (add r3, r2, r0, r1)>;
+
+def AR : RegisterClass<"IA64", [i64], 64, (add AR_PFS)>;
+
+// Branch registers. b6 holds the target of an indirect (function-descriptor)
+// call; like AR it gets its own class so MOV_TO_BR can name it.
+def BR : RegisterClass<"IA64", [i64], 64, (add B6)>;
+
+
+// these are the scratch (+stacked) FP registers
+
+def FP : RegisterClass<"IA64", [f64, f32, f80], 128,
+       (add F6, F7,
+        F8, F9, F10, F11, F12, F13, F14, F15,
+        F32, F33, F34, F35, F36, F37, F38, F39,
+        F40, F41, F42, F43, F44, F45, F46, F47,
+        F48, F49, F50, F51, F52, F53, F54, F55,
+        F56, F57, F58, F59, F60, F61, F62, F63,
+        F64, F65, F66, F67, F68, F69, F70, F71,
+        F72, F73, F74, F75, F76, F77, F78, F79,
+        F80, F81, F82, F83, F84, F85, F86, F87,
+        F88, F89, F90, F91, F92, F93, F94, F95,
+        F96, F97, F98, F99, F100, F101, F102, F103,
+        F104, F105, F106, F107, F108, F109, F110, F111,
+        F112, F113, F114, F115, F116, F117, F118, F119,
+        F120, F121, F122, F123, F124, F125, F126, F127,
+        F0, F1)> {  // F0, F1 are hidden via getReservedRegs (Phase D)
+  // Size/alignment of 128 bits so that stf.spill / ldf.fill of full 82-bit FP
+  // registers to stack slots are 16-byte aligned.
+  let Size = 128;
+}
+
+// An alternate view of the same physical FP registers whose sole value type is
+// f80 ('long double'). This mirrors x86's RFP80 (a single-type class layered
+// over the x87 stack registers): the generic inline-asm ISel path derives a
+// register's type from the *first* legal type of the chosen class, and the
+// multi-typed FP class above leads with f64. Routing an 80-bit value through a
+// 64-bit-typed register slot makes getCopyToParts try to truncate it and assert
+// ("Unknown mismatch!"). Returning this class for the 'f' constraint on an f80
+// value makes the representative register type f80, so the value tiles into one
+// register cleanly. (add FP) reuses FP's registers -- no new physical registers.
+def FP80 : RegisterClass<"IA64", [f80], 128, (add FP)> {
+  let Size = 128;
+}
+
+// these are the predicate registers, p0 (1/TRUE) is not here
+def PR : RegisterClass<"IA64", [i1], 64,
+// for now, only the scratch predicate regs
+       (add p6, p7, p8, p9, p10, p11, p12, p13, p14, p15)> {
+  let Size = 64;
+}
+
+/*
+ [p1, p2, p3, p4, p5, p6, p7,
+  p8, p9, p10, p11, p12, p13, p14, p15,
+  p16, p17, p18, p19, p20, p21, p22, p23,
+  p24, p25, p26, p27, p28, p29, p30, p31,
+  p32, p33, p34, p35, p36, p37, p38, p39,
+  p40, p41, p42, p43, p44, p45, p46, p47,
+  p48, p49, p50, p51, p52, p53, p54, p55,
+  p56, p57, p58, p59, p60, p61, p62, p63]>;
+  */
diff --git llvm/lib/Target/IA64/IA64Subtarget.cpp llvm/lib/Target/IA64/IA64Subtarget.cpp
new file mode 100644
index 000000000000..f9a51bf268cc
--- /dev/null
+++ llvm/lib/Target/IA64/IA64Subtarget.cpp
@@ -0,0 +1,30 @@
+//===-- IA64Subtarget.cpp - IA64 Subtarget Information --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64Subtarget.h"
+#include "IA64.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ia64-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "IA64GenSubtargetInfo.inc"
+
+void IA64Subtarget::anchor() {}
+
+IA64Subtarget::IA64Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                             StringRef FS, const TargetMachine &TM)
+    : IA64GenSubtargetInfo(TT, CPU, TuneCPU, FS), FrameLowering(),
+      InstrInfo(*this), TLInfo(TM, *this) { }
diff --git llvm/lib/Target/IA64/IA64Subtarget.h llvm/lib/Target/IA64/IA64Subtarget.h
new file mode 100644
index 000000000000..55e4dd05120a
--- /dev/null
+++ llvm/lib/Target/IA64/IA64Subtarget.h
@@ -0,0 +1,69 @@
+//===-- IA64Subtarget.h - Define Subtarget for the IA64 --------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IA64 specific subclass of TargetSubtargetInfo. It was
+// trivial in the pre-removal backend; modern LLVM uses it as the aggregate that
+// owns the InstrInfo, FrameLowering, RegisterInfo and TargetLowering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64SUBTARGET_H
+#define LLVM_LIB_TARGET_IA64_IA64SUBTARGET_H
+
+#include "IA64FrameLowering.h"
+#include "IA64ISelLowering.h"
+#include "IA64InstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/TargetParser/Triple.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "IA64GenSubtargetInfo.inc"
+
+namespace llvm {
+
+class StringRef;
+class TargetMachine;
+
+class IA64Subtarget : public IA64GenSubtargetInfo {
+  virtual void anchor();
+
+  // Note the declaration order: InstrInfo owns the RegisterInfo, and TLInfo's
+  // constructor queries it (computeRegisterProperties), so InstrInfo must be
+  // constructed before TLInfo.
+  IA64FrameLowering FrameLowering;
+  IA64InstrInfo InstrInfo;
+  IA64TargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+
+public:
+  IA64Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                StringRef FS, const TargetMachine &TM);
+
+  const IA64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const IA64RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const IA64TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options. Definition auto-generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64SUBTARGET_H
diff --git llvm/lib/Target/IA64/IA64TargetMachine.cpp llvm/lib/Target/IA64/IA64TargetMachine.cpp
new file mode 100644
index 000000000000..6ca6f1de381d
--- /dev/null
+++ llvm/lib/Target/IA64/IA64TargetMachine.cpp
@@ -0,0 +1,202 @@
+//===-- IA64TargetMachine.cpp - Define TargetMachine for IA64 -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64 specific subclass of TargetMachine. It is the
+// capstone that aggregates the subtarget and wires up the codegen pass pipeline
+// (instruction selection + the bundling pre-emit pass), and registers the
+// target machine so `llc -mtriple=ia64` can allocate one.
+//
+// The companion LLVMInitializeIA64TargetMC() lives in MCTargetDesc/.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64TargetMachine.h"
+#include "IA64.h"
+#include "IA64MachineFunctionInfo.h"
+#include "IA64RegisterInfo.h"
+#include "MCTargetDesc/IA64MCTargetDesc.h"
+#include "TargetInfo/IA64TargetInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+#include <optional>
+
+using namespace llvm;
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeIA64Target() {
+  // Register the target machine, so `llc -mtriple=ia64` can allocate one.
+  RegisterTargetMachine<IA64TargetMachine> X(getTheIA64Target());
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeIA64DAGToDAGISelLegacyPass(PR);
+}
+
+static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+  return RM.value_or(Reloc::Static);
+}
+
+IA64TargetMachine::IA64TargetMachine(const Target &T, const Triple &TT,
+                                     StringRef CPU, StringRef FS,
+                                     const TargetOptions &Options,
+                                     std::optional<Reloc::Model> RM,
+                                     std::optional<CodeModel::Model> CM,
+                                     CodeGenOptLevel OL, bool JIT)
+    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
+                               getEffectiveRelocModel(RM),
+                               getEffectiveCodeModel(CM, CodeModel::Small), OL),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
+  initAsmInfo();
+}
+
+IA64TargetMachine::~IA64TargetMachine() = default;
+
+const IA64Subtarget *
+IA64TargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute TuneAttr = F.getFnAttribute("tune-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string TuneCPU =
+      TuneAttr.isValid() ? TuneAttr.getValueAsString().str() : CPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to happen before the subtarget is created, since the latter
+    // depends on the code-generation flags on the function.
+    resetTargetOptions(F);
+    I = std::make_unique<IA64Subtarget>(getTargetTriple(), CPU, TuneCPU, FS,
+                                        *this);
+  }
+  return I.get();
+}
+
+MachineFunctionInfo *IA64TargetMachine::createMachineFunctionInfo(
+    BumpPtrAllocator &Allocator, const Function &F,
+    const TargetSubtargetInfo *STI) const {
+  return IA64FunctionInfo::create<IA64FunctionInfo>(Allocator, F, STI);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Rewrite the symbolic output registers out0-out7 in debug values to the real
+// stacked register they alias. gas resolves 'out0' to r(32+inputs+locals) from
+// the 'alloc', but the .td gives out0-out7 the fixed DwarfRegNum 120-127 (=
+// physical r120-r127), so a variable that lives in an output register at some PC
+// -- e.g. a parameter already moved into place for a call -- would be read by
+// gdb from the wrong register (seen as a bogus '0x0' in test_gdb.test_pretty_
+// print). The actual stacked register has the correct DwarfRegNum, so map to it.
+// Runs in addPreEmitPass2, after LiveDebugValues has finalized the debug values.
+struct IA64FixupDebugOutRegs : public MachineFunctionPass {
+  static char ID;
+  IA64FixupDebugOutRegs() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "IA64 debug output-register fixup";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!MF.getFunction().getSubprogram())
+      return false; // no debug info -> no debug values to fix
+
+    static const MCPhysReg OutRegs[8] = {
+        IA64::out0, IA64::out1, IA64::out2, IA64::out3,
+        IA64::out4, IA64::out5, IA64::out6, IA64::out7};
+
+    // out_i is the stacked register just above the input+local region the
+    // 'alloc' sized: index (inputs + locals + i). alloc operands are
+    // dst, inputs, locals, outputs, rotating.
+    unsigned Base = 0;
+    bool FoundAlloc = false;
+    for (MachineInstr &MI : MF.front())
+      if (MI.getOpcode() == IA64::ALLOC) {
+        Base = MI.getOperand(1).getImm() + MI.getOperand(2).getImm();
+        FoundAlloc = true;
+        break;
+      }
+    if (!FoundAlloc)
+      return false;
+
+    bool Changed = false;
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB) {
+        if (!MI.isDebugValue())
+          continue;
+        for (MachineOperand &MO : MI.debug_operands()) {
+          if (!MO.isReg() || !MO.getReg())
+            continue;
+          for (unsigned i = 0; i != 8; ++i)
+            if (MO.getReg() == OutRegs[i] &&
+                Base + i < IA64NumStackedGPRs) {
+              MO.setReg(getIA64StackedGPR(Base + i));
+              Changed = true;
+              break;
+            }
+        }
+      }
+    return Changed;
+  }
+};
+char IA64FixupDebugOutRegs::ID = 0;
+
+class IA64PassConfig : public TargetPassConfig {
+public:
+  IA64PassConfig(IA64TargetMachine &TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  IA64TargetMachine &getIA64TargetMachine() const {
+    return getTM<IA64TargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addPreEmitPass() override;
+  void addPreEmitPass2() override;
+};
+} // end anonymous namespace
+
+TargetPassConfig *IA64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new IA64PassConfig(*this, PM);
+}
+
+void IA64PassConfig::addIRPasses() {
+  // Expand atomics the backend cannot select directly: turn every atomicrmw
+  // into a cmpxchg loop (shouldExpandAtomicRMWInIR) and bracket stronger-than-
+  // monotonic atomics with fences (shouldInsertFencesForAtomic). This is no
+  // longer part of the target-independent addIRPasses, so each target adds it
+  // (cf. SparcPassConfig); without it atomicrmw reaches isel as AtomicLoadAdd
+  // etc. and fails to select, and ordering fences are never inserted.
+  addPass(createAtomicExpandLegacyPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool IA64PassConfig::addInstSelector() {
+  addPass(createIA64ISelDag(getIA64TargetMachine()));
+  return false;
+}
+
+void IA64PassConfig::addPreEmitPass() {
+  // Insert stop bits so the assembler can bundle correctly.
+  addPass(createIA64BundlingPass());
+}
+
+void IA64PassConfig::addPreEmitPass2() {
+  // Fix up out0-out7 in debug values now that LiveDebugValues has run and the
+  // debug locations are final (see IA64FixupDebugOutRegs).
+  addPass(new IA64FixupDebugOutRegs());
+}
diff --git llvm/lib/Target/IA64/IA64TargetMachine.h llvm/lib/Target/IA64/IA64TargetMachine.h
new file mode 100644
index 000000000000..adfb09de9556
--- /dev/null
+++ llvm/lib/Target/IA64/IA64TargetMachine.h
@@ -0,0 +1,53 @@
+//===-- IA64TargetMachine.h - Define TargetMachine for IA64 ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IA64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_IA64TARGETMACHINE_H
+#define LLVM_LIB_TARGET_IA64_IA64TARGETMACHINE_H
+
+#include "IA64Subtarget.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+#include <optional>
+
+namespace llvm {
+
+class IA64TargetMachine : public CodeGenTargetMachineImpl {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<IA64Subtarget>> SubtargetMap;
+
+public:
+  IA64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, const TargetOptions &Options,
+                    std::optional<Reloc::Model> RM,
+                    std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
+                    bool JIT);
+  ~IA64TargetMachine() override;
+
+  const IA64Subtarget *getSubtargetImpl(const Function &F) const override;
+
+  // Pass Pipeline Configuration.
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  MachineFunctionInfo *
+  createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
+                            const TargetSubtargetInfo *STI) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_IA64TARGETMACHINE_H
diff --git llvm/lib/Target/IA64/MCTargetDesc/CMakeLists.txt llvm/lib/Target/IA64/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 000000000000..c6a1441e72a6
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_component_library(LLVMIA64Desc
+  IA64InstPrinter.cpp
+  IA64MCAsmInfo.cpp
+  IA64MCTargetDesc.cpp
+  IA64TargetStreamer.cpp
+
+  LINK_COMPONENTS
+  MC
+  IA64Info
+  Support
+  TargetParser
+
+  ADD_TO_COMPONENT
+  IA64
+  )
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64InstPrinter.cpp llvm/lib/Target/IA64/MCTargetDesc/IA64InstPrinter.cpp
new file mode 100644
index 000000000000..ec013473b403
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64InstPrinter.cpp
@@ -0,0 +1,98 @@
+//===-- IA64InstPrinter.cpp - Convert IA64 MCInst to assembly syntax ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an IA64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64InstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#include "IA64GenAsmWriter.inc"
+
+void IA64InstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) {
+  OS << getRegisterName(Reg);
+}
+
+void IA64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
+                                StringRef Annot, const MCSubtargetInfo & /*STI*/,
+                                raw_ostream &O) {
+  printInstruction(MI, Address, O);
+  printAnnotation(O, Annot);
+}
+
+void IA64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+    return;
+  }
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+  assert(Op.isExpr() && "Unknown operand kind in printOperand");
+  MAI.printExpr(O, *Op.getExpr());
+}
+
+// Sign-extend and print an immediate of the given bit width. The pre-removal
+// AsmPrinter did this by hand because the operands are stored unsigned.
+void IA64InstPrinter::printS8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  int Val = (int)MI->getOperand(OpNo).getImm();
+  if (Val >= 128)
+    Val -= 256;
+  O << Val;
+}
+
+void IA64InstPrinter::printS14ImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  int Val = (int)MI->getOperand(OpNo).getImm();
+  if (Val >= 8192)
+    Val -= 16384;
+  O << Val;
+}
+
+void IA64InstPrinter::printS22ImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  int Val = (int)MI->getOperand(OpNo).getImm();
+  if (Val >= 2097152)
+    Val -= 4194304;
+  O << Val;
+}
+
+void IA64InstPrinter::printS64ImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else // a constant-pool / symbol reference
+    printOperand(MI, OpNo, O);
+}
+
+// plus.ll exercises no globals or calls; the @ltoff(@fptr(...)) decoration the
+// pre-removal backend applied is out of Stage-1 scope, so these defer to the
+// generic operand printer for now.
+void IA64InstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  printOperand(MI, OpNo, O);
+}
+
+void IA64InstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  printOperand(MI, OpNo, O);
+}
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64InstPrinter.h llvm/lib/Target/IA64/MCTargetDesc/IA64InstPrinter.h
new file mode 100644
index 000000000000..0a2fb6c195ea
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64InstPrinter.h
@@ -0,0 +1,49 @@
+//===-- IA64InstPrinter.h - Convert IA64 MCInst to assembly syntax --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an IA64 MCInst to a .s file (GNU gas syntax). It is the
+// modern home for the tablegen -gen-asm-writer output and the hand-written
+// operand printers that used to live in the pre-removal IA64AsmPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64INSTPRINTER_H
+#define LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class IA64InstPrinter : public MCInstPrinter {
+public:
+  IA64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, MCRegister Reg) override;
+  void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+                 const MCSubtargetInfo &STI, raw_ostream &O) override;
+
+  // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst &MI) const override;
+  void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+  static const char *getRegisterName(MCRegister Reg);
+
+  // Operand printers invoked by the autogenerated printInstruction.
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS14ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS22ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS64ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64INSTPRINTER_H
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64MCAsmInfo.cpp llvm/lib/Target/IA64/MCTargetDesc/IA64MCAsmInfo.cpp
new file mode 100644
index 000000000000..74e316495d56
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64MCAsmInfo.cpp
@@ -0,0 +1,104 @@
+//===-- IA64MCAsmInfo.cpp - IA64 asm properties ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the IA64MCAsmInfo properties. The
+// directive set is carried over from the pre-removal IA64TargetAsmInfo; section
+// selection (text/cstring/mergeable) is now handled generically by
+// TargetLoweringObjectFileELF, so it lives here no longer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+
+using namespace llvm;
+
+StringRef IA64::getSpecifierName(uint16_t S) {
+  switch (S) {
+  case IA64::S_None:
+    return {};
+  case IA64::S_LTOFF:
+    return "ltoff";
+  case IA64::S_FPTR:
+    return "fptr";
+  case IA64::S_TPREL:
+    return "tprel";
+  case IA64::S_DTPREL:
+    return "dtprel";
+  case IA64::S_DTPMOD:
+    return "dtpmod";
+  }
+  llvm_unreachable("Unhandled IA64 relocation specifier");
+}
+
+void IA64MCAsmInfo::anchor() {}
+
+IA64MCAsmInfo::IA64MCAsmInfo(const Triple &TheTriple,
+                             const MCTargetOptions &Options)
+    : MCAsmInfoELF() {
+  // IA-64 is LP64.
+  CodePointerSize = CalleeSaveStackSlotSize = 8;
+
+  CommentString = "//";
+
+  // The IA-64 backend has no integrated assembler (no MCCodeEmitter/AsmParser);
+  // we always emit assembly text for GNU 'as'. Telling MC we don't use the
+  // integrated assembler makes the AsmPrinter emit inline asm (e.g. the empty
+  // barrier that `core::hint::black_box` lowers to) as raw text instead of
+  // trying to parse it with a (nonexistent) target asm parser.
+  UseIntegratedAssembler = false;
+
+  // GNU 'as' for IA-64 treats a bare identifier that matches a register alias
+  // (`gp`=r1, `sp`=r12, `tp`=r13, `rp`=b0, `r1`, ...) as that register even in
+  // symbol position, so e.g. a C global named `tp` in `@ltoff(tp)` or a pointer
+  // table entry `data8.ua tp` resolves to a register instead of the symbol
+  // (a silent miscompile in the data case). Decorate every non-temporary symbol
+  // with a trailing '#', which `as` strips -- the form gcc and the pre-removal
+  // backend both emit.
+  UseSymbolHashSuffix = true;
+
+  // GNU 'as' for IA-64 spells the data directives "dataN"; the ".ua" suffix
+  // requests unaligned storage (carried over from IA64TargetAsmInfo).
+  Data8bitsDirective = "\tdata1\t";
+  Data16bitsDirective = "\tdata2.ua\t";
+  Data32bitsDirective = "\tdata4.ua\t";
+  Data64bitsDirective = "\tdata8.ua\t";
+
+  ZeroDirective = "\t.skip\t";
+  AsciiDirective = "\tstring\t";
+
+  // Emit source-level DWARF (.file/.loc) so the line table maps PCs back to the
+  // C source rather than to the temporary .s we hand to GNU 'as'. Without this
+  // the AsmPrinter suppresses all .loc directives, and the external assembler --
+  // still invoked with -g -- can only synthesize a line table for the assembly
+  // file it reads, so gdb shows e.g. "ldo-cbe475.s:257" instead of "ldo.c:NNN".
+  SupportsDebugInformation = true;
+
+  // GNU 'as' for IA-64 only accepts the single-string `.file N "name"` form, not
+  // LLVM's default two-argument `.file N "dir" "name"` (it rejects the second
+  // string as "junk at end of line"). Disabling the directory form makes the
+  // MCAsmStreamer fold the directory into the filename: `.file N "dir/name"`.
+  EnableDwarfFileDirectoryDefault = false;
+}
+
+// Print a relocation specifier as "@name(subexpr)", the form GNU 'as' for
+// IA-64 expects (e.g. "@ltoff(.L.str)"). Mirrors SparcELFMCAsmInfo, which uses
+// the "%name(...)" syntax.
+void IA64MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                       const MCSpecifierExpr &Expr) const {
+  StringRef S = IA64::getSpecifierName(Expr.getSpecifier());
+  if (!S.empty())
+    OS << '@' << S << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (!S.empty())
+    OS << ')';
+}
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64MCAsmInfo.h llvm/lib/Target/IA64/MCTargetDesc/IA64MCAsmInfo.h
new file mode 100644
index 000000000000..7efdb500ae77
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64MCAsmInfo.h
@@ -0,0 +1,76 @@
+//===-- IA64MCAsmInfo.h - IA64 asm properties ------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the IA64MCAsmInfo class. It is the
+// modern (MC-layer) replacement for the pre-removal IA64TargetAsmInfo, which
+// subclassed the long-deleted ELFTargetAsmInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64MCASMINFO_H
+#define LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64MCASMINFO_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class MCSpecifierExpr;
+class MCTargetOptions;
+class Triple;
+class raw_ostream;
+
+namespace IA64 {
+// Relocation specifiers. The backend is asm-output only, so these only select
+// the printed form (e.g. "@ltoff(sym)"); the GNU assembler turns that into the
+// matching R_IA64_* relocation. The specifier value is carried on the symbol
+// operand's target flags (set in IA64ISelDAGToDAG) and read back in
+// IA64MCInstLower.
+enum Specifier : uint16_t {
+  S_None = 0,
+  // @ltoff(sym): the gp-relative offset of the symbol's linkage-table (GOT)
+  // entry; emitted for the ADDL_GA + LD8 global-address sequence.
+  S_LTOFF = MCSymbolRefExpr::FirstTargetSpecifier,
+  // @fptr(sym): the address of the function descriptor { entry, gp } for a
+  // function symbol -- what a C function pointer must hold. Emitted for
+  // function pointers stored in data (data8 @fptr(f)).
+  S_FPTR,
+  // Thread-local storage offsets. @tprel(sym) is the symbol's offset from the
+  // thread pointer (tp/r13), used directly in local-exec (movl @tprel). @dtprel
+  // and @dtpmod are the dynamic-model offset and module id, materialised through
+  // the GOT (see the S_LTOFF_* markers below) and consumed by __tls_get_addr.
+  S_TPREL,
+  S_DTPREL,
+  S_DTPMOD,
+  // Marker flags (never stored in an MCSpecifierExpr): a value loaded through
+  // the GOT, so lowerSymbolOperand nests the inner specifier inside @ltoff and
+  // prints @ltoff(@fptr(f)) / @ltoff(@tprel(x)) / @ltoff(@dtpmod(x)) / etc.
+  S_LTOFF_FPTR,
+  S_LTOFF_TPREL,
+  S_LTOFF_DTPMOD,
+  S_LTOFF_DTPREL,
+};
+
+StringRef getSpecifierName(uint16_t S);
+} // namespace IA64
+
+class IA64MCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit IA64MCAsmInfo(const Triple &TheTriple,
+                         const MCTargetOptions &Options);
+
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64MCASMINFO_H
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64MCTargetDesc.cpp llvm/lib/Target/IA64/MCTargetDesc/IA64MCTargetDesc.cpp
new file mode 100644
index 000000000000..0ac491724b98
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64MCTargetDesc.cpp
@@ -0,0 +1,119 @@
+//===-- IA64MCTargetDesc.cpp - IA64 Target Descriptions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides IA64 specific target descriptions.
+//
+// The generated Targets.def emits LLVM_TARGET(IA64), so InitializeAllTargetMCs()
+// references LLVMInitializeIA64TargetMC(). By convention this entry point lives
+// in the target's "Desc" library (LLVMIA64Desc), which object-file tools such
+// as llvm-ar link via AllTargetsDescs.
+//
+// This registers the full MC component set for the asm-output path: MCAsmInfo
+// (replacing the pre-removal IA64TargetAsmInfo), MCInstrInfo (the instruction
+// table also backs IA64InstrInfo's generated constructor), MCRegisterInfo,
+// MCInstPrinter and MCSubtargetInfo. The object-emission components
+// (MCCodeEmitter / MCAsmBackend / ELFObjectWriter) remain out of Stage-1 scope.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64MCTargetDesc.h"
+#include "IA64InstPrinter.h"
+#include "IA64MCAsmInfo.h"
+#include "IA64TargetStreamer.h"
+#include "TargetInfo/IA64TargetInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/TargetParser/Triple.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "IA64GenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "IA64GenRegisterInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "IA64GenSubtargetInfo.inc"
+
+static MCAsmInfo *createIA64MCAsmInfo(const MCRegisterInfo &MRI,
+                                      const Triple &TT,
+                                      const MCTargetOptions &Options) {
+  return new IA64MCAsmInfo(TT, Options);
+}
+
+static MCInstrInfo *createIA64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitIA64MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createIA64MCRegisterInfo(const Triple & /*TT*/) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitIA64MCRegisterInfo(X, IA64::rp); // rp (b0) is the return-address register
+  return X;
+}
+
+static MCInstPrinter *createIA64MCInstPrinter(const Triple & /*T*/,
+                                              unsigned /*SyntaxVariant*/,
+                                              const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
+                                              const MCRegisterInfo &MRI) {
+  return new IA64InstPrinter(MAI, MII, MRI);
+}
+
+// The asm streamer carries the IA-64 unwind directives. There is no object
+// streamer (the backend has no integrated assembler), so the null streamer just
+// uses the no-op base class.
+static MCTargetStreamer *createIA64AsmTargetStreamer(MCStreamer &S,
+                                                     formatted_raw_ostream &OS,
+                                                     MCInstPrinter *) {
+  return new IA64TargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer *createIA64NullTargetStreamer(MCStreamer &S) {
+  return new IA64TargetStreamer(S);
+}
+
+static MCSubtargetInfo *createIA64MCSubtargetInfo(const Triple &TT,
+                                                  StringRef CPU, StringRef FS) {
+  if (CPU.empty())
+    CPU = "generic";
+  return createIA64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS);
+}
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeIA64TargetMC() {
+  Target &T = getTheIA64Target();
+
+  // Register the MC asm info (replaces the pre-removal IA64TargetAsmInfo /
+  // ELFTargetAsmInfo).
+  RegisterMCAsmInfoFn X(T, createIA64MCAsmInfo);
+
+  // Register the MC instruction info (the table also backs IA64InstrInfo).
+  TargetRegistry::RegisterMCInstrInfo(T, createIA64MCInstrInfo);
+
+  // Register the MC register info and the asm-output instruction printer.
+  TargetRegistry::RegisterMCRegInfo(T, createIA64MCRegisterInfo);
+  TargetRegistry::RegisterMCInstPrinter(T, createIA64MCInstPrinter);
+
+  // Register the target streamer that emits the IA-64 unwind directives.
+  TargetRegistry::RegisterAsmTargetStreamer(T, createIA64AsmTargetStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(T, createIA64NullTargetStreamer);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(T, createIA64MCSubtargetInfo);
+}
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64MCTargetDesc.h llvm/lib/Target/IA64/MCTargetDesc/IA64MCTargetDesc.h
new file mode 100644
index 000000000000..4fead9bf1cd6
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64MCTargetDesc.h
@@ -0,0 +1,26 @@
+//===-- IA64MCTargetDesc.h - IA64 Target Descriptions -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides IA64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64MCTARGETDESC_H
+#define LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64MCTARGETDESC_H
+
+// Defines symbolic names for IA64 registers. This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "IA64GenRegisterInfo.inc"
+
+// Defines symbolic names for the IA64 instructions.
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
+#include "IA64GenInstrInfo.inc"
+
+#endif // LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64MCTARGETDESC_H
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64TargetStreamer.cpp llvm/lib/Target/IA64/MCTargetDesc/IA64TargetStreamer.cpp
new file mode 100644
index 000000000000..6a55e74a8b39
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64TargetStreamer.cpp
@@ -0,0 +1,64 @@
+//===-- IA64TargetStreamer.cpp - IA64 Target Streamer Methods ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides IA64 specific target streamer methods: the textual form of
+// the IA-64 unwind directives. See IA64TargetStreamer.h for the rationale.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64TargetStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+IA64TargetStreamer::IA64TargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+void IA64TargetStreamer::anchor() {}
+
+IA64TargetAsmStreamer::IA64TargetAsmStreamer(MCStreamer &S,
+                                             formatted_raw_ostream &OS)
+    : IA64TargetStreamer(S), OS(OS) {}
+
+// gas references IA-64 symbols with a trailing '#' (to disambiguate them from
+// register names); the function labels are printed that way too, so the .proc /
+// .endp operands must match.
+void IA64TargetAsmStreamer::emitProc(const MCSymbol *Sym) {
+  OS << "\t.proc\t" << Sym->getName() << "#\n";
+}
+
+void IA64TargetAsmStreamer::emitEndP(const MCSymbol *Sym) {
+  OS << "\t.endp\t" << Sym->getName() << "#\n";
+}
+
+void IA64TargetAsmStreamer::emitPrologueDirective() { OS << "\t.prologue\n"; }
+
+void IA64TargetAsmStreamer::emitSaveARPFS(StringRef Reg) {
+  OS << "\t.save\tar.pfs, " << Reg << '\n';
+}
+
+void IA64TargetAsmStreamer::emitFFrame(int64_t Size) {
+  OS << "\t.fframe\t" << Size << '\n';
+}
+
+void IA64TargetAsmStreamer::emitSaveRP(StringRef Reg) {
+  OS << "\t.save\trp, " << Reg << '\n';
+}
+
+void IA64TargetAsmStreamer::emitBody() { OS << "\t.body\n"; }
+
+void IA64TargetAsmStreamer::emitLabelState(unsigned N) {
+  OS << "\t.label_state\t" << N << '\n';
+}
+
+void IA64TargetAsmStreamer::emitCopyState(unsigned N) {
+  OS << "\t.copy_state\t" << N << '\n';
+}
+
+void IA64TargetAsmStreamer::emitRestoreSP() { OS << "\t.restore\tsp\n"; }
diff --git llvm/lib/Target/IA64/MCTargetDesc/IA64TargetStreamer.h llvm/lib/Target/IA64/MCTargetDesc/IA64TargetStreamer.h
new file mode 100644
index 000000000000..3b8524251ad9
--- /dev/null
+++ llvm/lib/Target/IA64/MCTargetDesc/IA64TargetStreamer.h
@@ -0,0 +1,86 @@
+//===-- IA64TargetStreamer.h - IA64 Target Streamer ------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This streamer emits the IA-64 unwind directives (.proc / .prologue /
+// .save ar.pfs / .save rp / .fframe / .body / .restore sp / .endp) that GNU
+// 'gas' assembles into the .IA_64.unwind / .IA_64.unwind_info sections. Those
+// sections -- not DWARF .eh_frame -- are what gdb/libunwind read to walk an
+// IA-64 stack, so emitting them is what makes a backtrace work. The asm printer
+// drives these calls off the frame-setup/destroy flags on the prologue and
+// epilogue instructions.
+//
+// Only the textual (asm) form is implemented: the IA-64 backend has no
+// integrated assembler, so the object encoding is gas's job. The base class is
+// a no-op so the null streamer (and any future object streamer) link cleanly.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64TARGETSTREAMER_H
+#define LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64TARGETSTREAMER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+class MCSymbol;
+
+class IA64TargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+
+public:
+  IA64TargetStreamer(MCStreamer &S);
+
+  /// Open the unwind region for a procedure: ".proc <sym>".
+  virtual void emitProc(const MCSymbol *Sym) {}
+  /// Close it: ".endp <sym>".
+  virtual void emitEndP(const MCSymbol *Sym) {}
+  /// Begin the prologue region: ".prologue".
+  virtual void emitPrologueDirective() {}
+  /// Record where ar.pfs (the caller's frame marker) was saved:
+  /// ".save ar.pfs, <reg>".
+  virtual void emitSaveARPFS(StringRef Reg) {}
+  /// Record the fixed memory-frame size in bytes: ".fframe <size>".
+  virtual void emitFFrame(int64_t Size) {}
+  /// Record where the return pointer (b0) was saved: ".save rp, <reg>".
+  virtual void emitSaveRP(StringRef Reg) {}
+  /// End the prologue, begin the body region: ".body".
+  virtual void emitBody() {}
+  /// Snapshot the current unwind state under a label: ".label_state <n>".
+  virtual void emitLabelState(unsigned N) {}
+  /// Restore a snapshotted unwind state: ".copy_state <n>". Emitted before each
+  /// '.restore sp' in a function with several epilogues, so gas re-opens the
+  /// region the previous '.restore' closed.
+  virtual void emitCopyState(unsigned N) {}
+  /// Mark the point where sp is restored to its on-entry value: ".restore sp".
+  virtual void emitRestoreSP() {}
+};
+
+// Textual (.s) output for GNU gas.
+class IA64TargetAsmStreamer : public IA64TargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  IA64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+  void emitProc(const MCSymbol *Sym) override;
+  void emitEndP(const MCSymbol *Sym) override;
+  void emitPrologueDirective() override;
+  void emitSaveARPFS(StringRef Reg) override;
+  void emitFFrame(int64_t Size) override;
+  void emitSaveRP(StringRef Reg) override;
+  void emitBody() override;
+  void emitLabelState(unsigned N) override;
+  void emitCopyState(unsigned N) override;
+  void emitRestoreSP() override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_MCTARGETDESC_IA64TARGETSTREAMER_H
diff --git llvm/lib/Target/IA64/TargetInfo/CMakeLists.txt llvm/lib/Target/IA64/TargetInfo/CMakeLists.txt
new file mode 100644
index 000000000000..46e017f4a4f2
--- /dev/null
+++ llvm/lib/Target/IA64/TargetInfo/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_component_library(LLVMIA64Info
+  IA64TargetInfo.cpp
+
+  LINK_COMPONENTS
+  MC
+  Support
+
+  ADD_TO_COMPONENT
+  IA64
+  )
diff --git llvm/lib/Target/IA64/TargetInfo/IA64TargetInfo.cpp llvm/lib/Target/IA64/TargetInfo/IA64TargetInfo.cpp
new file mode 100644
index 000000000000..44a88bbc49c2
--- /dev/null
+++ llvm/lib/Target/IA64/TargetInfo/IA64TargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- IA64TargetInfo.cpp - IA64 Target Implementation -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetInfo/IA64TargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+Target &llvm::getTheIA64Target() {
+  static Target TheIA64Target;
+  return TheIA64Target;
+}
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeIA64TargetInfo() {
+  RegisterTarget<Triple::ia64, /*HasJIT=*/false> X(
+      getTheIA64Target(), "ia64", "IA-64 (Itanium)", "IA64");
+}
diff --git llvm/lib/Target/IA64/TargetInfo/IA64TargetInfo.h llvm/lib/Target/IA64/TargetInfo/IA64TargetInfo.h
new file mode 100644
index 000000000000..685b75986850
--- /dev/null
+++ llvm/lib/Target/IA64/TargetInfo/IA64TargetInfo.h
@@ -0,0 +1,20 @@
+//===-- IA64TargetInfo.h - IA64 Target Implementation -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_IA64_TARGETINFO_IA64TARGETINFO_H
+#define LLVM_LIB_TARGET_IA64_TARGETINFO_IA64TARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheIA64Target();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_IA64_TARGETINFO_IA64TARGETINFO_H
diff --git llvm/lib/TargetParser/TargetDataLayout.cpp llvm/lib/TargetParser/TargetDataLayout.cpp
index b8c3b4325558..2ffdebeeec61 100644
--- llvm/lib/TargetParser/TargetDataLayout.cpp
+++ llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -569,6 +569,10 @@ std::string Triple::computeDataLayout(StringRef ABIName) const {
     return "e-m:e-p:32:32:32-a:0-n16:32-"
            "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
            "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048";
+  case Triple::ia64:
+    // IA-64 (Itanium): little-endian LP64, 80-bit long double (carried over
+    // from the pre-removal backend's "e-f80:128:128"), 128-bit stack align.
+    return "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128";
   case Triple::loongarch32:
   case Triple::loongarch64:
     return computeLoongArchDataLayout(*this);
diff --git llvm/lib/TargetParser/Triple.cpp llvm/lib/TargetParser/Triple.cpp
index a4f9dd42c0fe..89487cdcc4e9 100644
--- llvm/lib/TargetParser/Triple.cpp
+++ llvm/lib/TargetParser/Triple.cpp
@@ -25,44 +25,84 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   switch (Kind) {
   case UnknownArch:    return "unknown";
 
-  case aarch64:        return "aarch64";
-  case aarch64_32:     return "aarch64_32";
-  case aarch64_be:     return "aarch64_be";
-  case amdgcn:         return "amdgcn";
-  case amdil64:        return "amdil64";
-  case amdil:          return "amdil";
-  case arc:            return "arc";
-  case arm:            return "arm";
-  case armeb:          return "armeb";
-  case avr:            return "avr";
-  case bpfeb:          return "bpfeb";
-  case bpfel:          return "bpfel";
-  case csky:           return "csky";
-  case dxil:           return "dxil";
-  case hexagon:        return "hexagon";
-  case hsail64:        return "hsail64";
-  case hsail:          return "hsail";
-  case kalimba:        return "kalimba";
-  case lanai:          return "lanai";
-  case loongarch32:    return "loongarch32";
-  case loongarch64:    return "loongarch64";
-  case m68k:           return "m68k";
-  case mips64:         return "mips64";
-  case mips64el:       return "mips64el";
-  case mips:           return "mips";
-  case mipsel:         return "mipsel";
-  case msp430:         return "msp430";
-  case nvptx64:        return "nvptx64";
-  case nvptx:          return "nvptx";
-  case ppc64:          return "powerpc64";
-  case ppc64le:        return "powerpc64le";
-  case ppc:            return "powerpc";
-  case ppcle:          return "powerpcle";
-  case r600:           return "r600";
-  case renderscript32: return "renderscript32";
-  case renderscript64: return "renderscript64";
-  case riscv32:        return "riscv32";
-  case riscv64:        return "riscv64";
+  case aarch64:
+    return "aarch64";
+  case aarch64_32:
+    return "aarch64_32";
+  case aarch64_be:
+    return "aarch64_be";
+  case amdgcn:
+    return "amdgcn";
+  case amdil64:
+    return "amdil64";
+  case amdil:
+    return "amdil";
+  case arc:
+    return "arc";
+  case arm:
+    return "arm";
+  case armeb:
+    return "armeb";
+  case avr:
+    return "avr";
+  case bpfeb:
+    return "bpfeb";
+  case bpfel:
+    return "bpfel";
+  case csky:
+    return "csky";
+  case dxil:
+    return "dxil";
+  case hexagon:
+    return "hexagon";
+  case ia64:
+    return "ia64";
+  case hsail64:
+    return "hsail64";
+  case hsail:
+    return "hsail";
+  case kalimba:
+    return "kalimba";
+  case lanai:
+    return "lanai";
+  case loongarch32:
+    return "loongarch32";
+  case loongarch64:
+    return "loongarch64";
+  case m68k:
+    return "m68k";
+  case mips64:
+    return "mips64";
+  case mips64el:
+    return "mips64el";
+  case mips:
+    return "mips";
+  case mipsel:
+    return "mipsel";
+  case msp430:
+    return "msp430";
+  case nvptx64:
+    return "nvptx64";
+  case nvptx:
+    return "nvptx";
+  case ppc64:
+    return "powerpc64";
+  case ppc64le:
+    return "powerpc64le";
+  case ppc:
+    return "powerpc";
+  case ppcle:
+    return "powerpcle";
+  case r600:
+    return "r600";
+  case renderscript32:
+    return "renderscript32";
+  case renderscript64:
+    return "renderscript64";
+  case riscv32:
+    return "riscv32";
+  case riscv64:
+    return "riscv64";
   case riscv32be:
     return "riscv32be";
   case riscv64be:
@@ -204,8 +244,13 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
 
   case hexagon:     return "hexagon";
 
-  case amdgcn:      return "amdgcn";
-  case r600:        return "r600";
+  case ia64:
+    return "ia64";
+
+  case amdgcn:
+    return "amdgcn";
+  case r600:
+    return "r600";
 
   case bpfel:
   case bpfeb:       return "bpf";
@@ -481,6 +526,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
       .Case("riscv32be", riscv32be)
       .Case("riscv64be", riscv64be)
       .Case("hexagon", hexagon)
+      .Case("ia64", ia64)
       .Case("sparc", sparc)
       .Case("sparcel", sparcel)
       .Case("sparcv9", sparcv9)
@@ -632,6 +678,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
           .Case("riscv32be", Triple::riscv32be)
           .Case("riscv64be", Triple::riscv64be)
           .Case("hexagon", Triple::hexagon)
+          .Cases({"ia64", "ia-64", "ia64le"}, Triple::ia64)
           .Cases({"s390x", "systemz"}, Triple::systemz)
           .Case("sparc", Triple::sparc)
           .Case("sparcel", Triple::sparcel)
@@ -1027,6 +1074,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::thumbeb:
   case Triple::ve:
   case Triple::xcore:
+  case Triple::ia64:
   case Triple::xtensa:
     return Triple::ELF;
 
@@ -1779,6 +1827,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::systemz:
   case llvm::Triple::ve:
   case llvm::Triple::wasm64:
+  case llvm::Triple::ia64:
   case llvm::Triple::x86_64:
     return 64;
   }
@@ -1824,6 +1873,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::bpfeb:
   case Triple::bpfel:
   case Triple::msp430:
+  case Triple::ia64:
   case Triple::systemz:
   case Triple::ve:
     T.setArch(UnknownArch);
@@ -1943,6 +1993,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::systemz:
   case Triple::ve:
   case Triple::wasm64:
+  case Triple::ia64:
   case Triple::x86_64:
     // Already 64-bit.
     break;
@@ -2019,6 +2070,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::ve:
   case Triple::csky:
   case Triple::xtensa:
+  case Triple::ia64:
 
   // ARM is intentionally unsupported here, changing the architecture would
   // drop any arch suffixes.
@@ -2139,6 +2191,7 @@ bool Triple::isLittleEndian() const {
   case Triple::x86:
   case Triple::x86_64:
   case Triple::xcore:
+  case Triple::ia64:
   case Triple::xtensa:
     return true;
   default:
diff --git llvm/test/CodeGen/IA64/arith.ll llvm/test/CodeGen/IA64/arith.ll
new file mode 100644
index 000000000000..a5afee7ac99a
--- /dev/null
+++ llvm/test/CodeGen/IA64/arith.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Basic integer arithmetic and bitwise/shift operations on i32 and i64.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @add_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: add_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    add r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp add_i64#
+  %r = add i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @sub_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: sub_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    sub r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp sub_i64#
+  %r = sub i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @and_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    and r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp and_i64#
+  %r = and i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @or_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    or r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp or_i64#
+  %r = or i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @xor_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    xor r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp xor_i64#
+  %r = xor i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @shl_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: shl_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    shl r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp shl_i64#
+  %r = shl i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @lshr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: lshr_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    shr.u r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp lshr_i64#
+  %r = lshr i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @ashr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: ashr_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    shr r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp ashr_i64#
+  %r = ashr i64 %a, %b
+  ret i64 %r
+}
+
+define i32 @add_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: add_i32#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    add r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp add_i32#
+  %r = add i32 %a, %b
+  ret i32 %r
+}
+
+define i32 @sub_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: sub_i32#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    sub r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp sub_i32#
+  %r = sub i32 %a, %b
+  ret i32 %r
+}
diff --git llvm/test/CodeGen/IA64/atomics.ll llvm/test/CodeGen/IA64/atomics.ll
new file mode 100644
index 000000000000..f28e7d71c50f
--- /dev/null
+++ llvm/test/CodeGen/IA64/atomics.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Atomic operations on IA-64.
+;
+; Acquire/release ordering is provided with a memory fence (mf) around plain
+; loads/stores; compare-exchange uses ar.ccv + cmpxchg, and atomicrmw is
+; expanded to a cmpxchg loop.
+;
+; Regression 91c64164 ("Fix narrow atomic load/store"): sub-64-bit atomic
+; load/store were marked Custom but not implemented, so they broke. Only i64 is
+; Custom; narrower widths are promoted and must still produce a valid sized
+; load/store (ld1/ld4/...), not crash or emit garbage.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @load_i64_acquire(ptr %p) {
+; CHECK-LABEL: load_i64_acquire#:
+; CHECK: ld8 {{r[0-9]+}} = {{\[}}{{r[0-9]+}}]
+; CHECK: mf
+  %v = load atomic i64, ptr %p acquire, align 8
+  ret i64 %v
+}
+
+define void @store_i64_release(ptr %p, i64 %v) {
+; CHECK-LABEL: store_i64_release#:
+; CHECK: mf
+; CHECK: st8 {{\[}}{{r[0-9]+}}] = {{r[0-9]+}}
+  store atomic i64 %v, ptr %p release, align 8
+  ret void
+}
+
+; Narrow atomic load: promoted, must emit a real 32-bit load (regression).
+define i32 @load_i32_acquire(ptr %p) {
+; CHECK-LABEL: load_i32_acquire#:
+; CHECK: ld4 {{r[0-9]+}} = {{\[}}{{r[0-9]+}}]
+; CHECK: mf
+  %v = load atomic i32, ptr %p acquire, align 4
+  ret i32 %v
+}
+
+; Narrow atomic load: promoted, must emit a real 8-bit load (regression).
+define i8 @load_i8_acquire(ptr %p) {
+; CHECK-LABEL: load_i8_acquire#:
+; CHECK: ld1 {{r[0-9]+}} = {{\[}}{{r[0-9]+}}]
+; CHECK: mf
+  %v = load atomic i8, ptr %p acquire, align 1
+  ret i8 %v
+}
+
+; cmpxchg uses the compare value register ar.ccv and cmpxchg8.
+define i64 @cmpxchg_i64(ptr %p, i64 %c, i64 %n) {
+; CHECK-LABEL: cmpxchg_i64#:
+; CHECK: mov ar.ccv = {{r[0-9]+}}
+; CHECK: cmpxchg8.acq {{r[0-9]+}} = {{\[}}{{r[0-9]+}}], {{r[0-9]+}}, ar.ccv
+  %r = cmpxchg ptr %p, i64 %c, i64 %n acq_rel acquire
+  %v = extractvalue { i64, i1 } %r, 0
+  ret i64 %v
+}
+
+; atomicrmw is expanded to a load + cmpxchg loop.
+define i64 @rmw_add(ptr %p, i64 %v) {
+; CHECK-LABEL: rmw_add#:
+; CHECK: cmpxchg8.acq {{r[0-9]+}} = {{\[}}{{r[0-9]+}}], {{r[0-9]+}}, ar.ccv
+  %r = atomicrmw add ptr %p, i64 %v acq_rel
+  ret i64 %r
+}
+
+define void @seq_cst_fence() {
+; CHECK-LABEL: seq_cst_fence#:
+; CHECK: mf
+  fence seq_cst
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/br.ll llvm/test/CodeGen/IA64/br.ll
new file mode 100644
index 000000000000..ce6623265cc5
--- /dev/null
+++ llvm/test/CodeGen/IA64/br.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Conditional and unconditional branches, and an indirect branch.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @sink(i64)
+
+define void @cond_br(i64 %a, i64 %b) {
+; CHECK-LABEL: cond_br#
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,1,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    cmp.ge p6, p0 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) brl.cond.sptk .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %then
+; CHECK-NEXT:    adds out0 = 1, r0
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB0_3
+; CHECK-NEXT:  .LBB0_2: // %else
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds out0 = 2, r0
+; CHECK-NEXT:  .LBB0_3: // %exit
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = sink#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp cond_br#
+entry:
+  %c = icmp slt i64 %a, %b
+  br i1 %c, label %then, label %else
+then:
+  call void @sink(i64 1)
+  br label %exit
+else:
+  call void @sink(i64 2)
+  br label %exit
+exit:
+  ret void
+}
+
+define i64 @loop(i64 %n) {
+; CHECK-LABEL: loop#
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r3 = r8
+; CHECK-NEXT:  .LBB1_1: // %head
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    add r8 = r8, r3
+; CHECK-NEXT:    adds r3 = 1, r3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.lt p6, p0 = r3, r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) brl.cond.sptk .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp loop#
+entry:
+  br label %head
+head:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %head ]
+  %acc = phi i64 [ 0, %entry ], [ %acc.next, %head ]
+  %acc.next = add i64 %acc, %i
+  %i.next = add i64 %i, 1
+  %c = icmp slt i64 %i.next, %n
+  br i1 %c, label %head, label %exit
+exit:
+  ret i64 %acc.next
+}
+
+define void @indirect_br(ptr %addr) {
+; CHECK-LABEL: indirect_br#
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov b6 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p0) br.cond.sptk b6
+; CHECK-NEXT:  .LBB2_1: // %a
+; CHECK-NEXT:    adds out0 = 10, r0
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB2_2
+; CHECK-NEXT:  .LBB2_3: // %b
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds out0 = 20, r0
+; CHECK-NEXT:  .LBB2_2: // %a
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = sink#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp indirect_br#
+entry:
+  indirectbr ptr %addr, [ label %a, label %b ]
+a:
+  call void @sink(i64 10)
+  ret void
+b:
+  call void @sink(i64 20)
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/call-stack-args.ll llvm/test/CodeGen/IA64/call-stack-args.ll
new file mode 100644
index 000000000000..8e8ee6f0fe06
--- /dev/null
+++ llvm/test/CodeGen/IA64/call-stack-args.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; The IA-64 SysV ABI passes the first 8 integer arguments in the outgoing
+; register window (out0-out7); arguments beyond that spill to the memory
+; stack starting at sp+16 (the 16-byte scratch area is reserved below it).
+;
+; Regression test for the outgoing stacked/memory-argument offset: an earlier
+; version homed memory args at sp+64/sp+80, which corrupted the argument list
+; seen by GCC-compiled variadic callees (e.g. clang-built bash calling printf).
+; The first memory arg must land at [r12+16], the second at [r12+24].
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare i64 @callee(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
+
+define i64 @caller() {
+; CHECK-LABEL: caller#:
+entry:
+; The 9th and 10th arguments go to the memory stack at sp+16 and sp+24.
+; CHECK-DAG: adds [[R9:r[0-9]+]] = 16, r12
+; CHECK-DAG: adds [[R10:r[0-9]+]] = 24, r12
+; CHECK-DAG: st8 {{\[}}[[R9]]] =
+; CHECK-DAG: st8 {{\[}}[[R10]]] =
+;
+; The first 8 arguments stay in the outgoing register window.
+; CHECK-DAG: out0 = 1,
+; CHECK-DAG: out7 = 8,
+;
+; CHECK: br.call.sptk rp = callee#
+  %r = call i64 @callee(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10)
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/calls.ll llvm/test/CodeGen/IA64/calls.ll
new file mode 100644
index 000000000000..7b461bbf61fb
--- /dev/null
+++ llvm/test/CodeGen/IA64/calls.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Direct and indirect calls.
+;
+; Integer arguments are passed in the outgoing register window out0..out7; the
+; call is a br.call through rp. An indirect call loads the function descriptor
+; (entry point + gp), sets gp, and branches through a branch register.
+;
+; Regression c7908264 ("Fix bug in BRCALL selection"): BRCALL's register-use
+; operands sit before the optional InGlue; selection used to look for InGlue at
+; a fixed position and mis-shuffled the operands. Both call forms below carry
+; glued register uses and exercise that path.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare i64 @f3(i64, i64, i64)
+
+define i64 @call_direct(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: call_direct#:
+; CHECK-DAG: mov out0 = r32
+; CHECK-DAG: mov out1 = r33
+; CHECK-DAG: mov out2 = r34
+; CHECK: br.call.sptk rp = f3#
+  %r = call i64 @f3(i64 %a, i64 %b, i64 %c)
+  ret i64 %r
+}
+
+define i64 @call_indirect(ptr %fp, i64 %a, i64 %b) {
+; CHECK-LABEL: call_indirect#:
+; CHECK-DAG: mov out0 = r33
+; CHECK-DAG: mov out1 = r34
+; The descriptor's gp is loaded into r1, the entry point into a branch register.
+; CHECK: mov b6 = {{r[0-9]+}}
+; CHECK: br.call.sptk rp = b6
+  %r = call i64 %fp(i64 %a, i64 %b)
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/dbg-out-reg.ll llvm/test/CodeGen/IA64/dbg-out-reg.ll
new file mode 100644
index 000000000000..eea312f2add8
--- /dev/null
+++ llvm/test/CodeGen/IA64/dbg-out-reg.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; IA-64's output registers out0-out7 are symbolic: gas resolves 'out0' to the
+; real stacked register r(32+inputs+locals) from the 'alloc', but the .td gives
+; them the fixed DwarfRegNum 120-127. So a variable that lives in an output
+; register at some PC -- here, parameter 'x' after it is moved into out0 to be
+; passed to g() -- must be described in debug info by the actual stacked
+; register, or gdb reads the wrong register (test_gdb.test_pretty_print saw a
+; bogus '0x0'). IA64FixupDebugOutRegs rewrites out0-out7 in debug values to that
+; register. Check that 'x' is reported in a stacked GPR, never in an out reg.
+
+; CHECK-LABEL: f#:
+; CHECK:       //DEBUG_VALUE: f:x <- $r32
+; CHECK:       mov out0 = r32
+; CHECK:       //DEBUG_VALUE: f:x <- $r{{[0-9]+}}
+; CHECK-NOT:   //DEBUG_VALUE: {{.*}} <- $out
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define dso_local i64 @f(i64 %x, i64 %y) local_unnamed_addr !dbg !11 {
+entry:
+    #dbg_value(i64 %x, !16, !DIExpression(), !18)
+    #dbg_value(i64 %y, !17, !DIExpression(), !18)
+  %call = tail call i64 @g(i64 %x), !dbg !19
+  %add = add nsw i64 %call, %y, !dbg !20
+  ret i64 %add, !dbg !21
+}
+
+declare !dbg !22 dso_local i64 @g(i64) local_unnamed_addr
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "arg.c", directory: "/tmp")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !12, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15)
+!12 = !DISubroutineType(types: !13)
+!13 = !{!14, !14, !14}
+!14 = !DIBasicType(name: "long", size: 64, encoding: DW_ATE_signed)
+!15 = !{!16, !17}
+!16 = !DILocalVariable(name: "x", arg: 1, scope: !11, file: !1, line: 2, type: !14)
+!17 = !DILocalVariable(name: "y", arg: 2, scope: !11, file: !1, line: 2, type: !14)
+!18 = !DILocation(line: 0, scope: !11)
+!19 = !DILocation(line: 2, column: 33, scope: !11)
+!20 = !DILocation(line: 2, column: 38, scope: !11)
+!21 = !DILocation(line: 2, column: 26, scope: !11)
+!22 = !DISubprogram(name: "g", scope: !1, file: !1, line: 1, type: !23, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!23 = !DISubroutineType(types: !24)
+!24 = !{!14, !14}
diff --git llvm/test/CodeGen/IA64/div.ll llvm/test/CodeGen/IA64/div.ll
new file mode 100644
index 000000000000..c099e67382b1
--- /dev/null
+++ llvm/test/CodeGen/IA64/div.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Integer division and remainder are provided by the toolchain runtime
+; (libgcc) rather than a native instruction, so these become library calls.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @sdiv_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: sdiv_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,2,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __divdi3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp sdiv_i64#
+  %r = sdiv i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @udiv_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: udiv_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,2,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __udivdi3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp udiv_i64#
+  %r = udiv i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @srem_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: srem_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,2,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __moddi3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp srem_i64#
+  %r = srem i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @urem_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: urem_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,2,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __umoddi3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp urem_i64#
+  %r = urem i64 %a, %b
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/dwarf-file.ll llvm/test/CodeGen/IA64/dwarf-file.ll
new file mode 100644
index 000000000000..176bfa4969e1
--- /dev/null
+++ llvm/test/CodeGen/IA64/dwarf-file.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; GNU 'as' for IA-64 only accepts the single-string `.file N "name"` form, not
+; LLVM's default two-argument `.file N "dir" "name"` (it rejects the second
+; string as "junk at end of line"). With EnableDwarfFileDirectoryDefault=false
+; the MCAsmStreamer folds the directory into the filename, emitting one quoted
+; string: `.file N "dir/name"`.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+; The numbered .file directive must carry a single folded "dir/name" string and
+; never a separate directory operand.
+; CHECK: .file 1 "/home/user/src/test.c"
+; CHECK-NOT: .file 1 {{.*}}" "
+
+define i64 @f(i64 %x) !dbg !4 {
+  ret i64 %x, !dbg !7
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.c", directory: "/home/user/src")
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !5, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !DILocation(line: 2, column: 1, scope: !4)
diff --git llvm/test/CodeGen/IA64/ext-trunc.ll llvm/test/CodeGen/IA64/ext-trunc.ll
new file mode 100644
index 000000000000..f26332c75a8d
--- /dev/null
+++ llvm/test/CodeGen/IA64/ext-trunc.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Integer sign/zero/any extension and truncation, including the i1 <-> GR
+; conversions used for predicate values.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @sext_i32_i64(i32 %a) {
+; CHECK-LABEL: sext_i32_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    sxt4 r8 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp sext_i32_i64#
+  %r = sext i32 %a to i64
+  ret i64 %r
+}
+
+define i64 @zext_i32_i64(i32 %a) {
+; CHECK-LABEL: zext_i32_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    zxt4 r8 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp zext_i32_i64#
+  %r = zext i32 %a to i64
+  ret i64 %r
+}
+
+define i64 @sext_i16_i64(i16 %a) {
+; CHECK-LABEL: sext_i16_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    sxt2 r8 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp sext_i16_i64#
+  %r = sext i16 %a to i64
+  ret i64 %r
+}
+
+define i64 @zext_i8_i64(i8 %a) {
+; CHECK-LABEL: zext_i8_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    zxt1 r8 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp zext_i8_i64#
+  %r = zext i8 %a to i64
+  ret i64 %r
+}
+
+define i32 @trunc_i64_i32(i64 %a) {
+; CHECK-LABEL: trunc_i64_i32#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r8 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp trunc_i64_i32#
+  %r = trunc i64 %a to i32
+  ret i32 %r
+}
+
+define i64 @zext_i1_i64(i1 %a) {
+; CHECK-LABEL: zext_i1_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    adds r3 = 1, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    and r8 = r32, r3
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp zext_i1_i64#
+  %r = zext i1 %a to i64
+  ret i64 %r
+}
+
+define i64 @sext_i1_i64(i1 %a) {
+; CHECK-LABEL: sext_i1_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    adds r3 = 1, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    and r3 = r32, r3
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    sub r8 = r8, r3
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp sext_i1_i64#
+  %r = sext i1 %a to i64
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/f16.ll llvm/test/CodeGen/IA64/f16.ll
new file mode 100644
index 000000000000..d154313521c4
--- /dev/null
+++ llvm/test/CodeGen/IA64/f16.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; IA-64 has no native half (f16). Conversions to/from f16 expand to the
+; soft-float libcalls (__truncsfhf2/__truncdfhf2/__extendhfsf2/__extendhfdf2
+; etc.), and f16 is loaded/stored as plain i16 bits rather than as an
+; extended/truncated FP value.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define half @trunc_f32_to_f16(float %x) {
+; CHECK-LABEL: trunc_f32_to_f16#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __truncsfhf2#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp trunc_f32_to_f16#
+  %r = fptrunc float %x to half
+  ret half %r
+}
+
+define half @trunc_f64_to_f16(double %x) {
+; CHECK-LABEL: trunc_f64_to_f16#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __truncdfhf2#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp trunc_f64_to_f16#
+  %r = fptrunc double %x to half
+  ret half %r
+}
+
+define half @trunc_f80_to_f16(x86_fp80 %x) {
+; CHECK-LABEL: trunc_f80_to_f16#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,2,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __truncxfhf2#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp trunc_f80_to_f16#
+  %r = fptrunc x86_fp80 %x to half
+  ret half %r
+}
+
+define float @ext_f16_to_f32(half %x) {
+; CHECK-LABEL: ext_f16_to_f32#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __extendhfsf2#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp ext_f16_to_f32#
+  %r = fpext half %x to float
+  ret float %r
+}
+
+define double @ext_f16_to_f64(half %x) {
+; CHECK-LABEL: ext_f16_to_f64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __extendhfsf2#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp ext_f16_to_f64#
+  %r = fpext half %x to double
+  ret double %r
+}
+
+define x86_fp80 @ext_f16_to_f80(half %x) {
+; CHECK-LABEL: ext_f16_to_f80#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __extendhfsf2#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp ext_f16_to_f80#
+  %r = fpext half %x to x86_fp80
+  ret x86_fp80 %r
+}
+
+; f16 in memory is just i16 bits: a load/store round-trip must not introduce
+; any FP extend/truncate.
+define void @load_store_f16(ptr %p, ptr %q) {
+; CHECK-LABEL: load_store_f16#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    ld2 r3 = [r32]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    st2 [r33] = r3
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp load_store_f16#
+  %v = load half, ptr %p
+  store half %v, ptr %q
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/f80.ll llvm/test/CodeGen/IA64/f80.ll
new file mode 100644
index 000000000000..bdf270abc894
--- /dev/null
+++ llvm/test/CodeGen/IA64/f80.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Native 80-bit floating point (x86_fp80 / long double): full-precision
+; arithmetic, the two-slot memory layout (ldfe/stfe), and constant pool loads.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define x86_fp80 @add_f80(x86_fp80 %a, x86_fp80 %b) {
+; CHECK-LABEL: add_f80#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r32
+; CHECK-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    fadd f8 = f8, f9
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp add_f80#
+  %r = fadd x86_fp80 %a, %b
+  ret x86_fp80 %r
+}
+
+define x86_fp80 @load_f80(ptr %p) {
+; CHECK-LABEL: load_f80#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    ldfe f8 = [r32]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp load_f80#
+  %v = load x86_fp80, ptr %p
+  ret x86_fp80 %v
+}
+
+define void @store_f80(x86_fp80 %v, ptr %p) {
+; CHECK-LABEL: store_f80#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r35
+; CHECK-NEXT:    alloc r35 = ar.pfs,0,4,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    stfe [r34] = f8
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r35
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp store_f80#
+  store x86_fp80 %v, ptr %p
+  ret void
+}
+
+define x86_fp80 @const_f80() {
+; CHECK-LABEL: const_f80#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r32
+; CHECK-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    addl r3 = @ltoff(.LCPI3_0), r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r3 = [r3]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ldfe f8 = [r3]
+; CHECK-NEXT:    mov ar.pfs = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp const_f80#
+  ret x86_fp80 0xK4000C90FDAA22168C235
+}
diff --git llvm/test/CodeGen/IA64/fma.ll llvm/test/CodeGen/IA64/fma.ll
new file mode 100644
index 000000000000..f675f48cc726
--- /dev/null
+++ llvm/test/CodeGen/IA64/fma.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Fused multiply-add. IA-64 has a single-rounding a*b+c F-unit op whose
+; completer picks the precision: ".s" (f32), ".d" (f64), none (native 80-bit).
+; The fnma/fms variants negate an operand. An explicit llvm.fma.fN intrinsic
+; (as compiler_builtins' libm / rustc emit) must select the matching opcode for
+; all three widths; f32 in particular needs its own pattern because an
+; ISD::FMA(f32) node is produced regardless of contraction.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare float @llvm.fma.f32(float, float, float)
+declare double @llvm.fma.f64(double, double, double)
+declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80)
+
+; f32: a*b+c
+define float @fma_f32(float %a, float %b, float %c) {
+; CHECK-LABEL: fma_f32#:
+; CHECK: fma.s {{f[0-9]+}} = {{f[0-9]+}}, {{f[0-9]+}}, {{f[0-9]+}}
+  %r = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %r
+}
+
+; f32: a*b-c selects fms.s
+define float @fms_f32(float %a, float %b, float %c) {
+; CHECK-LABEL: fms_f32#:
+; CHECK: fms.s {{f[0-9]+}} = {{f[0-9]+}}, {{f[0-9]+}}, {{f[0-9]+}}
+  %nc = fneg float %c
+  %r = call float @llvm.fma.f32(float %a, float %b, float %nc)
+  ret float %r
+}
+
+; f32: -a*b+c selects fnma.s
+define float @fnma_f32(float %a, float %b, float %c) {
+; CHECK-LABEL: fnma_f32#:
+; CHECK: fnma.s {{f[0-9]+}} = {{f[0-9]+}}, {{f[0-9]+}}, {{f[0-9]+}}
+  %na = fneg float %a
+  %r = call float @llvm.fma.f32(float %na, float %b, float %c)
+  ret float %r
+}
+
+; A contractable fmul+fadd over f32 is now fused (isFMAFasterThanFMulAndFAdd
+; returns true for f32), so it collapses to a single fma.s.
+define float @contract_f32(float %a, float %b, float %c) {
+; CHECK-LABEL: contract_f32#:
+; CHECK: fma.s {{f[0-9]+}} = {{f[0-9]+}}, {{f[0-9]+}}, {{f[0-9]+}}
+; CHECK-NOT: fmpy
+; CHECK-NOT: fadd
+  %m = fmul contract float %a, %b
+  %r = fadd contract float %m, %c
+  ret float %r
+}
+
+; Without a contract/fast flag, the multiply and add stay separate roundings.
+define float @nocontract_f32(float %a, float %b, float %c) {
+; CHECK-LABEL: nocontract_f32#:
+; CHECK: fmpy.s {{f[0-9]+}} =
+; CHECK: fadd.s {{f[0-9]+}} =
+; CHECK-NOT: fma.s
+  %m = fmul float %a, %b
+  %r = fadd float %m, %c
+  ret float %r
+}
+
+; f64 still selects fma.d.
+define double @fma_f64(double %a, double %b, double %c) {
+; CHECK-LABEL: fma_f64#:
+; CHECK: fma.d {{f[0-9]+}} = {{f[0-9]+}}, {{f[0-9]+}}, {{f[0-9]+}}
+  %r = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %r
+}
+
+; x86_fp80 selects the no-completer native fma.
+define x86_fp80 @fma_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) {
+; CHECK-LABEL: fma_f80#:
+; CHECK: fma {{f[0-9]+}} = {{f[0-9]+}}, {{f[0-9]+}}, {{f[0-9]+}}
+; CHECK-NOT: fma.
+  %r = call x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c)
+  ret x86_fp80 %r
+}
diff --git llvm/test/CodeGen/IA64/fp-args.ll llvm/test/CodeGen/IA64/fp-args.ll
new file mode 100644
index 000000000000..afee9df2b045
--- /dev/null
+++ llvm/test/CodeGen/IA64/fp-args.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Floating-point argument and return passing.
+;
+; FP registers legally hold both f32 and f64, so f32 args/returns are passed
+; as-is in FRs without promotion to f64.
+;   Regression bce6f5b2: f32 args were promoted to f64, forcing a spurious
+;     TRUNCATE on a value already known to be 32-bit.
+;   Regression a4a398ee: f32 returns were promoted to f64, emitting a spurious
+;     fnorm.s on a value already f32.
+;
+; On IA-64, FP args do NOT consume a GR; the GR file is shadowed by *slot*, so
+; an integer following an FP arg takes the next GR slot, not a slot keyed off
+; the FP register number.
+;   Regression 4330a190: shadowing keyed off the FP register index put the
+;     trailing integer in the wrong GR.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+; f32 returned directly in f8, no promotion/fnorm.
+define float @f32_passthrough(float %a) {
+; CHECK-LABEL: f32_passthrough#:
+; CHECK-NOT: fnorm
+; CHECK-NOT: trunc
+  ret float %a
+}
+
+; f32 args in f8/f9, result returned directly; no extra rounding on the result.
+define float @f32_add(float %a, float %b) {
+; CHECK-LABEL: f32_add#:
+; CHECK: fadd.s f8 = f8, f9
+; CHECK-NOT: fnorm
+  %r = fadd float %a, %b
+  ret float %r
+}
+
+declare void @sink(double, i32)
+
+; Callee: incoming i32 %b occupies GR slot 1 (r33), shadowed past the double.
+define void @int_after_fp(double %a, i32 %b) {
+; CHECK-LABEL: int_after_fp#:
+; CHECK: mov out1 = r33
+  call void @sink(double %a, i32 %b)
+  ret void
+}
+
+; Caller: double in f8, the trailing i32 goes to GR out slot 1 (out1), not out2.
+define void @call_int_after_fp() {
+; CHECK-LABEL: call_int_after_fp#:
+; CHECK-DAG: setf.d f8 =
+; CHECK-DAG: out1 = 7,
+; CHECK-NOT: out2 = 7,
+  call void @sink(double 1.0, i32 7)
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/fp-arith.ll llvm/test/CodeGen/IA64/fp-arith.ll
new file mode 100644
index 000000000000..fc4be6d703cd
--- /dev/null
+++ llvm/test/CodeGen/IA64/fp-arith.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Floating-point arithmetic and the precision completer.
+;
+; IA-64 FP registers are 82-bit; the arithmetic opcode carries a completer
+; selecting the rounding precision: ".s" for f32, ".d" for f64, and none for
+; the native 80-bit (full) precision used by x86_fp80.
+;
+; Regression a5dbe312 ("Set f64 precision on f64 arithmetic"): f64 ops were
+; emitted without the ".d" completer, computing at full precision.
+; Regression a33ad26c ("Remove redundant FNORM on FP widening"): fpext emitted
+; a bare fnorm even though FR values are already stored widened to 80 bits.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define double @fadd_f64(double %a, double %b) {
+; CHECK-LABEL: fadd_f64#:
+; CHECK: fadd.d {{f[0-9]+}} =
+  %r = fadd double %a, %b
+  ret double %r
+}
+
+define double @fsub_f64(double %a, double %b) {
+; CHECK-LABEL: fsub_f64#:
+; CHECK: fsub.d {{f[0-9]+}} =
+  %r = fsub double %a, %b
+  ret double %r
+}
+
+define double @fmul_f64(double %a, double %b) {
+; CHECK-LABEL: fmul_f64#:
+; CHECK: fmpy.d {{f[0-9]+}} =
+  %r = fmul double %a, %b
+  ret double %r
+}
+
+define float @fadd_f32(float %a, float %b) {
+; CHECK-LABEL: fadd_f32#:
+; CHECK: fadd.s {{f[0-9]+}} =
+  %r = fadd float %a, %b
+  ret float %r
+}
+
+define float @fmul_f32(float %a, float %b) {
+; CHECK-LABEL: fmul_f32#:
+; CHECK: fmpy.s {{f[0-9]+}} =
+  %r = fmul float %a, %b
+  ret float %r
+}
+
+; x86_fp80 is the native 80-bit precision: the opcode carries no completer.
+define x86_fp80 @fadd_f80(x86_fp80 %a, x86_fp80 %b) {
+; CHECK-LABEL: fadd_f80#:
+; CHECK: fadd {{f[0-9]+}} =
+; CHECK-NOT: fadd.
+  %r = fadd x86_fp80 %a, %b
+  ret x86_fp80 %r
+}
+
+; Widening f32->f64 is a no-op: the FR value is already 80-bit. No fnorm.
+define double @fpext_f32_f64(float %a) {
+; CHECK-LABEL: fpext_f32_f64#:
+; CHECK-NOT: fnorm
+  %r = fpext float %a to double
+  ret double %r
+}
diff --git llvm/test/CodeGen/IA64/fp-select-setcc.ll llvm/test/CodeGen/IA64/fp-select-setcc.ll
new file mode 100644
index 000000000000..86d6bd9aa1a8
--- /dev/null
+++ llvm/test/CodeGen/IA64/fp-select-setcc.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Floating-point comparison, select, and the sign-manipulation ops.
+; fneg/fabs/fnegabs are precision-agnostic (no completer); fcmp sets a predicate
+; pair; an FP select is a predicated FR move.
+;
+; Regression b85d0d0a ("Fix select of f32 FNEG/FABS/FABSNEG"): these ops were
+; implemented for f64/f80 but accidentally omitted for f32.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare float @llvm.fabs.f32(float)
+
+define float @fneg_f32(float %a) {
+; CHECK-LABEL: fneg_f32#:
+; CHECK: fneg f8 = f8
+  %r = fneg float %a
+  ret float %r
+}
+
+define float @fabs_f32(float %a) {
+; CHECK-LABEL: fabs_f32#:
+; CHECK: fabs f8 = f8
+  %r = call float @llvm.fabs.f32(float %a)
+  ret float %r
+}
+
+define float @fnegabs_f32(float %a) {
+; CHECK-LABEL: fnegabs_f32#:
+; CHECK: fnegabs f8 = f8
+  %t = call float @llvm.fabs.f32(float %a)
+  %r = fneg float %t
+  ret float %r
+}
+
+define double @fneg_f64(double %a) {
+; CHECK-LABEL: fneg_f64#:
+; CHECK: fneg f8 = f8
+  %r = fneg double %a
+  ret double %r
+}
+
+define i1 @fcmp_olt(float %a, float %b) {
+; CHECK-LABEL: fcmp_olt#:
+; CHECK: fcmp.lt p{{[0-9]+}}, p{{[0-9]+}} = f8, f9
+  %r = fcmp olt float %a, %b
+  ret i1 %r
+}
+
+; An FP select is lowered to a predicated FR move.
+define float @fselect(float %a, float %b, i1 %c) {
+; CHECK-LABEL: fselect#:
+; CHECK: ({{p[0-9]+}}) mov {{f[0-9]+}} = {{f[0-9]+}}
+  %r = select i1 %c, float %a, float %b
+  ret float %r
+}
diff --git llvm/test/CodeGen/IA64/fp-truncstore.ll llvm/test/CodeGen/IA64/fp-truncstore.ll
new file mode 100644
index 000000000000..6b4c4e3d3db4
--- /dev/null
+++ llvm/test/CodeGen/IA64/fp-truncstore.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Regression 1f31fec9 ("Fix truncating FP stores").
+;
+; Itanium FP stores (stfs/stfd) do NOT round their FR operand; they just write
+; the requested precision's bit pattern. A truncating store of a wider FP value
+; into narrower memory must therefore first round the value with fnorm, then
+; store. Selecting the truncstore directly to stfs/stfd stores an unrounded
+; value and produces garbage. The fix marks these truncstores Expand
+; (IA64ISelLowering.cpp setTruncStoreAction), so each emits fnorm-then-store.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+; f64 -> f32: round to single, then single store.
+define void @f64_to_f32(double %a, ptr %p) {
+; CHECK-LABEL: f64_to_f32#:
+; CHECK: fnorm.s [[V:f[0-9]+]] =
+; CHECK: stfs {{\[}}{{r[0-9]+}}] = [[V]]
+  %t = fptrunc double %a to float
+  store float %t, ptr %p
+  ret void
+}
+
+; f80 -> f32: round to single, then single store.
+define void @f80_to_f32(x86_fp80 %a, ptr %p) {
+; CHECK-LABEL: f80_to_f32#:
+; CHECK: fnorm.s [[V:f[0-9]+]] =
+; CHECK: stfs {{\[}}{{r[0-9]+}}] = [[V]]
+  %t = fptrunc x86_fp80 %a to float
+  store float %t, ptr %p
+  ret void
+}
+
+; f80 -> f64: round to double, then double store.
+define void @f80_to_f64(x86_fp80 %a, ptr %p) {
+; CHECK-LABEL: f80_to_f64#:
+; CHECK: fnorm.d [[V:f[0-9]+]] =
+; CHECK: stfd {{\[}}{{r[0-9]+}}] = [[V]]
+  %t = fptrunc x86_fp80 %a to double
+  store double %t, ptr %p
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/frame-overflow.ll llvm/test/CodeGen/IA64/frame-overflow.ll
new file mode 100644
index 000000000000..e697d0f2e2d3
--- /dev/null
+++ llvm/test/CodeGen/IA64/frame-overflow.ll
@@ -0,0 +1,281 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Regression test for stacked-GPR frame overflow under high register
+; pressure. The 'alloc' frame is (locals + outputs) and must be <= 96
+; stacked GPRs. A non-leaf function parks both the caller's ar.pfs and the
+; return pointer (rp) in one extra local each, and gas places the
+; outgoing-argument registers out0-out7 immediately above the locals -- so
+; locals + 1 (ar.pfs) + 1 (rp) + 8 (outputs) must fit in 96, i.e. the
+; allocator may use at most 86 stacked locals.
+;
+; getReservedRegs caps this by reserving the top 10 stacked GPRs (r118-r127):
+; the 8 outputs plus the rp save (r119) plus the ar.pfs save (r118). The worst
+; case 86 + 1 + 1 + 8 = 96 then exactly fits; one more local would push out7
+; onto the nonexistent r128, which GNU as rejects with "Size of frame exceeds
+; maximum of 96 registers".
+;
+; The ~120 volatile loads below are all live across the 8-argument call, so
+; they must occupy callee-preserved stacked locals (scratch GRs do not
+; survive a call), saturating the local frame to its cap.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+@g = external global [120 x i64]
+
+declare i64 @sink8(i64, i64, i64, i64, i64, i64, i64, i64)
+
+; CHECK-LABEL: pressure#:
+; The alloc frame must be locals=88, outputs=8: 86 allocator locals + the
+; ar.pfs save (r118) + the rp save (r119). 'alloc' is preceded by its
+; '.save ar.pfs' directive and followed by '.save rp', so the checks are in
+; that order.
+; CHECK: .save{{.*}}ar.pfs, r118
+; CHECK: alloc r{{[0-9]+}} = ar.pfs,0,88,8,0
+; CHECK: .save{{.*}}rp, r119
+
+define i64 @pressure() {
+entry:
+  %p0 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 0)
+  %p1 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 1)
+  %p2 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 2)
+  %p3 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 3)
+  %p4 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 4)
+  %p5 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 5)
+  %p6 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 6)
+  %p7 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 7)
+  %p8 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 8)
+  %p9 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 9)
+  %p10 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 10)
+  %p11 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 11)
+  %p12 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 12)
+  %p13 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 13)
+  %p14 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 14)
+  %p15 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 15)
+  %p16 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 16)
+  %p17 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 17)
+  %p18 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 18)
+  %p19 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 19)
+  %p20 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 20)
+  %p21 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 21)
+  %p22 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 22)
+  %p23 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 23)
+  %p24 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 24)
+  %p25 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 25)
+  %p26 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 26)
+  %p27 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 27)
+  %p28 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 28)
+  %p29 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 29)
+  %p30 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 30)
+  %p31 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 31)
+  %p32 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 32)
+  %p33 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 33)
+  %p34 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 34)
+  %p35 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 35)
+  %p36 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 36)
+  %p37 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 37)
+  %p38 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 38)
+  %p39 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 39)
+  %p40 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 40)
+  %p41 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 41)
+  %p42 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 42)
+  %p43 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 43)
+  %p44 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 44)
+  %p45 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 45)
+  %p46 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 46)
+  %p47 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 47)
+  %p48 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 48)
+  %p49 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 49)
+  %p50 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 50)
+  %p51 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 51)
+  %p52 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 52)
+  %p53 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 53)
+  %p54 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 54)
+  %p55 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 55)
+  %p56 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 56)
+  %p57 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 57)
+  %p58 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 58)
+  %p59 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 59)
+  %p60 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 60)
+  %p61 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 61)
+  %p62 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 62)
+  %p63 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 63)
+  %p64 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 64)
+  %p65 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 65)
+  %p66 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 66)
+  %p67 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 67)
+  %p68 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 68)
+  %p69 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 69)
+  %p70 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 70)
+  %p71 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 71)
+  %p72 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 72)
+  %p73 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 73)
+  %p74 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 74)
+  %p75 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 75)
+  %p76 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 76)
+  %p77 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 77)
+  %p78 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 78)
+  %p79 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 79)
+  %p80 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 80)
+  %p81 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 81)
+  %p82 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 82)
+  %p83 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 83)
+  %p84 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 84)
+  %p85 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 85)
+  %p86 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 86)
+  %p87 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 87)
+  %p88 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 88)
+  %p89 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 89)
+  %p90 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 90)
+  %p91 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 91)
+  %p92 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 92)
+  %p93 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 93)
+  %p94 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 94)
+  %p95 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 95)
+  %p96 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 96)
+  %p97 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 97)
+  %p98 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 98)
+  %p99 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 99)
+  %p100 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 100)
+  %p101 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 101)
+  %p102 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 102)
+  %p103 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 103)
+  %p104 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 104)
+  %p105 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 105)
+  %p106 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 106)
+  %p107 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 107)
+  %p108 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 108)
+  %p109 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 109)
+  %p110 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 110)
+  %p111 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 111)
+  %p112 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 112)
+  %p113 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 113)
+  %p114 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 114)
+  %p115 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 115)
+  %p116 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 116)
+  %p117 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 117)
+  %p118 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 118)
+  %p119 = load volatile i64, ptr getelementptr inbounds ([120 x i64], ptr @g, i64 0, i64 119)
+  %c = call i64 @sink8(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8)
+  %a0 = add i64 %c, %p0
+  %a1 = add i64 %a0, %p1
+  %a2 = add i64 %a1, %p2
+  %a3 = add i64 %a2, %p3
+  %a4 = add i64 %a3, %p4
+  %a5 = add i64 %a4, %p5
+  %a6 = add i64 %a5, %p6
+  %a7 = add i64 %a6, %p7
+  %a8 = add i64 %a7, %p8
+  %a9 = add i64 %a8, %p9
+  %a10 = add i64 %a9, %p10
+  %a11 = add i64 %a10, %p11
+  %a12 = add i64 %a11, %p12
+  %a13 = add i64 %a12, %p13
+  %a14 = add i64 %a13, %p14
+  %a15 = add i64 %a14, %p15
+  %a16 = add i64 %a15, %p16
+  %a17 = add i64 %a16, %p17
+  %a18 = add i64 %a17, %p18
+  %a19 = add i64 %a18, %p19
+  %a20 = add i64 %a19, %p20
+  %a21 = add i64 %a20, %p21
+  %a22 = add i64 %a21, %p22
+  %a23 = add i64 %a22, %p23
+  %a24 = add i64 %a23, %p24
+  %a25 = add i64 %a24, %p25
+  %a26 = add i64 %a25, %p26
+  %a27 = add i64 %a26, %p27
+  %a28 = add i64 %a27, %p28
+  %a29 = add i64 %a28, %p29
+  %a30 = add i64 %a29, %p30
+  %a31 = add i64 %a30, %p31
+  %a32 = add i64 %a31, %p32
+  %a33 = add i64 %a32, %p33
+  %a34 = add i64 %a33, %p34
+  %a35 = add i64 %a34, %p35
+  %a36 = add i64 %a35, %p36
+  %a37 = add i64 %a36, %p37
+  %a38 = add i64 %a37, %p38
+  %a39 = add i64 %a38, %p39
+  %a40 = add i64 %a39, %p40
+  %a41 = add i64 %a40, %p41
+  %a42 = add i64 %a41, %p42
+  %a43 = add i64 %a42, %p43
+  %a44 = add i64 %a43, %p44
+  %a45 = add i64 %a44, %p45
+  %a46 = add i64 %a45, %p46
+  %a47 = add i64 %a46, %p47
+  %a48 = add i64 %a47, %p48
+  %a49 = add i64 %a48, %p49
+  %a50 = add i64 %a49, %p50
+  %a51 = add i64 %a50, %p51
+  %a52 = add i64 %a51, %p52
+  %a53 = add i64 %a52, %p53
+  %a54 = add i64 %a53, %p54
+  %a55 = add i64 %a54, %p55
+  %a56 = add i64 %a55, %p56
+  %a57 = add i64 %a56, %p57
+  %a58 = add i64 %a57, %p58
+  %a59 = add i64 %a58, %p59
+  %a60 = add i64 %a59, %p60
+  %a61 = add i64 %a60, %p61
+  %a62 = add i64 %a61, %p62
+  %a63 = add i64 %a62, %p63
+  %a64 = add i64 %a63, %p64
+  %a65 = add i64 %a64, %p65
+  %a66 = add i64 %a65, %p66
+  %a67 = add i64 %a66, %p67
+  %a68 = add i64 %a67, %p68
+  %a69 = add i64 %a68, %p69
+  %a70 = add i64 %a69, %p70
+  %a71 = add i64 %a70, %p71
+  %a72 = add i64 %a71, %p72
+  %a73 = add i64 %a72, %p73
+  %a74 = add i64 %a73, %p74
+  %a75 = add i64 %a74, %p75
+  %a76 = add i64 %a75, %p76
+  %a77 = add i64 %a76, %p77
+  %a78 = add i64 %a77, %p78
+  %a79 = add i64 %a78, %p79
+  %a80 = add i64 %a79, %p80
+  %a81 = add i64 %a80, %p81
+  %a82 = add i64 %a81, %p82
+  %a83 = add i64 %a82, %p83
+  %a84 = add i64 %a83, %p84
+  %a85 = add i64 %a84, %p85
+  %a86 = add i64 %a85, %p86
+  %a87 = add i64 %a86, %p87
+  %a88 = add i64 %a87, %p88
+  %a89 = add i64 %a88, %p89
+  %a90 = add i64 %a89, %p90
+  %a91 = add i64 %a90, %p91
+  %a92 = add i64 %a91, %p92
+  %a93 = add i64 %a92, %p93
+  %a94 = add i64 %a93, %p94
+  %a95 = add i64 %a94, %p95
+  %a96 = add i64 %a95, %p96
+  %a97 = add i64 %a96, %p97
+  %a98 = add i64 %a97, %p98
+  %a99 = add i64 %a98, %p99
+  %a100 = add i64 %a99, %p100
+  %a101 = add i64 %a100, %p101
+  %a102 = add i64 %a101, %p102
+  %a103 = add i64 %a102, %p103
+  %a104 = add i64 %a103, %p104
+  %a105 = add i64 %a104, %p105
+  %a106 = add i64 %a105, %p106
+  %a107 = add i64 %a106, %p107
+  %a108 = add i64 %a107, %p108
+  %a109 = add i64 %a108, %p109
+  %a110 = add i64 %a109, %p110
+  %a111 = add i64 %a110, %p111
+  %a112 = add i64 %a111, %p112
+  %a113 = add i64 %a112, %p113
+  %a114 = add i64 %a113, %p114
+  %a115 = add i64 %a114, %p115
+  %a116 = add i64 %a115, %p116
+  %a117 = add i64 %a116, %p117
+  %a118 = add i64 %a117, %p118
+  %a119 = add i64 %a118, %p119
+  ret i64 %a119
+}
diff --git llvm/test/CodeGen/IA64/frameaddr.ll llvm/test/CodeGen/IA64/frameaddr.ll
new file mode 100644
index 000000000000..85bcf768e4f9
--- /dev/null
+++ llvm/test/CodeGen/IA64/frameaddr.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Regression e1222d38 ("Implement FRAMEADDR lowering for zero depth").
+;
+; FRAMEADDR/RETURNADDR "lie about being legal", so they must be forced Custom
+; and actually lowered. __builtin_frame_address(0) is the current frame pointer,
+; which on IA-64 is the stack pointer r12.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare ptr @llvm.frameaddress(i32)
+
+define ptr @frameaddr_0() {
+; CHECK-LABEL: frameaddr_0#:
+; CHECK: mov {{r[0-9]+}} = r12
+  %r = call ptr @llvm.frameaddress(i32 0)
+  ret ptr %r
+}
diff --git llvm/test/CodeGen/IA64/function-alias.ll llvm/test/CodeGen/IA64/function-alias.ll
new file mode 100644
index 000000000000..ef72f357b9a3
--- /dev/null
+++ llvm/test/CodeGen/IA64/function-alias.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; A function pointer stored in data is the address of the function's descriptor
+; { entry, gp }, so it is emitted as @fptr(f). A GlobalAlias of a function,
+; however, is just another name for the aliasee's entry-point symbol: it must
+; resolve to the bare entry point (`A = B`), not repeat the @fptr wrapping --
+; both because the alias should equal the entry point and because GNU as rejects
+; an @fptr pseudo-fixup in a symbol-assignment expression.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define void @target() {
+  ret void
+}
+
+@falias = alias void (), ptr @target
+@fnptr = global ptr @target
+
+; A function pointer in data is wrapped in @fptr (descriptor address)...
+; CHECK-LABEL: fnptr#:
+; CHECK-NEXT:  data8.ua @fptr(target#)
+
+; ...but the alias is a plain symbol assignment to the entry point, no @fptr.
+; CHECK:       .globl falias#
+; CHECK:       .type falias#,@function
+; CHECK:       falias# = target#
+; CHECK-NOT:   @fptr
diff --git llvm/test/CodeGen/IA64/globals.ll llvm/test/CodeGen/IA64/globals.ll
new file mode 100644
index 000000000000..5a4c97f39838
--- /dev/null
+++ llvm/test/CodeGen/IA64/globals.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Global address materialization through the GOT (@ltoff) and load/store of a
+; global's value.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+@g = global i64 0
+@arr = global [4 x i64] zeroinitializer
+
+define ptr @addr_of_g() {
+; CHECK-LABEL: addr_of_g#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r32
+; CHECK-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    addl r3 = @ltoff(g#), r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r8 = [r3]
+; CHECK-NEXT:    mov ar.pfs = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp addr_of_g#
+  ret ptr @g
+}
+
+define i64 @load_g() {
+; CHECK-LABEL: load_g#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r32
+; CHECK-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    addl r3 = @ltoff(g#), r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r3 = [r3]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r8 = [r3]
+; CHECK-NEXT:    mov ar.pfs = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp load_g#
+  %v = load i64, ptr @g
+  ret i64 %v
+}
+
+define void @store_g(i64 %v) {
+; CHECK-LABEL: store_g#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    addl r3 = @ltoff(g#), r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r3 = [r3]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    st8 [r3] = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp store_g#
+  store i64 %v, ptr @g
+  ret void
+}
+
+define ptr @addr_of_elem() {
+; CHECK-LABEL: addr_of_elem#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r32
+; CHECK-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    addl r3 = @ltoff(arr#), r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r3 = [r3]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds r8 = 16, r3
+; CHECK-NEXT:    mov ar.pfs = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp addr_of_elem#
+  ret ptr getelementptr inbounds ([4 x i64], ptr @arr, i64 0, i64 2)
+}
diff --git llvm/test/CodeGen/IA64/i1-ops.ll llvm/test/CodeGen/IA64/i1-ops.ll
new file mode 100644
index 000000000000..c98ea4db2874
--- /dev/null
+++ llvm/test/CodeGen/IA64/i1-ops.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Boolean (i1 / predicate) logical operations and equality, plus i1 select.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i1 @and_i1(i1 %a, i1 %b) {
+; CHECK-LABEL: and_i1#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    and r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp and_i1#
+  %r = and i1 %a, %b
+  ret i1 %r
+}
+
+define i1 @or_i1(i1 %a, i1 %b) {
+; CHECK-LABEL: or_i1#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    or r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp or_i1#
+  %r = or i1 %a, %b
+  ret i1 %r
+}
+
+define i1 @xor_i1(i1 %a, i1 %b) {
+; CHECK-LABEL: xor_i1#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    xor r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp xor_i1#
+  %r = xor i1 %a, %b
+  ret i1 %r
+}
+
+define i1 @not_i1(i1 %a) {
+; CHECK-LABEL: not_i1#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    extr.u r3 = r32, 0, 1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r3, r0
+; CHECK-NEXT:    cmp.eq p7, p0 = r0, r0
+; CHECK-NEXT:    cmp.ne p8, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p8) cmp.eq.unc p9, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p7) cmp.eq p9, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) cmp.ne p9, p0 = r0, r0
+; CHECK-NEXT:    (p6) cmp.eq p8, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p7) cmp.ne p8, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p9) cmp.eq p8, p0 = r0, r0
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p8) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp not_i1#
+  %r = xor i1 %a, true
+  ret i1 %r
+}
+
+define i1 @cmp_i1_eq(i1 %a, i1 %b) {
+; CHECK-LABEL: cmp_i1_eq#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    xor r3 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    extr.u r3 = r3, 0, 1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r3, r0
+; CHECK-NEXT:    cmp.eq p7, p0 = r0, r0
+; CHECK-NEXT:    cmp.ne p8, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p8) cmp.eq.unc p9, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p7) cmp.eq p9, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) cmp.ne p9, p0 = r0, r0
+; CHECK-NEXT:    (p6) cmp.eq p8, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p7) cmp.ne p8, p0 = r0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p9) cmp.eq p8, p0 = r0, r0
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p8) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp cmp_i1_eq#
+  %r = icmp eq i1 %a, %b
+  ret i1 %r
+}
+
+define i1 @select_i1(i1 %c, i1 %a, i1 %b) {
+; CHECK-LABEL: select_i1#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r35
+; CHECK-NEXT:    alloc r35 = ar.pfs,0,4,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    extr.u r3 = r33, 0, 1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r3, r0
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r3 = r8
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r3 = 1, r3
+; CHECK-NEXT:    extr.u r9 = r34, 0, 1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r9, r0
+; CHECK-NEXT:    mov r9 = r8
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r9 = 1, r9
+; CHECK-NEXT:    extr.u r10 = r32, 0, 1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r10, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) mov r9 = r3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r9, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r35
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp select_i1#
+  %r = select i1 %c, i1 %a, i1 %b
+  ret i1 %r
+}
diff --git llvm/test/CodeGen/IA64/imm.ll llvm/test/CodeGen/IA64/imm.ll
new file mode 100644
index 000000000000..f6caad38b366
--- /dev/null
+++ llvm/test/CodeGen/IA64/imm.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Integer constant materialization.
+;   14-bit immediates: adds rX = imm, r0
+;   22-bit immediates: addl rX = imm, r0
+;   wider:             movl rX = imm
+;
+; Regression baee1399 ("Limit ADDL to constant materialization"): ADDL used to
+; have a general selection pattern and could fold a register operand, which
+; cornered the register allocator (addl's GR3 destination class). It is now
+; emitted only to materialize a constant (source r0); a register add of a wide
+; constant materializes with movl and a separate add, never an addl that folds
+; the register.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @small() {
+; CHECK-LABEL: small#:
+; CHECK: adds r8 = 100, r0
+  ret i64 100
+}
+
+define i64 @medium() {
+; CHECK-LABEL: medium#:
+; CHECK: addl r8 = 1000000, r0
+  ret i64 1000000
+}
+
+define i64 @big() {
+; CHECK-LABEL: big#:
+; CHECK: movl r8 = 1234605616436508552
+  ret i64 1234605616436508552
+}
+
+; A wide constant added to a register: materialize with movl, then add.
+; ADDL must not fold the register operand.
+define i64 @add_wide_const(i64 %a) {
+; CHECK-LABEL: add_wide_const#:
+; CHECK: movl [[C:r[0-9]+]] = 1234605616436508552
+; CHECK: add r8 = r32, [[C]]
+; CHECK-NOT: addl {{r[0-9]+}} = {{r[0-9]+}}, r32
+  %r = add i64 %a, 1234605616436508552
+  ret i64 %r
+}
+
+; A small constant folds into adds, not addl.
+define i64 @add_small_const(i64 %a) {
+; CHECK-LABEL: add_small_const#:
+; CHECK: adds r8 = 100, r32
+  %r = add i64 %a, 100
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/indirect-call.ll llvm/test/CodeGen/IA64/indirect-call.ll
new file mode 100644
index 000000000000..3559556e8eca
--- /dev/null
+++ llvm/test/CodeGen/IA64/indirect-call.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Calls through a function pointer go via a function descriptor (entry point +
+; gp) and branch through a branch register; the address of a function yields
+; its @fptr descriptor.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @target()
+
+define void @call_fp(ptr %fp) {
+; CHECK-LABEL: call_fp#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    ld8 r3 = [r32]
+; CHECK-NEXT:    adds r8 = 8, r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r8 = [r8]
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r8
+; CHECK-NEXT:    mov b6 = r3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = b6
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp call_fp#
+  call void %fp()
+  ret void
+}
+
+define i64 @call_fp_ret(ptr %fp, i64 %a) {
+; CHECK-LABEL: call_fp_ret#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,1,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out0 = r33
+; CHECK-NEXT:    ld8 r3 = [r32]
+; CHECK-NEXT:    adds r8 = 8, r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r8 = [r8]
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r8
+; CHECK-NEXT:    mov b6 = r3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = b6
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp call_fp_ret#
+  %r = call i64 %fp(i64 %a)
+  ret i64 %r
+}
+
+define ptr @func_addr() {
+; CHECK-LABEL: func_addr#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r32
+; CHECK-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    addl r3 = @ltoff(@fptr(target#)), r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r8 = [r3]
+; CHECK-NEXT:    mov ar.pfs = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp func_addr#
+  ret ptr @target
+}
diff --git llvm/test/CodeGen/IA64/inline-asm.ll llvm/test/CodeGen/IA64/inline-asm.ll
new file mode 100644
index 000000000000..15b7a3c0ffe5
--- /dev/null
+++ llvm/test/CodeGen/IA64/inline-asm.ll
@@ -0,0 +1,127 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Inline assembly operand constraints. The backend recognises the GCC IA-64
+; register constraints 'r' (general register) and 'f' (floating-point
+; register), the immediate constraint 'i', and register clobbers.
+;
+; IA-64 symbol names carry a '#' suffix in the assembly, so function labels are
+; "name#:".
+
+; The 'r' constraint must accept integer values narrower than i64 (e.g. the
+; i8/i32 an identity "black box" asm carries, as emitted by Rust's
+; core::hint::black_box) -- the GR class only lists i64, so without a custom
+; getRegForInlineAsmConstraint hook the generic exact-type search fails with
+; "could not allocate input reg for constraint 'r'".
+
+; CHECK-LABEL: black_box_i8#:
+; CHECK: //APP
+; CHECK: //NO_APP
+; CHECK: br.ret
+define i8 @black_box_i8(i8 %x) {
+  %r = call i8 asm "", "=r,0"(i8 %x)
+  ret i8 %r
+}
+
+; CHECK-LABEL: black_box_i32#:
+; CHECK: br.ret
+define i32 @black_box_i32(i32 %x) {
+  %r = call i32 asm "", "=r,0"(i32 %x)
+  ret i32 %r
+}
+
+; CHECK-LABEL: black_box_i64#:
+; CHECK: br.ret
+define i64 @black_box_i64(i64 %x) {
+  %r = call i64 asm "", "=r,0"(i64 %x)
+  ret i64 %r
+}
+
+; A separate input and output 'r' operand: the add happens in GRs named by asm.
+; CHECK-LABEL: add_r#:
+; CHECK: //APP
+; CHECK: add r{{[0-9]+}} = r{{[0-9]+}}, r{{[0-9]+}}
+; CHECK: //NO_APP
+; CHECK: br.ret
+define i64 @add_r(i64 %a, i64 %b) {
+  %r = call i64 asm "add $0 = $1, $2", "=r,r,r"(i64 %a, i64 %b)
+  ret i64 %r
+}
+
+; The 'i' constraint and an integer immediate operand are printed as a bare
+; decimal literal (PrintAsmOperand's MO_Immediate case).
+; CHECK-LABEL: add_imm#:
+; CHECK: //APP
+; CHECK: adds r{{[0-9]+}} = 42, r{{[0-9]+}}
+; CHECK: //NO_APP
+; CHECK: br.ret
+define i64 @add_imm(i64 %a) {
+  %r = call i64 asm "adds $0 = $2, $1", "=r,r,i"(i64 %a, i64 42)
+  ret i64 %r
+}
+
+; An explicit register clobber (~{r14}) must be honoured: the asm body may write
+; r14 freely. Just check the asm is emitted and we still return cleanly.
+; CHECK-LABEL: clobber#:
+; CHECK: //APP
+; CHECK: add r{{[0-9]+}} = r{{[0-9]+}}, r{{[0-9]+}}
+; CHECK: //NO_APP
+; CHECK: br.ret
+define i64 @clobber(i64 %a, i64 %b) {
+  %r = call i64 asm "add $0 = $1, $2", "=r,r,r,~{r14}"(i64 %a, i64 %b)
+  ret i64 %r
+}
+
+; The 'f' constraint must accept floating-point values in an FP register, for
+; both f32 (an fnorm.s identity here) and f64.
+; CHECK-LABEL: black_box_f32#:
+; CHECK: //APP
+; CHECK: //NO_APP
+; CHECK: br.ret
+define float @black_box_f32(float %x) {
+  %r = call float asm "", "=f,0"(float %x)
+  ret float %r
+}
+
+; CHECK-LABEL: black_box_f64#:
+; CHECK: //APP
+; CHECK: //NO_APP
+; CHECK: br.ret
+define double @black_box_f64(double %x) {
+  %r = call double asm "", "=f,0"(double %x)
+  ret double %r
+}
+
+; f80 ('long double') is wider than the FP class's representative type, so it is
+; routed through the dedicated f80-only register class (otherwise the generic
+; inline-asm tiling asserts). An identity black box must round-trip it in an FR.
+; CHECK-LABEL: black_box_f80#:
+; CHECK: //APP
+; CHECK: //NO_APP
+; CHECK: br.ret
+define x86_fp80 @black_box_f80(x86_fp80 %x) {
+  %r = call x86_fp80 asm "", "=f,0"(x86_fp80 %x)
+  ret x86_fp80 %r
+}
+
+; The 'm' constraint is an indirect memory operand: IA-64 dereferences a single
+; address register, printed as '[rN]'. A store through an output '=*m' operand:
+; CHECK-LABEL: mem_store#:
+; CHECK: //APP
+; CHECK: st8 [r{{[0-9]+}}] = r{{[0-9]+}}
+; CHECK: //NO_APP
+; CHECK: br.ret
+define void @mem_store(ptr %p, i64 %v) {
+  call void asm "st8 $0 = $1", "=*m,r"(ptr elementtype(i64) %p, i64 %v)
+  ret void
+}
+
+; A load through an input '*m' operand.
+; CHECK-LABEL: mem_load#:
+; CHECK: //APP
+; CHECK: ld8 r{{[0-9]+}} = [r{{[0-9]+}}]
+; CHECK: //NO_APP
+; CHECK: br.ret
+define i64 @mem_load(ptr %p) {
+  %r = call i64 asm "ld8 $0 = $1", "=r,*m"(ptr elementtype(i64) %p)
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/jumptable.ll llvm/test/CodeGen/IA64/jumptable.ll
new file mode 100644
index 000000000000..4fc0e34fae3e
--- /dev/null
+++ llvm/test/CodeGen/IA64/jumptable.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; A dense switch is lowered through an (absolute) jump table.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @sink(i64)
+
+define void @jt(i64 %x) {
+; CHECK-LABEL: jt#
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    adds r3 = 4, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.gtu p6, p0 = r32, r3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) brl.cond.sptk .LBB0_8
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    adds r3 = 3, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    shl r3 = r32, r3
+; CHECK-NEXT:    addl r8 = @ltoff(.LJTI0_0), r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r8 = [r8]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    add r3 = r8, r3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    ld8 r3 = [r3]
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov b6 = r3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p0) br.cond.sptk b6
+; CHECK-NEXT:  .LBB0_2: // %c0
+; CHECK-NEXT:    adds out0 = 10, r0
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB0_3
+; CHECK-NEXT:  .LBB0_7: // %c4
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds out0 = 14, r0
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB0_3
+; CHECK-NEXT:  .LBB0_5: // %c2
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds out0 = 12, r0
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB0_3
+; CHECK-NEXT:  .LBB0_6: // %c3
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds out0 = 13, r0
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB0_3
+; CHECK-NEXT:  .LBB0_4: // %c1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds out0 = 11, r0
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB0_3
+; CHECK-NEXT:  .LBB0_8: // %def
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    adds out0 = 99, r0
+; CHECK-NEXT:  .LBB0_3: // %c0
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = sink#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp jt#
+entry:
+  switch i64 %x, label %def [
+    i64 0, label %c0
+    i64 1, label %c1
+    i64 2, label %c2
+    i64 3, label %c3
+    i64 4, label %c4
+  ]
+c0:
+  call void @sink(i64 10)
+  ret void
+c1:
+  call void @sink(i64 11)
+  ret void
+c2:
+  call void @sink(i64 12)
+  ret void
+c3:
+  call void @sink(i64 13)
+  ret void
+c4:
+  call void @sink(i64 14)
+  ret void
+def:
+  call void @sink(i64 99)
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/lit.local.cfg llvm/test/CodeGen/IA64/lit.local.cfg
new file mode 100644
index 000000000000..57c4c1bf58a6
--- /dev/null
+++ llvm/test/CodeGen/IA64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "IA64" in config.root.targets:
+    config.unsupported = True
diff --git llvm/test/CodeGen/IA64/load-store.ll llvm/test/CodeGen/IA64/load-store.ll
new file mode 100644
index 000000000000..abb147366ab5
--- /dev/null
+++ llvm/test/CodeGen/IA64/load-store.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Integer loads and stores. ld1/ld2/ld4/ld8 zero-extend the loaded value into
+; the 64-bit GR; a signed sub-word load is sign-extended afterwards with sxt.
+;
+; Regression b9d6b47e ("Sign-extend before load if value is signed"): a signed
+; narrow load must produce the sxt so the high bits are correct; a zero-extended
+; load needs no sxt because the ld already zero-fills.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @sextload_i8(ptr %p) {
+; CHECK-LABEL: sextload_i8#:
+; CHECK: ld1 [[R:r[0-9]+]] = {{\[}}{{r[0-9]+}}]
+; CHECK: sxt1 {{r[0-9]+}} = [[R]]
+  %v = load i8, ptr %p
+  %e = sext i8 %v to i64
+  ret i64 %e
+}
+
+define i64 @zextload_i8(ptr %p) {
+; CHECK-LABEL: zextload_i8#:
+; CHECK: ld1 {{r[0-9]+}} = {{\[}}{{r[0-9]+}}]
+; CHECK-NOT: sxt
+  %v = load i8, ptr %p
+  %e = zext i8 %v to i64
+  ret i64 %e
+}
+
+define i64 @sextload_i16(ptr %p) {
+; CHECK-LABEL: sextload_i16#:
+; CHECK: ld2 [[R:r[0-9]+]] = {{\[}}{{r[0-9]+}}]
+; CHECK: sxt2 {{r[0-9]+}} = [[R]]
+  %v = load i16, ptr %p
+  %e = sext i16 %v to i64
+  ret i64 %e
+}
+
+define i64 @sextload_i32(ptr %p) {
+; CHECK-LABEL: sextload_i32#:
+; CHECK: ld4 [[R:r[0-9]+]] = {{\[}}{{r[0-9]+}}]
+; CHECK: sxt4 {{r[0-9]+}} = [[R]]
+  %v = load i32, ptr %p
+  %e = sext i32 %v to i64
+  ret i64 %e
+}
+
+define i64 @load_i64(ptr %p) {
+; CHECK-LABEL: load_i64#:
+; CHECK: ld8 {{r[0-9]+}} = {{\[}}{{r[0-9]+}}]
+  %v = load i64, ptr %p
+  ret i64 %v
+}
+
+define void @store_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: store_i8#:
+; CHECK: st1 {{\[}}{{r[0-9]+}}] = {{r[0-9]+}}
+  store i8 %v, ptr %p
+  ret void
+}
+
+define void @store_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: store_i32#:
+; CHECK: st4 {{\[}}{{r[0-9]+}}] = {{r[0-9]+}}
+  store i32 %v, ptr %p
+  ret void
+}
+
+define void @store_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: store_i64#:
+; CHECK: st8 {{\[}}{{r[0-9]+}}] = {{r[0-9]+}}
+  store i64 %v, ptr %p
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/mul.ll llvm/test/CodeGen/IA64/mul.ll
new file mode 100644
index 000000000000..cbb952a9318c
--- /dev/null
+++ llvm/test/CodeGen/IA64/mul.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Multiplication (via the XMA unit) and the high-half / wide multiplies, plus
+; population count.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: mul_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    setf.sig f6 = r33
+; CHECK-NEXT:    setf.sig f7 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    xma.l f6 = f7, f6, f0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    getf.sig r8 = f6
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp mul_i64#
+  %r = mul i64 %a, %b
+  ret i64 %r
+}
+
+define i64 @mulhu_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: mulhu_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    setf.sig f6 = r33
+; CHECK-NEXT:    setf.sig f7 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    xma.hu f6 = f7, f6, f0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    getf.sig r8 = f6
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp mulhu_i64#
+  %za = zext i64 %a to i128
+  %zb = zext i64 %b to i128
+  %m = mul i128 %za, %zb
+  %s = lshr i128 %m, 64
+  %r = trunc i128 %s to i64
+  ret i64 %r
+}
+
+define i64 @mulhs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: mulhs_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    setf.sig f6 = r33
+; CHECK-NEXT:    setf.sig f7 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    xma.h f6 = f7, f6, f0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    getf.sig r8 = f6
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp mulhs_i64#
+  %sa = sext i64 %a to i128
+  %sb = sext i64 %b to i128
+  %m = mul i128 %sa, %sb
+  %s = lshr i128 %m, 64
+  %r = trunc i128 %s to i64
+  ret i64 %r
+}
+
+define i32 @mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: mul_i32#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    setf.sig f6 = r33
+; CHECK-NEXT:    setf.sig f7 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    xma.l f6 = f7, f6, f0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    getf.sig r8 = f6
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp mul_i32#
+  %r = mul i32 %a, %b
+  ret i32 %r
+}
+
+define i64 @popcnt_i64(i64 %a) {
+; CHECK-LABEL: popcnt_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,2,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    popcnt r8 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp popcnt_i64#
+  %r = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %r
+}
+
+declare i64 @llvm.ctpop.i64(i64)
diff --git llvm/test/CodeGen/IA64/returns-twice-spill.ll llvm/test/CodeGen/IA64/returns-twice-spill.ll
new file mode 100644
index 000000000000..6abc6baaa7cd
--- /dev/null
+++ llvm/test/CodeGen/IA64/returns-twice-spill.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; A call to a returns_twice function may modify the Register Stack Engine
+; backing store: vfork() in particular runs the child in the parent's address
+; space (CLONE_VM) on the *shared* backing store, so the child's use of stacked
+; registers overwrites the parent's. Values live across such a call must
+; therefore not be kept in stacked registers (r32-r127); the backend models this
+; by clobbering all stacked registers with a regmask on the call, forcing such
+; values out to the static callee-saved registers or to the frame.
+;
+; (Companion to setjmp-doublereturn.ll, which covers the gp/sp/rp parking.)
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare i32 @setjmp(ptr) returns_twice
+declare void @use(i64, i32)
+
+define i32 @liveacross(ptr %env, i64 %keep) {
+; CHECK-LABEL: liveacross#:
+;
+; The frame is sized to the registers actually allocated (here a handful), not
+; ballooned to the full 96-register stack: the stacked-register regmask on the
+; call must not be counted as register usage by the alloc-sizing scan.
+; CHECK: alloc {{r[0-9]+}} = ar.pfs,0,4,2,0
+;
+; %keep arrives in a stacked register (r33) and is live across the call. It is
+; spilled to the frame before the call...
+; CHECK: st8 [{{r[0-9]+}}] = r33
+; CHECK: br.call.sptk rp = setjmp#
+; ...and reloaded afterwards to be passed on, rather than read back out of the
+; (now-unreliable) stacked register.
+; CHECK: ld8 out0 = [{{r[0-9]+}}]
+; CHECK: br.call.sptk rp = use#
+entry:
+  %r = call i32 @setjmp(ptr %env)
+  call void @use(i64 %keep, i32 %r)
+  ret i32 %r
+}
diff --git llvm/test/CodeGen/IA64/rp-no-direct-spill.ll llvm/test/CodeGen/IA64/rp-no-direct-spill.ll
new file mode 100644
index 000000000000..618dcedb07be
--- /dev/null
+++ llvm/test/CodeGen/IA64/rp-no-direct-spill.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s --check-prefix=NODIRECT
+
+; The return pointer (rp) is really branch register b0. It is modeled as a member
+; of the GR class so that 'mov rN = rp' / 'mov rp = rN' assemble, but it can never
+; be the operand of a plain st8/ld8: those require a *general* register, and gas
+; rejects 'st8 [rX] = rp' / 'ld8 rp = [rX]' ("Operand N should be a general
+; register").
+;
+; A non-leaf function preserves its own return pointer by parking the incoming rp
+; once in a stacked local in the prologue (mov rN = rp) and restoring it in the
+; epilogue (mov rp = rN). It must NOT save/restore rp around each individual call:
+; LowerCall used to do that, and the per-call save value -- live across the call,
+; which clobbers b0 -- coalesced into the physical rp and was then spilled by the
+; register allocator as the illegal 'st8 [..] = rp' / 'ld8 rp = [..]'. This broke
+; real builds (e.g. fish-shell) at the external-assembler step.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @callee(i64)
+
+; Two back-to-back calls so b0 is clobbered twice while the function's own return
+; pointer must survive to the epilogue.
+define void @twocalls(i64 %x) {
+; CHECK-LABEL: twocalls#:
+; The incoming rp is parked once in a stacked local in the prologue...
+; CHECK: mov [[RPSAVE:r[0-9]+]] = rp
+; ...both calls clobber b0, with no per-call rp save/restore around either...
+; CHECK: br.call.sptk rp = callee#
+; CHECK: br.call.sptk rp = callee#
+; ...and the epilogue restores b0 from that one local.
+; CHECK: mov rp = [[RPSAVE]]
+;
+; Crucially, rp is never stored to / reloaded from the stack directly: neither of
+; these illegal forms may appear anywhere in the output.
+; NODIRECT-NOT: st8{{.*}}= rp
+; NODIRECT-NOT: ld8 rp =
+entry:
+  call void @callee(i64 %x)
+  call void @callee(i64 %x)
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/select.ll llvm/test/CodeGen/IA64/select.ll
new file mode 100644
index 000000000000..2d25700ed171
--- /dev/null
+++ llvm/test/CodeGen/IA64/select.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Integer select and select-with-comparison.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i64 @select_i64(i1 %c, i64 %a, i64 %b) {
+; CHECK-LABEL: select_i64#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r35
+; CHECK-NEXT:    alloc r35 = ar.pfs,0,4,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r8 = r34
+; CHECK-NEXT:    extr.u r3 = r32, 0, 1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r3, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) mov r8 = r33
+; CHECK-NEXT:    mov ar.pfs = r35
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp select_i64#
+  %r = select i1 %c, i64 %a, i64 %b
+  ret i64 %r
+}
+
+define i64 @select_cc(i64 %x, i64 %y, i64 %a, i64 %b) {
+; CHECK-LABEL: select_cc#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r36
+; CHECK-NEXT:    alloc r36 = ar.pfs,0,5,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r8 = r35
+; CHECK-NEXT:    cmp.lt p6, p0 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) mov r8 = r34
+; CHECK-NEXT:    mov ar.pfs = r36
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp select_cc#
+  %c = icmp slt i64 %x, %y
+  %r = select i1 %c, i64 %a, i64 %b
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/setcc.ll llvm/test/CodeGen/IA64/setcc.ll
new file mode 100644
index 000000000000..97b7353e78af
--- /dev/null
+++ llvm/test/CodeGen/IA64/setcc.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Integer comparisons. The result is computed into a predicate register and
+; materialized back into a GR for the i32 zext return.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i32 @cmp_eq(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_eq#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    cmp.eq p6, p0 = r32, r33
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp cmp_eq#
+  %c = icmp eq i64 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @cmp_ne(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_ne#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    cmp.ne p6, p0 = r32, r33
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp cmp_ne#
+  %c = icmp ne i64 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @cmp_slt(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_slt#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    cmp.lt p6, p0 = r32, r33
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp cmp_slt#
+  %c = icmp slt i64 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @cmp_ult(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_ult#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    cmp.ltu p6, p0 = r32, r33
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp cmp_ult#
+  %c = icmp ult i64 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
+
+define i32 @cmp_sgt(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_sgt#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    cmp.gt p6, p0 = r32, r33
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) adds r8 = 1, r8
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp cmp_sgt#
+  %c = icmp sgt i64 %a, %b
+  %r = zext i1 %c to i32
+  ret i32 %r
+}
diff --git llvm/test/CodeGen/IA64/setjmp-doublereturn.ll llvm/test/CodeGen/IA64/setjmp-doublereturn.ll
new file mode 100644
index 000000000000..5a0a9f9609e0
--- /dev/null
+++ llvm/test/CodeGen/IA64/setjmp-doublereturn.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Regression 48f447fc ("Save rp/gp/sp outside RSE for double return functions").
+;
+; A call to a returns_twice function (setjmp) can be re-entered via longjmp, at
+; which point the Register Stack Engine state is gone. gp (r1), sp (r12) and the
+; return pointer must therefore NOT be parked in stacked registers across such a
+; call; they are held in the static callee-saved registers r4/r6/r7 (themselves
+; spilled to the frame in the prologue), so they survive the second return.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare i32 @setjmp(ptr) returns_twice
+declare void @use(i32)
+
+define i32 @doublereturn(ptr %env) {
+; CHECK-LABEL: doublereturn#:
+; Prologue spills the callee-saved scratch registers to the frame.
+; CHECK-DAG: st8 {{\[}}{{r[0-9]+}}] = r4
+; CHECK-DAG: st8 {{\[}}{{r[0-9]+}}] = r6
+; CHECK-DAG: st8 {{\[}}{{r[0-9]+}}] = r7
+;
+; Park gp/sp/rp in callee-saved r4/r6/r7 (not stacked regs) around the call.
+; CHECK: mov r4 = r1
+; CHECK: mov r6 = r12
+; CHECK: mov r7 = rp
+; CHECK: br.call.sptk rp = setjmp#
+; CHECK: mov r1 = r4
+; CHECK: mov r12 = r6
+; CHECK: mov rp = r7
+entry:
+  %r = call i32 @setjmp(ptr %env)
+  call void @use(i32 %r)
+  ret i32 %r
+}
diff --git llvm/test/CodeGen/IA64/shift-parts.ll llvm/test/CodeGen/IA64/shift-parts.ll
new file mode 100644
index 000000000000..88b7fe9af412
--- /dev/null
+++ llvm/test/CodeGen/IA64/shift-parts.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; IA-64 has no 128-bit shift instruction. 128-bit shifts legalize to
+; {SHL,SRA,SRL}_PARTS over an i64 register pair, which we mark Expand so the
+; integer legalizer emits the libgcc libcall (__ashlti3/__ashrti3/__lshrti3),
+; matching how 128-bit divide/modulo are handled.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+define i128 @shl_i128(i128 %a, i128 %b) {
+; CHECK-LABEL: shl_i128#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r35
+; CHECK-NEXT:    alloc r35 = ar.pfs,0,5,3,0
+; CHECK-NEXT:    .save rp, r36
+; CHECK-NEXT:    mov r36 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    zxt4 out2 = r34
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __ashlti3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r35
+; CHECK-NEXT:    mov rp = r36
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp shl_i128#
+  %r = shl i128 %a, %b
+  ret i128 %r
+}
+
+define i128 @ashr_i128(i128 %a, i128 %b) {
+; CHECK-LABEL: ashr_i128#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r35
+; CHECK-NEXT:    alloc r35 = ar.pfs,0,5,3,0
+; CHECK-NEXT:    .save rp, r36
+; CHECK-NEXT:    mov r36 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    sxt4 out2 = r34
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __ashrti3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r35
+; CHECK-NEXT:    mov rp = r36
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp ashr_i128#
+  %r = ashr i128 %a, %b
+  ret i128 %r
+}
+
+define i128 @lshr_i128(i128 %a, i128 %b) {
+; CHECK-LABEL: lshr_i128#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r35
+; CHECK-NEXT:    alloc r35 = ar.pfs,0,5,3,0
+; CHECK-NEXT:    .save rp, r36
+; CHECK-NEXT:    mov r36 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    zxt4 out2 = r34
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __lshrti3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r35
+; CHECK-NEXT:    mov rp = r36
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp lshr_i128#
+  %r = lshr i128 %a, %b
+  ret i128 %r
+}
diff --git llvm/test/CodeGen/IA64/stack-realign.ll llvm/test/CodeGen/IA64/stack-realign.ll
new file mode 100644
index 000000000000..1d0f7595c06d
--- /dev/null
+++ llvm/test/CodeGen/IA64/stack-realign.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; The backend reports StackRealignable=false: sp (r12) is only 16-byte aligned
+; and the prologue never emits an 'and sp, -N'. A local whose alignment exceeds
+; 16 therefore cannot be honoured at a static sp+offset slot. Such an alloca is
+; demoted to a dynamically-sized object lowered via DYNAMIC_STACKALLOC, whose
+; Expand emits 'sp -= size; sp &= -align' -- so the pointer is genuinely
+; aligned. (Before this, the over-aligned alloca was folded into the static
+; frame and computeKnownBits rewrote field GEPs 'add base,k' into a colliding
+; 'or base,k', corrupting fields once sp turned out to be merely 16-aligned.)
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @use(ptr)
+
+; An alloca aligned to 64 must be dynamically realigned: sp is masked with -64.
+; CHECK-LABEL: overaligned#:
+; CHECK: adds [[NEG:r[0-9]+]] = -64, r0
+; CHECK: and {{r[0-9]+}} = {{r[0-9]+}}, [[NEG]]
+define void @overaligned() {
+  %p = alloca [8 x i64], align 64
+  call void @use(ptr %p)
+  ret void
+}
+
+; A 16-aligned alloca already satisfies the stack alignment, so it stays a
+; static frame slot: sp is adjusted by a plain add, never masked.
+; CHECK-LABEL: aligned16#:
+; CHECK-NOT: and {{r[0-9]+}} = {{r[0-9]+}}, {{r[0-9]+}}
+define void @aligned16() {
+  %p = alloca [8 x i64], align 16
+  call void @use(ptr %p)
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/struct-byval.ll llvm/test/CodeGen/IA64/struct-byval.ll
new file mode 100644
index 000000000000..04bf7de677cd
--- /dev/null
+++ llvm/test/CodeGen/IA64/struct-byval.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Small aggregates passed and returned by value: the IA-64 ABI flattens them
+; into the argument registers / return registers.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+%struct.two = type { i64, i64 }
+
+define i64 @pass_struct(%struct.two %s) {
+; CHECK-LABEL: pass_struct#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    add r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp pass_struct#
+  %a = extractvalue %struct.two %s, 0
+  %b = extractvalue %struct.two %s, 1
+  %r = add i64 %a, %b
+  ret i64 %r
+}
+
+define %struct.two @ret_struct(i64 %a, i64 %b) {
+; CHECK-LABEL: ret_struct#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r9 = r33
+; CHECK-NEXT:    mov r8 = r32
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp ret_struct#
+  %s0 = insertvalue %struct.two undef, i64 %a, 0
+  %s1 = insertvalue %struct.two %s0, i64 %b, 1
+  ret %struct.two %s1
+}
+
+declare i64 @take(%struct.two)
+
+define i64 @call_struct(i64 %a, i64 %b) {
+; CHECK-LABEL: call_struct#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,2,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = take#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp call_struct#
+  %s0 = insertvalue %struct.two undef, i64 %a, 0
+  %s1 = insertvalue %struct.two %s0, i64 %b, 1
+  %r = call i64 @take(%struct.two %s1)
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/tls.ll llvm/test/CodeGen/IA64/tls.ll
new file mode 100644
index 000000000000..3388643f6263
--- /dev/null
+++ llvm/test/CodeGen/IA64/tls.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s --check-prefix=STATIC
+; RUN: llc -mtriple=ia64 -relocation-model=pic < %s | FileCheck %s --check-prefix=PIC
+
+; Thread-local storage. Under the static relocation model a local TLS symbol
+; uses Local-Exec (movl @tprel) and an external one uses Initial-Exec
+; (@ltoff(@tprel) via the GOT); under PIC both use General-Dynamic
+; (@dtpmod/@dtprel + a call to __tls_get_addr). r13 is the thread pointer.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+@ext_tls = external thread_local global i64
+@loc_tls = internal thread_local global i64 0
+
+define ptr @addr_ext() {
+; STATIC-LABEL: addr_ext#
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    .prologue
+; STATIC-NEXT:    .save ar.pfs, r32
+; STATIC-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; STATIC-NEXT:    .body
+; STATIC-NEXT:    addl r3 = @ltoff(@tprel(ext_tls#)), r1
+; STATIC-NEXT:    ;;
+; STATIC-NEXT:    ld8 r3 = [r3]
+; STATIC-NEXT:    ;;
+; STATIC-NEXT:    add r8 = r13, r3
+; STATIC-NEXT:    mov ar.pfs = r32
+; STATIC-NEXT:    ;;
+; STATIC-NEXT:    br.ret.sptk.many rp
+; STATIC-NEXT:    .endp addr_ext#
+;
+; PIC-LABEL: addr_ext#
+; PIC:       // %bb.0:
+; PIC-NEXT:    .prologue
+; PIC-NEXT:    .save ar.pfs, r33
+; PIC-NEXT:    alloc r33 = ar.pfs,0,3,2,0
+; PIC-NEXT:    .save rp, r34
+; PIC-NEXT:    mov r34 = rp
+; PIC-NEXT:    .fframe 32
+; PIC-NEXT:    add r12 = -32, r12
+; PIC-NEXT:    .body
+; PIC-NEXT:    addl r3 = @ltoff(@dtpmod(ext_tls#)), r1
+; PIC-NEXT:    ;;
+; PIC-NEXT:    ld8 out0 = [r3]
+; PIC-NEXT:    addl r3 = @ltoff(@dtprel(ext_tls#)), r1
+; PIC-NEXT:    ;;
+; PIC-NEXT:    ld8 out1 = [r3]
+; PIC-NEXT:    mov r32 = r1
+; PIC-NEXT:    ;;
+; PIC-NEXT:    br.call.sptk rp = __tls_get_addr#
+; PIC-NEXT:    ;;
+; PIC-NEXT:    mov r1 = r32
+; PIC-NEXT:    mov ar.pfs = r33
+; PIC-NEXT:    mov rp = r34
+; PIC-NEXT:    .restore sp
+; PIC-NEXT:    add r12 = 32, r12
+; PIC-NEXT:    ;;
+; PIC-NEXT:    br.ret.sptk.many rp
+; PIC-NEXT:    .endp addr_ext#
+  ret ptr @ext_tls
+}
+
+define ptr @addr_loc() {
+; STATIC-LABEL: addr_loc#
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    .prologue
+; STATIC-NEXT:    .save ar.pfs, r32
+; STATIC-NEXT:    alloc r32 = ar.pfs,0,1,0,0
+; STATIC-NEXT:    .body
+; STATIC-NEXT:    movl r3 = @tprel(loc_tls#)
+; STATIC-NEXT:    ;;
+; STATIC-NEXT:    add r8 = r13, r3
+; STATIC-NEXT:    mov ar.pfs = r32
+; STATIC-NEXT:    ;;
+; STATIC-NEXT:    br.ret.sptk.many rp
+; STATIC-NEXT:    .endp addr_loc#
+;
+; PIC-LABEL: addr_loc#
+; PIC:       // %bb.0:
+; PIC-NEXT:    .prologue
+; PIC-NEXT:    .save ar.pfs, r33
+; PIC-NEXT:    alloc r33 = ar.pfs,0,3,2,0
+; PIC-NEXT:    .save rp, r34
+; PIC-NEXT:    mov r34 = rp
+; PIC-NEXT:    .fframe 32
+; PIC-NEXT:    add r12 = -32, r12
+; PIC-NEXT:    .body
+; PIC-NEXT:    addl r3 = @ltoff(@dtpmod(loc_tls#)), r1
+; PIC-NEXT:    ;;
+; PIC-NEXT:    ld8 out0 = [r3]
+; PIC-NEXT:    addl r3 = @ltoff(@dtprel(loc_tls#)), r1
+; PIC-NEXT:    ;;
+; PIC-NEXT:    ld8 out1 = [r3]
+; PIC-NEXT:    mov r32 = r1
+; PIC-NEXT:    ;;
+; PIC-NEXT:    br.call.sptk rp = __tls_get_addr#
+; PIC-NEXT:    ;;
+; PIC-NEXT:    mov r1 = r32
+; PIC-NEXT:    mov ar.pfs = r33
+; PIC-NEXT:    mov rp = r34
+; PIC-NEXT:    .restore sp
+; PIC-NEXT:    add r12 = 32, r12
+; PIC-NEXT:    ;;
+; PIC-NEXT:    br.ret.sptk.many rp
+; PIC-NEXT:    .endp addr_loc#
+  ret ptr @loc_tls
+}
diff --git llvm/test/CodeGen/IA64/unwind-copy-state.ll llvm/test/CodeGen/IA64/unwind-copy-state.ll
new file mode 100644
index 000000000000..0549250fa311
--- /dev/null
+++ llvm/test/CodeGen/IA64/unwind-copy-state.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=ia64 -enable-tail-merge=false < %s | FileCheck %s
+
+; A framed function with more than one epilogue needs '.label_state' after
+; '.body' and '.copy_state' before each '.restore sp', so GNU as does not reject
+; the second '.restore' (its unwind region was closed by the first). Tail-merge
+; is disabled here to keep the two epilogues from being folded into one.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @ext1(i32)
+
+define void @multi_epilogue(i32 %x) {
+; CHECK-LABEL: multi_epilogue#
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,1,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    .label_state 1
+; CHECK-NEXT:    zxt4 r3 = r32
+; CHECK-NEXT:    adds r8 = 0, r0
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    cmp.ne p6, p0 = r3, r8
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    (p6) brl.cond.sptk .LBB0_2
+; CHECK-NEXT:    (p0) brl.cond.sptk .LBB0_1
+; CHECK-NEXT:  .LBB0_2: // %b
+; CHECK-NEXT:    adds out0 = 22, r0
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = ext1#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .copy_state 1
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:  .LBB0_1: // %a
+; CHECK-NEXT:    adds out0 = 11, r0
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = ext1#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .copy_state 1
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp multi_epilogue#
+entry:
+  %c = icmp eq i32 %x, 0
+  br i1 %c, label %a, label %b
+a:
+  call void @ext1(i32 11)
+  ret void
+b:
+  call void @ext1(i32 22)
+  ret void
+}
diff --git llvm/test/CodeGen/IA64/unwind.ll llvm/test/CodeGen/IA64/unwind.ll
new file mode 100644
index 000000000000..cc58ad00f7cd
--- /dev/null
+++ llvm/test/CodeGen/IA64/unwind.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; IA-64 native stack-unwind directives (.proc / .prologue / .save ar.pfs /
+; .save rp / .fframe / .body / .restore sp / .endp). GNU as assembles these into
+; the .IA_64.unwind / .IA_64.unwind_info sections that gdb/libunwind read to walk
+; the stack, so they are what makes a backtrace step past a frame. The key record
+; is '.save rp, <reg>': a non-leaf function parks its return pointer in a fixed
+; stacked local that the unwinder can name from anywhere in the body.
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @ext()
+
+; A leaf function never clobbers b0: no '.save rp', no frame.
+define i64 @leaf(i64 %a, i64 %b) {
+; CHECK-LABEL: leaf#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    add r8 = r32, r33
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp leaf#
+  %r = add i64 %a, %b
+  ret i64 %r
+}
+
+; A non-leaf function parks rp in a stacked local: '.save rp, <reg>' with the
+; matching 'mov <reg> = rp' in the prologue and the restore before the return.
+define void @nonleaf() {
+; CHECK-LABEL: nonleaf#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r33
+; CHECK-NEXT:    alloc r33 = ar.pfs,0,3,0,0
+; CHECK-NEXT:    .save rp, r34
+; CHECK-NEXT:    mov r34 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = ext#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r33
+; CHECK-NEXT:    mov rp = r34
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp nonleaf#
+  call void @ext()
+  ret void
+}
+
+; A function whose only call is a libcall (sdiv -> __divdi3) is still non-leaf:
+; hasCalls() sees it even though there is no call at the IR level, so rp is
+; parked and '.save rp' is emitted.
+define i64 @libcall(i64 %a, i64 %b) {
+; CHECK-LABEL: libcall#
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .prologue
+; CHECK-NEXT:    .save ar.pfs, r34
+; CHECK-NEXT:    alloc r34 = ar.pfs,0,4,2,0
+; CHECK-NEXT:    .save rp, r35
+; CHECK-NEXT:    mov r35 = rp
+; CHECK-NEXT:    .fframe 32
+; CHECK-NEXT:    add r12 = -32, r12
+; CHECK-NEXT:    .body
+; CHECK-NEXT:    mov out1 = r33
+; CHECK-NEXT:    mov out0 = r32
+; CHECK-NEXT:    mov r32 = r1
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.call.sptk rp = __divdi3#
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    mov r1 = r32
+; CHECK-NEXT:    mov ar.pfs = r34
+; CHECK-NEXT:    mov rp = r35
+; CHECK-NEXT:    .restore sp
+; CHECK-NEXT:    add r12 = 32, r12
+; CHECK-NEXT:    ;;
+; CHECK-NEXT:    br.ret.sptk.many rp
+; CHECK-NEXT:    .endp libcall#
+  %r = sdiv i64 %a, %b
+  ret i64 %r
+}
diff --git llvm/test/CodeGen/IA64/varargs.ll llvm/test/CodeGen/IA64/varargs.ll
new file mode 100644
index 000000000000..faa83e6e6e3b
--- /dev/null
+++ llvm/test/CodeGen/IA64/varargs.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=ia64 < %s | FileCheck %s
+
+; Callee-side varargs lowering on IA-64.
+;
+; A variadic callee spills its incoming argument registers (out/in window
+; r32..r39) into the frame so va_list can walk a contiguous image. Per the
+; IA-64 psABI (§8.5.4) parameter slot i lives at offset 8*i-48 from the
+; *incoming* sp; the named slot 0 (here %fmt in r32) is not homed, the
+; variadic slots r33..r39 are.
+;
+; Regression test for 94b1587d ("Pass stacked arguments at sp+16 ... not
+; sp+80"): the register-home spill offsets used to carry an extra +64, which
+; mismatched gcc-built variadic callees. With a 64-byte frame here the
+; incoming sp is r12+64, so slot i homes at [r12 + (8*i-48) + 64] = [r12+8*i+16]:
+; the first variadic slot (r33, i=1) at [r12+24], the last (r39, i=7) at
+; [r12+72]. The buggy +64 scheme would have put them at [r12+88]..[r12+136].
+
+target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "ia64"
+
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_end(ptr)
+
+define i32 @va_callee(i32 %fmt, ...) {
+; CHECK-LABEL: va_callee#:
+entry:
+; First variadic slot (r33) homes at [r12+24], not [r12+88].
+; CHECK-DAG: add [[H1:r[0-9]+]] = 24, r12
+; CHECK-DAG: st8 {{\[}}{{r[0-9]+}}] = r33
+; Last register slot (r39) homes at [r12+72], not [r12+136]; slots are 8 apart.
+; CHECK-DAG: add [[H7:r[0-9]+]] = 72, r12
+; CHECK-DAG: st8 {{\[}}{{r[0-9]+}}] = r39
+  %ap = alloca ptr, align 8
+  call void @llvm.va_start(ptr %ap)
+  %a = va_arg ptr %ap, i32
+  %b = va_arg ptr %ap, i32
+  call void @llvm.va_end(ptr %ap)
+  %s = add i32 %a, %b
+  ret i32 %s
+}
diff --git llvm/utils/UpdateTestChecks/asm.py llvm/utils/UpdateTestChecks/asm.py
index 82377862885c..0d51c5fab83d 100644
--- llvm/utils/UpdateTestChecks/asm.py
+++ llvm/utils/UpdateTestChecks/asm.py
@@ -147,6 +147,16 @@ ASM_FUNCTION_SPARC_RE = re.compile(
     flags=(re.M | re.S),
 )
 
+ASM_FUNCTION_IA64_RE = re.compile(
+    # IA-64 symbols carry a '#' suffix, so the label is "name#:". Capture the
+    # '#' in func_name_separator so the function name still matches the IR (and
+    # the "// @name" comment) while the emitted CHECK-LABEL keeps the suffix.
+    r'^_?(?P<func>[^#:\n]+)(?P<func_name_separator>#):[ \t]*//+[ \t]*@"?(?P=func)"?\n'
+    r"(?P<body>.*?)\s*"
+    r".Lfunc_end[0-9]+:\n",
+    flags=(re.M | re.S),
+)
+
 ASM_FUNCTION_SYSTEMZ_RE = re.compile(
     r'^_?(?P<func>[^:]+):[ \t]*#+[ \t]*@"?(?P=func)"?\n'
     r"(?:[ \t]+.cfi_startproc\n)?"
@@ -457,6 +467,17 @@ def scrub_asm_sparc(asm, args):
     return asm
 
 
+def scrub_asm_ia64(asm, args):
+    # Scrub runs of whitespace out of the assembly, but leave the leading
+    # whitespace in place.
+    asm = common.SCRUB_WHITESPACE_RE.sub(r" ", asm)
+    # Expand the tabs used for indentation.
+    asm = string.expandtabs(asm, 2)
+    # Strip trailing whitespace.
+    asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r"", asm)
+    return asm
+
+
 def scrub_asm_spirv(asm, args):
     # Scrub runs of whitespace out of the assembly, but leave the leading
     # whitespace in place.
@@ -594,6 +615,7 @@ def get_run_handler(triple):
         "riscv64": (scrub_asm_riscv, ASM_FUNCTION_RISCV_RE),
         "lanai": (scrub_asm_lanai, ASM_FUNCTION_LANAI_RE),
         "sparc": (scrub_asm_sparc, ASM_FUNCTION_SPARC_RE),
+        "ia64": (scrub_asm_ia64, ASM_FUNCTION_IA64_RE),
         "spirv": (scrub_asm_spirv, ASM_FUNCTION_SPIRV_RE),
         "spirv32": (scrub_asm_spirv, ASM_FUNCTION_SPIRV_RE),
         "spirv64": (scrub_asm_spirv, ASM_FUNCTION_SPIRV_RE),