//===-- X86FixupInstTunings.cpp - replace instructions -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file does a tuning pass replacing slower machine instructions
// with faster ones. We do this here, as opposed to during normal ISel, as
// attempting to get the "right" instruction can break patterns. This pass
// is not meant search for special cases where an instruction can be transformed
// to another, it is only meant to do transformations where the old instruction
// is always replacable with the new instructions. For example:
//
//      `vpermq ymm` -> `vshufd ymm`
//          -- BAD, not always valid (lane cross/non-repeated mask)
//
//      `vpermilps ymm` -> `vshufd ymm`
//          -- GOOD, always replaceable
//
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"

using namespace llvm;

#define DEBUG_TYPE "x86-fixup-inst-tuning"

STATISTIC(NumInstChanges, "Number of instructions changes");

namespace {
class X86FixupInstTuningPass : public MachineFunctionPass {
public:
  static char ID;

  X86FixupInstTuningPass() : MachineFunctionPass(ID) {}

  StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; }

  bool runOnMachineFunction(MachineFunction &MF) override;
  bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator &I);

  // This pass runs after regalloc and doesn't support VReg operands.
  MachineFunctionProperties getRequiredProperties() const override {
    return MachineFunctionProperties().setNoVRegs();
  }

private:
  const X86InstrInfo *TII = nullptr;
  const X86Subtarget *ST = nullptr;
  const MCSchedModel *SM = nullptr;
};
} // end anonymous namespace

char X86FixupInstTuningPass::ID = 0;

INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false)

FunctionPass *llvm::createX86FixupInstTuning() {
  return new X86FixupInstTuningPass();
}

template <typename T>
static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
  if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
    return *NewVal < *CurVal;

  return std::nullopt;
}

bool X86FixupInstTuningPass::processInstruction(
    MachineFunction &MF, MachineBasicBlock &MBB,
    MachineBasicBlock::iterator &I) {
  MachineInstr &MI = *I;
  unsigned Opc = MI.getOpcode();
  unsigned NumOperands = MI.getDesc().getNumOperands();
  bool OptSize = MF.getFunction().hasOptSize();

  auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
    // We already checked that SchedModel exists in `NewOpcPreferable`.
    return MCSchedModel::getReciprocalThroughput(
        *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
  };

  auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
    // We already checked that SchedModel exists in `NewOpcPreferable`.
    return MCSchedModel::computeInstrLatency(
        *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
  };

  auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
    if (unsigned Size = TII->get(Opcode).getSize())
      return Size;
    // Zero size means we where unable to compute it.
    return std::nullopt;
  };

  auto NewOpcPreferable = [&](unsigned NewOpc,
                              bool ReplaceInTie = true) -> bool {
    std::optional<bool> Res;
    if (SM->hasInstrSchedModel()) {
      // Compare tput -> lat -> code size.
      Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
      if (Res.has_value())
        return *Res;

      Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
      if (Res.has_value())
        return *Res;
    }

    Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
    if (Res.has_value())
      return *Res;

    // We either have either were unable to get tput/lat/codesize or all values
    // were equal. Return specified option for a tie.
    return ReplaceInTie;
  };

  // `vpermilpd r, i` -> `vshufpd r, r, i`
  // `vpermilpd r, i, k` -> `vshufpd r, r, i, k`
  // `vshufpd` is always as fast or faster than `vpermilpd` and takes
  // 1 less byte of code size for VEX and EVEX encoding.
  auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
    if (!NewOpcPreferable(NewOpc))
      return false;
    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
    {
      unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
      MI.removeOperand(NumOperands - 1);
      MI.addOperand(MI.getOperand(NumOperands - 2));
      MI.setDesc(TII->get(NewOpc));
      MI.addOperand(MachineOperand::CreateImm(MaskImm));
    }
    LLVM_DEBUG(dbgs() << "     With: " << MI);
    return true;
  };

  // `vpermilps r, i` -> `vshufps r, r, i`
  // `vpermilps r, i, k` -> `vshufps r, r, i, k`
  // `vshufps` is always as fast or faster than `vpermilps` and takes
  // 1 less byte of code size for VEX and EVEX encoding.
  auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
    if (!NewOpcPreferable(NewOpc))
      return false;
    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
    {
      unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
      MI.removeOperand(NumOperands - 1);
      MI.addOperand(MI.getOperand(NumOperands - 2));
      MI.setDesc(TII->get(NewOpc));
      MI.addOperand(MachineOperand::CreateImm(MaskImm));
    }
    LLVM_DEBUG(dbgs() << "     With: " << MI);
    return true;
  };

  // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles.
  // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less
  // byte of code size.
  auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
    // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
    // `vpshufd` saves a byte of code size.
    if (!ST->hasNoDomainDelayShuffle() ||
        !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
      return false;
    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
    {
      MI.setDesc(TII->get(NewOpc));
    }
    LLVM_DEBUG(dbgs() << "     With: " << MI);
    return true;
  };

  // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00`
  // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff`
  // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00`
  // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff`
  // `vunpcklpd r, m` -> `vunpcklqdq r, m, k`
  // `vunpckhpd r, m` -> `vunpckhqdq r, m, k`
  // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k`
  // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k`
  // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd`
  //        -> `vunpck{l|h}qdq`
  // 2) If `vshufpd` faster than `vunpck{l|h}pd`
  //        -> `vshufpd`
  //
  // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay)
  auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
    if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
      return false;
    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
    {
      MI.setDesc(TII->get(NewOpc));
      MI.addOperand(MachineOperand::CreateImm(MaskImm));
    }
    LLVM_DEBUG(dbgs() << "     With: " << MI);
    return true;
  };

  auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool {
    // TODO it may be worth it to set ReplaceInTie to `true` as there is no real
    // downside to the integer unpck, but if someone doesn't specify exact
    // target we won't find it faster.
    if (!ST->hasNoDomainDelayShuffle() ||
        !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
      return false;
    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
    {
      MI.setDesc(TII->get(NewOpc));
    }
    LLVM_DEBUG(dbgs() << "     With: " << MI);
    return true;
  };

  auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain,
                               unsigned NewOpc) -> bool {
    if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
      return true;
    return ProcessUNPCK(NewOpc, 0x00);
  };
  auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain,
                               unsigned NewOpc) -> bool {
    if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
      return true;
    return ProcessUNPCK(NewOpc, 0xff);
  };

  auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool {
    return ProcessUNPCKToIntDomain(NewOpcIntDomain);
  };

  auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
    return ProcessUNPCKToIntDomain(NewOpc);
  };

  auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
    if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
      return false;
    // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
    APInt MaskW =
        APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false);
    APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
    if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
      return false;
    APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
    {
      MI.setDesc(TII->get(MovOpc));
      MI.removeOperand(NumOperands - 1);
      MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
    }
    LLVM_DEBUG(dbgs() << "     With: " << MI);
    return true;
  };

  auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
                               unsigned MovImm) -> bool {
    if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
      return false;
    if (!OptSize && !NewOpcPreferable(MovOpc))
      return false;
    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
    {
      MI.setDesc(TII->get(MovOpc));
      MI.removeOperand(NumOperands - 1);
    }
    LLVM_DEBUG(dbgs() << "     With: " << MI);
    return true;
  };

  switch (Opc) {
  case X86::BLENDPDrri:
    return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
  case X86::VBLENDPDrri:
    return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);

  case X86::BLENDPSrri:
    return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
           ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
  case X86::VBLENDPSrri:
    return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
           ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);

  case X86::VPBLENDWrri:
    // TODO: Add X86::VPBLENDWrmi handling
    // TODO: Add X86::VPBLENDWYrri handling
    // TODO: Add X86::VPBLENDWYrmi handling
    return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);

  case X86::VPERMILPDri:
    return ProcessVPERMILPDri(X86::VSHUFPDrri);
  case X86::VPERMILPDYri:
    return ProcessVPERMILPDri(X86::VSHUFPDYrri);
  case X86::VPERMILPDZ128ri:
    return ProcessVPERMILPDri(X86::VSHUFPDZ128rri);
  case X86::VPERMILPDZ256ri:
    return ProcessVPERMILPDri(X86::VSHUFPDZ256rri);
  case X86::VPERMILPDZri:
    return ProcessVPERMILPDri(X86::VSHUFPDZrri);
  case X86::VPERMILPDZ128rikz:
    return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz);
  case X86::VPERMILPDZ256rikz:
    return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz);
  case X86::VPERMILPDZrikz:
    return ProcessVPERMILPDri(X86::VSHUFPDZrrikz);
  case X86::VPERMILPDZ128rik:
    return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik);
  case X86::VPERMILPDZ256rik:
    return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik);
  case X86::VPERMILPDZrik:
    return ProcessVPERMILPDri(X86::VSHUFPDZrrik);

  case X86::VPERMILPSri:
    return ProcessVPERMILPSri(X86::VSHUFPSrri);
  case X86::VPERMILPSYri:
    return ProcessVPERMILPSri(X86::VSHUFPSYrri);
  case X86::VPERMILPSZ128ri:
    return ProcessVPERMILPSri(X86::VSHUFPSZ128rri);
  case X86::VPERMILPSZ256ri:
    return ProcessVPERMILPSri(X86::VSHUFPSZ256rri);
  case X86::VPERMILPSZri:
    return ProcessVPERMILPSri(X86::VSHUFPSZrri);
  case X86::VPERMILPSZ128rikz:
    return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz);
  case X86::VPERMILPSZ256rikz:
    return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz);
  case X86::VPERMILPSZrikz:
    return ProcessVPERMILPSri(X86::VSHUFPSZrrikz);
  case X86::VPERMILPSZ128rik:
    return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik);
  case X86::VPERMILPSZ256rik:
    return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik);
  case X86::VPERMILPSZrik:
    return ProcessVPERMILPSri(X86::VSHUFPSZrrik);
  case X86::VPERMILPSmi:
    return ProcessVPERMILPSmi(X86::VPSHUFDmi);
  case X86::VPERMILPSYmi:
    // TODO: See if there is a more generic way we can test if the replacement
    // instruction is supported.
    return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false;
  case X86::VPERMILPSZ128mi:
    return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi);
  case X86::VPERMILPSZ256mi:
    return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
  case X86::VPERMILPSZmi:
    return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
  case X86::VPERMILPSZ128mikz:
    return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz);
  case X86::VPERMILPSZ256mikz:
    return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz);
  case X86::VPERMILPSZmikz:
    return ProcessVPERMILPSmi(X86::VPSHUFDZmikz);
  case X86::VPERMILPSZ128mik:
    return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik);
  case X86::VPERMILPSZ256mik:
    return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik);
  case X86::VPERMILPSZmik:
    return ProcessVPERMILPSmi(X86::VPSHUFDZmik);

  case X86::MOVLHPSrr:
  case X86::UNPCKLPDrr:
    return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri);
  case X86::VMOVLHPSrr:
  case X86::VUNPCKLPDrr:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri);
  case X86::VUNPCKLPDYrr:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri);
    // VMOVLHPS is always 128 bits.
  case X86::VMOVLHPSZrr:
  case X86::VUNPCKLPDZ128rr:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri);
  case X86::VUNPCKLPDZ256rr:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri);
  case X86::VUNPCKLPDZrr:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri);
  case X86::VUNPCKLPDZ128rrk:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik);
  case X86::VUNPCKLPDZ256rrk:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik);
  case X86::VUNPCKLPDZrrk:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik);
  case X86::VUNPCKLPDZ128rrkz:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
  case X86::VUNPCKLPDZ256rrkz:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
  case X86::VUNPCKLPDZrrkz:
    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz);
  case X86::UNPCKHPDrr:
    return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri);
  case X86::VUNPCKHPDrr:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri);
  case X86::VUNPCKHPDYrr:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri);
  case X86::VUNPCKHPDZ128rr:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri);
  case X86::VUNPCKHPDZ256rr:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri);
  case X86::VUNPCKHPDZrr:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri);
  case X86::VUNPCKHPDZ128rrk:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik);
  case X86::VUNPCKHPDZ256rrk:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik);
  case X86::VUNPCKHPDZrrk:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik);
  case X86::VUNPCKHPDZ128rrkz:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
  case X86::VUNPCKHPDZ256rrkz:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
  case X86::VUNPCKHPDZrrkz:
    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz);
  case X86::UNPCKLPDrm:
    return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm);
  case X86::VUNPCKLPDrm:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm);
  case X86::VUNPCKLPDYrm:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm);
  case X86::VUNPCKLPDZ128rm:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm);
  case X86::VUNPCKLPDZ256rm:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm);
  case X86::VUNPCKLPDZrm:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm);
  case X86::VUNPCKLPDZ128rmk:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk);
  case X86::VUNPCKLPDZ256rmk:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk);
  case X86::VUNPCKLPDZrmk:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk);
  case X86::VUNPCKLPDZ128rmkz:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz);
  case X86::VUNPCKLPDZ256rmkz:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz);
  case X86::VUNPCKLPDZrmkz:
    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz);
  case X86::UNPCKHPDrm:
    return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm);
  case X86::VUNPCKHPDrm:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm);
  case X86::VUNPCKHPDYrm:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm);
  case X86::VUNPCKHPDZ128rm:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm);
  case X86::VUNPCKHPDZ256rm:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm);
  case X86::VUNPCKHPDZrm:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm);
  case X86::VUNPCKHPDZ128rmk:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk);
  case X86::VUNPCKHPDZ256rmk:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk);
  case X86::VUNPCKHPDZrmk:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk);
  case X86::VUNPCKHPDZ128rmkz:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz);
  case X86::VUNPCKHPDZ256rmkz:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz);
  case X86::VUNPCKHPDZrmkz:
    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz);

  case X86::UNPCKLPSrr:
    return ProcessUNPCKPS(X86::PUNPCKLDQrr);
  case X86::VUNPCKLPSrr:
    return ProcessUNPCKPS(X86::VPUNPCKLDQrr);
  case X86::VUNPCKLPSYrr:
    return ProcessUNPCKPS(X86::VPUNPCKLDQYrr);
  case X86::VUNPCKLPSZ128rr:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr);
  case X86::VUNPCKLPSZ256rr:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr);
  case X86::VUNPCKLPSZrr:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZrr);
  case X86::VUNPCKLPSZ128rrk:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk);
  case X86::VUNPCKLPSZ256rrk:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk);
  case X86::VUNPCKLPSZrrk:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk);
  case X86::VUNPCKLPSZ128rrkz:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz);
  case X86::VUNPCKLPSZ256rrkz:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz);
  case X86::VUNPCKLPSZrrkz:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz);
  case X86::UNPCKHPSrr:
    return ProcessUNPCKPS(X86::PUNPCKHDQrr);
  case X86::VUNPCKHPSrr:
    return ProcessUNPCKPS(X86::VPUNPCKHDQrr);
  case X86::VUNPCKHPSYrr:
    return ProcessUNPCKPS(X86::VPUNPCKHDQYrr);
  case X86::VUNPCKHPSZ128rr:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr);
  case X86::VUNPCKHPSZ256rr:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr);
  case X86::VUNPCKHPSZrr:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZrr);
  case X86::VUNPCKHPSZ128rrk:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk);
  case X86::VUNPCKHPSZ256rrk:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk);
  case X86::VUNPCKHPSZrrk:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk);
  case X86::VUNPCKHPSZ128rrkz:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz);
  case X86::VUNPCKHPSZ256rrkz:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz);
  case X86::VUNPCKHPSZrrkz:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz);
  case X86::UNPCKLPSrm:
    return ProcessUNPCKPS(X86::PUNPCKLDQrm);
  case X86::VUNPCKLPSrm:
    return ProcessUNPCKPS(X86::VPUNPCKLDQrm);
  case X86::VUNPCKLPSYrm:
    return ProcessUNPCKPS(X86::VPUNPCKLDQYrm);
  case X86::VUNPCKLPSZ128rm:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm);
  case X86::VUNPCKLPSZ256rm:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm);
  case X86::VUNPCKLPSZrm:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZrm);
  case X86::VUNPCKLPSZ128rmk:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk);
  case X86::VUNPCKLPSZ256rmk:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk);
  case X86::VUNPCKLPSZrmk:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk);
  case X86::VUNPCKLPSZ128rmkz:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz);
  case X86::VUNPCKLPSZ256rmkz:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz);
  case X86::VUNPCKLPSZrmkz:
    return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz);
  case X86::UNPCKHPSrm:
    return ProcessUNPCKPS(X86::PUNPCKHDQrm);
  case X86::VUNPCKHPSrm:
    return ProcessUNPCKPS(X86::VPUNPCKHDQrm);
  case X86::VUNPCKHPSYrm:
    return ProcessUNPCKPS(X86::VPUNPCKHDQYrm);
  case X86::VUNPCKHPSZ128rm:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm);
  case X86::VUNPCKHPSZ256rm:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm);
  case X86::VUNPCKHPSZrm:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZrm);
  case X86::VUNPCKHPSZ128rmk:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk);
  case X86::VUNPCKHPSZ256rmk:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk);
  case X86::VUNPCKHPSZrmk:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk);
  case X86::VUNPCKHPSZ128rmkz:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz);
  case X86::VUNPCKHPSZ256rmkz:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
  case X86::VUNPCKHPSZrmkz:
    return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
  default:
    return false;
  }
}

bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
  LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";);
  bool Changed = false;
  ST = &MF.getSubtarget<X86Subtarget>();
  TII = ST->getInstrInfo();
  SM = &ST->getSchedModel();

  for (MachineBasicBlock &MBB : MF) {
    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
      if (processInstruction(MF, MBB, I)) {
        ++NumInstChanges;
        Changed = true;
      }
    }
  }
  LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";);
  return Changed;
}
