LLVM 20.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31
32#ifdef EXPENSIVE_CHECKS
34#include "llvm/IR/Dominators.h"
35#endif
36
37#define DEBUG_TYPE "amdgpu-isel"
38
39using namespace llvm;
40
41//===----------------------------------------------------------------------===//
42// Instruction Selector Implementation
43//===----------------------------------------------------------------------===//
44
45namespace {
46static SDValue stripBitcast(SDValue Val) {
47 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48}
49
50// Figure out if this is really an extract of the high 16-bits of a dword.
51static bool isExtractHiElt(SDValue In, SDValue &Out) {
52 In = stripBitcast(In);
53
54 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56 if (!Idx->isOne())
57 return false;
58 Out = In.getOperand(0);
59 return true;
60 }
61 }
62
63 if (In.getOpcode() != ISD::TRUNCATE)
64 return false;
65
66 SDValue Srl = In.getOperand(0);
67 if (Srl.getOpcode() == ISD::SRL) {
68 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69 if (ShiftAmt->getZExtValue() == 16) {
70 Out = stripBitcast(Srl.getOperand(0));
71 return true;
72 }
73 }
74 }
75
76 return false;
77}
78
79// Look through operations that obscure just looking at the low 16-bits of the
80// same register.
81static SDValue stripExtractLoElt(SDValue In) {
82 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83 SDValue Idx = In.getOperand(1);
84 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
85 return In.getOperand(0);
86 }
87
88 if (In.getOpcode() == ISD::TRUNCATE) {
89 SDValue Src = In.getOperand(0);
90 if (Src.getValueType().getSizeInBits() == 32)
91 return stripBitcast(Src);
92 }
93
94 return In;
95}
96
97} // end anonymous namespace
98
100 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
101 false)
103INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
105#ifdef EXPENSIVE_CHECKS
108#endif
110 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
111 false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
118}
119
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(TM, OptLevel) {}
123
125 Subtarget = &MF.getSubtarget<GCNSubtarget>();
127 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
129}
130
131bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132 // XXX - only need to list legal operations.
133 switch (Opc) {
134 case ISD::FADD:
135 case ISD::FSUB:
136 case ISD::FMUL:
137 case ISD::FDIV:
138 case ISD::FREM:
140 case ISD::UINT_TO_FP:
141 case ISD::SINT_TO_FP:
142 case ISD::FABS:
143 // Fabs is lowered to a bit operation, but it's an and which will clear the
144 // high bits anyway.
145 case ISD::FSQRT:
146 case ISD::FSIN:
147 case ISD::FCOS:
148 case ISD::FPOWI:
149 case ISD::FPOW:
150 case ISD::FLOG:
151 case ISD::FLOG2:
152 case ISD::FLOG10:
153 case ISD::FEXP:
154 case ISD::FEXP2:
155 case ISD::FCEIL:
156 case ISD::FTRUNC:
157 case ISD::FRINT:
158 case ISD::FNEARBYINT:
159 case ISD::FROUNDEVEN:
160 case ISD::FROUND:
161 case ISD::FFLOOR:
162 case ISD::FMINNUM:
163 case ISD::FMAXNUM:
164 case ISD::FLDEXP:
165 case AMDGPUISD::FRACT:
166 case AMDGPUISD::CLAMP:
169 case AMDGPUISD::FMIN3:
170 case AMDGPUISD::FMAX3:
171 case AMDGPUISD::FMED3:
173 case AMDGPUISD::RCP:
174 case AMDGPUISD::RSQ:
176 // On gfx10, all 16-bit instructions preserve the high bits.
177 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178 case ISD::FP_ROUND:
179 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180 // high bits on gfx9.
181 // TODO: If we had the source node we could see if the source was fma/mad
183 case ISD::FMA:
184 case ISD::FMAD:
187 default:
188 // fcopysign, select and others may be lowered to 32-bit bit operations
189 // which don't zero the high bits.
190 return false;
191 }
192}
193
195#ifdef EXPENSIVE_CHECKS
196 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198 for (auto &L : LI->getLoopsInPreorder()) {
199 assert(L->isLCSSAForm(DT));
200 }
201#endif
203}
204
208#ifdef EXPENSIVE_CHECKS
211#endif
213}
214
216 assert(Subtarget->d16PreservesUnusedBits());
217 MVT VT = N->getValueType(0).getSimpleVT();
218 if (VT != MVT::v2i16 && VT != MVT::v2f16)
219 return false;
220
221 SDValue Lo = N->getOperand(0);
222 SDValue Hi = N->getOperand(1);
223
224 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
225
226 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229
230 // Need to check for possible indirect dependencies on the other half of the
231 // vector to avoid introducing a cycle.
232 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
233 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
234
236 SDValue Ops[] = {
237 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238 };
239
240 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241 if (LdHi->getMemoryVT() == MVT::i8) {
242 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
244 } else {
245 assert(LdHi->getMemoryVT() == MVT::i16);
246 }
247
248 SDValue NewLoadHi =
249 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
250 Ops, LdHi->getMemoryVT(),
251 LdHi->getMemOperand());
252
253 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
254 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
255 return true;
256 }
257
258 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
259 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
262 if (LdLo && Lo.hasOneUse()) {
263 SDValue TiedIn = getHi16Elt(Hi);
264 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
265 return false;
266
267 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
268 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269 if (LdLo->getMemoryVT() == MVT::i8) {
270 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
272 } else {
273 assert(LdLo->getMemoryVT() == MVT::i16);
274 }
275
276 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
277
278 SDValue Ops[] = {
279 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280 };
281
282 SDValue NewLoadLo =
283 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
284 Ops, LdLo->getMemoryVT(),
285 LdLo->getMemOperand());
286
287 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
289 return true;
290 }
291
292 return false;
293}
294
296 if (!Subtarget->d16PreservesUnusedBits())
297 return;
298
300
301 bool MadeChange = false;
302 while (Position != CurDAG->allnodes_begin()) {
303 SDNode *N = &*--Position;
304 if (N->use_empty())
305 continue;
306
307 switch (N->getOpcode()) {
309 // TODO: Match load d16 from shl (extload:i16), 16
310 MadeChange |= matchLoadD16FromBuildVector(N);
311 break;
312 default:
313 break;
314 }
315 }
316
317 if (MadeChange) {
319 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320 CurDAG->dump(););
321 }
322}
323
324bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
325 if (N->isUndef())
326 return true;
327
328 const SIInstrInfo *TII = Subtarget->getInstrInfo();
329 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
330 return TII->isInlineConstant(C->getAPIntValue());
331
332 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
333 return TII->isInlineConstant(C->getValueAPF());
334
335 return false;
336}
337
338/// Determine the register class for \p OpNo
339/// \returns The register class of the virtual register that will be used for
340/// the given operand number \OpNo or NULL if the register class cannot be
341/// determined.
342const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
343 unsigned OpNo) const {
344 if (!N->isMachineOpcode()) {
345 if (N->getOpcode() == ISD::CopyToReg) {
346 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
347 if (Reg.isVirtual()) {
349 return MRI.getRegClass(Reg);
350 }
351
352 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353 return TRI->getPhysRegBaseClass(Reg);
354 }
355
356 return nullptr;
357 }
358
359 switch (N->getMachineOpcode()) {
360 default: {
361 const MCInstrDesc &Desc =
362 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
363 unsigned OpIdx = Desc.getNumDefs() + OpNo;
364 if (OpIdx >= Desc.getNumOperands())
365 return nullptr;
366 int RegClass = Desc.operands()[OpIdx].RegClass;
367 if (RegClass == -1)
368 return nullptr;
369
370 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
371 }
372 case AMDGPU::REG_SEQUENCE: {
373 unsigned RCID = N->getConstantOperandVal(0);
374 const TargetRegisterClass *SuperRC =
375 Subtarget->getRegisterInfo()->getRegClass(RCID);
376
377 SDValue SubRegOp = N->getOperand(OpNo + 1);
378 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
379 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380 SubRegIdx);
381 }
382 }
383}
384
385SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
386 SDValue Glue) const {
387 SmallVector <SDValue, 8> Ops;
388 Ops.push_back(NewChain); // Replace the chain.
389 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
390 Ops.push_back(N->getOperand(i));
391
392 Ops.push_back(Glue);
393 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
394}
395
396SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
398 *static_cast<const SITargetLowering*>(getTargetLowering());
399
400 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
401
402 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
403 return glueCopyToOp(N, M0, M0.getValue(1));
404}
405
406SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
407 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
408 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409 if (Subtarget->ldsRequiresM0Init())
410 return glueCopyToM0(
411 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
412 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
414 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415 return
416 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
417 }
418 return N;
419}
420
421MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422 EVT VT) const {
424 AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
427 AMDGPU::S_MOV_B32, DL, MVT::i32,
428 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
429 const SDValue Ops[] = {
430 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
431 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
432 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
433
434 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
435}
436
437void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
438 EVT VT = N->getValueType(0);
439 unsigned NumVectorElts = VT.getVectorNumElements();
440 EVT EltVT = VT.getVectorElementType();
441 SDLoc DL(N);
442 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
443
444 if (NumVectorElts == 1) {
445 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
446 RegClass);
447 return;
448 }
449
450 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
451 "supported yet");
452 // 32 = Max Num Vector Elements
453 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
454 // 1 = Vector Register Class
455 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
456
457 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
459 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
460 bool IsRegSeq = true;
461 unsigned NOps = N->getNumOperands();
462 for (unsigned i = 0; i < NOps; i++) {
463 // XXX: Why is this here?
464 if (isa<RegisterSDNode>(N->getOperand(i))) {
465 IsRegSeq = false;
466 break;
467 }
468 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
470 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
471 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
472 }
473 if (NOps != NumVectorElts) {
474 // Fill in the missing undef elements if this was a scalar_to_vector.
475 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
476 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
477 DL, EltVT);
478 for (unsigned i = NOps; i < NumVectorElts; ++i) {
479 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
481 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
482 RegSeqArgs[1 + (2 * i) + 1] =
483 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
484 }
485 }
486
487 if (!IsRegSeq)
488 SelectCode(N);
489 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
490}
491
493 unsigned int Opc = N->getOpcode();
494 if (N->isMachineOpcode()) {
495 N->setNodeId(-1);
496 return; // Already selected.
497 }
498
499 // isa<MemSDNode> almost works but is slightly too permissive for some DS
500 // intrinsics.
501 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
502 N = glueCopyToM0LDSInit(N);
503 SelectCode(N);
504 return;
505 }
506
507 switch (Opc) {
508 default:
509 break;
510 // We are selecting i64 ADD here instead of custom lower it during
511 // DAG legalization, so we can fold some i64 ADDs used for address
512 // calculation into the LOAD and STORE instructions.
513 case ISD::ADDC:
514 case ISD::ADDE:
515 case ISD::SUBC:
516 case ISD::SUBE: {
517 if (N->getValueType(0) != MVT::i64)
518 break;
519
520 SelectADD_SUB_I64(N);
521 return;
522 }
523 case ISD::UADDO_CARRY:
524 case ISD::USUBO_CARRY:
525 if (N->getValueType(0) != MVT::i32)
526 break;
527
528 SelectAddcSubb(N);
529 return;
530 case ISD::UADDO:
531 case ISD::USUBO: {
532 SelectUADDO_USUBO(N);
533 return;
534 }
536 SelectFMUL_W_CHAIN(N);
537 return;
538 }
540 SelectFMA_W_CHAIN(N);
541 return;
542 }
543
545 case ISD::BUILD_VECTOR: {
546 EVT VT = N->getValueType(0);
547 unsigned NumVectorElts = VT.getVectorNumElements();
548 if (VT.getScalarSizeInBits() == 16) {
549 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
550 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
551 ReplaceNode(N, Packed);
552 return;
553 }
554 }
555
556 break;
557 }
558
559 assert(VT.getVectorElementType().bitsEq(MVT::i32));
560 unsigned RegClassID =
561 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
562 SelectBuildVector(N, RegClassID);
563 return;
564 }
565 case ISD::BUILD_PAIR: {
566 SDValue RC, SubReg0, SubReg1;
567 SDLoc DL(N);
568 if (N->getValueType(0) == MVT::i128) {
569 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
570 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
571 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
572 } else if (N->getValueType(0) == MVT::i64) {
573 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
574 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
575 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
576 } else {
577 llvm_unreachable("Unhandled value type for BUILD_PAIR");
578 }
579 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
580 N->getOperand(1), SubReg1 };
581 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
582 N->getValueType(0), Ops));
583 return;
584 }
585
586 case ISD::Constant:
587 case ISD::ConstantFP: {
588 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
589 break;
590
591 uint64_t Imm;
592 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
593 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
594 if (AMDGPU::isValid32BitLiteral(Imm, true))
595 break;
596 } else {
597 ConstantSDNode *C = cast<ConstantSDNode>(N);
598 Imm = C->getZExtValue();
599 if (AMDGPU::isValid32BitLiteral(Imm, false))
600 break;
601 }
602
603 SDLoc DL(N);
604 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
605 return;
606 }
608 case AMDGPUISD::BFE_U32: {
609 // There is a scalar version available, but unlike the vector version which
610 // has a separate operand for the offset and width, the scalar version packs
611 // the width and offset into a single operand. Try to move to the scalar
612 // version if the offsets are constant, so that we can try to keep extended
613 // loads of kernel arguments in SGPRs.
614
615 // TODO: Technically we could try to pattern match scalar bitshifts of
616 // dynamic values, but it's probably not useful.
617 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
618 if (!Offset)
619 break;
620
621 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
622 if (!Width)
623 break;
624
625 bool Signed = Opc == AMDGPUISD::BFE_I32;
626
627 uint32_t OffsetVal = Offset->getZExtValue();
628 uint32_t WidthVal = Width->getZExtValue();
629
630 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
631 WidthVal));
632 return;
633 }
635 SelectDIV_SCALE(N);
636 return;
637 }
640 SelectMAD_64_32(N);
641 return;
642 }
643 case ISD::SMUL_LOHI:
644 case ISD::UMUL_LOHI:
645 return SelectMUL_LOHI(N);
646 case ISD::CopyToReg: {
648 *static_cast<const SITargetLowering*>(getTargetLowering());
649 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
650 break;
651 }
652 case ISD::AND:
653 case ISD::SRL:
654 case ISD::SRA:
656 if (N->getValueType(0) != MVT::i32)
657 break;
658
659 SelectS_BFE(N);
660 return;
661 case ISD::BRCOND:
662 SelectBRCOND(N);
663 return;
664 case ISD::FP_EXTEND:
665 SelectFP_EXTEND(N);
666 return;
672 // Hack around using a legal type if f16 is illegal.
673 if (N->getValueType(0) == MVT::i32) {
674 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
675 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
676 { N->getOperand(0), N->getOperand(1) });
677 SelectCode(N);
678 return;
679 }
680
681 break;
682 }
684 SelectINTRINSIC_W_CHAIN(N);
685 return;
686 }
688 SelectINTRINSIC_WO_CHAIN(N);
689 return;
690 }
691 case ISD::INTRINSIC_VOID: {
692 SelectINTRINSIC_VOID(N);
693 return;
694 }
696 SelectWAVE_ADDRESS(N);
697 return;
698 }
699 case ISD::STACKRESTORE: {
700 SelectSTACKRESTORE(N);
701 return;
702 }
703 }
704
705 SelectCode(N);
706}
707
708bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
709 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
710 const Instruction *Term = BB->getTerminator();
711 return Term->getMetadata("amdgpu.uniform") ||
712 Term->getMetadata("structurizecfg.uniform");
713}
714
715bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
716 unsigned ShAmtBits) const {
717 assert(N->getOpcode() == ISD::AND);
718
719 const APInt &RHS = N->getConstantOperandAPInt(1);
720 if (RHS.countr_one() >= ShAmtBits)
721 return true;
722
723 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
724 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
725}
726
728 SDValue &N0, SDValue &N1) {
729 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
730 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
731 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
732 // (i64 (bitcast (v2i32 (build_vector
733 // (or (extract_vector_elt V, 0), OFFSET),
734 // (extract_vector_elt V, 1)))))
735 SDValue Lo = Addr.getOperand(0).getOperand(0);
736 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
737 SDValue BaseLo = Lo.getOperand(0);
738 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
739 // Check that split base (Lo and Hi) are extracted from the same one.
740 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
742 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
743 // Lo is statically extracted from index 0.
744 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
745 BaseLo.getConstantOperandVal(1) == 0 &&
746 // Hi is statically extracted from index 0.
747 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
748 BaseHi.getConstantOperandVal(1) == 1) {
749 N0 = BaseLo.getOperand(0).getOperand(0);
750 N1 = Lo.getOperand(1);
751 return true;
752 }
753 }
754 }
755 return false;
756}
757
758bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
759 SDValue &RHS) const {
761 LHS = Addr.getOperand(0);
762 RHS = Addr.getOperand(1);
763 return true;
764 }
765
766 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
767 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
768 return true;
769 }
770
771 return false;
772}
773
775 return "AMDGPU DAG->DAG Pattern Instruction Selection";
776}
777
780 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
781
785#ifdef EXPENSIVE_CHECKS
787 .getManager();
788 auto &F = MF.getFunction();
791 for (auto &L : LI.getLoopsInPreorder())
792 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
793#endif
794 return SelectionDAGISelPass::run(MF, MFAM);
795}
796
797//===----------------------------------------------------------------------===//
798// Complex Patterns
799//===----------------------------------------------------------------------===//
800
801bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
802 SDValue &Offset) {
803 return false;
804}
805
806bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
807 SDValue &Offset) {
809 SDLoc DL(Addr);
810
811 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
812 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
813 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
814 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
815 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
816 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
819 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
820 Base = Addr.getOperand(0);
821 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822 } else {
823 Base = Addr;
824 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
825 }
826
827 return true;
828}
829
830SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
831 const SDLoc &DL) const {
833 AMDGPU::S_MOV_B32, DL, MVT::i32,
834 CurDAG->getTargetConstant(Val, DL, MVT::i32));
835 return SDValue(Mov, 0);
836}
837
838// FIXME: Should only handle uaddo_carry/usubo_carry
839void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
840 SDLoc DL(N);
841 SDValue LHS = N->getOperand(0);
842 SDValue RHS = N->getOperand(1);
843
844 unsigned Opcode = N->getOpcode();
845 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
846 bool ProduceCarry =
847 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
848 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
849
850 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
851 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
852
853 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
854 DL, MVT::i32, LHS, Sub0);
855 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
856 DL, MVT::i32, LHS, Sub1);
857
858 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
859 DL, MVT::i32, RHS, Sub0);
860 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
861 DL, MVT::i32, RHS, Sub1);
862
863 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
864
865 static const unsigned OpcMap[2][2][2] = {
866 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
867 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
868 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
869 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
870
871 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
872 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
873
874 SDNode *AddLo;
875 if (!ConsumeCarry) {
876 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
877 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
878 } else {
879 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
880 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
881 }
882 SDValue AddHiArgs[] = {
883 SDValue(Hi0, 0),
884 SDValue(Hi1, 0),
885 SDValue(AddLo, 1)
886 };
887 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
888
889 SDValue RegSequenceArgs[] = {
890 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
891 SDValue(AddLo,0),
892 Sub0,
893 SDValue(AddHi,0),
894 Sub1,
895 };
896 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
897 MVT::i64, RegSequenceArgs);
898
899 if (ProduceCarry) {
900 // Replace the carry-use
901 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
902 }
903
904 // Replace the remaining uses.
905 ReplaceNode(N, RegSequence);
906}
907
908void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
909 SDLoc DL(N);
910 SDValue LHS = N->getOperand(0);
911 SDValue RHS = N->getOperand(1);
912 SDValue CI = N->getOperand(2);
913
914 if (N->isDivergent()) {
915 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
916 : AMDGPU::V_SUBB_U32_e64;
918 N, Opc, N->getVTList(),
919 {LHS, RHS, CI,
920 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
921 } else {
922 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
923 : AMDGPU::S_SUB_CO_PSEUDO;
924 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
925 }
926}
927
928void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
929 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
930 // carry out despite the _i32 name. These were renamed in VI to _U32.
931 // FIXME: We should probably rename the opcodes here.
932 bool IsAdd = N->getOpcode() == ISD::UADDO;
933 bool IsVALU = N->isDivergent();
934
935 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
936 ++UI)
937 if (UI.getUse().getResNo() == 1) {
938 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
939 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
940 IsVALU = true;
941 break;
942 }
943 }
944
945 if (IsVALU) {
946 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
947
949 N, Opc, N->getVTList(),
950 {N->getOperand(0), N->getOperand(1),
951 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
952 } else {
953 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
954 : AMDGPU::S_USUBO_PSEUDO;
955
956 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
957 {N->getOperand(0), N->getOperand(1)});
958 }
959}
960
961void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
962 SDLoc SL(N);
963 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
964 SDValue Ops[10];
965
966 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
967 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
968 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
969 Ops[8] = N->getOperand(0);
970 Ops[9] = N->getOperand(4);
971
972 // If there are no source modifiers, prefer fmac over fma because it can use
973 // the smaller VOP2 encoding.
974 bool UseFMAC = Subtarget->hasDLInsts() &&
975 cast<ConstantSDNode>(Ops[0])->isZero() &&
976 cast<ConstantSDNode>(Ops[2])->isZero() &&
977 cast<ConstantSDNode>(Ops[4])->isZero();
978 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
979 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
980}
981
982void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
983 SDLoc SL(N);
984 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
985 SDValue Ops[8];
986
987 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
988 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
989 Ops[6] = N->getOperand(0);
990 Ops[7] = N->getOperand(3);
991
992 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
993}
994
995// We need to handle this here because tablegen doesn't support matching
996// instructions with multiple outputs.
997void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
998 SDLoc SL(N);
999 EVT VT = N->getValueType(0);
1000
1001 assert(VT == MVT::f32 || VT == MVT::f64);
1002
1003 unsigned Opc
1004 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1005
1006 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1007 // omod
1008 SDValue Ops[8];
1009 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1010 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1011 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1012 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1013}
1014
1015// We need to handle this here because tablegen doesn't support matching
1016// instructions with multiple outputs.
1017void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1018 SDLoc SL(N);
1019 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1020 unsigned Opc;
1021 if (Subtarget->hasMADIntraFwdBug())
1022 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1023 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1024 else
1025 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1026
1027 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1028 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1029 Clamp };
1030 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1031}
1032
1033// We need to handle this here because tablegen doesn't support matching
1034// instructions with multiple outputs.
1035void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1036 SDLoc SL(N);
1037 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1038 unsigned Opc;
1039 if (Subtarget->hasMADIntraFwdBug())
1040 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1041 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1042 else
1043 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1044
1045 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1046 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1047 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1048 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1049 if (!SDValue(N, 0).use_empty()) {
1050 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1051 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1052 MVT::i32, SDValue(Mad, 0), Sub0);
1053 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1054 }
1055 if (!SDValue(N, 1).use_empty()) {
1056 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1057 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1058 MVT::i32, SDValue(Mad, 0), Sub1);
1059 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1060 }
1062}
1063
1064bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1065 if (!isUInt<16>(Offset))
1066 return false;
1067
1068 if (!Base || Subtarget->hasUsableDSOffset() ||
1069 Subtarget->unsafeDSOffsetFoldingEnabled())
1070 return true;
1071
1072 // On Southern Islands instruction with a negative base value and an offset
1073 // don't seem to work.
1074 return CurDAG->SignBitIsZero(Base);
1075}
1076
1077bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1078 SDValue &Offset) const {
1079 SDLoc DL(Addr);
1081 SDValue N0 = Addr.getOperand(0);
1082 SDValue N1 = Addr.getOperand(1);
1083 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1084 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1085 // (add n0, c0)
1086 Base = N0;
1087 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1088 return true;
1089 }
1090 } else if (Addr.getOpcode() == ISD::SUB) {
1091 // sub C, x -> add (sub 0, x), C
1092 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1093 int64_t ByteOffset = C->getSExtValue();
1094 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1095 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1096
1097 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1098 // the known bits in isDSOffsetLegal. We need to emit the selected node
1099 // here, so this is thrown away.
1100 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1101 Zero, Addr.getOperand(1));
1102
1103 if (isDSOffsetLegal(Sub, ByteOffset)) {
1105 Opnds.push_back(Zero);
1106 Opnds.push_back(Addr.getOperand(1));
1107
1108 // FIXME: Select to VOP3 version for with-carry.
1109 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1110 if (Subtarget->hasAddNoCarry()) {
1111 SubOp = AMDGPU::V_SUB_U32_e64;
1112 Opnds.push_back(
1113 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1114 }
1115
1116 MachineSDNode *MachineSub =
1117 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1118
1119 Base = SDValue(MachineSub, 0);
1120 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1121 return true;
1122 }
1123 }
1124 }
1125 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1126 // If we have a constant address, prefer to put the constant into the
1127 // offset. This can save moves to load the constant address since multiple
1128 // operations can share the zero base address register, and enables merging
1129 // into read2 / write2 instructions.
1130
1131 SDLoc DL(Addr);
1132
1133 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1134 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1135 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1136 DL, MVT::i32, Zero);
1137 Base = SDValue(MovZero, 0);
1138 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1139 return true;
1140 }
1141 }
1142
1143 // default case
1144 Base = Addr;
1145 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1146 return true;
1147}
1148
1149bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1150 unsigned Offset1,
1151 unsigned Size) const {
1152 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1153 return false;
1154 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1155 return false;
1156
1157 if (!Base || Subtarget->hasUsableDSOffset() ||
1158 Subtarget->unsafeDSOffsetFoldingEnabled())
1159 return true;
1160
1161 // On Southern Islands instruction with a negative base value and an offset
1162 // don't seem to work.
1163 return CurDAG->SignBitIsZero(Base);
1164}
1165
1166// Return whether the operation has NoUnsignedWrap property.
1168 return (Addr.getOpcode() == ISD::ADD &&
1169 Addr->getFlags().hasNoUnsignedWrap()) ||
1170 Addr->getOpcode() == ISD::OR;
1171}
1172
1173// Check that the base address of flat scratch load/store in the form of `base +
1174// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1175// requirement). We always treat the first operand as the base address here.
1176bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1178 return true;
1179
1180 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1181 // values.
1182 if (Subtarget->hasSignedScratchOffsets())
1183 return true;
1184
1185 auto LHS = Addr.getOperand(0);
1186 auto RHS = Addr.getOperand(1);
1187
1188 // If the immediate offset is negative and within certain range, the base
1189 // address cannot also be negative. If the base is also negative, the sum
1190 // would be either negative or much larger than the valid range of scratch
1191 // memory a thread can access.
1192 ConstantSDNode *ImmOp = nullptr;
1193 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1194 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1195 return true;
1196 }
1197
1198 return CurDAG->SignBitIsZero(LHS);
1199}
1200
1201// Check address value in SGPR/VGPR are legal for flat scratch in the form
1202// of: SGPR + VGPR.
1203bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1205 return true;
1206
1207 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1208 // values.
1209 if (Subtarget->hasSignedScratchOffsets())
1210 return true;
1211
1212 auto LHS = Addr.getOperand(0);
1213 auto RHS = Addr.getOperand(1);
1214 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1215}
1216
1217// Check address value in SGPR/VGPR are legal for flat scratch in the form
1218// of: SGPR + VGPR + Imm.
1219bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1220 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1221 // values.
1222 if (AMDGPU::isGFX12Plus(*Subtarget))
1223 return true;
1224
1225 auto Base = Addr.getOperand(0);
1226 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1227 // If the immediate offset is negative and within certain range, the base
1228 // address cannot also be negative. If the base is also negative, the sum
1229 // would be either negative or much larger than the valid range of scratch
1230 // memory a thread can access.
1231 if (isNoUnsignedWrap(Base) &&
1233 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1234 return true;
1235
1236 auto LHS = Base.getOperand(0);
1237 auto RHS = Base.getOperand(1);
1238 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1239}
1240
1241// TODO: If offset is too big, put low 16-bit into offset.
1242bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1243 SDValue &Offset0,
1244 SDValue &Offset1) const {
1245 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1246}
1247
1248bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1249 SDValue &Offset0,
1250 SDValue &Offset1) const {
1251 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1252}
1253
1254bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1255 SDValue &Offset0, SDValue &Offset1,
1256 unsigned Size) const {
1257 SDLoc DL(Addr);
1258
1260 SDValue N0 = Addr.getOperand(0);
1261 SDValue N1 = Addr.getOperand(1);
1262 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1263 unsigned OffsetValue0 = C1->getZExtValue();
1264 unsigned OffsetValue1 = OffsetValue0 + Size;
1265
1266 // (add n0, c0)
1267 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1268 Base = N0;
1269 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1270 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1271 return true;
1272 }
1273 } else if (Addr.getOpcode() == ISD::SUB) {
1274 // sub C, x -> add (sub 0, x), C
1275 if (const ConstantSDNode *C =
1276 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1277 unsigned OffsetValue0 = C->getZExtValue();
1278 unsigned OffsetValue1 = OffsetValue0 + Size;
1279
1280 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1281 SDLoc DL(Addr);
1282 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1283
1284 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1285 // the known bits in isDSOffsetLegal. We need to emit the selected node
1286 // here, so this is thrown away.
1287 SDValue Sub =
1288 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1289
1290 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1292 Opnds.push_back(Zero);
1293 Opnds.push_back(Addr.getOperand(1));
1294 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1295 if (Subtarget->hasAddNoCarry()) {
1296 SubOp = AMDGPU::V_SUB_U32_e64;
1297 Opnds.push_back(
1298 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1299 }
1300
1301 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1302 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1303
1304 Base = SDValue(MachineSub, 0);
1305 Offset0 =
1306 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1307 Offset1 =
1308 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1309 return true;
1310 }
1311 }
1312 }
1313 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1314 unsigned OffsetValue0 = CAddr->getZExtValue();
1315 unsigned OffsetValue1 = OffsetValue0 + Size;
1316
1317 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1318 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1319 MachineSDNode *MovZero =
1320 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1321 Base = SDValue(MovZero, 0);
1322 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1323 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1324 return true;
1325 }
1326 }
1327
1328 // default case
1329
1330 Base = Addr;
1331 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1332 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1333 return true;
1334}
1335
1336bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1337 SDValue &SOffset, SDValue &Offset,
1338 SDValue &Offen, SDValue &Idxen,
1339 SDValue &Addr64) const {
1340 // Subtarget prefers to use flat instruction
1341 // FIXME: This should be a pattern predicate and not reach here
1342 if (Subtarget->useFlatForGlobal())
1343 return false;
1344
1345 SDLoc DL(Addr);
1346
1347 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1348 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1349 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1350 SOffset = Subtarget->hasRestrictedSOffset()
1351 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1352 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1353
1354 ConstantSDNode *C1 = nullptr;
1355 SDValue N0 = Addr;
1357 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1358 if (isUInt<32>(C1->getZExtValue()))
1359 N0 = Addr.getOperand(0);
1360 else
1361 C1 = nullptr;
1362 }
1363
1364 if (N0.getOpcode() == ISD::ADD) {
1365 // (add N2, N3) -> addr64, or
1366 // (add (add N2, N3), C1) -> addr64
1367 SDValue N2 = N0.getOperand(0);
1368 SDValue N3 = N0.getOperand(1);
1369 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1370
1371 if (N2->isDivergent()) {
1372 if (N3->isDivergent()) {
1373 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1374 // addr64, and construct the resource from a 0 address.
1375 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1376 VAddr = N0;
1377 } else {
1378 // N2 is divergent, N3 is not.
1379 Ptr = N3;
1380 VAddr = N2;
1381 }
1382 } else {
1383 // N2 is not divergent.
1384 Ptr = N2;
1385 VAddr = N3;
1386 }
1387 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1388 } else if (N0->isDivergent()) {
1389 // N0 is divergent. Use it as the addr64, and construct the resource from a
1390 // 0 address.
1391 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1392 VAddr = N0;
1393 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1394 } else {
1395 // N0 -> offset, or
1396 // (N0 + C1) -> offset
1397 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1398 Ptr = N0;
1399 }
1400
1401 if (!C1) {
1402 // No offset.
1403 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1404 return true;
1405 }
1406
1407 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1408 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1409 // Legal offset for instruction.
1410 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1411 return true;
1412 }
1413
1414 // Illegal offset, store it in soffset.
1415 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1416 SOffset =
1418 AMDGPU::S_MOV_B32, DL, MVT::i32,
1419 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1420 0);
1421 return true;
1422}
1423
1424bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1425 SDValue &VAddr, SDValue &SOffset,
1426 SDValue &Offset) const {
1427 SDValue Ptr, Offen, Idxen, Addr64;
1428
1429 // addr64 bit was removed for volcanic islands.
1430 // FIXME: This should be a pattern predicate and not reach here
1431 if (!Subtarget->hasAddr64())
1432 return false;
1433
1434 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1435 return false;
1436
1437 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1438 if (C->getSExtValue()) {
1439 SDLoc DL(Addr);
1440
1441 const SITargetLowering& Lowering =
1442 *static_cast<const SITargetLowering*>(getTargetLowering());
1443
1444 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1445 return true;
1446 }
1447
1448 return false;
1449}
1450
1451std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1452 SDLoc DL(N);
1453
1454 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1455 SDValue TFI =
1456 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1457
1458 // We rebase the base address into an absolute stack address and hence
1459 // use constant 0 for soffset. This value must be retained until
1460 // frame elimination and eliminateFrameIndex will choose the appropriate
1461 // frame register if need be.
1462 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1463}
1464
1465bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1466 SDValue Addr, SDValue &Rsrc,
1467 SDValue &VAddr, SDValue &SOffset,
1468 SDValue &ImmOffset) const {
1469
1470 SDLoc DL(Addr);
1473
1474 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1475
1476 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1477 int64_t Imm = CAddr->getSExtValue();
1478 const int64_t NullPtr =
1480 // Don't fold null pointer.
1481 if (Imm != NullPtr) {
1482 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1483 SDValue HighBits =
1484 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1485 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1486 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1487 VAddr = SDValue(MovHighBits, 0);
1488
1489 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1490 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1491 return true;
1492 }
1493 }
1494
1496 // (add n0, c1)
1497
1498 SDValue N0 = Addr.getOperand(0);
1499 uint64_t C1 = Addr.getConstantOperandVal(1);
1500
1501 // Offsets in vaddr must be positive if range checking is enabled.
1502 //
1503 // The total computation of vaddr + soffset + offset must not overflow. If
1504 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1505 // overflowing.
1506 //
1507 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1508 // always perform a range check. If a negative vaddr base index was used,
1509 // this would fail the range check. The overall address computation would
1510 // compute a valid address, but this doesn't happen due to the range
1511 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1512 //
1513 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1514 // MUBUF vaddr, but not on older subtargets which can only do this if the
1515 // sign bit is known 0.
1516 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1517 if (TII->isLegalMUBUFImmOffset(C1) &&
1518 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1519 CurDAG->SignBitIsZero(N0))) {
1520 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1521 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1522 return true;
1523 }
1524 }
1525
1526 // (node)
1527 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1528 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1529 return true;
1530}
1531
1532static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1533 if (Val.getOpcode() != ISD::CopyFromReg)
1534 return false;
1535 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1536 if (!Reg.isPhysical())
1537 return false;
1538 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1539 return RC && TRI.isSGPRClass(RC);
1540}
1541
1542bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1543 SDValue Addr,
1544 SDValue &SRsrc,
1545 SDValue &SOffset,
1546 SDValue &Offset) const {
1547 const SIRegisterInfo *TRI =
1548 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1549 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1552 SDLoc DL(Addr);
1553
1554 // CopyFromReg <sgpr>
1555 if (IsCopyFromSGPR(*TRI, Addr)) {
1556 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1557 SOffset = Addr;
1558 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1559 return true;
1560 }
1561
1562 ConstantSDNode *CAddr;
1563 if (Addr.getOpcode() == ISD::ADD) {
1564 // Add (CopyFromReg <sgpr>) <constant>
1565 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1566 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1567 return false;
1568 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1569 return false;
1570
1571 SOffset = Addr.getOperand(0);
1572 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1573 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1574 // <constant>
1575 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1576 } else {
1577 return false;
1578 }
1579
1580 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1581
1582 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1583 return true;
1584}
1585
1586bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1587 SDValue &SOffset, SDValue &Offset
1588 ) const {
1589 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1590 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1591
1592 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1593 return false;
1594
1595 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1596 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1597 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1598 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1599 maskTrailingOnes<uint64_t>(32); // Size
1600 SDLoc DL(Addr);
1601
1602 const SITargetLowering& Lowering =
1603 *static_cast<const SITargetLowering*>(getTargetLowering());
1604
1605 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1606 return true;
1607 }
1608 return false;
1609}
1610
1611bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1612 SDValue &SOffset) const {
1613 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1614 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1615 return true;
1616 }
1617
1618 SOffset = ByteOffsetNode;
1619 return true;
1620}
1621
1622// Find a load or store from corresponding pattern root.
1623// Roots may be build_vector, bitconvert or their combinations.
1626 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1627 return MN;
1628 assert(isa<BuildVectorSDNode>(N));
1629 for (SDValue V : N->op_values())
1630 if (MemSDNode *MN =
1631 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1632 return MN;
1633 llvm_unreachable("cannot find MemSDNode in the pattern!");
1634}
1635
1636bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1637 SDValue &VAddr, SDValue &Offset,
1638 uint64_t FlatVariant) const {
1639 int64_t OffsetVal = 0;
1640
1641 unsigned AS = findMemSDNode(N)->getAddressSpace();
1642
1643 bool CanHaveFlatSegmentOffsetBug =
1644 Subtarget->hasFlatSegmentOffsetBug() &&
1645 FlatVariant == SIInstrFlags::FLAT &&
1647
1648 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1649 SDValue N0, N1;
1650 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1651 (FlatVariant != SIInstrFlags::FlatScratch ||
1652 isFlatScratchBaseLegal(Addr))) {
1653 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1654
1655 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1656 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1657 Addr = N0;
1658 OffsetVal = COffsetVal;
1659 } else {
1660 // If the offset doesn't fit, put the low bits into the offset field and
1661 // add the rest.
1662 //
1663 // For a FLAT instruction the hardware decides whether to access
1664 // global/scratch/shared memory based on the high bits of vaddr,
1665 // ignoring the offset field, so we have to ensure that when we add
1666 // remainder to vaddr it still points into the same underlying object.
1667 // The easiest way to do that is to make sure that we split the offset
1668 // into two pieces that are both >= 0 or both <= 0.
1669
1670 SDLoc DL(N);
1671 uint64_t RemainderOffset;
1672
1673 std::tie(OffsetVal, RemainderOffset) =
1674 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1675
1676 SDValue AddOffsetLo =
1677 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1678 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1679
1680 if (Addr.getValueType().getSizeInBits() == 32) {
1682 Opnds.push_back(N0);
1683 Opnds.push_back(AddOffsetLo);
1684 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1685 if (Subtarget->hasAddNoCarry()) {
1686 AddOp = AMDGPU::V_ADD_U32_e64;
1687 Opnds.push_back(Clamp);
1688 }
1689 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1690 } else {
1691 // TODO: Should this try to use a scalar add pseudo if the base address
1692 // is uniform and saddr is usable?
1693 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1694 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1695
1696 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1697 DL, MVT::i32, N0, Sub0);
1698 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1699 DL, MVT::i32, N0, Sub1);
1700
1701 SDValue AddOffsetHi =
1702 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1703
1704 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1705
1706 SDNode *Add =
1707 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1708 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1709
1710 SDNode *Addc = CurDAG->getMachineNode(
1711 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1712 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1713
1714 SDValue RegSequenceArgs[] = {
1715 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1716 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1717
1718 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1719 MVT::i64, RegSequenceArgs),
1720 0);
1721 }
1722 }
1723 }
1724 }
1725
1726 VAddr = Addr;
1727 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1728 return true;
1729}
1730
1731bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1732 SDValue &VAddr,
1733 SDValue &Offset) const {
1734 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1735}
1736
1737bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1738 SDValue &VAddr,
1739 SDValue &Offset) const {
1740 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1741}
1742
1743bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1744 SDValue &VAddr,
1745 SDValue &Offset) const {
1746 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1748}
1749
1750// If this matches zero_extend i32:x, return x
1752 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1753 return SDValue();
1754
1755 SDValue ExtSrc = Op.getOperand(0);
1756 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1757}
1758
1759// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1760bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1761 SDValue Addr,
1762 SDValue &SAddr,
1763 SDValue &VOffset,
1764 SDValue &Offset) const {
1765 int64_t ImmOffset = 0;
1766
1767 // Match the immediate offset first, which canonically is moved as low as
1768 // possible.
1769
1770 SDValue LHS, RHS;
1771 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1772 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1773 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1774
1775 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1777 Addr = LHS;
1778 ImmOffset = COffsetVal;
1779 } else if (!LHS->isDivergent()) {
1780 if (COffsetVal > 0) {
1781 SDLoc SL(N);
1782 // saddr + large_offset -> saddr +
1783 // (voffset = large_offset & ~MaxOffset) +
1784 // (large_offset & MaxOffset);
1785 int64_t SplitImmOffset, RemainderOffset;
1786 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1788
1789 if (isUInt<32>(RemainderOffset)) {
1790 SDNode *VMov = CurDAG->getMachineNode(
1791 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1792 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1793 VOffset = SDValue(VMov, 0);
1794 SAddr = LHS;
1795 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1796 return true;
1797 }
1798 }
1799
1800 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1801 // is 1 we would need to perform 1 or 2 extra moves for each half of
1802 // the constant and it is better to do a scalar add and then issue a
1803 // single VALU instruction to materialize zero. Otherwise it is less
1804 // instructions to perform VALU adds with immediates or inline literals.
1805 unsigned NumLiterals =
1806 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1807 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1808 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1809 return false;
1810 }
1811 }
1812
1813 // Match the variable offset.
1814 if (Addr.getOpcode() == ISD::ADD) {
1815 LHS = Addr.getOperand(0);
1816 RHS = Addr.getOperand(1);
1817
1818 if (!LHS->isDivergent()) {
1819 // add (i64 sgpr), (zero_extend (i32 vgpr))
1820 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1821 SAddr = LHS;
1822 VOffset = ZextRHS;
1823 }
1824 }
1825
1826 if (!SAddr && !RHS->isDivergent()) {
1827 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1828 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1829 SAddr = RHS;
1830 VOffset = ZextLHS;
1831 }
1832 }
1833
1834 if (SAddr) {
1835 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1836 return true;
1837 }
1838 }
1839
1840 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1841 isa<ConstantSDNode>(Addr))
1842 return false;
1843
1844 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1845 // moves required to copy a 64-bit SGPR to VGPR.
1846 SAddr = Addr;
1847 SDNode *VMov =
1848 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1849 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1850 VOffset = SDValue(VMov, 0);
1851 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1852 return true;
1853}
1854
1856 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1857 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1858 } else if (SAddr.getOpcode() == ISD::ADD &&
1859 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1860 // Materialize this into a scalar move for scalar address to avoid
1861 // readfirstlane.
1862 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1863 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1864 FI->getValueType(0));
1865 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1866 MVT::i32, TFI, SAddr.getOperand(1)),
1867 0);
1868 }
1869
1870 return SAddr;
1871}
1872
1873// Match (32-bit SGPR base) + sext(imm offset)
1874bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1875 SDValue &SAddr,
1876 SDValue &Offset) const {
1877 if (Addr->isDivergent())
1878 return false;
1879
1880 SDLoc DL(Addr);
1881
1882 int64_t COffsetVal = 0;
1883
1884 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1885 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1886 SAddr = Addr.getOperand(0);
1887 } else {
1888 SAddr = Addr;
1889 }
1890
1891 SAddr = SelectSAddrFI(CurDAG, SAddr);
1892
1893 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1894
1895 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1897 int64_t SplitImmOffset, RemainderOffset;
1898 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1900
1901 COffsetVal = SplitImmOffset;
1902
1903 SDValue AddOffset =
1905 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1906 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
1907 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1908 SAddr, AddOffset),
1909 0);
1910 }
1911
1912 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
1913
1914 return true;
1915}
1916
1917// Check whether the flat scratch SVS swizzle bug affects this access.
1918bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1919 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1920 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1921 return false;
1922
1923 // The bug affects the swizzling of SVS accesses if there is any carry out
1924 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1925 // voffset to (soffset + inst_offset).
1926 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1927 KnownBits SKnown =
1929 KnownBits::makeConstant(APInt(32, ImmOffset)));
1930 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1932 return (VMax & 3) + (SMax & 3) >= 4;
1933}
1934
1935bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1936 SDValue &VAddr, SDValue &SAddr,
1937 SDValue &Offset) const {
1938 int64_t ImmOffset = 0;
1939
1940 SDValue LHS, RHS;
1941 SDValue OrigAddr = Addr;
1942 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1943 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1944 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1945
1946 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1947 Addr = LHS;
1948 ImmOffset = COffsetVal;
1949 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1950 SDLoc SL(N);
1951 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1952 // (large_offset & MaxOffset);
1953 int64_t SplitImmOffset, RemainderOffset;
1954 std::tie(SplitImmOffset, RemainderOffset)
1955 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1956
1957 if (isUInt<32>(RemainderOffset)) {
1958 SDNode *VMov = CurDAG->getMachineNode(
1959 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1960 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1961 VAddr = SDValue(VMov, 0);
1962 SAddr = LHS;
1963 if (!isFlatScratchBaseLegal(Addr))
1964 return false;
1965 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1966 return false;
1967 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1968 return true;
1969 }
1970 }
1971 }
1972
1973 if (Addr.getOpcode() != ISD::ADD)
1974 return false;
1975
1976 LHS = Addr.getOperand(0);
1977 RHS = Addr.getOperand(1);
1978
1979 if (!LHS->isDivergent() && RHS->isDivergent()) {
1980 SAddr = LHS;
1981 VAddr = RHS;
1982 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1983 SAddr = RHS;
1984 VAddr = LHS;
1985 } else {
1986 return false;
1987 }
1988
1989 if (OrigAddr != Addr) {
1990 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1991 return false;
1992 } else {
1993 if (!isFlatScratchBaseLegalSV(OrigAddr))
1994 return false;
1995 }
1996
1997 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1998 return false;
1999 SAddr = SelectSAddrFI(CurDAG, SAddr);
2000 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2001 return true;
2002}
2003
2004// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2005// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2006// Handle the case where the Immediate Offset + SOffset is negative.
2007bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2008 bool Imm32Only,
2009 bool IsBuffer,
2010 int64_t ImmOffset) const {
2011 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2012 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2013 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2014 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2015 return false;
2016 }
2017
2018 return true;
2019}
2020
2021// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2022// not null) offset. If Imm32Only is true, match only 32-bit immediate
2023// offsets available on CI.
2024bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2025 SDValue *SOffset, SDValue *Offset,
2026 bool Imm32Only, bool IsBuffer,
2027 bool HasSOffset,
2028 int64_t ImmOffset) const {
2029 assert((!SOffset || !Offset) &&
2030 "Cannot match both soffset and offset at the same time!");
2031
2032 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2033 if (!C) {
2034 if (!SOffset)
2035 return false;
2036
2037 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2038 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2039 *SOffset = ByteOffsetNode;
2040 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2041 ImmOffset);
2042 }
2043 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2044 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2045 *SOffset = ByteOffsetNode.getOperand(0);
2046 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2047 ImmOffset);
2048 }
2049 }
2050 return false;
2051 }
2052
2053 SDLoc SL(ByteOffsetNode);
2054
2055 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2056 // offset for S_BUFFER instructions is unsigned.
2057 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2058 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2059 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2060 if (EncodedOffset && Offset && !Imm32Only) {
2061 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2062 return true;
2063 }
2064
2065 // SGPR and literal offsets are unsigned.
2066 if (ByteOffset < 0)
2067 return false;
2068
2069 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2070 if (EncodedOffset && Offset && Imm32Only) {
2071 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2072 return true;
2073 }
2074
2075 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2076 return false;
2077
2078 if (SOffset) {
2079 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2080 *SOffset = SDValue(
2081 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2082 return true;
2083 }
2084
2085 return false;
2086}
2087
2088SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2089 if (Addr.getValueType() != MVT::i32)
2090 return Addr;
2091
2092 // Zero-extend a 32-bit address.
2093 SDLoc SL(Addr);
2094
2097 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2098 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2099
2100 const SDValue Ops[] = {
2101 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2102 Addr,
2103 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2104 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2105 0),
2106 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2107 };
2108
2109 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2110 Ops), 0);
2111}
2112
2113// Match a base and an immediate (if Offset is not null) or an SGPR (if
2114// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2115// true, match only 32-bit immediate offsets available on CI.
2116bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2117 SDValue *SOffset, SDValue *Offset,
2118 bool Imm32Only, bool IsBuffer,
2119 bool HasSOffset,
2120 int64_t ImmOffset) const {
2121 if (SOffset && Offset) {
2122 assert(!Imm32Only && !IsBuffer);
2123 SDValue B;
2124
2125 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2126 return false;
2127
2128 int64_t ImmOff = 0;
2129 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2130 ImmOff = C->getSExtValue();
2131
2132 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2133 ImmOff);
2134 }
2135
2136 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2137 // wraparound, because s_load instructions perform the addition in 64 bits.
2138 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2139 !Addr->getFlags().hasNoUnsignedWrap())
2140 return false;
2141
2142 SDValue N0, N1;
2143 // Extract the base and offset if possible.
2144 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2145 N0 = Addr.getOperand(0);
2146 N1 = Addr.getOperand(1);
2147 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2148 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2149 }
2150 if (!N0 || !N1)
2151 return false;
2152
2153 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2154 ImmOffset)) {
2155 SBase = N0;
2156 return true;
2157 }
2158 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2159 ImmOffset)) {
2160 SBase = N1;
2161 return true;
2162 }
2163 return false;
2164}
2165
2166bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2167 SDValue *SOffset, SDValue *Offset,
2168 bool Imm32Only) const {
2169 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2170 SBase = Expand32BitAddress(SBase);
2171 return true;
2172 }
2173
2174 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2175 SBase = Expand32BitAddress(Addr);
2176 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2177 return true;
2178 }
2179
2180 return false;
2181}
2182
2183bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2184 SDValue &Offset) const {
2185 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2186}
2187
2188bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2189 SDValue &Offset) const {
2191 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2192 /* Imm32Only */ true);
2193}
2194
2195bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2196 SDValue &SOffset) const {
2197 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2198}
2199
2200bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2201 SDValue &SOffset,
2202 SDValue &Offset) const {
2203 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2204}
2205
2206bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2207 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2208 /* Imm32Only */ false, /* IsBuffer */ true);
2209}
2210
2211bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2212 SDValue &Offset) const {
2214 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2215 /* Imm32Only */ true, /* IsBuffer */ true);
2216}
2217
2218bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2219 SDValue &Offset) const {
2220 // Match the (soffset + offset) pair as a 32-bit register base and
2221 // an immediate offset.
2222 return N.getValueType() == MVT::i32 &&
2223 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2224 &Offset, /* Imm32Only */ false,
2225 /* IsBuffer */ true);
2226}
2227
2228bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2229 SDValue &Base,
2230 SDValue &Offset) const {
2231 SDLoc DL(Index);
2232
2233 if (CurDAG->isBaseWithConstantOffset(Index)) {
2234 SDValue N0 = Index.getOperand(0);
2235 SDValue N1 = Index.getOperand(1);
2236 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2237
2238 // (add n0, c0)
2239 // Don't peel off the offset (c0) if doing so could possibly lead
2240 // the base (n0) to be negative.
2241 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2242 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2243 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2244 Base = N0;
2245 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2246 return true;
2247 }
2248 }
2249
2250 if (isa<ConstantSDNode>(Index))
2251 return false;
2252
2253 Base = Index;
2254 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2255 return true;
2256}
2257
2258SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2259 SDValue Val, uint32_t Offset,
2260 uint32_t Width) {
2261 if (Val->isDivergent()) {
2262 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2264 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2265
2266 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2267 }
2268 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2269 // Transformation function, pack the offset and width of a BFE into
2270 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2271 // source, bits [5:0] contain the offset and bits [22:16] the width.
2272 uint32_t PackedVal = Offset | (Width << 16);
2273 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2274
2275 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2276}
2277
2278void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2279 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2280 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2281 // Predicate: 0 < b <= c < 32
2282
2283 const SDValue &Shl = N->getOperand(0);
2284 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2285 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2286
2287 if (B && C) {
2288 uint32_t BVal = B->getZExtValue();
2289 uint32_t CVal = C->getZExtValue();
2290
2291 if (0 < BVal && BVal <= CVal && CVal < 32) {
2292 bool Signed = N->getOpcode() == ISD::SRA;
2293 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2294 32 - CVal));
2295 return;
2296 }
2297 }
2298 SelectCode(N);
2299}
2300
2301void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2302 switch (N->getOpcode()) {
2303 case ISD::AND:
2304 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2305 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2306 // Predicate: isMask(mask)
2307 const SDValue &Srl = N->getOperand(0);
2308 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2309 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2310
2311 if (Shift && Mask) {
2312 uint32_t ShiftVal = Shift->getZExtValue();
2313 uint32_t MaskVal = Mask->getZExtValue();
2314
2315 if (isMask_32(MaskVal)) {
2316 uint32_t WidthVal = llvm::popcount(MaskVal);
2317 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2318 WidthVal));
2319 return;
2320 }
2321 }
2322 }
2323 break;
2324 case ISD::SRL:
2325 if (N->getOperand(0).getOpcode() == ISD::AND) {
2326 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2327 // Predicate: isMask(mask >> b)
2328 const SDValue &And = N->getOperand(0);
2329 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2330 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2331
2332 if (Shift && Mask) {
2333 uint32_t ShiftVal = Shift->getZExtValue();
2334 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2335
2336 if (isMask_32(MaskVal)) {
2337 uint32_t WidthVal = llvm::popcount(MaskVal);
2338 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2339 WidthVal));
2340 return;
2341 }
2342 }
2343 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2344 SelectS_BFEFromShifts(N);
2345 return;
2346 }
2347 break;
2348 case ISD::SRA:
2349 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2350 SelectS_BFEFromShifts(N);
2351 return;
2352 }
2353 break;
2354
2356 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2357 SDValue Src = N->getOperand(0);
2358 if (Src.getOpcode() != ISD::SRL)
2359 break;
2360
2361 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2362 if (!Amt)
2363 break;
2364
2365 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2366 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2367 Amt->getZExtValue(), Width));
2368 return;
2369 }
2370 }
2371
2372 SelectCode(N);
2373}
2374
2375bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2376 assert(N->getOpcode() == ISD::BRCOND);
2377 if (!N->hasOneUse())
2378 return false;
2379
2380 SDValue Cond = N->getOperand(1);
2381 if (Cond.getOpcode() == ISD::CopyToReg)
2382 Cond = Cond.getOperand(2);
2383
2384 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2385 return false;
2386
2387 MVT VT = Cond.getOperand(0).getSimpleValueType();
2388 if (VT == MVT::i32)
2389 return true;
2390
2391 if (VT == MVT::i64) {
2392 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2393 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2394 Subtarget->hasScalarCompareEq64();
2395 }
2396
2397 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2398 return true;
2399
2400 return false;
2401}
2402
2403static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2404 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2405 // Special case for amdgcn.ballot:
2406 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2407 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2408 // =>
2409 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2410 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2411 // Cond becomes a i(WaveSize) full mask value.
2412 // Note that ballot doesn't use SETEQ condition but its easy to support it
2413 // here for completeness, so in this case Negate is set true on return.
2414 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2415 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2416 isNullConstant(VCMP.getOperand(1))) {
2417
2418 auto Cond = VCMP.getOperand(0);
2419 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2420 Cond = Cond.getOperand(0);
2421
2422 if (isBoolSGPR(Cond)) {
2423 Negate = VCMP_CC == ISD::SETEQ;
2424 return Cond;
2425 }
2426 }
2427 return SDValue();
2428}
2429
2430void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2431 SDValue Cond = N->getOperand(1);
2432
2433 if (Cond.isUndef()) {
2434 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2435 N->getOperand(2), N->getOperand(0));
2436 return;
2437 }
2438
2439 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2440
2441 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2442 bool AndExec = !UseSCCBr;
2443 bool Negate = false;
2444
2445 if (Cond.getOpcode() == ISD::SETCC &&
2446 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2447 SDValue VCMP = Cond->getOperand(0);
2448 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2449 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2450 isNullConstant(Cond->getOperand(1)) &&
2451 // We may encounter ballot.i64 in wave32 mode on -O0.
2452 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2453 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2454 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2455 // BRCOND i1 %C, %BB
2456 // =>
2457 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2458 // VCC = COPY i(WaveSize) %VCMP
2459 // S_CBRANCH_VCCNZ/VCCZ %BB
2460 Negate = CC == ISD::SETEQ;
2461 bool NegatedBallot = false;
2462 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2463 Cond = BallotCond;
2464 UseSCCBr = !BallotCond->isDivergent();
2465 Negate = Negate ^ NegatedBallot;
2466 } else {
2467 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2468 // selected as V_CMP, but this may change for uniform condition.
2469 Cond = VCMP;
2470 UseSCCBr = false;
2471 }
2472 }
2473 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2474 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2475 // used.
2476 AndExec = false;
2477 }
2478
2479 unsigned BrOp =
2480 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2481 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2482 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2483 SDLoc SL(N);
2484
2485 if (AndExec) {
2486 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2487 // analyzed what generates the vcc value, so we do not know whether vcc
2488 // bits for disabled lanes are 0. Thus we need to mask out bits for
2489 // disabled lanes.
2490 //
2491 // For the case that we select S_CBRANCH_SCC1 and it gets
2492 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2493 // SIInstrInfo::moveToVALU which inserts the S_AND).
2494 //
2495 // We could add an analysis of what generates the vcc value here and omit
2496 // the S_AND when is unnecessary. But it would be better to add a separate
2497 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2498 // catches both cases.
2499 Cond = SDValue(
2501 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2502 MVT::i1,
2503 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2504 : AMDGPU::EXEC,
2505 MVT::i1),
2506 Cond),
2507 0);
2508 }
2509
2510 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2511 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2512 N->getOperand(2), // Basic Block
2513 VCC.getValue(0));
2514}
2515
2516void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2517 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2518 !N->isDivergent()) {
2519 SDValue Src = N->getOperand(0);
2520 if (Src.getValueType() == MVT::f16) {
2521 if (isExtractHiElt(Src, Src)) {
2522 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2523 {Src});
2524 return;
2525 }
2526 }
2527 }
2528
2529 SelectCode(N);
2530}
2531
2532void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2533 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2534 // be copied to an SGPR with readfirstlane.
2535 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2536 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2537
2538 SDValue Chain = N->getOperand(0);
2539 SDValue Ptr = N->getOperand(2);
2540 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2541 MachineMemOperand *MMO = M->getMemOperand();
2542 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2543
2546 SDValue PtrBase = Ptr.getOperand(0);
2547 SDValue PtrOffset = Ptr.getOperand(1);
2548
2549 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2550 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2551 N = glueCopyToM0(N, PtrBase);
2552 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2553 }
2554 }
2555
2556 if (!Offset) {
2557 N = glueCopyToM0(N, Ptr);
2558 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2559 }
2560
2561 SDValue Ops[] = {
2562 Offset,
2563 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2564 Chain,
2565 N->getOperand(N->getNumOperands() - 1) // New glue
2566 };
2567
2568 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2569 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2570}
2571
2572// We need to handle this here because tablegen doesn't support matching
2573// instructions with multiple outputs.
2574void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2575 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2576 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2577 N->getOperand(5), N->getOperand(0)};
2578
2579 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2580 MachineMemOperand *MMO = M->getMemOperand();
2581 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2582 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2583}
2584
2585static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2586 switch (IntrID) {
2587 case Intrinsic::amdgcn_ds_gws_init:
2588 return AMDGPU::DS_GWS_INIT;
2589 case Intrinsic::amdgcn_ds_gws_barrier:
2590 return AMDGPU::DS_GWS_BARRIER;
2591 case Intrinsic::amdgcn_ds_gws_sema_v:
2592 return AMDGPU::DS_GWS_SEMA_V;
2593 case Intrinsic::amdgcn_ds_gws_sema_br:
2594 return AMDGPU::DS_GWS_SEMA_BR;
2595 case Intrinsic::amdgcn_ds_gws_sema_p:
2596 return AMDGPU::DS_GWS_SEMA_P;
2597 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2598 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2599 default:
2600 llvm_unreachable("not a gws intrinsic");
2601 }
2602}
2603
2604void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2605 if (!Subtarget->hasGWS() ||
2606 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2607 !Subtarget->hasGWSSemaReleaseAll())) {
2608 // Let this error.
2609 SelectCode(N);
2610 return;
2611 }
2612
2613 // Chain, intrinsic ID, vsrc, offset
2614 const bool HasVSrc = N->getNumOperands() == 4;
2615 assert(HasVSrc || N->getNumOperands() == 3);
2616
2617 SDLoc SL(N);
2618 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2619 int ImmOffset = 0;
2620 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2621 MachineMemOperand *MMO = M->getMemOperand();
2622
2623 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2624 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2625
2626 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2627 // offset field) % 64. Some versions of the programming guide omit the m0
2628 // part, or claim it's from offset 0.
2629 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2630 // If we have a constant offset, try to use the 0 in m0 as the base.
2631 // TODO: Look into changing the default m0 initialization value. If the
2632 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2633 // the immediate offset.
2634 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2635 ImmOffset = ConstOffset->getZExtValue();
2636 } else {
2637 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2638 ImmOffset = BaseOffset.getConstantOperandVal(1);
2639 BaseOffset = BaseOffset.getOperand(0);
2640 }
2641
2642 // Prefer to do the shift in an SGPR since it should be possible to use m0
2643 // as the result directly. If it's already an SGPR, it will be eliminated
2644 // later.
2645 SDNode *SGPROffset
2646 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2647 BaseOffset);
2648 // Shift to offset in m0
2649 SDNode *M0Base
2650 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2651 SDValue(SGPROffset, 0),
2652 CurDAG->getTargetConstant(16, SL, MVT::i32));
2653 glueCopyToM0(N, SDValue(M0Base, 0));
2654 }
2655
2656 SDValue Chain = N->getOperand(0);
2657 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2658
2659 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2661 if (HasVSrc)
2662 Ops.push_back(N->getOperand(2));
2663 Ops.push_back(OffsetField);
2664 Ops.push_back(Chain);
2665
2666 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2667 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2668}
2669
2670void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2671 if (Subtarget->getLDSBankCount() != 16) {
2672 // This is a single instruction with a pattern.
2673 SelectCode(N);
2674 return;
2675 }
2676
2677 SDLoc DL(N);
2678
2679 // This requires 2 instructions. It is possible to write a pattern to support
2680 // this, but the generated isel emitter doesn't correctly deal with multiple
2681 // output instructions using the same physical register input. The copy to m0
2682 // is incorrectly placed before the second instruction.
2683 //
2684 // TODO: Match source modifiers.
2685 //
2686 // def : Pat <
2687 // (int_amdgcn_interp_p1_f16
2688 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2689 // (i32 timm:$attrchan), (i32 timm:$attr),
2690 // (i1 timm:$high), M0),
2691 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2692 // timm:$attrchan, 0,
2693 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2694 // let Predicates = [has16BankLDS];
2695 // }
2696
2697 // 16 bank LDS
2698 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2699 N->getOperand(5), SDValue());
2700
2701 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2702
2703 SDNode *InterpMov =
2704 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2705 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2706 N->getOperand(3), // Attr
2707 N->getOperand(2), // Attrchan
2708 ToM0.getValue(1) // In glue
2709 });
2710
2711 SDNode *InterpP1LV =
2712 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2713 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2714 N->getOperand(1), // Src0
2715 N->getOperand(3), // Attr
2716 N->getOperand(2), // Attrchan
2717 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2718 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2719 N->getOperand(4), // high
2720 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2721 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2722 SDValue(InterpMov, 1)
2723 });
2724
2725 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2726}
2727
2728void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2729 unsigned IntrID = N->getConstantOperandVal(1);
2730 switch (IntrID) {
2731 case Intrinsic::amdgcn_ds_append:
2732 case Intrinsic::amdgcn_ds_consume: {
2733 if (N->getValueType(0) != MVT::i32)
2734 break;
2735 SelectDSAppendConsume(N, IntrID);
2736 return;
2737 }
2738 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2739 SelectDSBvhStackIntrinsic(N);
2740 return;
2741 case Intrinsic::amdgcn_init_whole_wave:
2744 ->setInitWholeWave();
2745 break;
2746 }
2747
2748 SelectCode(N);
2749}
2750
2751void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2752 unsigned IntrID = N->getConstantOperandVal(0);
2753 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2754 SDNode *ConvGlueNode = N->getGluedNode();
2755 if (ConvGlueNode) {
2756 // FIXME: Possibly iterate over multiple glue nodes?
2757 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2758 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2759 ConvGlueNode =
2760 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2761 MVT::Glue, SDValue(ConvGlueNode, 0));
2762 } else {
2763 ConvGlueNode = nullptr;
2764 }
2765 switch (IntrID) {
2766 case Intrinsic::amdgcn_wqm:
2767 Opcode = AMDGPU::WQM;
2768 break;
2769 case Intrinsic::amdgcn_softwqm:
2770 Opcode = AMDGPU::SOFT_WQM;
2771 break;
2772 case Intrinsic::amdgcn_wwm:
2773 case Intrinsic::amdgcn_strict_wwm:
2774 Opcode = AMDGPU::STRICT_WWM;
2775 break;
2776 case Intrinsic::amdgcn_strict_wqm:
2777 Opcode = AMDGPU::STRICT_WQM;
2778 break;
2779 case Intrinsic::amdgcn_interp_p1_f16:
2780 SelectInterpP1F16(N);
2781 return;
2782 case Intrinsic::amdgcn_permlane16_swap:
2783 case Intrinsic::amdgcn_permlane32_swap: {
2784 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2785 !Subtarget->hasPermlane16Swap()) ||
2786 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
2787 !Subtarget->hasPermlane32Swap())) {
2788 SelectCode(N); // Hit the default error
2789 return;
2790 }
2791
2792 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2793 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2794 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2795
2796 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
2797 if (ConvGlueNode)
2798 NewOps.push_back(SDValue(ConvGlueNode, 0));
2799
2800 bool FI = N->getConstantOperandVal(3);
2801 NewOps[2] = CurDAG->getTargetConstant(
2803
2804 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
2805 return;
2806 }
2807 default:
2808 SelectCode(N);
2809 break;
2810 }
2811
2812 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2813 SDValue Src = N->getOperand(1);
2814 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2815 }
2816
2817 if (ConvGlueNode) {
2818 SmallVector<SDValue, 4> NewOps(N->ops());
2819 NewOps.push_back(SDValue(ConvGlueNode, 0));
2820 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2821 }
2822}
2823
2824void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2825 unsigned IntrID = N->getConstantOperandVal(1);
2826 switch (IntrID) {
2827 case Intrinsic::amdgcn_ds_gws_init:
2828 case Intrinsic::amdgcn_ds_gws_barrier:
2829 case Intrinsic::amdgcn_ds_gws_sema_v:
2830 case Intrinsic::amdgcn_ds_gws_sema_br:
2831 case Intrinsic::amdgcn_ds_gws_sema_p:
2832 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2833 SelectDS_GWS(N, IntrID);
2834 return;
2835 default:
2836 break;
2837 }
2838
2839 SelectCode(N);
2840}
2841
2842void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2843 SDValue Log2WaveSize =
2844 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2845 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2846 {N->getOperand(0), Log2WaveSize});
2847}
2848
2849void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2850 SDValue SrcVal = N->getOperand(1);
2851 if (SrcVal.getValueType() != MVT::i32) {
2852 SelectCode(N); // Emit default error
2853 return;
2854 }
2855
2856 SDValue CopyVal;
2858 SDLoc SL(N);
2859
2860 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2861 CopyVal = SrcVal.getOperand(0);
2862 } else {
2863 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2864 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2865
2866 if (N->isDivergent()) {
2867 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2868 MVT::i32, SrcVal),
2869 0);
2870 }
2871
2872 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2873 {SrcVal, Log2WaveSize}),
2874 0);
2875 }
2876
2877 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2878 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2879}
2880
2881bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2882 unsigned &Mods,
2883 bool IsCanonicalizing,
2884 bool AllowAbs) const {
2885 Mods = SISrcMods::NONE;
2886 Src = In;
2887
2888 if (Src.getOpcode() == ISD::FNEG) {
2889 Mods |= SISrcMods::NEG;
2890 Src = Src.getOperand(0);
2891 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2892 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2893 // denormal mode, but we're implicitly canonicalizing in a source operand.
2894 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2895 if (LHS && LHS->isZero()) {
2896 Mods |= SISrcMods::NEG;
2897 Src = Src.getOperand(1);
2898 }
2899 }
2900
2901 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2902 Mods |= SISrcMods::ABS;
2903 Src = Src.getOperand(0);
2904 }
2905
2906 return true;
2907}
2908
2909bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2910 SDValue &SrcMods) const {
2911 unsigned Mods;
2912 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2913 /*AllowAbs=*/true)) {
2914 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2915 return true;
2916 }
2917
2918 return false;
2919}
2920
2921bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2922 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2923 unsigned Mods;
2924 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2925 /*AllowAbs=*/true)) {
2926 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2927 return true;
2928 }
2929
2930 return false;
2931}
2932
2933bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2934 SDValue &SrcMods) const {
2935 unsigned Mods;
2936 if (SelectVOP3ModsImpl(In, Src, Mods,
2937 /*IsCanonicalizing=*/true,
2938 /*AllowAbs=*/false)) {
2939 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2940 return true;
2941 }
2942
2943 return false;
2944}
2945
2946bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2947 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2948 return false;
2949
2950 Src = In;
2951 return true;
2952}
2953
2954bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2955 SDValue &SrcMods,
2956 bool OpSel) const {
2957 unsigned Mods;
2958 if (SelectVOP3ModsImpl(In, Src, Mods,
2959 /*IsCanonicalizing=*/true,
2960 /*AllowAbs=*/false)) {
2961 if (OpSel)
2962 Mods |= SISrcMods::OP_SEL_0;
2963 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2964 return true;
2965 }
2966
2967 return false;
2968}
2969
2970bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2971 SDValue &SrcMods) const {
2972 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2973}
2974
2975bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2976 SDValue &SrcMods) const {
2977 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2978}
2979
2980bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2981 SDValue &SrcMods, SDValue &Clamp,
2982 SDValue &Omod) const {
2983 SDLoc DL(In);
2984 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2985 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2986
2987 return SelectVOP3Mods(In, Src, SrcMods);
2988}
2989
2990bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2991 SDValue &SrcMods, SDValue &Clamp,
2992 SDValue &Omod) const {
2993 SDLoc DL(In);
2994 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2995 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2996
2997 return SelectVOP3BMods(In, Src, SrcMods);
2998}
2999
3000bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3001 SDValue &Clamp, SDValue &Omod) const {
3002 Src = In;
3003
3004 SDLoc DL(In);
3005 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3006 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3007
3008 return true;
3009}
3010
3011bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3012 SDValue &SrcMods, bool IsDOT) const {
3013 unsigned Mods = SISrcMods::NONE;
3014 Src = In;
3015
3016 // TODO: Handle G_FSUB 0 as fneg
3017 if (Src.getOpcode() == ISD::FNEG) {
3019 Src = Src.getOperand(0);
3020 }
3021
3022 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3023 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3024 unsigned VecMods = Mods;
3025
3026 SDValue Lo = stripBitcast(Src.getOperand(0));
3027 SDValue Hi = stripBitcast(Src.getOperand(1));
3028
3029 if (Lo.getOpcode() == ISD::FNEG) {
3030 Lo = stripBitcast(Lo.getOperand(0));
3031 Mods ^= SISrcMods::NEG;
3032 }
3033
3034 if (Hi.getOpcode() == ISD::FNEG) {
3035 Hi = stripBitcast(Hi.getOperand(0));
3036 Mods ^= SISrcMods::NEG_HI;
3037 }
3038
3039 if (isExtractHiElt(Lo, Lo))
3040 Mods |= SISrcMods::OP_SEL_0;
3041
3042 if (isExtractHiElt(Hi, Hi))
3043 Mods |= SISrcMods::OP_SEL_1;
3044
3045 unsigned VecSize = Src.getValueSizeInBits();
3046 Lo = stripExtractLoElt(Lo);
3047 Hi = stripExtractLoElt(Hi);
3048
3049 if (Lo.getValueSizeInBits() > VecSize) {
3051 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3052 MVT::getIntegerVT(VecSize), Lo);
3053 }
3054
3055 if (Hi.getValueSizeInBits() > VecSize) {
3057 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3058 MVT::getIntegerVT(VecSize), Hi);
3059 }
3060
3061 assert(Lo.getValueSizeInBits() <= VecSize &&
3062 Hi.getValueSizeInBits() <= VecSize);
3063
3064 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3065 // Really a scalar input. Just select from the low half of the register to
3066 // avoid packing.
3067
3068 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3069 Src = Lo;
3070 } else {
3071 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3072
3073 SDLoc SL(In);
3075 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3076 Lo.getValueType()), 0);
3077 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3078 : AMDGPU::SReg_64RegClassID;
3079 const SDValue Ops[] = {
3080 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3081 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3082 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3083
3084 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3085 Src.getValueType(), Ops), 0);
3086 }
3087 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3088 return true;
3089 }
3090
3091 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3092 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3093 .bitcastToAPInt().getZExtValue();
3094 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3095 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3096 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3097 return true;
3098 }
3099 }
3100
3101 Mods = VecMods;
3102 }
3103
3104 // Packed instructions do not have abs modifiers.
3105 Mods |= SISrcMods::OP_SEL_1;
3106
3107 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3108 return true;
3109}
3110
3111bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3112 SDValue &SrcMods) const {
3113 return SelectVOP3PMods(In, Src, SrcMods, true);
3114}
3115
3116bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3117 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3118 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3119 // 1 promotes packed values to signed, 0 treats them as unsigned.
3120 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3121
3122 unsigned Mods = SISrcMods::OP_SEL_1;
3123 unsigned SrcSign = C->getZExtValue();
3124 if (SrcSign == 1)
3125 Mods ^= SISrcMods::NEG;
3126
3127 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3128 return true;
3129}
3130
3131bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3132 SDValue &Src) const {
3133 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3134 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3135
3136 unsigned Mods = SISrcMods::OP_SEL_1;
3137 unsigned SrcVal = C->getZExtValue();
3138 if (SrcVal == 1)
3139 Mods |= SISrcMods::OP_SEL_0;
3140
3141 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3142 return true;
3143}
3144
3146 llvm::SelectionDAG *CurDAG,
3147 const SDLoc &DL) {
3148 unsigned DstRegClass;
3149 EVT DstTy;
3150 switch (Elts.size()) {
3151 case 8:
3152 DstRegClass = AMDGPU::VReg_256RegClassID;
3153 DstTy = MVT::v8i32;
3154 break;
3155 case 4:
3156 DstRegClass = AMDGPU::VReg_128RegClassID;
3157 DstTy = MVT::v4i32;
3158 break;
3159 case 2:
3160 DstRegClass = AMDGPU::VReg_64RegClassID;
3161 DstTy = MVT::v2i32;
3162 break;
3163 default:
3164 llvm_unreachable("unhandled Reg sequence size");
3165 }
3166
3168 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3169 for (unsigned i = 0; i < Elts.size(); ++i) {
3170 Ops.push_back(Elts[i]);
3171 Ops.push_back(CurDAG->getTargetConstant(
3173 }
3174 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3175}
3176
3178 llvm::SelectionDAG *CurDAG,
3179 const SDLoc &DL) {
3180 SmallVector<SDValue, 8> PackedElts;
3181 assert("unhandled Reg sequence size" &&
3182 (Elts.size() == 8 || Elts.size() == 16));
3183
3184 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3185 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3186 for (unsigned i = 0; i < Elts.size(); i += 2) {
3187 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3188 SDValue HiSrc;
3189 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3190 PackedElts.push_back(HiSrc);
3191 } else {
3192 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3193 MachineSDNode *Packed =
3194 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3195 {Elts[i + 1], Elts[i], PackLoLo});
3196 PackedElts.push_back(SDValue(Packed, 0));
3197 }
3198 }
3199
3200 return buildRegSequence32(PackedElts, CurDAG, DL);
3201}
3202
3204 llvm::SelectionDAG *CurDAG,
3205 const SDLoc &DL, unsigned ElementSize) {
3206 if (ElementSize == 16)
3207 return buildRegSequence16(Elts, CurDAG, DL);
3208 if (ElementSize == 32)
3209 return buildRegSequence32(Elts, CurDAG, DL);
3210 llvm_unreachable("Unhandled element size");
3211}
3212
3213static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3215 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3216 unsigned ElementSize) {
3217 if (ModOpcode == ISD::FNEG) {
3218 Mods |= SISrcMods::NEG;
3219 // Check if all elements also have abs modifier
3220 SmallVector<SDValue, 8> NegAbsElts;
3221 for (auto El : Elts) {
3222 if (El.getOpcode() != ISD::FABS)
3223 break;
3224 NegAbsElts.push_back(El->getOperand(0));
3225 }
3226 if (Elts.size() != NegAbsElts.size()) {
3227 // Neg
3228 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3229 } else {
3230 // Neg and Abs
3231 Mods |= SISrcMods::NEG_HI;
3232 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3233 }
3234 } else {
3235 assert(ModOpcode == ISD::FABS);
3236 // Abs
3237 Mods |= SISrcMods::NEG_HI;
3238 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3239 }
3240}
3241
3242// Check all f16 elements for modifiers while looking through b32 and v2b16
3243// build vector, stop if element does not satisfy ModifierCheck.
3244static void
3246 std::function<bool(SDValue)> ModifierCheck) {
3247 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3248 if (auto *F16Pair =
3249 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3250 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3251 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3252 if (!ModifierCheck(ElF16))
3253 break;
3254 }
3255 }
3256 }
3257}
3258
3259bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3260 SDValue &SrcMods) const {
3261 Src = In;
3262 unsigned Mods = SISrcMods::OP_SEL_1;
3263
3264 // mods are on f16 elements
3265 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3267
3268 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3269 if (Element.getOpcode() != ISD::FNEG)
3270 return false;
3271 EltsF16.push_back(Element.getOperand(0));
3272 return true;
3273 });
3274
3275 // All elements have neg modifier
3276 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3277 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3278 Mods |= SISrcMods::NEG;
3279 Mods |= SISrcMods::NEG_HI;
3280 }
3281 }
3282
3283 // mods are on v2f16 elements
3284 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3285 SmallVector<SDValue, 8> EltsV2F16;
3286 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3287 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3288 // Based on first element decide which mod we match, neg or abs
3289 if (ElV2f16.getOpcode() != ISD::FNEG)
3290 break;
3291 EltsV2F16.push_back(ElV2f16.getOperand(0));
3292 }
3293
3294 // All pairs of elements have neg modifier
3295 if (BV->getNumOperands() == EltsV2F16.size()) {
3296 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3297 Mods |= SISrcMods::NEG;
3298 Mods |= SISrcMods::NEG_HI;
3299 }
3300 }
3301
3302 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3303 return true;
3304}
3305
3306bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3307 SDValue &SrcMods) const {
3308 Src = In;
3309 unsigned Mods = SISrcMods::OP_SEL_1;
3310 unsigned ModOpcode;
3311
3312 // mods are on f16 elements
3313 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3315 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3316 // Based on first element decide which mod we match, neg or abs
3317 if (EltsF16.empty())
3318 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3319 if (ElF16.getOpcode() != ModOpcode)
3320 return false;
3321 EltsF16.push_back(ElF16.getOperand(0));
3322 return true;
3323 });
3324
3325 // All elements have ModOpcode modifier
3326 if (BV->getNumOperands() * 2 == EltsF16.size())
3327 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3328 16);
3329 }
3330
3331 // mods are on v2f16 elements
3332 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3333 SmallVector<SDValue, 8> EltsV2F16;
3334
3335 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3336 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3337 // Based on first element decide which mod we match, neg or abs
3338 if (EltsV2F16.empty())
3339 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3340 if (ElV2f16->getOpcode() != ModOpcode)
3341 break;
3342 EltsV2F16.push_back(ElV2f16->getOperand(0));
3343 }
3344
3345 // All elements have ModOpcode modifier
3346 if (BV->getNumOperands() == EltsV2F16.size())
3347 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3348 32);
3349 }
3350
3351 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3352 return true;
3353}
3354
3355bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3356 SDValue &SrcMods) const {
3357 Src = In;
3358 unsigned Mods = SISrcMods::OP_SEL_1;
3360
3361 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3362 assert(BV->getNumOperands() > 0);
3363 // Based on first element decide which mod we match, neg or abs
3364 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3365 unsigned ModOpcode =
3366 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3367 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3368 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3369 if (ElF32.getOpcode() != ModOpcode)
3370 break;
3371 EltsF32.push_back(ElF32.getOperand(0));
3372 }
3373
3374 // All elements had ModOpcode modifier
3375 if (BV->getNumOperands() == EltsF32.size())
3376 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3377 32);
3378 }
3379
3380 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3381 return true;
3382}
3383
3384bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3385 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3386 BitVector UndefElements;
3387 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3388 if (isInlineImmediate(Splat.getNode())) {
3389 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3390 unsigned Imm = C->getAPIntValue().getSExtValue();
3391 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3392 return true;
3393 }
3394 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3395 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3396 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3397 return true;
3398 }
3399 llvm_unreachable("unhandled Constant node");
3400 }
3401 }
3402
3403 // 16 bit splat
3404 SDValue SplatSrc32 = stripBitcast(In);
3405 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3406 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3407 SDValue SplatSrc16 = stripBitcast(Splat32);
3408 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3409 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3411 std::optional<APInt> RawValue;
3412 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3413 RawValue = C->getValueAPF().bitcastToAPInt();
3414 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3415 RawValue = C->getAPIntValue();
3416
3417 if (RawValue.has_value()) {
3418 EVT VT = In.getValueType().getScalarType();
3419 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3420 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3423 RawValue.value());
3424 if (TII->isInlineConstant(FloatVal)) {
3425 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3426 MVT::i16);
3427 return true;
3428 }
3429 } else if (VT.getSimpleVT() == MVT::i16) {
3430 if (TII->isInlineConstant(RawValue.value())) {
3431 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3432 MVT::i16);
3433 return true;
3434 }
3435 } else
3436 llvm_unreachable("unknown 16-bit type");
3437 }
3438 }
3439 }
3440
3441 return false;
3442}
3443
3444bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3445 SDValue &IndexKey) const {
3446 unsigned Key = 0;
3447 Src = In;
3448
3449 if (In.getOpcode() == ISD::SRL) {
3450 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3451 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3452 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3453 ShiftAmt->getZExtValue() % 8 == 0) {
3454 Key = ShiftAmt->getZExtValue() / 8;
3455 Src = ShiftSrc;
3456 }
3457 }
3458
3459 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3460 return true;
3461}
3462
3463bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3464 SDValue &IndexKey) const {
3465 unsigned Key = 0;
3466 Src = In;
3467
3468 if (In.getOpcode() == ISD::SRL) {
3469 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3470 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3471 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3472 ShiftAmt->getZExtValue() == 16) {
3473 Key = 1;
3474 Src = ShiftSrc;
3475 }
3476 }
3477
3478 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3479 return true;
3480}
3481
3482bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3483 SDValue &SrcMods) const {
3484 Src = In;
3485 // FIXME: Handle op_sel
3486 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3487 return true;
3488}
3489
3490bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3491 SDValue &SrcMods) const {
3492 // FIXME: Handle op_sel
3493 return SelectVOP3Mods(In, Src, SrcMods);
3494}
3495
3496// The return value is not whether the match is possible (which it always is),
3497// but whether or not it a conversion is really used.
3498bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3499 unsigned &Mods) const {
3500 Mods = 0;
3501 SelectVOP3ModsImpl(In, Src, Mods);
3502
3503 if (Src.getOpcode() == ISD::FP_EXTEND) {
3504 Src = Src.getOperand(0);
3505 assert(Src.getValueType() == MVT::f16);
3506 Src = stripBitcast(Src);
3507
3508 // Be careful about folding modifiers if we already have an abs. fneg is
3509 // applied last, so we don't want to apply an earlier fneg.
3510 if ((Mods & SISrcMods::ABS) == 0) {
3511 unsigned ModsTmp;
3512 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3513
3514 if ((ModsTmp & SISrcMods::NEG) != 0)
3515 Mods ^= SISrcMods::NEG;
3516
3517 if ((ModsTmp & SISrcMods::ABS) != 0)
3518 Mods |= SISrcMods::ABS;
3519 }
3520
3521 // op_sel/op_sel_hi decide the source type and source.
3522 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3523 // If the sources's op_sel is set, it picks the high half of the source
3524 // register.
3525
3526 Mods |= SISrcMods::OP_SEL_1;
3527 if (isExtractHiElt(Src, Src)) {
3528 Mods |= SISrcMods::OP_SEL_0;
3529
3530 // TODO: Should we try to look for neg/abs here?
3531 }
3532
3533 return true;
3534 }
3535
3536 return false;
3537}
3538
3539bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3540 SDValue &SrcMods) const {
3541 unsigned Mods = 0;
3542 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3543 return false;
3544 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3545 return true;
3546}
3547
3548bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3549 SDValue &SrcMods) const {
3550 unsigned Mods = 0;
3551 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3552 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3553 return true;
3554}
3555
3556// Match BITOP3 operation and return a number of matched instructions plus
3557// truth table.
3558static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3560 unsigned NumOpcodes = 0;
3561 uint8_t LHSBits, RHSBits;
3562
3563 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3564 // Define truth table given Src0, Src1, Src2 bits permutations:
3565 // 0 0 0
3566 // 0 0 1
3567 // 0 1 0
3568 // 0 1 1
3569 // 1 0 0
3570 // 1 0 1
3571 // 1 1 0
3572 // 1 1 1
3573 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3574
3575 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3576 if (C->isAllOnes()) {
3577 Bits = 0xff;
3578 return true;
3579 }
3580 if (C->isZero()) {
3581 Bits = 0;
3582 return true;
3583 }
3584 }
3585
3586 for (unsigned I = 0; I < Src.size(); ++I) {
3587 // Try to find existing reused operand
3588 if (Src[I] == Op) {
3589 Bits = SrcBits[I];
3590 return true;
3591 }
3592 // Try to replace parent operator
3593 if (Src[I] == In) {
3594 Bits = SrcBits[I];
3595 Src[I] = Op;
3596 return true;
3597 }
3598 }
3599
3600 if (Src.size() == 3) {
3601 // No room left for operands. Try one last time, there can be a 'not' of
3602 // one of our source operands. In this case we can compute the bits
3603 // without growing Src vector.
3604 if (Op.getOpcode() == ISD::XOR) {
3605 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3606 if (C->isAllOnes()) {
3607 SDValue LHS = Op.getOperand(0);
3608 for (unsigned I = 0; I < Src.size(); ++I) {
3609 if (Src[I] == LHS) {
3610 Bits = ~SrcBits[I];
3611 return true;
3612 }
3613 }
3614 }
3615 }
3616 }
3617
3618 return false;
3619 }
3620
3621 Bits = SrcBits[Src.size()];
3622 Src.push_back(Op);
3623 return true;
3624 };
3625
3626 switch (In.getOpcode()) {
3627 case ISD::AND:
3628 case ISD::OR:
3629 case ISD::XOR: {
3630 SDValue LHS = In.getOperand(0);
3631 SDValue RHS = In.getOperand(1);
3632
3633 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3634 if (!getOperandBits(LHS, LHSBits) ||
3635 !getOperandBits(RHS, RHSBits)) {
3636 Src = Backup;
3637 return std::make_pair(0, 0);
3638 }
3639
3640 // Recursion is naturally limited by the size of the operand vector.
3641 auto Op = BitOp3_Op(LHS, Src);
3642 if (Op.first) {
3643 NumOpcodes += Op.first;
3644 LHSBits = Op.second;
3645 }
3646
3647 Op = BitOp3_Op(RHS, Src);
3648 if (Op.first) {
3649 NumOpcodes += Op.first;
3650 RHSBits = Op.second;
3651 }
3652 break;
3653 }
3654 default:
3655 return std::make_pair(0, 0);
3656 }
3657
3658 uint8_t TTbl;
3659 switch (In.getOpcode()) {
3660 case ISD::AND:
3661 TTbl = LHSBits & RHSBits;
3662 break;
3663 case ISD::OR:
3664 TTbl = LHSBits | RHSBits;
3665 break;
3666 case ISD::XOR:
3667 TTbl = LHSBits ^ RHSBits;
3668 break;
3669 default:
3670 break;
3671 }
3672
3673 return std::make_pair(NumOpcodes + 1, TTbl);
3674}
3675
3676bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3677 SDValue &Src2, SDValue &Tbl) const {
3679 uint8_t TTbl;
3680 unsigned NumOpcodes;
3681
3682 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3683
3684 // Src.empty() case can happen if all operands are all zero or all ones.
3685 // Normally it shall be optimized out before reaching this.
3686 if (NumOpcodes < 2 || Src.empty())
3687 return false;
3688
3689 // For a uniform case threshold should be higher to account for moves between
3690 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3691 // and a readtfirstlane after.
3692 if (NumOpcodes < 4 && !In->isDivergent())
3693 return false;
3694
3695 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3696 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3697 // asm more readable. This cannot be modeled with AddedComplexity because
3698 // selector does not know how many operations did we match.
3699 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3700 (In.getOperand(0).getOpcode() == In.getOpcode() ||
3701 In.getOperand(1).getOpcode() == In.getOpcode()))
3702 return false;
3703
3704 if (In.getOpcode() == ISD::OR &&
3705 (In.getOperand(0).getOpcode() == ISD::AND ||
3706 In.getOperand(1).getOpcode() == ISD::AND))
3707 return false;
3708 }
3709
3710 // Last operand can be ignored, turning a ternary operation into a binary.
3711 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3712 // 'c' with 'a' here without changing the answer. In some pathological
3713 // cases it should be possible to get an operation with a single operand
3714 // too if optimizer would not catch it.
3715 while (Src.size() < 3)
3716 Src.push_back(Src[0]);
3717
3718 Src0 = Src[0];
3719 Src1 = Src[1];
3720 Src2 = Src[2];
3721
3722 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3723 return true;
3724}
3725
3726SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3727 if (In.isUndef())
3728 return CurDAG->getUNDEF(MVT::i32);
3729
3730 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3731 SDLoc SL(In);
3732 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3733 }
3734
3735 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3736 SDLoc SL(In);
3737 return CurDAG->getConstant(
3738 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3739 }
3740
3741 SDValue Src;
3742 if (isExtractHiElt(In, Src))
3743 return Src;
3744
3745 return SDValue();
3746}
3747
3748bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3750
3751 const SIRegisterInfo *SIRI =
3752 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3753 const SIInstrInfo * SII =
3754 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3755
3756 unsigned Limit = 0;
3757 bool AllUsesAcceptSReg = true;
3758 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3759 Limit < 10 && U != E; ++U, ++Limit) {
3760 const TargetRegisterClass *RC =
3761 getOperandRegClass(U->getUser(), U->getOperandNo());
3762
3763 // If the register class is unknown, it could be an unknown
3764 // register class that needs to be an SGPR, e.g. an inline asm
3765 // constraint
3766 if (!RC || SIRI->isSGPRClass(RC))
3767 return false;
3768
3769 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3770 AllUsesAcceptSReg = false;
3771 SDNode *User = U->getUser();
3772 if (User->isMachineOpcode()) {
3773 unsigned Opc = User->getMachineOpcode();
3774 const MCInstrDesc &Desc = SII->get(Opc);
3775 if (Desc.isCommutable()) {
3776 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
3777 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3778 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3779 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3780 const TargetRegisterClass *CommutedRC =
3781 getOperandRegClass(U->getUser(), CommutedOpNo);
3782 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3783 CommutedRC == &AMDGPU::VS_64RegClass)
3784 AllUsesAcceptSReg = true;
3785 }
3786 }
3787 }
3788 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3789 // commuting current user. This means have at least one use
3790 // that strictly require VGPR. Thus, we will not attempt to commute
3791 // other user instructions.
3792 if (!AllUsesAcceptSReg)
3793 break;
3794 }
3795 }
3796 return !AllUsesAcceptSReg && (Limit < 10);
3797}
3798
3799bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3800 const auto *Ld = cast<LoadSDNode>(N);
3801
3802 const MachineMemOperand *MMO = Ld->getMemOperand();
3803 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3804 return false;
3805
3806 return MMO->getSize().hasValue() &&
3807 Ld->getAlign() >=
3808 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3809 uint64_t(4))) &&
3810 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3811 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3812 (Subtarget->getScalarizeGlobalBehavior() &&
3813 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3814 Ld->isSimple() &&
3815 static_cast<const SITargetLowering *>(getTargetLowering())
3816 ->isMemOpHasNoClobberedMemOperand(N)));
3817}
3818
3821 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3822 bool IsModified = false;
3823 do {
3824 IsModified = false;
3825
3826 // Go over all selected nodes and try to fold them a bit more
3828 while (Position != CurDAG->allnodes_end()) {
3829 SDNode *Node = &*Position++;
3830 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3831 if (!MachineNode)
3832 continue;
3833
3834 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3835 if (ResNode != Node) {
3836 if (ResNode)
3837 ReplaceUses(Node, ResNode);
3838 IsModified = true;
3839 }
3840 }
3842 } while (IsModified);
3843}
3844
3846 CodeGenOptLevel OptLevel)
3848 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3849
unsigned const MachineRegisterInfo * MRI
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:706
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool isWave32() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:497
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:799
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDValue getRegister(Register Reg, EVT VT)
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:555
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:556
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:753
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:710
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:496
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:698
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:490
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:578
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:558
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:383
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ TargetFrameIndex
Definition: ISDOpcodes.h:172
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1684
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1613
@ Undef
Value of the register doesn't matter.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:267
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:263
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:121
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.