LLVM 20.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382
383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
396 MVT::f32, Legal);
397
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
401 {MVT::f16, MVT::f32, MVT::f64}, Expand);
402
405 Custom);
406
407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412 Expand);
413
414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
415
416 if (Subtarget->has16BitInsts())
417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
418 else {
419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
421 }
422
424 Custom);
425
426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428 // default unless marked custom/legal.
430 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
431 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
432 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
433 MVT::v16f64},
434 Custom);
435
436 if (isTypeLegal(MVT::f16))
438 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
439 Custom);
440
441 // Expand to fneg + fadd.
443
445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
446 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
454 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
455 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
456 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
457 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
458 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
459 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
460 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
461 Custom);
462
464 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
465
466 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
467 for (MVT VT : ScalarIntVTs) {
468 // These should use [SU]DIVREM, so set them to expand
470 Expand);
471
472 // GPU does not have divrem function for signed or unsigned.
474
475 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
477
479
480 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
482 }
483
484 // The hardware supports 32-bit FSHR, but not FSHL.
486
487 // The hardware supports 32-bit ROTR, but not ROTL.
488 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
490
492
496 MVT::i64, Custom);
498
500 Legal);
501
504 MVT::i64, Custom);
505
506 for (auto VT : {MVT::i8, MVT::i16})
508
509 static const MVT::SimpleValueType VectorIntTypes[] = {
510 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
511 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
512
513 for (MVT VT : VectorIntTypes) {
514 // Expand the following operations for the current type by default.
527 VT, Expand);
528 }
529
530 static const MVT::SimpleValueType FloatVectorTypes[] = {
531 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
532 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
533
534 for (MVT VT : FloatVectorTypes) {
547 VT, Expand);
548 }
549
550 // This causes using an unrolled select operation rather than expansion with
551 // bit operations. This is in general better, but the alternative using BFI
552 // instructions may be better if the select sources are SGPRs.
554 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
555
557 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
558
560 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
561
563 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
564
566 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
567
569 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
570
572 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
582
584 setJumpIsExpensive(true);
585
586 // FIXME: This is only partially true. If we have to do vector compares, any
587 // SGPR pair can be a condition register. If we have a uniform condition, we
588 // are better off doing SALU operations, where there is only one SCC. For now,
589 // we don't have a way of knowing during instruction selection if a condition
590 // will be uniform and we always use vector compares. Assume we are using
591 // vector compares until that is fixed.
593
596
598
599 // We want to find all load dependencies for long chains of stores to enable
600 // merging into very wide vectors. The problem is with vectors with > 4
601 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
602 // vectors are a legal type, even though we have to split the loads
603 // usually. When we can more precisely specify load legality per address
604 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
605 // smarter so that they can figure out what to do in 2 iterations without all
606 // N > 4 stores on the same chain.
608
609 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
610 // about these during lowering.
611 MaxStoresPerMemcpy = 0xffffffff;
612 MaxStoresPerMemmove = 0xffffffff;
613 MaxStoresPerMemset = 0xffffffff;
614
615 // The expansion for 64-bit division is enormous.
617 addBypassSlowDiv(64, 32);
618
629
633}
634
636 if (getTargetMachine().Options.NoSignedZerosFPMath)
637 return true;
638
639 const auto Flags = Op.getNode()->getFlags();
640 if (Flags.hasNoSignedZeros())
641 return true;
642
643 return false;
644}
645
646//===----------------------------------------------------------------------===//
647// Target Information
648//===----------------------------------------------------------------------===//
649
651static bool fnegFoldsIntoOpcode(unsigned Opc) {
652 switch (Opc) {
653 case ISD::FADD:
654 case ISD::FSUB:
655 case ISD::FMUL:
656 case ISD::FMA:
657 case ISD::FMAD:
658 case ISD::FMINNUM:
659 case ISD::FMAXNUM:
662 case ISD::FMINIMUM:
663 case ISD::FMAXIMUM:
664 case ISD::SELECT:
665 case ISD::FSIN:
666 case ISD::FTRUNC:
667 case ISD::FRINT:
668 case ISD::FNEARBYINT:
669 case ISD::FROUNDEVEN:
671 case AMDGPUISD::RCP:
678 case AMDGPUISD::FMED3:
679 // TODO: handle llvm.amdgcn.fma.legacy
680 return true;
681 case ISD::BITCAST:
682 llvm_unreachable("bitcast is special cased");
683 default:
684 return false;
685 }
686}
687
688static bool fnegFoldsIntoOp(const SDNode *N) {
689 unsigned Opc = N->getOpcode();
690 if (Opc == ISD::BITCAST) {
691 // TODO: Is there a benefit to checking the conditions performFNegCombine
692 // does? We don't for the other cases.
693 SDValue BCSrc = N->getOperand(0);
694 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
695 return BCSrc.getNumOperands() == 2 &&
696 BCSrc.getOperand(1).getValueSizeInBits() == 32;
697 }
698
699 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
700 }
701
702 return fnegFoldsIntoOpcode(Opc);
703}
704
705/// \p returns true if the operation will definitely need to use a 64-bit
706/// encoding, and thus will use a VOP3 encoding regardless of the source
707/// modifiers.
709static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
710 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
711 VT == MVT::f64;
712}
713
714/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
715/// type for ISD::SELECT.
717static bool selectSupportsSourceMods(const SDNode *N) {
718 // TODO: Only applies if select will be vector
719 return N->getValueType(0) == MVT::f32;
720}
721
722// Most FP instructions support source modifiers, but this could be refined
723// slightly.
725static bool hasSourceMods(const SDNode *N) {
726 if (isa<MemSDNode>(N))
727 return false;
728
729 switch (N->getOpcode()) {
730 case ISD::CopyToReg:
731 case ISD::FDIV:
732 case ISD::FREM:
733 case ISD::INLINEASM:
737
738 // TODO: Should really be looking at the users of the bitcast. These are
739 // problematic because bitcasts are used to legalize all stores to integer
740 // types.
741 case ISD::BITCAST:
742 return false;
744 switch (N->getConstantOperandVal(0)) {
745 case Intrinsic::amdgcn_interp_p1:
746 case Intrinsic::amdgcn_interp_p2:
747 case Intrinsic::amdgcn_interp_mov:
748 case Intrinsic::amdgcn_interp_p1_f16:
749 case Intrinsic::amdgcn_interp_p2_f16:
750 return false;
751 default:
752 return true;
753 }
754 }
755 case ISD::SELECT:
757 default:
758 return true;
759 }
760}
761
763 unsigned CostThreshold) {
764 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
765 // it is truly free to use a source modifier in all cases. If there are
766 // multiple users but for each one will necessitate using VOP3, there will be
767 // a code size increase. Try to avoid increasing code size unless we know it
768 // will save on the instruction count.
769 unsigned NumMayIncreaseSize = 0;
770 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
771
772 assert(!N->use_empty());
773
774 // XXX - Should this limit number of uses to check?
775 for (const SDNode *U : N->users()) {
776 if (!hasSourceMods(U))
777 return false;
778
779 if (!opMustUseVOP3Encoding(U, VT)) {
780 if (++NumMayIncreaseSize > CostThreshold)
781 return false;
782 }
783 }
784
785 return true;
786}
787
789 ISD::NodeType ExtendKind) const {
790 assert(!VT.isVector() && "only scalar expected");
791
792 // Round to the next multiple of 32-bits.
793 unsigned Size = VT.getSizeInBits();
794 if (Size <= 32)
795 return MVT::i32;
796 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
797}
798
800 return MVT::i32;
801}
802
804 return true;
805}
806
807// The backend supports 32 and 64 bit floating point immediates.
808// FIXME: Why are we reporting vectors of FP immediates as legal?
810 bool ForCodeSize) const {
811 EVT ScalarVT = VT.getScalarType();
812 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
813 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
814}
815
816// We don't want to shrink f64 / f32 constants.
818 EVT ScalarVT = VT.getScalarType();
819 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
820}
821
823 ISD::LoadExtType ExtTy,
824 EVT NewVT) const {
825 // TODO: This may be worth removing. Check regression tests for diffs.
827 return false;
828
829 unsigned NewSize = NewVT.getStoreSizeInBits();
830
831 // If we are reducing to a 32-bit load or a smaller multi-dword load,
832 // this is always better.
833 if (NewSize >= 32)
834 return true;
835
836 EVT OldVT = N->getValueType(0);
837 unsigned OldSize = OldVT.getStoreSizeInBits();
838
839 MemSDNode *MN = cast<MemSDNode>(N);
840 unsigned AS = MN->getAddressSpace();
841 // Do not shrink an aligned scalar load to sub-dword.
842 // Scalar engine cannot do sub-dword loads.
843 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
844 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
847 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
848 MN->isInvariant())) &&
850 return false;
851
852 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
853 // extloads, so doing one requires using a buffer_load. In cases where we
854 // still couldn't use a scalar load, using the wider load shouldn't really
855 // hurt anything.
856
857 // If the old size already had to be an extload, there's no harm in continuing
858 // to reduce the width.
859 return (OldSize < 32);
860}
861
863 const SelectionDAG &DAG,
864 const MachineMemOperand &MMO) const {
865
866 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
867
868 if (LoadTy.getScalarType() == MVT::i32)
869 return false;
870
871 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
872 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
873
874 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
875 return false;
876
877 unsigned Fast = 0;
879 CastTy, MMO, &Fast) &&
880 Fast;
881}
882
883// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
884// profitable with the expansion for 64-bit since it's generally good to
885// speculate things.
887 return true;
888}
889
891 return true;
892}
893
895 switch (N->getOpcode()) {
896 case ISD::EntryToken:
897 case ISD::TokenFactor:
898 return true;
900 unsigned IntrID = N->getConstantOperandVal(0);
902 }
904 unsigned IntrID = N->getConstantOperandVal(1);
906 }
907 case ISD::LOAD:
908 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
910 return true;
911 return false;
912 case AMDGPUISD::SETCC: // ballot-style instruction
913 return true;
914 }
915 return false;
916}
917
919 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
920 NegatibleCost &Cost, unsigned Depth) const {
921
922 switch (Op.getOpcode()) {
923 case ISD::FMA:
924 case ISD::FMAD: {
925 // Negating a fma is not free if it has users without source mods.
926 if (!allUsesHaveSourceMods(Op.getNode()))
927 return SDValue();
928 break;
929 }
930 case AMDGPUISD::RCP: {
931 SDValue Src = Op.getOperand(0);
932 EVT VT = Op.getValueType();
933 SDLoc SL(Op);
934
935 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
936 ForCodeSize, Cost, Depth + 1);
937 if (NegSrc)
938 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
939 return SDValue();
940 }
941 default:
942 break;
943 }
944
945 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
946 ForCodeSize, Cost, Depth);
947}
948
949//===---------------------------------------------------------------------===//
950// Target Properties
951//===---------------------------------------------------------------------===//
952
955
956 // Packed operations do not have a fabs modifier.
957 return VT == MVT::f32 || VT == MVT::f64 ||
958 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
959}
960
963 // Report this based on the end legalized type.
964 VT = VT.getScalarType();
965 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
966}
967
969 unsigned NumElem,
970 unsigned AS) const {
971 return true;
972}
973
975 // There are few operations which truly have vector input operands. Any vector
976 // operation is going to involve operations on each component, and a
977 // build_vector will be a copy per element, so it always makes sense to use a
978 // build_vector input in place of the extracted element to avoid a copy into a
979 // super register.
980 //
981 // We should probably only do this if all users are extracts only, but this
982 // should be the common case.
983 return true;
984}
985
987 // Truncate is just accessing a subregister.
988
989 unsigned SrcSize = Source.getSizeInBits();
990 unsigned DestSize = Dest.getSizeInBits();
991
992 return DestSize < SrcSize && DestSize % 32 == 0 ;
993}
994
996 // Truncate is just accessing a subregister.
997
998 unsigned SrcSize = Source->getScalarSizeInBits();
999 unsigned DestSize = Dest->getScalarSizeInBits();
1000
1001 if (DestSize== 16 && Subtarget->has16BitInsts())
1002 return SrcSize >= 32;
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0;
1005}
1006
1008 unsigned SrcSize = Src->getScalarSizeInBits();
1009 unsigned DestSize = Dest->getScalarSizeInBits();
1010
1011 if (SrcSize == 16 && Subtarget->has16BitInsts())
1012 return DestSize >= 32;
1013
1014 return SrcSize == 32 && DestSize == 64;
1015}
1016
1018 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1019 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1020 // this will enable reducing 64-bit operations the 32-bit, which is always
1021 // good.
1022
1023 if (Src == MVT::i16)
1024 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1025
1026 return Src == MVT::i32 && Dest == MVT::i64;
1027}
1028
1030 EVT DestVT) const {
1031 switch (N->getOpcode()) {
1032 case ISD::ADD:
1033 case ISD::SUB:
1034 case ISD::SHL:
1035 case ISD::SRL:
1036 case ISD::SRA:
1037 case ISD::AND:
1038 case ISD::OR:
1039 case ISD::XOR:
1040 case ISD::MUL:
1041 case ISD::SETCC:
1042 case ISD::SELECT:
1043 if (Subtarget->has16BitInsts() &&
1044 (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
1045 // Don't narrow back down to i16 if promoted to i32 already.
1046 if (!N->isDivergent() && DestVT.isInteger() &&
1047 DestVT.getScalarSizeInBits() > 1 &&
1048 DestVT.getScalarSizeInBits() <= 16 &&
1049 SrcVT.getScalarSizeInBits() > 16) {
1050 return false;
1051 }
1052 }
1053 return true;
1054 default:
1055 break;
1056 }
1057
1058 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1059 // limited number of native 64-bit operations. Shrinking an operation to fit
1060 // in a single 32-bit register should always be helpful. As currently used,
1061 // this is much less general than the name suggests, and is only used in
1062 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1063 // not profitable, and may actually be harmful.
1064 if (isa<LoadSDNode>(N))
1065 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1066
1067 return true;
1068}
1069
1071 const SDNode* N, CombineLevel Level) const {
1072 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1073 N->getOpcode() == ISD::SRL) &&
1074 "Expected shift op");
1075
1076 SDValue ShiftLHS = N->getOperand(0);
1077 if (!ShiftLHS->hasOneUse())
1078 return false;
1079
1080 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1081 !ShiftLHS.getOperand(0)->hasOneUse())
1082 return false;
1083
1084 // Always commute pre-type legalization and right shifts.
1085 // We're looking for shl(or(x,y),z) patterns.
1087 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1088 return true;
1089
1090 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1091 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1092 (N->user_begin()->getOpcode() == ISD::SRA ||
1093 N->user_begin()->getOpcode() == ISD::SRL))
1094 return false;
1095
1096 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1097 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1098 if (LHS.getOpcode() != ISD::SHL)
1099 return false;
1100 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1101 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1102 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1103 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1104 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1105 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1106 };
1107 SDValue LHS = N->getOperand(0).getOperand(0);
1108 SDValue RHS = N->getOperand(0).getOperand(1);
1109 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1110}
1111
1112//===---------------------------------------------------------------------===//
1113// TargetLowering Callbacks
1114//===---------------------------------------------------------------------===//
1115
1117 bool IsVarArg) {
1118 switch (CC) {
1126 return CC_AMDGPU;
1129 return CC_AMDGPU_CS_CHAIN;
1130 case CallingConv::C:
1131 case CallingConv::Fast:
1132 case CallingConv::Cold:
1133 return CC_AMDGPU_Func;
1135 return CC_SI_Gfx;
1138 default:
1139 report_fatal_error("Unsupported calling convention for call");
1140 }
1141}
1142
1144 bool IsVarArg) {
1145 switch (CC) {
1148 llvm_unreachable("kernels should not be handled here");
1158 return RetCC_SI_Shader;
1160 return RetCC_SI_Gfx;
1161 case CallingConv::C:
1162 case CallingConv::Fast:
1163 case CallingConv::Cold:
1164 return RetCC_AMDGPU_Func;
1165 default:
1166 report_fatal_error("Unsupported calling convention.");
1167 }
1168}
1169
1170/// The SelectionDAGBuilder will automatically promote function arguments
1171/// with illegal types. However, this does not work for the AMDGPU targets
1172/// since the function arguments are stored in memory as these illegal types.
1173/// In order to handle this properly we need to get the original types sizes
1174/// from the LLVM IR Function and fixup the ISD:InputArg values before
1175/// passing them to AnalyzeFormalArguments()
1176
1177/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1178/// input values across multiple registers. Each item in the Ins array
1179/// represents a single value that will be stored in registers. Ins[x].VT is
1180/// the value type of the value that will be stored in the register, so
1181/// whatever SDNode we lower the argument to needs to be this type.
1182///
1183/// In order to correctly lower the arguments we need to know the size of each
1184/// argument. Since Ins[x].VT gives us the size of the register that will
1185/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1186/// for the original function argument so that we can deduce the correct memory
1187/// type to use for Ins[x]. In most cases the correct memory type will be
1188/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1189/// we have a kernel argument of type v8i8, this argument will be split into
1190/// 8 parts and each part will be represented by its own item in the Ins array.
1191/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1192/// the argument before it was split. From this, we deduce that the memory type
1193/// for each individual part is i8. We pass the memory type as LocVT to the
1194/// calling convention analysis function and the register type (Ins[x].VT) as
1195/// the ValVT.
1197 CCState &State,
1198 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1199 const MachineFunction &MF = State.getMachineFunction();
1200 const Function &Fn = MF.getFunction();
1201 LLVMContext &Ctx = Fn.getParent()->getContext();
1202 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1203 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1205
1206 Align MaxAlign = Align(1);
1207 uint64_t ExplicitArgOffset = 0;
1208 const DataLayout &DL = Fn.getDataLayout();
1209
1210 unsigned InIndex = 0;
1211
1212 for (const Argument &Arg : Fn.args()) {
1213 const bool IsByRef = Arg.hasByRefAttr();
1214 Type *BaseArgTy = Arg.getType();
1215 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1216 Align Alignment = DL.getValueOrABITypeAlignment(
1217 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1218 MaxAlign = std::max(Alignment, MaxAlign);
1219 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1220
1221 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1222 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1223
1224 // We're basically throwing away everything passed into us and starting over
1225 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1226 // to us as computed in Ins.
1227 //
1228 // We also need to figure out what type legalization is trying to do to get
1229 // the correct memory offsets.
1230
1231 SmallVector<EVT, 16> ValueVTs;
1233 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1234
1235 for (unsigned Value = 0, NumValues = ValueVTs.size();
1236 Value != NumValues; ++Value) {
1237 uint64_t BasePartOffset = Offsets[Value];
1238
1239 EVT ArgVT = ValueVTs[Value];
1240 EVT MemVT = ArgVT;
1241 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1242 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1243
1244 if (NumRegs == 1) {
1245 // This argument is not split, so the IR type is the memory type.
1246 if (ArgVT.isExtended()) {
1247 // We have an extended type, like i24, so we should just use the
1248 // register type.
1249 MemVT = RegisterVT;
1250 } else {
1251 MemVT = ArgVT;
1252 }
1253 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1254 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1255 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1256 // We have a vector value which has been split into a vector with
1257 // the same scalar type, but fewer elements. This should handle
1258 // all the floating-point vector types.
1259 MemVT = RegisterVT;
1260 } else if (ArgVT.isVector() &&
1261 ArgVT.getVectorNumElements() == NumRegs) {
1262 // This arg has been split so that each element is stored in a separate
1263 // register.
1264 MemVT = ArgVT.getScalarType();
1265 } else if (ArgVT.isExtended()) {
1266 // We have an extended type, like i65.
1267 MemVT = RegisterVT;
1268 } else {
1269 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1270 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1271 if (RegisterVT.isInteger()) {
1272 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1273 } else if (RegisterVT.isVector()) {
1274 assert(!RegisterVT.getScalarType().isFloatingPoint());
1275 unsigned NumElements = RegisterVT.getVectorNumElements();
1276 assert(MemoryBits % NumElements == 0);
1277 // This vector type has been split into another vector type with
1278 // a different elements size.
1279 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1280 MemoryBits / NumElements);
1281 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1282 } else {
1283 llvm_unreachable("cannot deduce memory type.");
1284 }
1285 }
1286
1287 // Convert one element vectors to scalar.
1288 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1289 MemVT = MemVT.getScalarType();
1290
1291 // Round up vec3/vec5 argument.
1292 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1293 MemVT = MemVT.getPow2VectorType(State.getContext());
1294 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1295 MemVT = MemVT.getRoundIntegerType(State.getContext());
1296 }
1297
1298 unsigned PartOffset = 0;
1299 for (unsigned i = 0; i != NumRegs; ++i) {
1300 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1301 BasePartOffset + PartOffset,
1302 MemVT.getSimpleVT(),
1304 PartOffset += MemVT.getStoreSize();
1305 }
1306 }
1307 }
1308}
1309
1311 SDValue Chain, CallingConv::ID CallConv,
1312 bool isVarArg,
1314 const SmallVectorImpl<SDValue> &OutVals,
1315 const SDLoc &DL, SelectionDAG &DAG) const {
1316 // FIXME: Fails for r600 tests
1317 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1318 // "wave terminate should not have return values");
1319 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1320}
1321
1322//===---------------------------------------------------------------------===//
1323// Target specific lowering
1324//===---------------------------------------------------------------------===//
1325
1326/// Selects the correct CCAssignFn for a given CallingConvention value.
1328 bool IsVarArg) {
1329 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1330}
1331
1333 bool IsVarArg) {
1335}
1336
1338 SelectionDAG &DAG,
1339 MachineFrameInfo &MFI,
1340 int ClobberedFI) const {
1341 SmallVector<SDValue, 8> ArgChains;
1342 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1343 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1344
1345 // Include the original chain at the beginning of the list. When this is
1346 // used by target LowerCall hooks, this helps legalize find the
1347 // CALLSEQ_BEGIN node.
1348 ArgChains.push_back(Chain);
1349
1350 // Add a chain value for each stack argument corresponding
1351 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1352 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1353 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1354 if (FI->getIndex() < 0) {
1355 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1356 int64_t InLastByte = InFirstByte;
1357 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1358
1359 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1360 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1361 ArgChains.push_back(SDValue(L, 1));
1362 }
1363 }
1364 }
1365 }
1366
1367 // Build a tokenfactor for all the chains.
1368 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1369}
1370
1373 StringRef Reason) const {
1374 SDValue Callee = CLI.Callee;
1375 SelectionDAG &DAG = CLI.DAG;
1376
1377 const Function &Fn = DAG.getMachineFunction().getFunction();
1378
1379 StringRef FuncName("<unknown>");
1380
1381 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1382 FuncName = G->getSymbol();
1383 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1384 FuncName = G->getGlobal()->getName();
1385
1387 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1388 DAG.getContext()->diagnose(NoCalls);
1389
1390 if (!CLI.IsTailCall) {
1391 for (ISD::InputArg &Arg : CLI.Ins)
1392 InVals.push_back(DAG.getUNDEF(Arg.VT));
1393 }
1394
1395 return DAG.getEntryNode();
1396}
1397
1399 SmallVectorImpl<SDValue> &InVals) const {
1400 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1401}
1402
1404 SelectionDAG &DAG) const {
1405 const Function &Fn = DAG.getMachineFunction().getFunction();
1406
1407 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1408 SDLoc(Op).getDebugLoc());
1409 DAG.getContext()->diagnose(NoDynamicAlloca);
1410 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1411 return DAG.getMergeValues(Ops, SDLoc());
1412}
1413
1415 SelectionDAG &DAG) const {
1416 switch (Op.getOpcode()) {
1417 default:
1418 Op->print(errs(), &DAG);
1419 llvm_unreachable("Custom lowering code for this "
1420 "instruction is not implemented yet!");
1421 break;
1423 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1425 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1426 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1427 case ISD::FREM: return LowerFREM(Op, DAG);
1428 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1429 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1430 case ISD::FRINT: return LowerFRINT(Op, DAG);
1431 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1432 case ISD::FROUNDEVEN:
1433 return LowerFROUNDEVEN(Op, DAG);
1434 case ISD::FROUND: return LowerFROUND(Op, DAG);
1435 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1436 case ISD::FLOG2:
1437 return LowerFLOG2(Op, DAG);
1438 case ISD::FLOG:
1439 case ISD::FLOG10:
1440 return LowerFLOGCommon(Op, DAG);
1441 case ISD::FEXP:
1442 case ISD::FEXP10:
1443 return lowerFEXP(Op, DAG);
1444 case ISD::FEXP2:
1445 return lowerFEXP2(Op, DAG);
1446 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1447 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1448 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1449 case ISD::FP_TO_SINT:
1450 case ISD::FP_TO_UINT:
1451 return LowerFP_TO_INT(Op, DAG);
1452 case ISD::CTTZ:
1454 case ISD::CTLZ:
1456 return LowerCTLZ_CTTZ(Op, DAG);
1458 }
1459 return Op;
1460}
1461
1464 SelectionDAG &DAG) const {
1465 switch (N->getOpcode()) {
1467 // Different parts of legalization seem to interpret which type of
1468 // sign_extend_inreg is the one to check for custom lowering. The extended
1469 // from type is what really matters, but some places check for custom
1470 // lowering of the result type. This results in trying to use
1471 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1472 // nothing here and let the illegal result integer be handled normally.
1473 return;
1474 case ISD::FLOG2:
1475 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1476 Results.push_back(Lowered);
1477 return;
1478 case ISD::FLOG:
1479 case ISD::FLOG10:
1480 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1481 Results.push_back(Lowered);
1482 return;
1483 case ISD::FEXP2:
1484 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1485 Results.push_back(Lowered);
1486 return;
1487 case ISD::FEXP:
1488 case ISD::FEXP10:
1489 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1490 Results.push_back(Lowered);
1491 return;
1492 case ISD::CTLZ:
1494 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1495 Results.push_back(Lowered);
1496 return;
1497 default:
1498 return;
1499 }
1500}
1501
1503 SDValue Op,
1504 SelectionDAG &DAG) const {
1505
1506 const DataLayout &DL = DAG.getDataLayout();
1507 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1508 const GlobalValue *GV = G->getGlobal();
1509
1510 if (!MFI->isModuleEntryFunction()) {
1511 if (std::optional<uint32_t> Address =
1513 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1514 }
1515 }
1516
1517 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1518 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1519 if (!MFI->isModuleEntryFunction() &&
1520 GV->getName() != "llvm.amdgcn.module.lds" &&
1521 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
1522 SDLoc DL(Op);
1523 const Function &Fn = DAG.getMachineFunction().getFunction();
1524 DiagnosticInfoUnsupported BadLDSDecl(
1525 Fn, "local memory global used by non-kernel function",
1526 DL.getDebugLoc(), DS_Warning);
1527 DAG.getContext()->diagnose(BadLDSDecl);
1528
1529 // We currently don't have a way to correctly allocate LDS objects that
1530 // aren't directly associated with a kernel. We do force inlining of
1531 // functions that use local objects. However, if these dead functions are
1532 // not eliminated, we don't want a compile time error. Just emit a warning
1533 // and a trap, since there should be no callable path here.
1534 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1535 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1536 Trap, DAG.getRoot());
1537 DAG.setRoot(OutputChain);
1538 return DAG.getUNDEF(Op.getValueType());
1539 }
1540
1541 // XXX: What does the value of G->getOffset() mean?
1542 assert(G->getOffset() == 0 &&
1543 "Do not know what to do with an non-zero offset");
1544
1545 // TODO: We could emit code to handle the initialization somewhere.
1546 // We ignore the initializer for now and legalize it to allow selection.
1547 // The initializer will anyway get errored out during assembly emission.
1548 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1549 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1550 }
1551 return SDValue();
1552}
1553
1555 SelectionDAG &DAG) const {
1557 SDLoc SL(Op);
1558
1559 EVT VT = Op.getValueType();
1560 if (VT.getVectorElementType().getSizeInBits() < 32) {
1561 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1562 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1563 unsigned NewNumElt = OpBitSize / 32;
1564 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1566 MVT::i32, NewNumElt);
1567 for (const SDUse &U : Op->ops()) {
1568 SDValue In = U.get();
1569 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1570 if (NewNumElt > 1)
1571 DAG.ExtractVectorElements(NewIn, Args);
1572 else
1573 Args.push_back(NewIn);
1574 }
1575
1576 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1577 NewNumElt * Op.getNumOperands());
1578 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1579 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1580 }
1581 }
1582
1583 for (const SDUse &U : Op->ops())
1584 DAG.ExtractVectorElements(U.get(), Args);
1585
1586 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1587}
1588
1590 SelectionDAG &DAG) const {
1591 SDLoc SL(Op);
1593 unsigned Start = Op.getConstantOperandVal(1);
1594 EVT VT = Op.getValueType();
1595 EVT SrcVT = Op.getOperand(0).getValueType();
1596
1597 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1598 unsigned NumElt = VT.getVectorNumElements();
1599 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1600 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1601
1602 // Extract 32-bit registers at a time.
1603 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1604 EVT NewVT = NumElt == 2
1605 ? MVT::i32
1606 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1607 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1608
1609 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1610 if (NumElt == 2)
1611 Tmp = Args[0];
1612 else
1613 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1614
1615 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1616 }
1617
1618 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1620
1621 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1622}
1623
1624// TODO: Handle fabs too
1626 if (Val.getOpcode() == ISD::FNEG)
1627 return Val.getOperand(0);
1628
1629 return Val;
1630}
1631
1633 if (Val.getOpcode() == ISD::FNEG)
1634 Val = Val.getOperand(0);
1635 if (Val.getOpcode() == ISD::FABS)
1636 Val = Val.getOperand(0);
1637 if (Val.getOpcode() == ISD::FCOPYSIGN)
1638 Val = Val.getOperand(0);
1639 return Val;
1640}
1641
1643 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1644 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1645 SelectionDAG &DAG = DCI.DAG;
1646 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1647 switch (CCOpcode) {
1648 case ISD::SETOEQ:
1649 case ISD::SETONE:
1650 case ISD::SETUNE:
1651 case ISD::SETNE:
1652 case ISD::SETUEQ:
1653 case ISD::SETEQ:
1654 case ISD::SETFALSE:
1655 case ISD::SETFALSE2:
1656 case ISD::SETTRUE:
1657 case ISD::SETTRUE2:
1658 case ISD::SETUO:
1659 case ISD::SETO:
1660 break;
1661 case ISD::SETULE:
1662 case ISD::SETULT: {
1663 if (LHS == True)
1664 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1665 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1666 }
1667 case ISD::SETOLE:
1668 case ISD::SETOLT:
1669 case ISD::SETLE:
1670 case ISD::SETLT: {
1671 // Ordered. Assume ordered for undefined.
1672
1673 // Only do this after legalization to avoid interfering with other combines
1674 // which might occur.
1676 !DCI.isCalledByLegalizer())
1677 return SDValue();
1678
1679 // We need to permute the operands to get the correct NaN behavior. The
1680 // selected operand is the second one based on the failing compare with NaN,
1681 // so permute it based on the compare type the hardware uses.
1682 if (LHS == True)
1683 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1684 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1685 }
1686 case ISD::SETUGE:
1687 case ISD::SETUGT: {
1688 if (LHS == True)
1689 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1690 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1691 }
1692 case ISD::SETGT:
1693 case ISD::SETGE:
1694 case ISD::SETOGE:
1695 case ISD::SETOGT: {
1697 !DCI.isCalledByLegalizer())
1698 return SDValue();
1699
1700 if (LHS == True)
1701 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1702 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1703 }
1704 case ISD::SETCC_INVALID:
1705 llvm_unreachable("Invalid setcc condcode!");
1706 }
1707 return SDValue();
1708}
1709
1710/// Generate Min/Max node
1712 SDValue LHS, SDValue RHS,
1713 SDValue True, SDValue False,
1714 SDValue CC,
1715 DAGCombinerInfo &DCI) const {
1716 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1717 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1718
1719 SelectionDAG &DAG = DCI.DAG;
1720
1721 // If we can't directly match this, try to see if we can fold an fneg to
1722 // match.
1723
1724 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1725 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1726 SDValue NegTrue = peekFNeg(True);
1727
1728 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1729 // fmin/fmax.
1730 //
1731 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1732 // -> fneg (fmin_legacy lhs, K)
1733 //
1734 // TODO: Use getNegatedExpression
1735 if (LHS == NegTrue && CFalse && CRHS) {
1736 APFloat NegRHS = neg(CRHS->getValueAPF());
1737 if (NegRHS == CFalse->getValueAPF()) {
1738 SDValue Combined =
1739 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1740 if (Combined)
1741 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1742 return SDValue();
1743 }
1744 }
1745
1746 return SDValue();
1747}
1748
1749std::pair<SDValue, SDValue>
1751 SDLoc SL(Op);
1752
1753 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1754
1755 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1756 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1757
1758 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1759 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1760
1761 return std::pair(Lo, Hi);
1762}
1763
1765 SDLoc SL(Op);
1766
1767 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1768 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1769 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1770}
1771
1773 SDLoc SL(Op);
1774
1775 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1776 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1777 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1778}
1779
1780// Split a vector type into two parts. The first part is a power of two vector.
1781// The second part is whatever is left over, and is a scalar if it would
1782// otherwise be a 1-vector.
1783std::pair<EVT, EVT>
1785 EVT LoVT, HiVT;
1786 EVT EltVT = VT.getVectorElementType();
1787 unsigned NumElts = VT.getVectorNumElements();
1788 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1789 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1790 HiVT = NumElts - LoNumElts == 1
1791 ? EltVT
1792 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1793 return std::pair(LoVT, HiVT);
1794}
1795
1796// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1797// scalar.
1798std::pair<SDValue, SDValue>
1800 const EVT &LoVT, const EVT &HiVT,
1801 SelectionDAG &DAG) const {
1803 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1804 N.getValueType().getVectorNumElements() &&
1805 "More vector elements requested than available!");
1807 DAG.getVectorIdxConstant(0, DL));
1808 SDValue Hi = DAG.getNode(
1810 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1811 return std::pair(Lo, Hi);
1812}
1813
1815 SelectionDAG &DAG) const {
1816 LoadSDNode *Load = cast<LoadSDNode>(Op);
1817 EVT VT = Op.getValueType();
1818 SDLoc SL(Op);
1819
1820
1821 // If this is a 2 element vector, we really want to scalarize and not create
1822 // weird 1 element vectors.
1823 if (VT.getVectorNumElements() == 2) {
1824 SDValue Ops[2];
1825 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1826 return DAG.getMergeValues(Ops, SL);
1827 }
1828
1829 SDValue BasePtr = Load->getBasePtr();
1830 EVT MemVT = Load->getMemoryVT();
1831
1832 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1833
1834 EVT LoVT, HiVT;
1835 EVT LoMemVT, HiMemVT;
1836 SDValue Lo, Hi;
1837
1838 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1839 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1840 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1841
1842 unsigned Size = LoMemVT.getStoreSize();
1843 Align BaseAlign = Load->getAlign();
1844 Align HiAlign = commonAlignment(BaseAlign, Size);
1845
1846 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1847 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1848 BaseAlign, Load->getMemOperand()->getFlags());
1849 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1850 SDValue HiLoad =
1851 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1852 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1853 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1854
1855 SDValue Join;
1856 if (LoVT == HiVT) {
1857 // This is the case that the vector is power of two so was evenly split.
1858 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1859 } else {
1860 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1861 DAG.getVectorIdxConstant(0, SL));
1862 Join = DAG.getNode(
1864 VT, Join, HiLoad,
1866 }
1867
1868 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1869 LoLoad.getValue(1), HiLoad.getValue(1))};
1870
1871 return DAG.getMergeValues(Ops, SL);
1872}
1873
1875 SelectionDAG &DAG) const {
1876 LoadSDNode *Load = cast<LoadSDNode>(Op);
1877 EVT VT = Op.getValueType();
1878 SDValue BasePtr = Load->getBasePtr();
1879 EVT MemVT = Load->getMemoryVT();
1880 SDLoc SL(Op);
1881 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1882 Align BaseAlign = Load->getAlign();
1883 unsigned NumElements = MemVT.getVectorNumElements();
1884
1885 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1886 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1887 if (NumElements != 3 ||
1888 (BaseAlign < Align(8) &&
1889 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1890 return SplitVectorLoad(Op, DAG);
1891
1892 assert(NumElements == 3);
1893
1894 EVT WideVT =
1896 EVT WideMemVT =
1898 SDValue WideLoad = DAG.getExtLoad(
1899 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1900 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1901 return DAG.getMergeValues(
1902 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1903 DAG.getVectorIdxConstant(0, SL)),
1904 WideLoad.getValue(1)},
1905 SL);
1906}
1907
1909 SelectionDAG &DAG) const {
1910 StoreSDNode *Store = cast<StoreSDNode>(Op);
1911 SDValue Val = Store->getValue();
1912 EVT VT = Val.getValueType();
1913
1914 // If this is a 2 element vector, we really want to scalarize and not create
1915 // weird 1 element vectors.
1916 if (VT.getVectorNumElements() == 2)
1917 return scalarizeVectorStore(Store, DAG);
1918
1919 EVT MemVT = Store->getMemoryVT();
1920 SDValue Chain = Store->getChain();
1921 SDValue BasePtr = Store->getBasePtr();
1922 SDLoc SL(Op);
1923
1924 EVT LoVT, HiVT;
1925 EVT LoMemVT, HiMemVT;
1926 SDValue Lo, Hi;
1927
1928 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1929 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1930 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1931
1932 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1933
1934 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1935 Align BaseAlign = Store->getAlign();
1936 unsigned Size = LoMemVT.getStoreSize();
1937 Align HiAlign = commonAlignment(BaseAlign, Size);
1938
1939 SDValue LoStore =
1940 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1941 Store->getMemOperand()->getFlags());
1942 SDValue HiStore =
1943 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1944 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1945
1946 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1947}
1948
1949// This is a shortcut for integer division because we have fast i32<->f32
1950// conversions, and fast f32 reciprocal instructions. The fractional part of a
1951// float is enough to accurately represent up to a 24-bit signed integer.
1953 bool Sign) const {
1954 SDLoc DL(Op);
1955 EVT VT = Op.getValueType();
1956 SDValue LHS = Op.getOperand(0);
1957 SDValue RHS = Op.getOperand(1);
1958 MVT IntVT = MVT::i32;
1959 MVT FltVT = MVT::f32;
1960
1961 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1962 if (LHSSignBits < 9)
1963 return SDValue();
1964
1965 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1966 if (RHSSignBits < 9)
1967 return SDValue();
1968
1969 unsigned BitSize = VT.getSizeInBits();
1970 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1971 unsigned DivBits = BitSize - SignBits;
1972 if (Sign)
1973 ++DivBits;
1974
1977
1978 SDValue jq = DAG.getConstant(1, DL, IntVT);
1979
1980 if (Sign) {
1981 // char|short jq = ia ^ ib;
1982 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1983
1984 // jq = jq >> (bitsize - 2)
1985 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1986 DAG.getConstant(BitSize - 2, DL, VT));
1987
1988 // jq = jq | 0x1
1989 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1990 }
1991
1992 // int ia = (int)LHS;
1993 SDValue ia = LHS;
1994
1995 // int ib, (int)RHS;
1996 SDValue ib = RHS;
1997
1998 // float fa = (float)ia;
1999 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2000
2001 // float fb = (float)ib;
2002 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2003
2004 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2005 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2006
2007 // fq = trunc(fq);
2008 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2009
2010 // float fqneg = -fq;
2011 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2012
2014
2015 bool UseFmadFtz = false;
2016 if (Subtarget->isGCN()) {
2018 UseFmadFtz =
2020 }
2021
2022 // float fr = mad(fqneg, fb, fa);
2023 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2024 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2026 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2027
2028 // int iq = (int)fq;
2029 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2030
2031 // fr = fabs(fr);
2032 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2033
2034 // fb = fabs(fb);
2035 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2036
2037 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2038
2039 // int cv = fr >= fb;
2040 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2041
2042 // jq = (cv ? jq : 0);
2043 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2044
2045 // dst = iq + jq;
2046 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2047
2048 // Rem needs compensation, it's easier to recompute it
2049 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2050 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2051
2052 // Truncate to number of bits this divide really is.
2053 if (Sign) {
2054 SDValue InRegSize
2055 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2056 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2057 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2058 } else {
2059 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2060 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2061 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2062 }
2063
2064 return DAG.getMergeValues({ Div, Rem }, DL);
2065}
2066
2068 SelectionDAG &DAG,
2070 SDLoc DL(Op);
2071 EVT VT = Op.getValueType();
2072
2073 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2074
2075 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2076
2077 SDValue One = DAG.getConstant(1, DL, HalfVT);
2078 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2079
2080 //HiLo split
2081 SDValue LHS_Lo, LHS_Hi;
2082 SDValue LHS = Op.getOperand(0);
2083 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2084
2085 SDValue RHS_Lo, RHS_Hi;
2086 SDValue RHS = Op.getOperand(1);
2087 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2088
2089 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2091
2092 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2093 LHS_Lo, RHS_Lo);
2094
2095 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2096 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2097
2098 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2099 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2100 return;
2101 }
2102
2103 if (isTypeLegal(MVT::i64)) {
2104 // The algorithm here is based on ideas from "Software Integer Division",
2105 // Tom Rodeheffer, August 2008.
2106
2109
2110 // Compute denominator reciprocal.
2111 unsigned FMAD =
2112 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2115 : (unsigned)AMDGPUISD::FMAD_FTZ;
2116
2117 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2118 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2119 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2120 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2121 Cvt_Lo);
2122 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2123 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2124 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2125 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2126 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2127 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2128 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2129 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2130 Mul1);
2131 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2132 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2133 SDValue Rcp64 = DAG.getBitcast(VT,
2134 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2135
2136 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2137 SDValue One64 = DAG.getConstant(1, DL, VT);
2138 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2139 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2140
2141 // First round of UNR (Unsigned integer Newton-Raphson).
2142 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2143 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2144 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2145 SDValue Mulhi1_Lo, Mulhi1_Hi;
2146 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2147 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2148 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2149 Mulhi1_Lo, Zero1);
2150 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2151 Mulhi1_Hi, Add1_Lo.getValue(1));
2152 SDValue Add1 = DAG.getBitcast(VT,
2153 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2154
2155 // Second round of UNR.
2156 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2157 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2158 SDValue Mulhi2_Lo, Mulhi2_Hi;
2159 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2160 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2161 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2162 Mulhi2_Lo, Zero1);
2163 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2164 Mulhi2_Hi, Add2_Lo.getValue(1));
2165 SDValue Add2 = DAG.getBitcast(VT,
2166 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2167
2168 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2169
2170 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2171
2172 SDValue Mul3_Lo, Mul3_Hi;
2173 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2174 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2175 Mul3_Lo, Zero1);
2176 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2177 Mul3_Hi, Sub1_Lo.getValue(1));
2178 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2179 SDValue Sub1 = DAG.getBitcast(VT,
2180 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2181
2182 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2183 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2184 ISD::SETUGE);
2185 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2186 ISD::SETUGE);
2187 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2188
2189 // TODO: Here and below portions of the code can be enclosed into if/endif.
2190 // Currently control flow is unconditional and we have 4 selects after
2191 // potential endif to substitute PHIs.
2192
2193 // if C3 != 0 ...
2194 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2195 RHS_Lo, Zero1);
2196 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2197 RHS_Hi, Sub1_Lo.getValue(1));
2198 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2199 Zero, Sub2_Lo.getValue(1));
2200 SDValue Sub2 = DAG.getBitcast(VT,
2201 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2202
2203 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2204
2205 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2206 ISD::SETUGE);
2207 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2208 ISD::SETUGE);
2209 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2210
2211 // if (C6 != 0)
2212 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2213
2214 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2215 RHS_Lo, Zero1);
2216 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2217 RHS_Hi, Sub2_Lo.getValue(1));
2218 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2219 Zero, Sub3_Lo.getValue(1));
2220 SDValue Sub3 = DAG.getBitcast(VT,
2221 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2222
2223 // endif C6
2224 // endif C3
2225
2226 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2227 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2228
2229 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2230 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2231
2232 Results.push_back(Div);
2233 Results.push_back(Rem);
2234
2235 return;
2236 }
2237
2238 // r600 expandion.
2239 // Get Speculative values
2240 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2241 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2242
2243 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2244 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2245 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2246
2247 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2248 SDValue DIV_Lo = Zero;
2249
2250 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2251
2252 for (unsigned i = 0; i < halfBitWidth; ++i) {
2253 const unsigned bitPos = halfBitWidth - i - 1;
2254 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2255 // Get value of high bit
2256 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2257 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2258 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2259
2260 // Shift
2261 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2262 // Add LHS high bit
2263 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2264
2265 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2266 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2267
2268 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2269
2270 // Update REM
2271 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2272 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2273 }
2274
2275 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2276 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2277 Results.push_back(DIV);
2278 Results.push_back(REM);
2279}
2280
2282 SelectionDAG &DAG) const {
2283 SDLoc DL(Op);
2284 EVT VT = Op.getValueType();
2285
2286 if (VT == MVT::i64) {
2288 LowerUDIVREM64(Op, DAG, Results);
2289 return DAG.getMergeValues(Results, DL);
2290 }
2291
2292 if (VT == MVT::i32) {
2293 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2294 return Res;
2295 }
2296
2297 SDValue X = Op.getOperand(0);
2298 SDValue Y = Op.getOperand(1);
2299
2300 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2301 // algorithm used here.
2302
2303 // Initial estimate of inv(y).
2304 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2305
2306 // One round of UNR.
2307 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2308 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2309 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2310 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2311
2312 // Quotient/remainder estimate.
2313 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2314 SDValue R =
2315 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2316
2317 // First quotient/remainder refinement.
2318 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2319 SDValue One = DAG.getConstant(1, DL, VT);
2320 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2321 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2322 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2323 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2324 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2325
2326 // Second quotient/remainder refinement.
2327 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2328 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2329 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2330 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2331 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2332
2333 return DAG.getMergeValues({Q, R}, DL);
2334}
2335
2337 SelectionDAG &DAG) const {
2338 SDLoc DL(Op);
2339 EVT VT = Op.getValueType();
2340
2341 SDValue LHS = Op.getOperand(0);
2342 SDValue RHS = Op.getOperand(1);
2343
2344 SDValue Zero = DAG.getConstant(0, DL, VT);
2345 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2346
2347 if (VT == MVT::i32) {
2348 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2349 return Res;
2350 }
2351
2352 if (VT == MVT::i64 &&
2353 DAG.ComputeNumSignBits(LHS) > 32 &&
2354 DAG.ComputeNumSignBits(RHS) > 32) {
2355 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2356
2357 //HiLo split
2358 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2359 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2360 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2361 LHS_Lo, RHS_Lo);
2362 SDValue Res[2] = {
2363 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2364 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2365 };
2366 return DAG.getMergeValues(Res, DL);
2367 }
2368
2369 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2370 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2371 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2372 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2373
2374 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2375 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2376
2377 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2378 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2379
2380 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2381 SDValue Rem = Div.getValue(1);
2382
2383 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2384 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2385
2386 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2387 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2388
2389 SDValue Res[2] = {
2390 Div,
2391 Rem
2392 };
2393 return DAG.getMergeValues(Res, DL);
2394}
2395
2396// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2398 SDLoc SL(Op);
2399 EVT VT = Op.getValueType();
2400 auto Flags = Op->getFlags();
2401 SDValue X = Op.getOperand(0);
2402 SDValue Y = Op.getOperand(1);
2403
2404 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2405 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2406 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2407 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2408 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2409}
2410
2412 SDLoc SL(Op);
2413 SDValue Src = Op.getOperand(0);
2414
2415 // result = trunc(src)
2416 // if (src > 0.0 && src != result)
2417 // result += 1.0
2418
2419 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2420
2421 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2422 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2423
2424 EVT SetCCVT =
2425 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2426
2427 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2428 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2429 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2430
2431 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2432 // TODO: Should this propagate fast-math-flags?
2433 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2434}
2435
2437 SelectionDAG &DAG) {
2438 const unsigned FractBits = 52;
2439 const unsigned ExpBits = 11;
2440
2441 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2442 Hi,
2443 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2444 DAG.getConstant(ExpBits, SL, MVT::i32));
2445 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2446 DAG.getConstant(1023, SL, MVT::i32));
2447
2448 return Exp;
2449}
2450
2452 SDLoc SL(Op);
2453 SDValue Src = Op.getOperand(0);
2454
2455 assert(Op.getValueType() == MVT::f64);
2456
2457 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2458
2459 // Extract the upper half, since this is where we will find the sign and
2460 // exponent.
2461 SDValue Hi = getHiHalf64(Src, DAG);
2462
2463 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2464
2465 const unsigned FractBits = 52;
2466
2467 // Extract the sign bit.
2468 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2469 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2470
2471 // Extend back to 64-bits.
2472 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2473 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2474
2475 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2476 const SDValue FractMask
2477 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2478
2479 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2480 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2481 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2482
2483 EVT SetCCVT =
2484 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2485
2486 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2487
2488 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2489 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2490
2491 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2492 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2493
2494 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2495}
2496
2498 SelectionDAG &DAG) const {
2499 SDLoc SL(Op);
2500 SDValue Src = Op.getOperand(0);
2501
2502 assert(Op.getValueType() == MVT::f64);
2503
2504 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2505 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2506 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2507
2508 // TODO: Should this propagate fast-math-flags?
2509
2510 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2511 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2512
2513 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2514
2515 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2516 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2517
2518 EVT SetCCVT =
2519 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2520 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2521
2522 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2523}
2524
2526 SelectionDAG &DAG) const {
2527 // FNEARBYINT and FRINT are the same, except in their handling of FP
2528 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2529 // rint, so just treat them as equivalent.
2530 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2531 Op.getOperand(0));
2532}
2533
2535 auto VT = Op.getValueType();
2536 auto Arg = Op.getOperand(0u);
2537 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2538}
2539
2540// XXX - May require not supporting f32 denormals?
2541
2542// Don't handle v2f16. The extra instructions to scalarize and repack around the
2543// compare and vselect end up producing worse code than scalarizing the whole
2544// operation.
2546 SDLoc SL(Op);
2547 SDValue X = Op.getOperand(0);
2548 EVT VT = Op.getValueType();
2549
2550 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2551
2552 // TODO: Should this propagate fast-math-flags?
2553
2554 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2555
2556 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2557
2558 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2559 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2560
2561 EVT SetCCVT =
2562 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2563
2564 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2565 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2566 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2567
2568 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2569 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2570}
2571
2573 SDLoc SL(Op);
2574 SDValue Src = Op.getOperand(0);
2575
2576 // result = trunc(src);
2577 // if (src < 0.0 && src != result)
2578 // result += -1.0.
2579
2580 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2581
2582 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2583 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2584
2585 EVT SetCCVT =
2586 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2587
2588 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2589 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2590 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2591
2592 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2593 // TODO: Should this propagate fast-math-flags?
2594 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2595}
2596
2597/// Return true if it's known that \p Src can never be an f32 denormal value.
2599 switch (Src.getOpcode()) {
2600 case ISD::FP_EXTEND:
2601 return Src.getOperand(0).getValueType() == MVT::f16;
2602 case ISD::FP16_TO_FP:
2603 case ISD::FFREXP:
2604 return true;
2606 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2607 switch (IntrinsicID) {
2608 case Intrinsic::amdgcn_frexp_mant:
2609 return true;
2610 default:
2611 return false;
2612 }
2613 }
2614 default:
2615 return false;
2616 }
2617
2618 llvm_unreachable("covered opcode switch");
2619}
2620
2622 SDNodeFlags Flags) {
2623 if (Flags.hasApproximateFuncs())
2624 return true;
2625 auto &Options = DAG.getTarget().Options;
2626 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2627}
2628
2630 SDValue Src,
2631 SDNodeFlags Flags) {
2632 return !valueIsKnownNeverF32Denorm(Src) &&
2633 DAG.getMachineFunction()
2636}
2637
2639 SDValue Src,
2640 SDNodeFlags Flags) const {
2641 SDLoc SL(Src);
2642 EVT VT = Src.getValueType();
2643 const fltSemantics &Semantics = VT.getFltSemantics();
2644 SDValue SmallestNormal =
2645 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2646
2647 // Want to scale denormals up, but negatives and 0 work just as well on the
2648 // scaled path.
2649 SDValue IsLtSmallestNormal = DAG.getSetCC(
2650 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2651 SmallestNormal, ISD::SETOLT);
2652
2653 return IsLtSmallestNormal;
2654}
2655
2657 SDNodeFlags Flags) const {
2658 SDLoc SL(Src);
2659 EVT VT = Src.getValueType();
2660 const fltSemantics &Semantics = VT.getFltSemantics();
2661 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2662
2663 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2664 SDValue IsFinite = DAG.getSetCC(
2665 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2666 Inf, ISD::SETOLT);
2667 return IsFinite;
2668}
2669
2670/// If denormal handling is required return the scaled input to FLOG2, and the
2671/// check for denormal range. Otherwise, return null values.
2672std::pair<SDValue, SDValue>
2674 SDValue Src, SDNodeFlags Flags) const {
2675 if (!needsDenormHandlingF32(DAG, Src, Flags))
2676 return {};
2677
2678 MVT VT = MVT::f32;
2679 const fltSemantics &Semantics = APFloat::IEEEsingle();
2680 SDValue SmallestNormal =
2681 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2682
2683 SDValue IsLtSmallestNormal = DAG.getSetCC(
2684 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2685 SmallestNormal, ISD::SETOLT);
2686
2687 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2688 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2689 SDValue ScaleFactor =
2690 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2691
2692 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2693 return {ScaledInput, IsLtSmallestNormal};
2694}
2695
2697 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2698 // If we have to handle denormals, scale up the input and adjust the result.
2699
2700 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2701 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2702
2703 SDLoc SL(Op);
2704 EVT VT = Op.getValueType();
2705 SDValue Src = Op.getOperand(0);
2706 SDNodeFlags Flags = Op->getFlags();
2707
2708 if (VT == MVT::f16) {
2709 // Nothing in half is a denormal when promoted to f32.
2710 assert(!Subtarget->has16BitInsts());
2711 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2712 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2713 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2714 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2715 }
2716
2717 auto [ScaledInput, IsLtSmallestNormal] =
2718 getScaledLogInput(DAG, SL, Src, Flags);
2719 if (!ScaledInput)
2720 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2721
2722 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2723
2724 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2725 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2726 SDValue ResultOffset =
2727 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2728 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2729}
2730
2731static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2732 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2733 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2734 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2735}
2736
2738 SelectionDAG &DAG) const {
2739 SDValue X = Op.getOperand(0);
2740 EVT VT = Op.getValueType();
2741 SDNodeFlags Flags = Op->getFlags();
2742 SDLoc DL(Op);
2743
2744 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2745 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2746
2747 const auto &Options = getTargetMachine().Options;
2748 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2749 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2750
2751 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2752 // Log and multiply in f32 is good enough for f16.
2753 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2754 }
2755
2756 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2757 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2758 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2759 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2760 }
2761
2762 return Lowered;
2763 }
2764
2765 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2766 if (ScaledInput)
2767 X = ScaledInput;
2768
2769 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2770
2771 SDValue R;
2772 if (Subtarget->hasFastFMAF32()) {
2773 // c+cc are ln(2)/ln(10) to more than 49 bits
2774 const float c_log10 = 0x1.344134p-2f;
2775 const float cc_log10 = 0x1.09f79ep-26f;
2776
2777 // c + cc is ln(2) to more than 49 bits
2778 const float c_log = 0x1.62e42ep-1f;
2779 const float cc_log = 0x1.efa39ep-25f;
2780
2781 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2782 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2783
2784 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2785 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2786 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2787 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2788 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2789 } else {
2790 // ch+ct is ln(2)/ln(10) to more than 36 bits
2791 const float ch_log10 = 0x1.344000p-2f;
2792 const float ct_log10 = 0x1.3509f6p-18f;
2793
2794 // ch + ct is ln(2) to more than 36 bits
2795 const float ch_log = 0x1.62e000p-1f;
2796 const float ct_log = 0x1.0bfbe8p-15f;
2797
2798 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2799 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2800
2801 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2802 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2803 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2804 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2805 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2806
2807 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2808 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2809 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2810 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2811 }
2812
2813 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2814 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2815
2816 // TODO: Check if known finite from source value.
2817 if (!IsFiniteOnly) {
2818 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2819 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2820 }
2821
2822 if (IsScaled) {
2823 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2824 SDValue ShiftK =
2825 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2826 SDValue Shift =
2827 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2828 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2829 }
2830
2831 return R;
2832}
2833
2835 return LowerFLOGCommon(Op, DAG);
2836}
2837
2838// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2839// promote f16 operation.
2841 SelectionDAG &DAG, bool IsLog10,
2842 SDNodeFlags Flags) const {
2843 EVT VT = Src.getValueType();
2844 unsigned LogOp =
2845 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2846
2847 double Log2BaseInverted =
2849
2850 if (VT == MVT::f32) {
2851 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2852 if (ScaledInput) {
2853 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2854 SDValue ScaledResultOffset =
2855 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2856
2857 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2858
2859 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2860 ScaledResultOffset, Zero, Flags);
2861
2862 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2863
2864 if (Subtarget->hasFastFMAF32())
2865 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2866 Flags);
2867 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2868 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2869 }
2870 }
2871
2872 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2873 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2874
2875 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2876 Flags);
2877}
2878
2880 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2881 // If we have to handle denormals, scale up the input and adjust the result.
2882
2883 SDLoc SL(Op);
2884 EVT VT = Op.getValueType();
2885 SDValue Src = Op.getOperand(0);
2886 SDNodeFlags Flags = Op->getFlags();
2887
2888 if (VT == MVT::f16) {
2889 // Nothing in half is a denormal when promoted to f32.
2890 assert(!Subtarget->has16BitInsts());
2891 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2892 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2893 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2894 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2895 }
2896
2897 assert(VT == MVT::f32);
2898
2899 if (!needsDenormHandlingF32(DAG, Src, Flags))
2900 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2901
2902 // bool needs_scaling = x < -0x1.f80000p+6f;
2903 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2904
2905 // -nextafter(128.0, -1)
2906 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2907
2908 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2909
2910 SDValue NeedsScaling =
2911 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2912
2913 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2914 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2915
2916 SDValue AddOffset =
2917 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2918
2919 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2920 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2921
2922 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2923 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2924 SDValue ResultScale =
2925 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2926
2927 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2928}
2929
2931 SelectionDAG &DAG,
2932 SDNodeFlags Flags) const {
2933 EVT VT = X.getValueType();
2934 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2935
2936 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2937 // exp2(M_LOG2E_F * f);
2938 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2939 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2940 : (unsigned)ISD::FEXP2,
2941 SL, VT, Mul, Flags);
2942 }
2943
2944 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2945
2946 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2947 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2948
2949 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2950
2951 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2952
2953 SDValue AdjustedX =
2954 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2955
2956 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2957
2958 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2959
2960 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2961 SDValue AdjustedResult =
2962 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2963
2964 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2965 Flags);
2966}
2967
2968/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2969/// handled correctly.
2971 SelectionDAG &DAG,
2972 SDNodeFlags Flags) const {
2973 const EVT VT = X.getValueType();
2974 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2975
2976 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2977 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2978 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2979 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2980
2981 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2982 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2983 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2984 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2985 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2986 }
2987
2988 // bool s = x < -0x1.2f7030p+5f;
2989 // x += s ? 0x1.0p+5f : 0.0f;
2990 // exp10 = exp2(x * 0x1.a92000p+1f) *
2991 // exp2(x * 0x1.4f0978p-11f) *
2992 // (s ? 0x1.9f623ep-107f : 1.0f);
2993
2994 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2995
2996 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2997 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2998
2999 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3000 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3001 SDValue AdjustedX =
3002 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3003
3004 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3005 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3006
3007 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3008 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3009 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3010 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3011
3012 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3013
3014 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3015 SDValue AdjustedResult =
3016 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3017
3018 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3019 Flags);
3020}
3021
3023 EVT VT = Op.getValueType();
3024 SDLoc SL(Op);
3025 SDValue X = Op.getOperand(0);
3026 SDNodeFlags Flags = Op->getFlags();
3027 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3028
3029 if (VT.getScalarType() == MVT::f16) {
3030 // v_exp_f16 (fmul x, log2e)
3031 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3032 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3033
3034 if (VT.isVector())
3035 return SDValue();
3036
3037 // exp(f16 x) ->
3038 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3039
3040 // Nothing in half is a denormal when promoted to f32.
3041 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3042 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3043 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3044 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3045 }
3046
3047 assert(VT == MVT::f32);
3048
3049 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3050 // library behavior. Also, is known-not-daz source sufficient?
3051 if (allowApproxFunc(DAG, Flags)) {
3052 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3053 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3054 }
3055
3056 // Algorithm:
3057 //
3058 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3059 //
3060 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3061 // n = 64*m + j, 0 <= j < 64
3062 //
3063 // e^x = 2^((64*m + j + f)/64)
3064 // = (2^m) * (2^(j/64)) * 2^(f/64)
3065 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3066 //
3067 // f = x*(64/ln(2)) - n
3068 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3069 //
3070 // e^x = (2^m) * (2^(j/64)) * e^r
3071 //
3072 // (2^(j/64)) is precomputed
3073 //
3074 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3075 // e^r = 1 + q
3076 //
3077 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3078 //
3079 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3080 SDNodeFlags FlagsNoContract = Flags;
3081 FlagsNoContract.setAllowContract(false);
3082
3083 SDValue PH, PL;
3084 if (Subtarget->hasFastFMAF32()) {
3085 const float c_exp = numbers::log2ef;
3086 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3087 const float c_exp10 = 0x1.a934f0p+1f;
3088 const float cc_exp10 = 0x1.2f346ep-24f;
3089
3090 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3091 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3092
3093 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3094 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3095 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3096 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3097 } else {
3098 const float ch_exp = 0x1.714000p+0f;
3099 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3100
3101 const float ch_exp10 = 0x1.a92000p+1f;
3102 const float cl_exp10 = 0x1.4f0978p-11f;
3103
3104 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3105 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3106
3107 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3108 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3109 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3110 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3111 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3112
3113 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3114
3115 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3116 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3117 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3118 }
3119
3120 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3121
3122 // It is unsafe to contract this fsub into the PH multiply.
3123 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3124
3125 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3126 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3127 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3128
3129 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3130
3131 SDValue UnderflowCheckConst =
3132 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3133
3134 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3135 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3136 SDValue Underflow =
3137 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3138
3139 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3140 const auto &Options = getTargetMachine().Options;
3141
3142 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3143 SDValue OverflowCheckConst =
3144 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3145 SDValue Overflow =
3146 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3147 SDValue Inf =
3149 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3150 }
3151
3152 return R;
3153}
3154
3155static bool isCtlzOpc(unsigned Opc) {
3156 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3157}
3158
3159static bool isCttzOpc(unsigned Opc) {
3160 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3161}
3162
3164 SelectionDAG &DAG) const {
3165 auto SL = SDLoc(Op);
3166 auto Opc = Op.getOpcode();
3167 auto Arg = Op.getOperand(0u);
3168 auto ResultVT = Op.getValueType();
3169
3170 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3171 return {};
3172
3173 assert(isCtlzOpc(Opc));
3174 assert(ResultVT == Arg.getValueType());
3175
3176 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3177 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3178 SDValue NewOp;
3179
3180 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3181 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3182 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3183 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3184 } else {
3185 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3186 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3187 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3188 }
3189
3190 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3191}
3192
3194 SDLoc SL(Op);
3195 SDValue Src = Op.getOperand(0);
3196
3197 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3198 bool Ctlz = isCtlzOpc(Op.getOpcode());
3199 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3200
3201 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3202 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3203 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3204
3205 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3206 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3207 // (cttz hi:lo) -> (umin (ffbl src), 32)
3208 // (ctlz_zero_undef src) -> (ffbh src)
3209 // (cttz_zero_undef src) -> (ffbl src)
3210
3211 // 64-bit scalar version produce 32-bit result
3212 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3213 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3214 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3215 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3216 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3217 if (!ZeroUndef) {
3218 const SDValue ConstVal = DAG.getConstant(
3219 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3220 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3221 }
3222 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3223 }
3224
3225 SDValue Lo, Hi;
3226 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3227
3228 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3229 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3230
3231 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3232 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3233 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3234 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3235
3236 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3237 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3238 if (Ctlz)
3239 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3240 else
3241 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3242
3243 SDValue NewOpr;
3244 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3245 if (!ZeroUndef) {
3246 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3247 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3248 }
3249
3250 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3251}
3252
3254 bool Signed) const {
3255 // The regular method converting a 64-bit integer to float roughly consists of
3256 // 2 steps: normalization and rounding. In fact, after normalization, the
3257 // conversion from a 64-bit integer to a float is essentially the same as the
3258 // one from a 32-bit integer. The only difference is that it has more
3259 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3260 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3261 // converted into the correct float number. The basic steps for the unsigned
3262 // conversion are illustrated in the following pseudo code:
3263 //
3264 // f32 uitofp(i64 u) {
3265 // i32 hi, lo = split(u);
3266 // // Only count the leading zeros in hi as we have native support of the
3267 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3268 // // reduced to a 32-bit one automatically.
3269 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3270 // u <<= shamt;
3271 // hi, lo = split(u);
3272 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3273 // // convert it as a 32-bit integer and scale the result back.
3274 // return uitofp(hi) * 2^(32 - shamt);
3275 // }
3276 //
3277 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3278 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3279 // converted instead followed by negation based its sign bit.
3280
3281 SDLoc SL(Op);
3282 SDValue Src = Op.getOperand(0);
3283
3284 SDValue Lo, Hi;
3285 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3286 SDValue Sign;
3287 SDValue ShAmt;
3288 if (Signed && Subtarget->isGCN()) {
3289 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3290 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3291 // account. That is, the maximal shift is
3292 // - 32 if Lo and Hi have opposite signs;
3293 // - 33 if Lo and Hi have the same sign.
3294 //
3295 // Or, MaxShAmt = 33 + OppositeSign, where
3296 //
3297 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3298 // - -1 if Lo and Hi have opposite signs; and
3299 // - 0 otherwise.
3300 //
3301 // All in all, ShAmt is calculated as
3302 //
3303 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3304 //
3305 // or
3306 //
3307 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3308 //
3309 // to reduce the critical path.
3310 SDValue OppositeSign = DAG.getNode(
3311 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3312 DAG.getConstant(31, SL, MVT::i32));
3313 SDValue MaxShAmt =
3314 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3315 OppositeSign);
3316 // Count the leading sign bits.
3317 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3318 // Different from unsigned conversion, the shift should be one bit less to
3319 // preserve the sign bit.
3320 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3321 DAG.getConstant(1, SL, MVT::i32));
3322 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3323 } else {
3324 if (Signed) {
3325 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3326 // absolute value first.
3327 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3328 DAG.getConstant(63, SL, MVT::i64));
3329 SDValue Abs =
3330 DAG.getNode(ISD::XOR, SL, MVT::i64,
3331 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3332 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3333 }
3334 // Count the leading zeros.
3335 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3336 // The shift amount for signed integers is [0, 32].
3337 }
3338 // Normalize the given 64-bit integer.
3339 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3340 // Split it again.
3341 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3342 // Calculate the adjust bit for rounding.
3343 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3344 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3345 DAG.getConstant(1, SL, MVT::i32), Lo);
3346 // Get the 32-bit normalized integer.
3347 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3348 // Convert the normalized 32-bit integer into f32.
3349 unsigned Opc =
3350 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3351 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3352
3353 // Finally, need to scale back the converted floating number as the original
3354 // 64-bit integer is converted as a 32-bit one.
3355 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3356 ShAmt);
3357 // On GCN, use LDEXP directly.
3358 if (Subtarget->isGCN())
3359 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3360
3361 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3362 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3363 // exponent is enough to avoid overflowing into the sign bit.
3364 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3365 DAG.getConstant(23, SL, MVT::i32));
3366 SDValue IVal =
3367 DAG.getNode(ISD::ADD, SL, MVT::i32,
3368 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3369 if (Signed) {
3370 // Set the sign bit.
3371 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3372 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3373 DAG.getConstant(31, SL, MVT::i32));
3374 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3375 }
3376 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3377}
3378
3380 bool Signed) const {
3381 SDLoc SL(Op);
3382 SDValue Src = Op.getOperand(0);
3383
3384 SDValue Lo, Hi;
3385 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3386
3388 SL, MVT::f64, Hi);
3389
3390 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3391
3392 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3393 DAG.getConstant(32, SL, MVT::i32));
3394 // TODO: Should this propagate fast-math-flags?
3395 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3396}
3397
3399 SelectionDAG &DAG) const {
3400 // TODO: Factor out code common with LowerSINT_TO_FP.
3401 EVT DestVT = Op.getValueType();
3402 SDValue Src = Op.getOperand(0);
3403 EVT SrcVT = Src.getValueType();
3404
3405 if (SrcVT == MVT::i16) {
3406 if (DestVT == MVT::f16)
3407 return Op;
3408 SDLoc DL(Op);
3409
3410 // Promote src to i32
3411 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3412 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3413 }
3414
3415 if (DestVT == MVT::bf16) {
3416 SDLoc SL(Op);
3417 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3418 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3419 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3420 }
3421
3422 if (SrcVT != MVT::i64)
3423 return Op;
3424
3425 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3426 SDLoc DL(Op);
3427
3428 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3429 SDValue FPRoundFlag =
3430 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3431 SDValue FPRound =
3432 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3433
3434 return FPRound;
3435 }
3436
3437 if (DestVT == MVT::f32)
3438 return LowerINT_TO_FP32(Op, DAG, false);
3439
3440 assert(DestVT == MVT::f64);
3441 return LowerINT_TO_FP64(Op, DAG, false);
3442}
3443
3445 SelectionDAG &DAG) const {
3446 EVT DestVT = Op.getValueType();
3447
3448 SDValue Src = Op.getOperand(0);
3449 EVT SrcVT = Src.getValueType();
3450
3451 if (SrcVT == MVT::i16) {
3452 if (DestVT == MVT::f16)
3453 return Op;
3454
3455 SDLoc DL(Op);
3456 // Promote src to i32
3457 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3458 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3459 }
3460
3461 if (DestVT == MVT::bf16) {
3462 SDLoc SL(Op);
3463 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3464 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3465 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3466 }
3467
3468 if (SrcVT != MVT::i64)
3469 return Op;
3470
3471 // TODO: Factor out code common with LowerUINT_TO_FP.
3472
3473 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3474 SDLoc DL(Op);
3475 SDValue Src = Op.getOperand(0);
3476
3477 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3478 SDValue FPRoundFlag =
3479 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3480 SDValue FPRound =
3481 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3482
3483 return FPRound;
3484 }
3485
3486 if (DestVT == MVT::f32)
3487 return LowerINT_TO_FP32(Op, DAG, true);
3488
3489 assert(DestVT == MVT::f64);
3490 return LowerINT_TO_FP64(Op, DAG, true);
3491}
3492
3494 bool Signed) const {
3495 SDLoc SL(Op);
3496
3497 SDValue Src = Op.getOperand(0);
3498 EVT SrcVT = Src.getValueType();
3499
3500 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3501
3502 // The basic idea of converting a floating point number into a pair of 32-bit
3503 // integers is illustrated as follows:
3504 //
3505 // tf := trunc(val);
3506 // hif := floor(tf * 2^-32);
3507 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3508 // hi := fptoi(hif);
3509 // lo := fptoi(lof);
3510 //
3511 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3512 SDValue Sign;
3513 if (Signed && SrcVT == MVT::f32) {
3514 // However, a 32-bit floating point number has only 23 bits mantissa and
3515 // it's not enough to hold all the significant bits of `lof` if val is
3516 // negative. To avoid the loss of precision, We need to take the absolute
3517 // value after truncating and flip the result back based on the original
3518 // signedness.
3519 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3520 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3521 DAG.getConstant(31, SL, MVT::i32));
3522 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3523 }
3524
3525 SDValue K0, K1;
3526 if (SrcVT == MVT::f64) {
3527 K0 = DAG.getConstantFP(
3528 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3529 SrcVT);
3530 K1 = DAG.getConstantFP(
3531 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3532 SrcVT);
3533 } else {
3534 K0 = DAG.getConstantFP(
3535 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3536 K1 = DAG.getConstantFP(
3537 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3538 }
3539 // TODO: Should this propagate fast-math-flags?
3540 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3541
3542 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3543
3544 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3545
3546 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3548 SL, MVT::i32, FloorMul);
3549 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3550
3551 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3552 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3553
3554 if (Signed && SrcVT == MVT::f32) {
3555 assert(Sign);
3556 // Flip the result based on the signedness, which is either all 0s or 1s.
3557 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3558 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3559 // r := xor(r, sign) - sign;
3560 Result =
3561 DAG.getNode(ISD::SUB, SL, MVT::i64,
3562 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3563 }
3564
3565 return Result;
3566}
3567
3569 SDLoc DL(Op);
3570 SDValue N0 = Op.getOperand(0);
3571
3572 // Convert to target node to get known bits
3573 if (N0.getValueType() == MVT::f32)
3574 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3575
3576 if (getTargetMachine().Options.UnsafeFPMath) {
3577 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3578 return SDValue();
3579 }
3580
3581 assert(N0.getSimpleValueType() == MVT::f64);
3582
3583 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3584 const unsigned ExpMask = 0x7ff;
3585 const unsigned ExpBiasf64 = 1023;
3586 const unsigned ExpBiasf16 = 15;
3587 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3588 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3589 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3590 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3591 DAG.getConstant(32, DL, MVT::i64));
3592 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3593 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3594 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3595 DAG.getConstant(20, DL, MVT::i64));
3596 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3597 DAG.getConstant(ExpMask, DL, MVT::i32));
3598 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3599 // add the f16 bias (15) to get the biased exponent for the f16 format.
3600 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3601 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3602
3603 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3604 DAG.getConstant(8, DL, MVT::i32));
3605 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3606 DAG.getConstant(0xffe, DL, MVT::i32));
3607
3608 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3609 DAG.getConstant(0x1ff, DL, MVT::i32));
3610 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3611
3612 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3613 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3614
3615 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3616 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3617 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3618 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3619
3620 // N = M | (E << 12);
3621 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3622 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3623 DAG.getConstant(12, DL, MVT::i32)));
3624
3625 // B = clamp(1-E, 0, 13);
3626 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3627 One, E);
3628 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3629 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3630 DAG.getConstant(13, DL, MVT::i32));
3631
3632 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3633 DAG.getConstant(0x1000, DL, MVT::i32));
3634
3635 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3636 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3637 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3638 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3639
3640 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3641 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3642 DAG.getConstant(0x7, DL, MVT::i32));
3643 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3644 DAG.getConstant(2, DL, MVT::i32));
3645 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3646 One, Zero, ISD::SETEQ);
3647 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3648 One, Zero, ISD::SETGT);
3649 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3650 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3651
3652 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3653 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3654 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3655 I, V, ISD::SETEQ);
3656
3657 // Extract the sign bit.
3658 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3659 DAG.getConstant(16, DL, MVT::i32));
3660 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3661 DAG.getConstant(0x8000, DL, MVT::i32));
3662
3663 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3664 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3665}
3666
3668 SelectionDAG &DAG) const {
3669 SDValue Src = Op.getOperand(0);
3670 unsigned OpOpcode = Op.getOpcode();
3671 EVT SrcVT = Src.getValueType();
3672 EVT DestVT = Op.getValueType();
3673
3674 // Will be selected natively
3675 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3676 return Op;
3677
3678 if (SrcVT == MVT::bf16) {
3679 SDLoc DL(Op);
3680 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3681 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3682 }
3683
3684 // Promote i16 to i32
3685 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3686 SDLoc DL(Op);
3687
3688 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3689 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3690 }
3691
3692 if (DestVT != MVT::i64)
3693 return Op;
3694
3695 if (SrcVT == MVT::f16 ||
3696 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3697 SDLoc DL(Op);
3698
3699 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3700 unsigned Ext =
3702 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3703 }
3704
3705 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3706 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3707
3708 return SDValue();
3709}
3710
3712 SelectionDAG &DAG) const {
3713 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3714 MVT VT = Op.getSimpleValueType();
3715 MVT ScalarVT = VT.getScalarType();
3716
3717 assert(VT.isVector());
3718
3719 SDValue Src = Op.getOperand(0);
3720 SDLoc DL(Op);
3721
3722 // TODO: Don't scalarize on Evergreen?
3723 unsigned NElts = VT.getVectorNumElements();
3725 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3726
3727 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3728 for (unsigned I = 0; I < NElts; ++I)
3729 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3730
3731 return DAG.getBuildVector(VT, DL, Args);
3732}
3733
3734//===----------------------------------------------------------------------===//
3735// Custom DAG optimizations
3736//===----------------------------------------------------------------------===//
3737
3738static bool isU24(SDValue Op, SelectionDAG &DAG) {
3739 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3740}
3741
3742static bool isI24(SDValue Op, SelectionDAG &DAG) {
3743 EVT VT = Op.getValueType();
3744 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3745 // as unsigned 24-bit values.
3747}
3748
3751 SelectionDAG &DAG = DCI.DAG;
3752 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3753 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3754
3755 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3756 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3757 unsigned NewOpcode = Node24->getOpcode();
3758 if (IsIntrin) {
3759 unsigned IID = Node24->getConstantOperandVal(0);
3760 switch (IID) {
3761 case Intrinsic::amdgcn_mul_i24:
3762 NewOpcode = AMDGPUISD::MUL_I24;
3763 break;
3764 case Intrinsic::amdgcn_mul_u24:
3765 NewOpcode = AMDGPUISD::MUL_U24;
3766 break;
3767 case Intrinsic::amdgcn_mulhi_i24:
3768 NewOpcode = AMDGPUISD::MULHI_I24;
3769 break;
3770 case Intrinsic::amdgcn_mulhi_u24:
3771 NewOpcode = AMDGPUISD::MULHI_U24;
3772 break;
3773 default:
3774 llvm_unreachable("Expected 24-bit mul intrinsic");
3775 }
3776 }
3777
3778 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3779
3780 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3781 // the operands to have other uses, but will only perform simplifications that
3782 // involve bypassing some nodes for this user.
3783 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3784 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3785 if (DemandedLHS || DemandedRHS)
3786 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3787 DemandedLHS ? DemandedLHS : LHS,
3788 DemandedRHS ? DemandedRHS : RHS);
3789
3790 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3791 // operands if this node is the only user.
3792 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3793 return SDValue(Node24, 0);
3794 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3795 return SDValue(Node24, 0);
3796
3797 return SDValue();
3798}
3799
3800template <typename IntTy>
3802 uint32_t Width, const SDLoc &DL) {
3803 if (Width + Offset < 32) {
3804 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3805 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3806 if constexpr (std::is_signed_v<IntTy>) {
3807 return DAG.getSignedConstant(Result, DL, MVT::i32);
3808 } else {
3809 return DAG.getConstant(Result, DL, MVT::i32);
3810 }
3811 }
3812
3813 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3814}
3815
3816static bool hasVolatileUser(SDNode *Val) {
3817 for (SDNode *U : Val->users()) {
3818 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3819 if (M->isVolatile())
3820 return true;
3821 }
3822 }
3823
3824 return false;
3825}
3826
3828 // i32 vectors are the canonical memory type.
3829 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3830 return false;
3831
3832 if (!VT.isByteSized())
3833 return false;
3834
3835 unsigned Size = VT.getStoreSize();
3836
3837 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3838 return false;
3839
3840 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3841 return false;
3842
3843 return true;
3844}
3845
3846// Replace load of an illegal type with a store of a bitcast to a friendlier
3847// type.
3849 DAGCombinerInfo &DCI) const {
3850 if (!DCI.isBeforeLegalize())
3851 return SDValue();
3852
3853 LoadSDNode *LN = cast<LoadSDNode>(N);
3854 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3855 return SDValue();
3856
3857 SDLoc SL(N);
3858 SelectionDAG &DAG = DCI.DAG;
3859 EVT VT = LN->getMemoryVT();
3860
3861 unsigned Size = VT.getStoreSize();
3862 Align Alignment = LN->getAlign();
3863 if (Alignment < Size && isTypeLegal(VT)) {
3864 unsigned IsFast;
3865 unsigned AS = LN->getAddressSpace();
3866
3867 // Expand unaligned loads earlier than legalization. Due to visitation order
3868 // problems during legalization, the emitted instructions to pack and unpack
3869 // the bytes again are not eliminated in the case of an unaligned copy.
3871 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3872 if (VT.isVector())
3873 return SplitVectorLoad(SDValue(LN, 0), DAG);
3874
3875 SDValue Ops[2];
3876 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3877
3878 return DAG.getMergeValues(Ops, SDLoc(N));
3879 }
3880
3881 if (!IsFast)
3882 return SDValue();
3883 }
3884
3885 if (!shouldCombineMemoryType(VT))
3886 return SDValue();
3887
3888 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3889
3890 SDValue NewLoad
3891 = DAG.getLoad(NewVT, SL, LN->getChain(),
3892 LN->getBasePtr(), LN->getMemOperand());
3893
3894 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3895 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3896 return SDValue(N, 0);
3897}
3898
3899// Replace store of an illegal type with a store of a bitcast to a friendlier
3900// type.
3902 DAGCombinerInfo &DCI) const {
3903 if (!DCI.isBeforeLegalize())
3904 return SDValue();
3905
3906 StoreSDNode *SN = cast<StoreSDNode>(N);
3907 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3908 return SDValue();
3909
3910 EVT VT = SN->getMemoryVT();
3911 unsigned Size = VT.getStoreSize();
3912
3913 SDLoc SL(N);
3914 SelectionDAG &DAG = DCI.DAG;
3915 Align Alignment = SN->getAlign();
3916 if (Alignment < Size && isTypeLegal(VT)) {
3917 unsigned IsFast;
3918 unsigned AS = SN->getAddressSpace();
3919
3920 // Expand unaligned stores earlier than legalization. Due to visitation
3921 // order problems during legalization, the emitted instructions to pack and
3922 // unpack the bytes again are not eliminated in the case of an unaligned
3923 // copy.
3925 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3926 if (VT.isVector())
3927 return SplitVectorStore(SDValue(SN, 0), DAG);
3928
3929 return expandUnalignedStore(SN, DAG);
3930 }
3931
3932 if (!IsFast)
3933 return SDValue();
3934 }
3935
3936 if (!shouldCombineMemoryType(VT))
3937 return SDValue();
3938
3939 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3940 SDValue Val = SN->getValue();
3941
3942 //DCI.AddToWorklist(Val.getNode());
3943
3944 bool OtherUses = !Val.hasOneUse();
3945 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3946 if (OtherUses) {
3947 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3948 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3949 }
3950
3951 return DAG.getStore(SN->getChain(), SL, CastVal,
3952 SN->getBasePtr(), SN->getMemOperand());
3953}
3954
3955// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3956// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3957// issues.
3959 DAGCombinerInfo &DCI) const {
3960 SelectionDAG &DAG = DCI.DAG;
3961 SDValue N0 = N->getOperand(0);
3962
3963 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3964 // (vt2 (truncate (assertzext vt0:x, vt1)))
3965 if (N0.getOpcode() == ISD::TRUNCATE) {
3966 SDValue N1 = N->getOperand(1);
3967 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3968 SDLoc SL(N);
3969
3970 SDValue Src = N0.getOperand(0);
3971 EVT SrcVT = Src.getValueType();
3972 if (SrcVT.bitsGE(ExtVT)) {
3973 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3974 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3975 }
3976 }
3977
3978 return SDValue();
3979}
3980
3982 SDNode *N, DAGCombinerInfo &DCI) const {
3983 unsigned IID = N->getConstantOperandVal(0);
3984 switch (IID) {
3985 case Intrinsic::amdgcn_mul_i24:
3986 case Intrinsic::amdgcn_mul_u24:
3987 case Intrinsic::amdgcn_mulhi_i24:
3988 case Intrinsic::amdgcn_mulhi_u24:
3989 return simplifyMul24(N, DCI);
3990 case Intrinsic::amdgcn_fract:
3991 case Intrinsic::amdgcn_rsq:
3992 case Intrinsic::amdgcn_rcp_legacy:
3993 case Intrinsic::amdgcn_rsq_legacy:
3994 case Intrinsic::amdgcn_rsq_clamp: {
3995 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3996 SDValue Src = N->getOperand(1);
3997 return Src.isUndef() ? Src : SDValue();
3998 }
3999 case Intrinsic::amdgcn_frexp_exp: {
4000 // frexp_exp (fneg x) -> frexp_exp x
4001 // frexp_exp (fabs x) -> frexp_exp x
4002 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4003 SDValue Src = N->getOperand(1);
4004 SDValue PeekSign = peekFPSignOps(Src);
4005 if (PeekSign == Src)
4006 return SDValue();
4007 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4008 0);
4009 }
4010 default:
4011 return SDValue();
4012 }
4013}
4014
4015/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4016/// binary operation \p Opc to it with the corresponding constant operands.
4018 DAGCombinerInfo &DCI, const SDLoc &SL,
4019 unsigned Opc, SDValue LHS,
4020 uint32_t ValLo, uint32_t ValHi) const {
4021 SelectionDAG &DAG = DCI.DAG;
4022 SDValue Lo, Hi;
4023 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4024
4025 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4026 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4027
4028 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4029 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4030
4031 // Re-visit the ands. It's possible we eliminated one of them and it could
4032 // simplify the vector.
4033 DCI.AddToWorklist(Lo.getNode());
4034 DCI.AddToWorklist(Hi.getNode());
4035
4036 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4037 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4038}
4039
4041 DAGCombinerInfo &DCI) const {
4042 EVT VT = N->getValueType(0);
4043
4044 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4045 if (!RHS)
4046 return SDValue();
4047
4048 SDValue LHS = N->getOperand(0);
4049 unsigned RHSVal = RHS->getZExtValue();
4050 if (!RHSVal)
4051 return LHS;
4052
4053 SDLoc SL(N);
4054 SelectionDAG &DAG = DCI.DAG;
4055
4056 switch (LHS->getOpcode()) {
4057 default:
4058 break;
4059 case ISD::ZERO_EXTEND:
4060 case ISD::SIGN_EXTEND:
4061 case ISD::ANY_EXTEND: {
4062 SDValue X = LHS->getOperand(0);
4063
4064 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4065 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4066 // Prefer build_vector as the canonical form if packed types are legal.
4067 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4068 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4069 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4070 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4071 }
4072
4073 // shl (ext x) => zext (shl x), if shift does not overflow int
4074 if (VT != MVT::i64)
4075 break;
4076 KnownBits Known = DAG.computeKnownBits(X);
4077 unsigned LZ = Known.countMinLeadingZeros();
4078 if (LZ < RHSVal)
4079 break;
4080 EVT XVT = X.getValueType();
4081 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4082 return DAG.getZExtOrTrunc(Shl, SL, VT);
4083 }
4084 }
4085
4086 if (VT != MVT::i64)
4087 return SDValue();
4088
4089 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4090
4091 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4092 // common case, splitting this into a move and a 32-bit shift is faster and
4093 // the same code size.
4094 if (RHSVal < 32)
4095 return SDValue();
4096
4097 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4098
4099 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4100 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4101
4102 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4103
4104 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4105 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4106}
4107
4109 DAGCombinerInfo &DCI) const {
4110 if (N->getValueType(0) != MVT::i64)
4111 return SDValue();
4112
4113 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4114 if (!RHS)
4115 return SDValue();
4116
4117 SelectionDAG &DAG = DCI.DAG;
4118 SDLoc SL(N);
4119 unsigned RHSVal = RHS->getZExtValue();
4120
4121 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4122 if (RHSVal == 32) {
4123 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4124 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4125 DAG.getConstant(31, SL, MVT::i32));
4126
4127 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4128 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4129 }
4130
4131 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4132 if (RHSVal == 63) {
4133 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4134 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4135 DAG.getConstant(31, SL, MVT::i32));
4136 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4137 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4138 }
4139
4140 return SDValue();
4141}
4142
4144 DAGCombinerInfo &DCI) const {
4145 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4146 if (!RHS)
4147 return SDValue();
4148
4149 EVT VT = N->getValueType(0);
4150 SDValue LHS = N->getOperand(0);
4151 unsigned ShiftAmt = RHS->getZExtValue();
4152 SelectionDAG &DAG = DCI.DAG;
4153 SDLoc SL(N);
4154
4155 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4156 // this improves the ability to match BFE patterns in isel.
4157 if (LHS.getOpcode() == ISD::AND) {
4158 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4159 unsigned MaskIdx, MaskLen;
4160 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4161 MaskIdx == ShiftAmt) {
4162 return DAG.getNode(
4163 ISD::AND, SL, VT,
4164 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4165 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4166 }
4167 }
4168 }
4169
4170 if (VT != MVT::i64)
4171 return SDValue();
4172
4173 if (ShiftAmt < 32)
4174 return SDValue();
4175
4176 // srl i64:x, C for C >= 32
4177 // =>
4178 // build_pair (srl hi_32(x), C - 32), 0
4179 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4180
4181 SDValue Hi = getHiHalf64(LHS, DAG);
4182
4183 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4184 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4185
4186 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4187
4188 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4189}
4190
4192 SDNode *N, DAGCombinerInfo &DCI) const {
4193 SDLoc SL(N);
4194 SelectionDAG &DAG = DCI.DAG;
4195 EVT VT = N->getValueType(0);
4196 SDValue Src = N->getOperand(0);
4197
4198 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4199 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4200 SDValue Vec = Src.getOperand(0);
4201 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4202 SDValue Elt0 = Vec.getOperand(0);
4203 EVT EltVT = Elt0.getValueType();
4204 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4205 if (EltVT.isFloatingPoint()) {
4206 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4207 EltVT.changeTypeToInteger(), Elt0);
4208 }
4209
4210 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4211 }
4212 }
4213 }
4214
4215 // Equivalent of above for accessing the high element of a vector as an
4216 // integer operation.
4217 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4218 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4219 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4220 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4221 SDValue BV = stripBitcast(Src.getOperand(0));
4222 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4223 BV.getValueType().getVectorNumElements() == 2) {
4224 SDValue SrcElt = BV.getOperand(1);
4225 EVT SrcEltVT = SrcElt.getValueType();
4226 if (SrcEltVT.isFloatingPoint()) {
4227 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4228 SrcEltVT.changeTypeToInteger(), SrcElt);
4229 }
4230
4231 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4232 }
4233 }
4234 }
4235 }
4236
4237 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4238 //
4239 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4240 // i16 (trunc (srl (i32 (trunc x), K)))
4241 if (VT.getScalarSizeInBits() < 32) {
4242 EVT SrcVT = Src.getValueType();
4243 if (SrcVT.getScalarSizeInBits() > 32 &&
4244 (Src.getOpcode() == ISD::SRL ||
4245 Src.getOpcode() == ISD::SRA ||
4246 Src.getOpcode() == ISD::SHL)) {
4247 SDValue Amt = Src.getOperand(1);
4248 KnownBits Known = DAG.computeKnownBits(Amt);
4249
4250 // - For left shifts, do the transform as long as the shift
4251 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4252 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4253 // losing information stored in the high bits when truncating.
4254 const unsigned MaxCstSize =
4255 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4256 if (Known.getMaxValue().ule(MaxCstSize)) {
4257 EVT MidVT = VT.isVector() ?
4258 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4259 VT.getVectorNumElements()) : MVT::i32;
4260
4261 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4262 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4263 Src.getOperand(0));
4264 DCI.AddToWorklist(Trunc.getNode());
4265
4266 if (Amt.getValueType() != NewShiftVT) {
4267 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4268 DCI.AddToWorklist(Amt.getNode());
4269 }
4270
4271 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4272 Trunc, Amt);
4273 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4274 }
4275 }
4276 }
4277
4278 return SDValue();
4279}
4280
4281// We need to specifically handle i64 mul here to avoid unnecessary conversion
4282// instructions. If we only match on the legalized i64 mul expansion,
4283// SimplifyDemandedBits will be unable to remove them because there will be
4284// multiple uses due to the separate mul + mulh[su].
4285static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4286 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4287 if (Size <= 32) {
4288 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4289 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4290 }
4291
4292 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4293 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4294
4295 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4296 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4297
4298 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4299}
4300
4301/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4302/// return SDValue().
4303static SDValue getAddOneOp(const SDNode *V) {
4304 if (V->getOpcode() != ISD::ADD)
4305 return SDValue();
4306
4307 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4308}
4309
4311 DAGCombinerInfo &DCI) const {
4312 assert(N->getOpcode() == ISD::MUL);
4313 EVT VT = N->getValueType(0);
4314
4315 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4316 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4317 // unnecessarily). isDivergent() is used as an approximation of whether the
4318 // value is in an SGPR.
4319 if (!N->isDivergent())
4320 return SDValue();
4321
4322 unsigned Size = VT.getSizeInBits();
4323 if (VT.isVector() || Size > 64)
4324 return SDValue();
4325
4326 SelectionDAG &DAG = DCI.DAG;
4327 SDLoc DL(N);
4328
4329 SDValue N0 = N->getOperand(0);
4330 SDValue N1 = N->getOperand(1);
4331
4332 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4333 // matching.
4334
4335 // mul x, (add y, 1) -> add (mul x, y), x
4336 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4337 SDValue AddOp = getAddOneOp(V.getNode());
4338 if (!AddOp)
4339 return SDValue();
4340
4341 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4342 return U->getOpcode() == ISD::MUL;
4343 }))
4344 return AddOp;
4345
4346 return SDValue();
4347 };
4348
4349 // FIXME: The selection pattern is not properly checking for commuted
4350 // operands, so we have to place the mul in the LHS
4351 if (SDValue MulOper = IsFoldableAdd(N0)) {
4352 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4353 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4354 }
4355
4356 if (SDValue MulOper = IsFoldableAdd(N1)) {
4357 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4358 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4359 }
4360
4361 // There are i16 integer mul/mad.
4362 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4363 return SDValue();
4364
4365 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4366 // in the source into any_extends if the result of the mul is truncated. Since
4367 // we can assume the high bits are whatever we want, use the underlying value
4368 // to avoid the unknown high bits from interfering.
4369 if (N0.getOpcode() == ISD::ANY_EXTEND)
4370 N0 = N0.getOperand(0);
4371
4372 if (N1.getOpcode() == ISD::ANY_EXTEND)
4373 N1 = N1.getOperand(0);
4374
4375 SDValue Mul;
4376
4377 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4378 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4379 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4380 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4381 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4382 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4383 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4384 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4385 } else {
4386 return SDValue();
4387 }
4388
4389 // We need to use sext even for MUL_U24, because MUL_U24 is used
4390 // for signed multiply of 8 and 16-bit types.
4391 return DAG.getSExtOrTrunc(Mul, DL, VT);
4392}
4393
4394SDValue
4396 DAGCombinerInfo &DCI) const {
4397 if (N->getValueType(0) != MVT::i32)
4398 return SDValue();
4399
4400 SelectionDAG &DAG = DCI.DAG;
4401 SDLoc DL(N);
4402
4403 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4404 SDValue N0 = N->getOperand(0);
4405 SDValue N1 = N->getOperand(1);
4406
4407 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4408 // in the source into any_extends if the result of the mul is truncated. Since
4409 // we can assume the high bits are whatever we want, use the underlying value
4410 // to avoid the unknown high bits from interfering.
4411 if (N0.getOpcode() == ISD::ANY_EXTEND)
4412 N0 = N0.getOperand(0);
4413 if (N1.getOpcode() == ISD::ANY_EXTEND)
4414 N1 = N1.getOperand(0);
4415
4416 // Try to use two fast 24-bit multiplies (one for each half of the result)
4417 // instead of one slow extending multiply.
4418 unsigned LoOpcode = 0;
4419 unsigned HiOpcode = 0;
4420 if (Signed) {
4421 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4422 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4423 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4424 LoOpcode = AMDGPUISD::MUL_I24;
4425 HiOpcode = AMDGPUISD::MULHI_I24;
4426 }
4427 } else {
4428 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4429 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4430 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4431 LoOpcode = AMDGPUISD::MUL_U24;
4432 HiOpcode = AMDGPUISD::MULHI_U24;
4433 }
4434 }
4435 if (!LoOpcode)
4436 return SDValue();
4437
4438 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4439 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4440 DCI.CombineTo(N, Lo, Hi);
4441 return SDValue(N, 0);
4442}
4443
4445 DAGCombinerInfo &DCI) const {
4446 EVT VT = N->getValueType(0);
4447
4448 if (!Subtarget->hasMulI24() || VT.isVector())
4449 return SDValue();
4450
4451 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4452 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4453 // unnecessarily). isDivergent() is used as an approximation of whether the
4454 // value is in an SGPR.
4455 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4456 // valu op anyway)
4457 if (Subtarget->hasSMulHi() && !N->isDivergent())
4458 return SDValue();
4459
4460 SelectionDAG &DAG = DCI.DAG;
4461 SDLoc DL(N);
4462
4463 SDValue N0 = N->getOperand(0);
4464 SDValue N1 = N->getOperand(1);
4465
4466 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4467 return SDValue();
4468
4469 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4470 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4471
4472 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4473 DCI.AddToWorklist(Mulhi.getNode());
4474 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4475}
4476
4478 DAGCombinerInfo &DCI) const {
4479 EVT VT = N->getValueType(0);
4480
4481 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4482 return SDValue();
4483
4484 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4485 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4486 // unnecessarily). isDivergent() is used as an approximation of whether the
4487 // value is in an SGPR.
4488 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4489 // valu op anyway)
4490 if (Subtarget->hasSMulHi() && !N->isDivergent())
4491 return SDValue();
4492
4493 SelectionDAG &DAG = DCI.DAG;
4494 SDLoc DL(N);
4495
4496 SDValue N0 = N->getOperand(0);
4497 SDValue N1 = N->getOperand(1);
4498
4499 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4500 return SDValue();
4501
4502 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4503 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4504
4505 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4506 DCI.AddToWorklist(Mulhi.getNode());
4507 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4508}
4509
4510SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4511 SDValue Op,
4512 const SDLoc &DL,
4513 unsigned Opc) const {
4514 EVT VT = Op.getValueType();
4515 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4516 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4517 LegalVT != MVT::i16))
4518 return SDValue();
4519
4520 if (VT != MVT::i32)
4521 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4522
4523 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4524 if (VT != MVT::i32)
4525 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4526
4527 return FFBX;
4528}
4529
4530// The native instructions return -1 on 0 input. Optimize out a select that
4531// produces -1 on 0.
4532//
4533// TODO: If zero is not undef, we could also do this if the output is compared
4534// against the bitwidth.
4535//
4536// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4538 SDValue LHS, SDValue RHS,
4539 DAGCombinerInfo &DCI) const {
4540 if (!isNullConstant(Cond.getOperand(1)))
4541 return SDValue();
4542
4543 SelectionDAG &DAG = DCI.DAG;
4544 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4545 SDValue CmpLHS = Cond.getOperand(0);
4546
4547 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4548 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4549 if (CCOpcode == ISD::SETEQ &&
4550 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4551 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4552 unsigned Opc =
4554 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4555 }
4556
4557 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4558 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4559 if (CCOpcode == ISD::SETNE &&
4560 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4561 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4562 unsigned Opc =
4564
4565 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4566 }
4567
4568 return SDValue();
4569}
4570
4572 unsigned Op,
4573 const SDLoc &SL,
4574 SDValue Cond,
4575 SDValue N1,
4576 SDValue N2) {
4577 SelectionDAG &DAG = DCI.DAG;
4578 EVT VT = N1.getValueType();
4579
4580 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4581 N1.getOperand(0), N2.getOperand(0));
4582 DCI.AddToWorklist(NewSelect.getNode());
4583 return DAG.getNode(Op, SL, VT, NewSelect);
4584}
4585
4586// Pull a free FP operation out of a select so it may fold into uses.
4587//
4588// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4589// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4590//
4591// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4592// select c, (fabs x), +k -> fabs (select c, x, k)
4593SDValue
4595 SDValue N) const {
4596 SelectionDAG &DAG = DCI.DAG;
4597 SDValue Cond = N.getOperand(0);
4598 SDValue LHS = N.getOperand(1);
4599 SDValue RHS = N.getOperand(2);
4600
4601 EVT VT = N.getValueType();
4602 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4603 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4605 return SDValue();
4606
4607 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4608 SDLoc(N), Cond, LHS, RHS);
4609 }
4610
4611 bool Inv = false;
4612 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4613 std::swap(LHS, RHS);
4614 Inv = true;
4615 }
4616
4617 // TODO: Support vector constants.
4618 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4619 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4620 !selectSupportsSourceMods(N.getNode())) {
4621 SDLoc SL(N);
4622 // If one side is an fneg/fabs and the other is a constant, we can push the
4623 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4624 SDValue NewLHS = LHS.getOperand(0);
4625 SDValue NewRHS = RHS;
4626
4627 // Careful: if the neg can be folded up, don't try to pull it back down.
4628 bool ShouldFoldNeg = true;
4629
4630 if (NewLHS.hasOneUse()) {
4631 unsigned Opc = NewLHS.getOpcode();
4632 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4633 ShouldFoldNeg = false;
4634 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4635 ShouldFoldNeg = false;
4636 }
4637
4638 if (ShouldFoldNeg) {
4639 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4640 return SDValue();
4641
4642 // We're going to be forced to use a source modifier anyway, there's no
4643 // point to pulling the negate out unless we can get a size reduction by
4644 // negating the constant.
4645 //
4646 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4647 // about cheaper constants.
4648 if (NewLHS.getOpcode() == ISD::FABS &&
4650 return SDValue();
4651
4653 return SDValue();
4654
4655 if (LHS.getOpcode() == ISD::FNEG)
4656 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4657
4658 if (Inv)
4659 std::swap(NewLHS, NewRHS);
4660
4661 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4662 Cond, NewLHS, NewRHS);
4663 DCI.AddToWorklist(NewSelect.getNode());
4664 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4665 }
4666 }
4667
4668 return SDValue();
4669}
4670
4672 DAGCombinerInfo &DCI) const {
4673 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4674 return Folded;
4675
4676 SDValue Cond = N->getOperand(0);
4677 if (Cond.getOpcode() != ISD::SETCC)
4678 return SDValue();
4679
4680 EVT VT = N->getValueType(0);
4681 SDValue LHS = Cond.getOperand(0);
4682 SDValue RHS = Cond.getOperand(1);
4683 SDValue CC = Cond.getOperand(2);
4684
4685 SDValue True = N->getOperand(1);
4686 SDValue False = N->getOperand(2);
4687
4688 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4689 SelectionDAG &DAG = DCI.DAG;
4690 if (DAG.isConstantValueOfAnyType(True) &&
4691 !DAG.isConstantValueOfAnyType(False)) {
4692 // Swap cmp + select pair to move constant to false input.
4693 // This will allow using VOPC cndmasks more often.
4694 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4695
4696 SDLoc SL(N);
4697 ISD::CondCode NewCC =
4698 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4699
4700 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4701 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4702 }
4703
4704 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4706 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4707 // Revisit this node so we can catch min3/max3/med3 patterns.
4708 //DCI.AddToWorklist(MinMax.getNode());
4709 return MinMax;
4710 }
4711 }
4712
4713 // There's no reason to not do this if the condition has other uses.
4714 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4715}
4716
4717static bool isInv2Pi(const APFloat &APF) {
4718 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4719 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4720 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4721
4722 return APF.bitwiseIsEqual(KF16) ||
4723 APF.bitwiseIsEqual(KF32) ||
4724 APF.bitwiseIsEqual(KF64);
4725}
4726
4727// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4728// additional cost to negate them.
4731 if (C->isZero())
4732 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4733
4734 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4735 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4736
4738}
4739
4743 return false;
4744}
4745
4749 return false;
4750}
4751
4752static unsigned inverseMinMax(unsigned Opc) {
4753 switch (Opc) {
4754 case ISD::FMAXNUM:
4755 return ISD::FMINNUM;
4756 case ISD::FMINNUM:
4757 return ISD::FMAXNUM;
4758 case ISD::FMAXNUM_IEEE:
4759 return ISD::FMINNUM_IEEE;
4760 case ISD::FMINNUM_IEEE:
4761 return ISD::FMAXNUM_IEEE;
4762 case ISD::FMAXIMUM:
4763 return ISD::FMINIMUM;
4764 case ISD::FMINIMUM:
4765 return ISD::FMAXIMUM;
4770 default:
4771 llvm_unreachable("invalid min/max opcode");
4772 }
4773}
4774
4775/// \return true if it's profitable to try to push an fneg into its source
4776/// instruction.
4778 // If the input has multiple uses and we can either fold the negate down, or
4779 // the other uses cannot, give up. This both prevents unprofitable
4780 // transformations and infinite loops: we won't repeatedly try to fold around
4781 // a negate that has no 'good' form.
4782 if (N0.hasOneUse()) {
4783 // This may be able to fold into the source, but at a code size cost. Don't
4784 // fold if the fold into the user is free.
4785 if (allUsesHaveSourceMods(N, 0))
4786 return false;
4787 } else {
4788 if (fnegFoldsIntoOp(N0.getNode()) &&
4790 return false;
4791 }
4792
4793 return true;
4794}
4795
4797 DAGCombinerInfo &DCI) const {
4798 SelectionDAG &DAG = DCI.DAG;
4799 SDValue N0 = N->getOperand(0);
4800 EVT VT = N->getValueType(0);
4801
4802 unsigned Opc = N0.getOpcode();
4803
4804 if (!shouldFoldFNegIntoSrc(N, N0))
4805 return SDValue();
4806
4807 SDLoc SL(N);
4808 switch (Opc) {
4809 case ISD::FADD: {
4810 if (!mayIgnoreSignedZero(N0))
4811 return SDValue();
4812
4813 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4814 SDValue LHS = N0.getOperand(0);
4815 SDValue RHS = N0.getOperand(1);
4816
4817 if (LHS.getOpcode() != ISD::FNEG)
4818 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4819 else
4820 LHS = LHS.getOperand(0);
4821
4822 if (RHS.getOpcode() != ISD::FNEG)
4823 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4824 else
4825 RHS = RHS.getOperand(0);
4826
4827 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4828 if (Res.getOpcode() != ISD::FADD)
4829 return SDValue(); // Op got folded away.
4830 if (!N0.hasOneUse())
4831 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4832 return Res;
4833 }
4834 case ISD::FMUL:
4836 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4837 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4838 SDValue LHS = N0.getOperand(0);
4839 SDValue RHS = N0.getOperand(1);
4840
4841 if (LHS.getOpcode() == ISD::FNEG)
4842 LHS = LHS.getOperand(0);
4843 else if (RHS.getOpcode() == ISD::FNEG)
4844 RHS = RHS.getOperand(0);
4845 else
4846 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4847
4848 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4849 if (Res.getOpcode() != Opc)
4850 return SDValue(); // Op got folded away.
4851 if (!N0.hasOneUse())
4852 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4853 return Res;
4854 }
4855 case ISD::FMA:
4856 case ISD::FMAD: {
4857 // TODO: handle llvm.amdgcn.fma.legacy
4858 if (!mayIgnoreSignedZero(N0))
4859 return SDValue();
4860
4861 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4862 SDValue LHS = N0.getOperand(0);
4863 SDValue MHS = N0.getOperand(1);
4864 SDValue RHS = N0.getOperand(2);
4865
4866 if (LHS.getOpcode() == ISD::FNEG)
4867 LHS = LHS.getOperand(0);
4868 else if (MHS.getOpcode() == ISD::FNEG)
4869 MHS = MHS.getOperand(0);
4870 else
4871 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4872
4873 if (RHS.getOpcode() != ISD::FNEG)
4874 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4875 else
4876 RHS = RHS.getOperand(0);
4877
4878 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4879 if (Res.getOpcode() != Opc)
4880 return SDValue(); // Op got folded away.
4881 if (!N0.hasOneUse())
4882 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4883 return Res;
4884 }
4885 case ISD::FMAXNUM:
4886 case ISD::FMINNUM:
4887 case ISD::FMAXNUM_IEEE:
4888 case ISD::FMINNUM_IEEE:
4889 case ISD::FMINIMUM:
4890 case ISD::FMAXIMUM:
4893 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4894 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4895 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4896 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4897
4898 SDValue LHS = N0.getOperand(0);
4899 SDValue RHS = N0.getOperand(1);
4900
4901 // 0 doesn't have a negated inline immediate.
4902 // TODO: This constant check should be generalized to other operations.
4904 return SDValue();
4905
4906 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4907 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4908 unsigned Opposite = inverseMinMax(Opc);
4909
4910 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4911 if (Res.getOpcode() != Opposite)
4912 return SDValue(); // Op got folded away.
4913 if (!N0.hasOneUse())
4914 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4915 return Res;
4916 }
4917 case AMDGPUISD::FMED3: {
4918 SDValue Ops[3];
4919 for (unsigned I = 0; I < 3; ++I)
4920 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4921
4922 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4923 if (Res.getOpcode() != AMDGPUISD::FMED3)
4924 return SDValue(); // Op got folded away.
4925
4926 if (!N0.hasOneUse()) {
4927 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4928 DAG.ReplaceAllUsesWith(N0, Neg);
4929
4930 for (SDNode *U : Neg->users())
4931 DCI.AddToWorklist(U);
4932 }
4933
4934 return Res;
4935 }
4936 case ISD::FP_EXTEND:
4937 case ISD::FTRUNC:
4938 case ISD::FRINT:
4939 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4940 case ISD::FROUNDEVEN:
4941 case ISD::FSIN:
4942 case ISD::FCANONICALIZE:
4943 case AMDGPUISD::RCP:
4946 case AMDGPUISD::SIN_HW: {
4947 SDValue CvtSrc = N0.getOperand(0);
4948 if (CvtSrc.getOpcode() == ISD::FNEG) {
4949 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4950 // (fneg (rcp (fneg x))) -> (rcp x)
4951 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4952 }
4953
4954 if (!N0.hasOneUse())
4955 return SDValue();
4956
4957 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4958 // (fneg (rcp x)) -> (rcp (fneg x))
4959 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4960 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4961 }
4962 case ISD::FP_ROUND: {
4963 SDValue CvtSrc = N0.getOperand(0);
4964
4965 if (CvtSrc.getOpcode() == ISD::FNEG) {
4966 // (fneg (fp_round (fneg x))) -> (fp_round x)
4967 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4968 CvtSrc.getOperand(0), N0.getOperand(1));
4969 }
4970
4971 if (!N0.hasOneUse())
4972 return SDValue();
4973
4974 // (fneg (fp_round x)) -> (fp_round (fneg x))
4975 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4976 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4977 }
4978 case ISD::FP16_TO_FP: {
4979 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4980 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4981 // Put the fneg back as a legal source operation that can be matched later.
4982 SDLoc SL(N);
4983
4984 SDValue Src = N0.getOperand(0);
4985 EVT SrcVT = Src.getValueType();
4986
4987 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4988 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4989 DAG.getConstant(0x8000, SL, SrcVT));
4990 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4991 }
4992 case ISD::SELECT: {
4993 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4994 // TODO: Invert conditions of foldFreeOpFromSelect
4995 return SDValue();
4996 }
4997 case ISD::BITCAST: {
4998 SDLoc SL(N);
4999 SDValue BCSrc = N0.getOperand(0);
5000 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5001 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5002 if (HighBits.getValueType().getSizeInBits() != 32 ||
5003 !fnegFoldsIntoOp(HighBits.getNode()))
5004 return SDValue();
5005
5006 // f64 fneg only really needs to operate on the high half of of the
5007 // register, so try to force it to an f32 operation to help make use of
5008 // source modifiers.
5009 //
5010 //
5011 // fneg (f64 (bitcast (build_vector x, y))) ->
5012 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5013 // (fneg (bitcast i32:y to f32)))
5014
5015 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5016 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5017 SDValue CastBack =
5018 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5019
5020 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5021 Ops.back() = CastBack;
5022 DCI.AddToWorklist(NegHi.getNode());
5023 SDValue Build =
5024 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5025 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5026
5027 if (!N0.hasOneUse())
5028 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5029 return Result;
5030 }
5031
5032 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5033 BCSrc.hasOneUse()) {
5034 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5035 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5036
5037 // TODO: Cast back result for multiple uses is beneficial in some cases.
5038
5039 SDValue LHS =
5040 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5041 SDValue RHS =
5042 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5043
5044 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5045 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5046
5047 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5048 NegRHS);
5049 }
5050
5051 return SDValue();
5052 }
5053 default:
5054 return SDValue();
5055 }
5056}
5057
5059 DAGCombinerInfo &DCI) const {
5060 SelectionDAG &DAG = DCI.DAG;
5061 SDValue N0 = N->getOperand(0);
5062
5063 if (!N0.hasOneUse())
5064 return SDValue();
5065
5066 switch (N0.getOpcode()) {
5067 case ISD::FP16_TO_FP: {
5068 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5069 SDLoc SL(N);
5070 SDValue Src = N0.getOperand(0);
5071 EVT SrcVT = Src.getValueType();
5072
5073 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5074 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5075 DAG.getConstant(0x7fff, SL, SrcVT));
5076 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5077 }
5078 default:
5079 return SDValue();
5080 }
5081}
5082
5084 DAGCombinerInfo &DCI) const {
5085 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5086 if (!CFP)
5087 return SDValue();
5088
5089 // XXX - Should this flush denormals?
5090 const APFloat &Val = CFP->getValueAPF();
5091 APFloat One(Val.getSemantics(), "1.0");
5092 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5093}
5094
5096 DAGCombinerInfo &DCI) const {
5097 SelectionDAG &DAG = DCI.DAG;
5098 SDLoc DL(N);
5099
5100 switch(N->getOpcode()) {
5101 default:
5102 break;
5103 case ISD::BITCAST: {
5104 EVT DestVT = N->getValueType(0);
5105
5106 // Push casts through vector builds. This helps avoid emitting a large
5107 // number of copies when materializing floating point vector constants.
5108 //
5109 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5110 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5111 if (DestVT.isVector()) {
5112 SDValue Src = N->getOperand(0);
5113 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5116 EVT SrcVT = Src.getValueType();
5117 unsigned NElts = DestVT.getVectorNumElements();
5118
5119 if (SrcVT.getVectorNumElements() == NElts) {
5120 EVT DestEltVT = DestVT.getVectorElementType();
5121
5122 SmallVector<SDValue, 8> CastedElts;
5123 SDLoc SL(N);
5124 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5125 SDValue Elt = Src.getOperand(I);
5126 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5127 }
5128
5129 return DAG.getBuildVector(DestVT, SL, CastedElts);
5130 }
5131 }
5132 }
5133
5134 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5135 break;
5136
5137 // Fold bitcasts of constants.
5138 //
5139 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5140 // TODO: Generalize and move to DAGCombiner
5141 SDValue Src = N->getOperand(0);
5142 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5143 SDLoc SL(N);
5144 uint64_t CVal = C->getZExtValue();
5145 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5146 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5147 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5148 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5149 }
5150
5151 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5152 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5153 SDLoc SL(N);
5154 uint64_t CVal = Val.getZExtValue();
5155 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5156 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5157 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5158
5159 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5160 }
5161
5162 break;
5163 }
5164 case ISD::SHL: {
5166 break;
5167
5168 return performShlCombine(N, DCI);
5169 }
5170 case ISD::SRL: {
5172 break;
5173
5174 return performSrlCombine(N, DCI);
5175 }
5176 case ISD::SRA: {
5178 break;
5179
5180 return performSraCombine(N, DCI);
5181 }
5182 case ISD::TRUNCATE:
5183 return performTruncateCombine(N, DCI);
5184 case ISD::MUL:
5185 return performMulCombine(N, DCI);
5186 case AMDGPUISD::MUL_U24:
5187 case AMDGPUISD::MUL_I24: {
5188 if (SDValue Simplified = simplifyMul24(N, DCI))
5189 return Simplified;
5190 break;
5191 }
5194 return simplifyMul24(N, DCI);
5195 case ISD::SMUL_LOHI:
5196 case ISD::UMUL_LOHI:
5197 return performMulLoHiCombine(N, DCI);
5198 case ISD::MULHS:
5199 return performMulhsCombine(N, DCI);
5200 case ISD::MULHU:
5201 return performMulhuCombine(N, DCI);
5202 case ISD::SELECT:
5203 return performSelectCombine(N, DCI);
5204 case ISD::FNEG:
5205 return performFNegCombine(N, DCI);
5206 case ISD::FABS:
5207 return performFAbsCombine(N, DCI);
5208 case AMDGPUISD::BFE_I32:
5209 case AMDGPUISD::BFE_U32: {
5210 assert(!N->getValueType(0).isVector() &&
5211 "Vector handling of BFE not implemented");
5212 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5213 if (!Width)
5214 break;
5215
5216 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5217 if (WidthVal == 0)
5218 return DAG.getConstant(0, DL, MVT::i32);
5219
5220 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5221 if (!Offset)
5222 break;
5223
5224 SDValue BitsFrom = N->getOperand(0);
5225 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5226
5227 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5228
5229 if (OffsetVal == 0) {
5230 // This is already sign / zero extended, so try to fold away extra BFEs.
5231 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5232
5233 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5234 if (OpSignBits >= SignBits)
5235 return BitsFrom;
5236
5237 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5238 if (Signed) {
5239 // This is a sign_extend_inreg. Replace it to take advantage of existing
5240 // DAG Combines. If not eliminated, we will match back to BFE during
5241 // selection.
5242
5243 // TODO: The sext_inreg of extended types ends, although we can could
5244 // handle them in a single BFE.
5245 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5246 DAG.getValueType(SmallVT));
5247 }
5248
5249 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5250 }
5251
5252 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5253 if (Signed) {
5254 return constantFoldBFE<int32_t>(DAG,
5255 CVal->getSExtValue(),
5256 OffsetVal,
5257 WidthVal,
5258 DL);
5259 }
5260
5261 return constantFoldBFE<uint32_t>(DAG,
5262 CVal->getZExtValue(),
5263 OffsetVal,
5264 WidthVal,
5265 DL);
5266 }
5267
5268 if ((OffsetVal + WidthVal) >= 32 &&
5269 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5270 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5271 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5272 BitsFrom, ShiftVal);
5273 }
5274
5275 if (BitsFrom.hasOneUse()) {
5276 APInt Demanded = APInt::getBitsSet(32,
5277 OffsetVal,
5278 OffsetVal + WidthVal);
5279
5280 KnownBits Known;
5282 !DCI.isBeforeLegalizeOps());
5283 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5284 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5285 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5286 DCI.CommitTargetLoweringOpt(TLO);
5287 }
5288 }
5289
5290 break;
5291 }
5292 case ISD::LOAD:
5293 return performLoadCombine(N, DCI);
5294 case ISD::STORE:
5295 return performStoreCombine(N, DCI);
5296 case AMDGPUISD::RCP:
5298 return performRcpCombine(N, DCI);
5299 case ISD::AssertZext:
5300 case ISD::AssertSext:
5301 return performAssertSZExtCombine(N, DCI);
5303 return performIntrinsicWOChainCombine(N, DCI);
5304 case AMDGPUISD::FMAD_FTZ: {
5305 SDValue N0 = N->getOperand(0);
5306 SDValue N1 = N->getOperand(1);
5307 SDValue N2 = N->getOperand(2);
5308 EVT VT = N->getValueType(0);
5309
5310 // FMAD_FTZ is a FMAD + flush denormals to zero.
5311 // We flush the inputs, the intermediate step, and the output.
5312 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5313 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5314 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5315 if (N0CFP && N1CFP && N2CFP) {
5316 const auto FTZ = [](const APFloat &V) {
5317 if (V.isDenormal()) {
5318 APFloat Zero(V.getSemantics(), 0);
5319 return V.isNegative() ? -Zero : Zero;
5320 }
5321 return V;
5322 };
5323
5324 APFloat V0 = FTZ(N0CFP->getValueAPF());
5325 APFloat V1 = FTZ(N1CFP->getValueAPF());
5326 APFloat V2 = FTZ(N2CFP->getValueAPF());
5328 V0 = FTZ(V0);
5330 return DAG.getConstantFP(FTZ(V0), DL, VT);
5331 }
5332 break;
5333 }
5334 }
5335 return SDValue();
5336}
5337
5338//===----------------------------------------------------------------------===//
5339// Helper functions
5340//===----------------------------------------------------------------------===//
5341
5343 const TargetRegisterClass *RC,
5344 Register Reg, EVT VT,
5345 const SDLoc &SL,
5346 bool RawReg) const {
5349 Register VReg;
5350
5351 if (!MRI.isLiveIn(Reg)) {
5352 VReg = MRI.createVirtualRegister(RC);
5353 MRI.addLiveIn(Reg, VReg);
5354 } else {
5355 VReg = MRI.getLiveInVirtReg(Reg);
5356 }
5357
5358 if (RawReg)
5359 return DAG.getRegister(VReg, VT);
5360
5361 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5362}
5363
5364// This may be called multiple times, and nothing prevents creating multiple
5365// objects at the same offset. See if we already defined this object.
5367 int64_t Offset) {
5368 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5369 if (MFI.getObjectOffset(I) == Offset) {
5370 assert(MFI.getObjectSize(I) == Size);
5371 return I;
5372 }
5373 }
5374
5375 return MFI.CreateFixedObject(Size, Offset, true);
5376}
5377
5379 EVT VT,
5380 const SDLoc &SL,
5381 int64_t Offset) const {
5383 MachineFrameInfo &MFI = MF.getFrameInfo();
5384 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5385
5386 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5387 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5388
5389 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5392}
5393
5395 const SDLoc &SL,
5396 SDValue Chain,
5397 SDValue ArgVal,
5398 int64_t Offset) const {
5402
5403 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5404 // Stores to the argument stack area are relative to the stack pointer.
5405 SDValue SP =
5406 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5407 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5408 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5410 return Store;
5411}
5412
5414 const TargetRegisterClass *RC,
5415 EVT VT, const SDLoc &SL,
5416 const ArgDescriptor &Arg) const {
5417 assert(Arg && "Attempting to load missing argument");
5418
5419 SDValue V = Arg.isRegister() ?
5420 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5421 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5422
5423 if (!Arg.isMasked())
5424 return V;
5425
5426 unsigned Mask = Arg.getMask();
5427 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5428 V = DAG.getNode(ISD::SRL, SL, VT, V,
5429 DAG.getShiftAmountConstant(Shift, VT, SL));
5430 return DAG.getNode(ISD::AND, SL, VT, V,
5431 DAG.getConstant(Mask >> Shift, SL, VT));
5432}
5433
5435 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5436 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5437 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5438 uint64_t ArgOffset =
5439 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5440 switch (Param) {
5441 case FIRST_IMPLICIT:
5442 return ArgOffset;
5443 case PRIVATE_BASE:
5445 case SHARED_BASE:
5446 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5447 case QUEUE_PTR:
5448 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5449 }
5450 llvm_unreachable("unexpected implicit parameter type");
5451}
5452
5454 const MachineFunction &MF, const ImplicitParameter Param) const {
5457}
5458
5459#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5460
5461const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5462 switch ((AMDGPUISD::NodeType)Opcode) {
5463 case AMDGPUISD::FIRST_NUMBER: break;
5464 // AMDIL DAG nodes
5465 NODE_NAME_CASE(BRANCH_COND);
5466
5467 // AMDGPU DAG nodes
5468 NODE_NAME_CASE(IF)
5469 NODE_NAME_CASE(ELSE)
5470 NODE_NAME_CASE(LOOP)
5471 NODE_NAME_CASE(CALL)
5472 NODE_NAME_CASE(TC_RETURN)
5473 NODE_NAME_CASE(TC_RETURN_GFX)
5474 NODE_NAME_CASE(TC_RETURN_CHAIN)
5475 NODE_NAME_CASE(TRAP)
5476 NODE_NAME_CASE(RET_GLUE)
5477 NODE_NAME_CASE(WAVE_ADDRESS)
5478 NODE_NAME_CASE(RETURN_TO_EPILOG)
5479 NODE_NAME_CASE(ENDPGM)
5480 NODE_NAME_CASE(ENDPGM_TRAP)
5481 NODE_NAME_CASE(SIMULATED_TRAP)
5482 NODE_NAME_CASE(DWORDADDR)
5483 NODE_NAME_CASE(FRACT)
5484 NODE_NAME_CASE(SETCC)
5485 NODE_NAME_CASE(DENORM_MODE)
5486 NODE_NAME_CASE(FMA_W_CHAIN)
5487 NODE_NAME_CASE(FMUL_W_CHAIN)
5488 NODE_NAME_CASE(CLAMP)
5489 NODE_NAME_CASE(COS_HW)
5490 NODE_NAME_CASE(SIN_HW)
5491 NODE_NAME_CASE(FMAX_LEGACY)
5492 NODE_NAME_CASE(FMIN_LEGACY)
5493 NODE_NAME_CASE(FMAX3)
5494 NODE_NAME_CASE(SMAX3)
5495 NODE_NAME_CASE(UMAX3)
5496 NODE_NAME_CASE(FMIN3)
5497 NODE_NAME_CASE(SMIN3)
5498 NODE_NAME_CASE(UMIN3)
5499 NODE_NAME_CASE(FMED3)
5500 NODE_NAME_CASE(SMED3)
5501 NODE_NAME_CASE(UMED3)
5502 NODE_NAME_CASE(FMAXIMUM3)
5503 NODE_NAME_CASE(FMINIMUM3)
5504 NODE_NAME_CASE(FDOT2)
5505 NODE_NAME_CASE(URECIP)
5506 NODE_NAME_CASE(DIV_SCALE)
5507 NODE_NAME_CASE(DIV_FMAS)
5508 NODE_NAME_CASE(DIV_FIXUP)
5509 NODE_NAME_CASE(FMAD_FTZ)
5510 NODE_NAME_CASE(RCP)
5511 NODE_NAME_CASE(RSQ)
5512 NODE_NAME_CASE(RCP_LEGACY)
5513 NODE_NAME_CASE(RCP_IFLAG)
5514 NODE_NAME_CASE(LOG)
5515 NODE_NAME_CASE(EXP)
5516 NODE_NAME_CASE(FMUL_LEGACY)
5517 NODE_NAME_CASE(RSQ_CLAMP)
5518 NODE_NAME_CASE(FP_CLASS)
5519 NODE_NAME_CASE(DOT4)
5520 NODE_NAME_CASE(CARRY)
5521 NODE_NAME_CASE(BORROW)
5522 NODE_NAME_CASE(BFE_U32)
5523 NODE_NAME_CASE(BFE_I32)
5524 NODE_NAME_CASE(BFI)
5525 NODE_NAME_CASE(BFM)
5526 NODE_NAME_CASE(FFBH_U32)
5527 NODE_NAME_CASE(FFBH_I32)
5528 NODE_NAME_CASE(FFBL_B32)
5529 NODE_NAME_CASE(MUL_U24)
5530 NODE_NAME_CASE(MUL_I24)
5531 NODE_NAME_CASE(MULHI_U24)
5532 NODE_NAME_CASE(MULHI_I24)
5533 NODE_NAME_CASE(MAD_U24)
5534 NODE_NAME_CASE(MAD_I24)
5535 NODE_NAME_CASE(MAD_I64_I32)
5536 NODE_NAME_CASE(MAD_U64_U32)
5537 NODE_NAME_CASE(PERM)
5538 NODE_NAME_CASE(TEXTURE_FETCH)
5539 NODE_NAME_CASE(R600_EXPORT)
5540 NODE_NAME_CASE(CONST_ADDRESS)
5541 NODE_NAME_CASE(REGISTER_LOAD)
5542 NODE_NAME_CASE(REGISTER_STORE)
5543 NODE_NAME_CASE(CVT_F32_UBYTE0)
5544 NODE_NAME_CASE(CVT_F32_UBYTE1)
5545 NODE_NAME_CASE(CVT_F32_UBYTE2)
5546 NODE_NAME_CASE(CVT_F32_UBYTE3)
5547 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5548 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5549 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5550 NODE_NAME_CASE(CVT_PK_I16_I32)
5551 NODE_NAME_CASE(CVT_PK_U16_U32)
5552 NODE_NAME_CASE(FP_TO_FP16)
5553 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5554 NODE_NAME_CASE(CONST_DATA_PTR)
5555 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5557 NODE_NAME_CASE(DUMMY_CHAIN)
5559 NODE_NAME_CASE(LOAD_D16_HI)
5560 NODE_NAME_CASE(LOAD_D16_LO)
5561 NODE_NAME_CASE(LOAD_D16_HI_I8)
5562 NODE_NAME_CASE(LOAD_D16_HI_U8)
5563 NODE_NAME_CASE(LOAD_D16_LO_I8)
5564 NODE_NAME_CASE(LOAD_D16_LO_U8)
5565 NODE_NAME_CASE(STORE_MSKOR)
5566 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5567 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5568 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5569 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5570 NODE_NAME_CASE(DS_ORDERED_COUNT)
5571 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5572 NODE_NAME_CASE(BUFFER_LOAD)
5573 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5574 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5575 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5576 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5577 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5578 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5579 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5580 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5581 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5582 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5583 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5584 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5585 NODE_NAME_CASE(SBUFFER_LOAD)
5586 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5587 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5588 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5589 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5590 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5591 NODE_NAME_CASE(BUFFER_STORE)
5592 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5593 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5594 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5595 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5596 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5597 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5598 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5599 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5600 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5601 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5602 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5603 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5604 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5605 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5606 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5607 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5608 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5609 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5610 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5611 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5612 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5613 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5614 }
5615 return nullptr;
5616}
5617
5619 SelectionDAG &DAG, int Enabled,
5620 int &RefinementSteps,
5621 bool &UseOneConstNR,
5622 bool Reciprocal) const {
5623 EVT VT = Operand.getValueType();
5624
5625 if (VT == MVT::f32) {
5626 RefinementSteps = 0;
5627 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5628 }
5629
5630 // TODO: There is also f64 rsq instruction, but the documentation is less
5631 // clear on its precision.
5632
5633 return SDValue();
5634}
5635
5637 SelectionDAG &DAG, int Enabled,
5638 int &RefinementSteps) const {
5639 EVT VT = Operand.getValueType();
5640
5641 if (VT == MVT::f32) {
5642 // Reciprocal, < 1 ulp error.
5643 //
5644 // This reciprocal approximation converges to < 0.5 ulp error with one
5645 // newton rhapson performed with two fused multiple adds (FMAs).
5646
5647 RefinementSteps = 0;
5648 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5649 }
5650
5651 // TODO: There is also f64 rcp instruction, but the documentation is less
5652 // clear on its precision.
5653
5654 return SDValue();
5655}
5656
5657static unsigned workitemIntrinsicDim(unsigned ID) {
5658 switch (ID) {
5659 case Intrinsic::amdgcn_workitem_id_x:
5660 return 0;
5661 case Intrinsic::amdgcn_workitem_id_y:
5662 return 1;
5663 case Intrinsic::amdgcn_workitem_id_z:
5664 return 2;
5665 default:
5666 llvm_unreachable("not a workitem intrinsic");
5667 }
5668}
5669
5671 const SDValue Op, KnownBits &Known,
5672 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5673
5674 Known.resetAll(); // Don't know anything.
5675
5676 unsigned Opc = Op.getOpcode();
5677
5678 switch (Opc) {
5679 default:
5680 break;
5681 case AMDGPUISD::CARRY:
5682 case AMDGPUISD::BORROW: {
5683 Known.Zero = APInt::getHighBitsSet(32, 31);
5684 break;
5685 }
5686
5687 case AMDGPUISD::BFE_I32:
5688 case AMDGPUISD::BFE_U32: {
5689 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5690 if (!CWidth)
5691 return;
5692
5693 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5694
5695 if (Opc == AMDGPUISD::BFE_U32)
5696 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5697
5698 break;
5699 }
5700 case AMDGPUISD::FP_TO_FP16: {
5701 unsigned BitWidth = Known.getBitWidth();
5702
5703 // High bits are zero.
5705 break;
5706 }
5707 case AMDGPUISD::MUL_U24:
5708 case AMDGPUISD::MUL_I24: {
5709 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5710 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5711 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5712 RHSKnown.countMinTrailingZeros();
5713 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5714 // Skip extra check if all bits are known zeros.
5715 if (TrailZ >= 32)
5716 break;
5717
5718 // Truncate to 24 bits.
5719 LHSKnown = LHSKnown.trunc(24);
5720 RHSKnown = RHSKnown.trunc(24);
5721
5722 if (Opc == AMDGPUISD::MUL_I24) {
5723 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5724 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5725 unsigned MaxValBits = LHSValBits + RHSValBits;
5726 if (MaxValBits > 32)
5727 break;
5728 unsigned SignBits = 32 - MaxValBits + 1;
5729 bool LHSNegative = LHSKnown.isNegative();
5730 bool LHSNonNegative = LHSKnown.isNonNegative();
5731 bool LHSPositive = LHSKnown.isStrictlyPositive();
5732 bool RHSNegative = RHSKnown.isNegative();
5733 bool RHSNonNegative = RHSKnown.isNonNegative();
5734 bool RHSPositive = RHSKnown.isStrictlyPositive();
5735
5736 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5737 Known.Zero.setHighBits(SignBits);
5738 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5739 Known.One.setHighBits(SignBits);
5740 } else {
5741 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5742 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5743 unsigned MaxValBits = LHSValBits + RHSValBits;
5744 if (MaxValBits >= 32)
5745 break;
5746 Known.Zero.setBitsFrom(MaxValBits);
5747 }
5748 break;
5749 }
5750 case AMDGPUISD::PERM: {
5751 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5752 if (!CMask)
5753 return;
5754
5755 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5756 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5757 unsigned Sel = CMask->getZExtValue();
5758
5759 for (unsigned I = 0; I < 32; I += 8) {
5760 unsigned SelBits = Sel & 0xff;
5761 if (SelBits < 4) {
5762 SelBits *= 8;
5763 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5764 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5765 } else if (SelBits < 7) {
5766 SelBits = (SelBits & 3) * 8;
5767 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5768 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5769 } else if (SelBits == 0x0c) {
5770 Known.Zero |= 0xFFull << I;
5771 } else if (SelBits > 0x0c) {
5772 Known.One |= 0xFFull << I;
5773 }
5774 Sel >>= 8;
5775 }
5776 break;
5777 }
5779 Known.Zero.setHighBits(24);
5780 break;
5781 }
5783 Known.Zero.setHighBits(16);
5784 break;
5785 }
5786 case AMDGPUISD::LDS: {
5787 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5788 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5789
5790 Known.Zero.setHighBits(16);
5791 Known.Zero.setLowBits(Log2(Alignment));
5792 break;
5793 }
5794 case AMDGPUISD::SMIN3:
5795 case AMDGPUISD::SMAX3:
5796 case AMDGPUISD::SMED3:
5797 case AMDGPUISD::UMIN3:
5798 case AMDGPUISD::UMAX3:
5799 case AMDGPUISD::UMED3: {
5800 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5801 if (Known2.isUnknown())
5802 break;
5803
5804 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5805 if (Known1.isUnknown())
5806 break;
5807
5808 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5809 if (Known0.isUnknown())
5810 break;
5811
5812 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5813 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5814 Known.One = Known0.One & Known1.One & Known2.One;
5815 break;
5816 }
5818 unsigned IID = Op.getConstantOperandVal(0);
5819 switch (IID) {
5820 case Intrinsic::amdgcn_workitem_id_x:
5821 case Intrinsic::amdgcn_workitem_id_y:
5822 case Intrinsic::amdgcn_workitem_id_z: {
5823 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5825 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5826 break;
5827 }
5828 default:
5829 break;
5830 }
5831 }
5832 }
5833}
5834
5836 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5837 unsigned Depth) const {
5838 switch (Op.getOpcode()) {
5839 case AMDGPUISD::BFE_I32: {
5840 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5841 if (!Width)
5842 return 1;
5843
5844 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5845 if (!isNullConstant(Op.getOperand(1)))
5846 return SignBits;
5847
5848 // TODO: Could probably figure something out with non-0 offsets.
5849 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5850 return std::max(SignBits, Op0SignBits);
5851 }
5852
5853 case AMDGPUISD::BFE_U32: {
5854 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5855 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5856 }
5857
5858 case AMDGPUISD::CARRY:
5859 case AMDGPUISD::BORROW:
5860 return 31;
5862 return 25;
5864 return 17;
5866 return 24;
5868 return 16;
5870 return 16;
5871 case AMDGPUISD::SMIN3:
5872 case AMDGPUISD::SMAX3:
5873 case AMDGPUISD::SMED3:
5874 case AMDGPUISD::UMIN3:
5875 case AMDGPUISD::UMAX3:
5876 case AMDGPUISD::UMED3: {
5877 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5878 if (Tmp2 == 1)
5879 return 1; // Early out.
5880
5881 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5882 if (Tmp1 == 1)
5883 return 1; // Early out.
5884
5885 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5886 if (Tmp0 == 1)
5887 return 1; // Early out.
5888
5889 return std::min({Tmp0, Tmp1, Tmp2});
5890 }
5891 default:
5892 return 1;
5893 }
5894}
5895
5898 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5899 unsigned Depth) const {
5900 const MachineInstr *MI = MRI.getVRegDef(R);
5901 if (!MI)
5902 return 1;
5903
5904 // TODO: Check range metadata on MMO.
5905 switch (MI->getOpcode()) {
5906 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5907 return 25;
5908 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5909 return 17;
5910 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5911 return 24;
5912 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5913 return 16;
5914 case AMDGPU::G_AMDGPU_SMED3:
5915 case AMDGPU::G_AMDGPU_UMED3: {
5916 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5917 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5918 if (Tmp2 == 1)
5919 return 1;
5920 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5921 if (Tmp1 == 1)
5922 return 1;
5923 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5924 if (Tmp0 == 1)
5925 return 1;
5926 return std::min({Tmp0, Tmp1, Tmp2});
5927 }
5928 default:
5929 return 1;
5930 }
5931}
5932
5934 const SelectionDAG &DAG,
5935 bool SNaN,
5936 unsigned Depth) const {
5937 unsigned Opcode = Op.getOpcode();
5938 switch (Opcode) {
5941 if (SNaN)
5942 return true;
5943
5944 // TODO: Can check no nans on one of the operands for each one, but which
5945 // one?
5946 return false;
5947 }
5950 if (SNaN)
5951 return true;
5952 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5953 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5954 }
5955 case AMDGPUISD::FMED3:
5956 case AMDGPUISD::FMIN3:
5957 case AMDGPUISD::FMAX3:
5960 case AMDGPUISD::FMAD_FTZ: {
5961 if (SNaN)
5962 return true;
5963 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5964 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5965 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5966 }
5971 return true;
5972
5973 case AMDGPUISD::RCP:
5974 case AMDGPUISD::RSQ:
5976 case AMDGPUISD::RSQ_CLAMP: {
5977 if (SNaN)
5978 return true;
5979
5980 // TODO: Need is known positive check.
5981 return false;
5982 }
5983 case ISD::FLDEXP:
5984 case AMDGPUISD::FRACT: {
5985 if (SNaN)
5986 return true;
5987 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5988 }
5992 // TODO: Refine on operands.
5993 return SNaN;
5994 case AMDGPUISD::SIN_HW:
5995 case AMDGPUISD::COS_HW: {
5996 // TODO: Need check for infinity
5997 return SNaN;
5998 }
6000 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6001 // TODO: Handle more intrinsics
6002 switch (IntrinsicID) {
6003 case Intrinsic::amdgcn_cubeid:
6004 return true;
6005
6006 case Intrinsic::amdgcn_frexp_mant: {
6007 if (SNaN)
6008 return true;
6009 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6010 }
6011 case Intrinsic::amdgcn_cvt_pkrtz: {
6012 if (SNaN)
6013 return true;
6014 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6015 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6016 }
6017 case Intrinsic::amdgcn_rcp:
6018 case Intrinsic::amdgcn_rsq:
6019 case Intrinsic::amdgcn_rcp_legacy:
6020 case Intrinsic::amdgcn_rsq_legacy:
6021 case Intrinsic::amdgcn_rsq_clamp: {
6022 if (SNaN)
6023 return true;
6024
6025 // TODO: Need is known positive check.
6026 return false;
6027 }
6028 case Intrinsic::amdgcn_trig_preop:
6029 case Intrinsic::amdgcn_fdot2:
6030 // TODO: Refine on operand
6031 return SNaN;
6032 case Intrinsic::amdgcn_fma_legacy:
6033 if (SNaN)
6034 return true;
6035 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6036 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6037 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6038 default:
6039 return false;
6040 }
6041 }
6042 default:
6043 return false;
6044 }
6045}
6046
6048 Register N0, Register N1) const {
6049 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6050}
unsigned const MachineRegisterInfo * MRI
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:299
#define LLVM_READONLY
Definition: Compiler.h:306
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1405
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1178
const fltSemantics & getSemantics() const
Definition: APFloat.h:1448
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1196
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1155
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1095
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:575
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:501
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:854
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:825
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:495
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:496
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:698
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:490
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:508
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:584
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:578
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1613
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1593
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
MaybeAlign getAlign(const Function &F, unsigned Index)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1535
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:265
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:297
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:266
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:263
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:472
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:414
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:142
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:303
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:100
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:234
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:153
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:43
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:288
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:106
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:97
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:261
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...