Skip to content

Port canCombineStoreAndExtract to AArch64#194656

Open
LumioseSil wants to merge 2 commits intollvm:mainfrom
LumioseSil:i
Open

Port canCombineStoreAndExtract to AArch64#194656
LumioseSil wants to merge 2 commits intollvm:mainfrom
LumioseSil:i

Conversation

@LumioseSil
Copy link
Copy Markdown
Contributor

No description provided.

A
Revert "A"

This reverts commit 571275bf1b89f411520633d0fb1e1afbff3f9894.

Pre-commit test (NFC)
@llvmbot
Copy link
Copy Markdown
Member

llvmbot commented Apr 28, 2026

@llvm/pr-subscribers-backend-aarch64

Author: LumioseSil (LumioseSil)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/194656.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+31)
  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+3)
  • (added) llvm/test/CodeGen/AArch64/vector-promotion.ll (+69)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 52b416fdd755b..7c5997f7cfe51 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -33214,6 +33214,37 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
+bool AArch64TargetLowering::canCombineStoreAndExtract(Type *VectorTy,
+                                                      Value *Idx,
+                                                      unsigned &Cost) const {
+  // If we do not have NEON, fixed-width vector types are not natively
+  // supported.
+  if (!Subtarget->hasNEON())
+    return false;
+
+  // Floating point values and vector values map to the same register file.
+  // Therefore, although we could do a store extract of a vector type, this is
+  // better to leave at float as we have more freedom in the addressing mode for
+  // those.
+  if (VectorTy->isFPOrFPVectorTy() || VectorTy->isScalableTy())
+    return false;
+
+  // If the index is unknown at compile time, this is very expensive to lower
+  // and it is not possible to combine the store with the extract.
+  if (!isa<ConstantInt>(Idx))
+    return false;
+
+  assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+  unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
+  // We can do a store + vector extract on any vector that fits perfectly in a D
+  // or Q register.
+  if (BitWidth == 64 || BitWidth == 128) {
+    Cost = 0;
+    return true;
+  }
+  return false;  
+}
+
 bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 58efdd3e18fc0..50d4541405d36 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -894,6 +894,9 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldLocalize(const MachineInstr &MI,
                       const TargetTransformInfo *TTI) const override;
 
+  bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                 unsigned &Cost) const override;
+
   bool SimplifyDemandedBitsForTargetNode(SDValue Op,
                                          const APInt &OriginalDemandedBits,
                                          const APInt &OriginalDemandedElts,
diff --git a/llvm/test/CodeGen/AArch64/vector-promotion.ll b/llvm/test/CodeGen/AArch64/vector-promotion.ll
new file mode 100644
index 0000000000000..610f8ba0224e2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-promotion.ll
@@ -0,0 +1,69 @@
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=aarch64 %s -o - -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=aarch64 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck --check-prefix=ASM %s
+
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 poison, i32 1>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest
+; IR-BOTH-NEXT: ret
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM-NOT: umov
+define void @simpleOneInstructionPromotion(ptr %addr1, ptr %dest) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, ptr %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedInstructionForPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2
+; IR-BOTH-NEXT: store i1 [[CMP]], ptr %dest
+; IR-BOTH-NEXT: ret
+define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out = icmp eq i32 %extract, %in2
+  store i1 %out, ptr %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @chainOfInstructionsToPromote
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR3]], i32 0
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest
+; IR-BOTH-NEXT: ret
+define void @chainOfInstructionsToPromote(ptr %addr1, ptr %dest) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out1 = or i32 %extract, 1
+  %out2 = or i32 %out1, 1
+  %out3 = or i32 %out2, 1
+  store i32 %out3, ptr %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; Vector version:
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], splat (i32 1)
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx
+; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionVariableIdx(ptr %addr1, ptr %dest, i32 %idx) {
+  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 %idx
+  %out = or i32 %extract, 1
+  store i32 %out, ptr %dest, align 4
+  ret void
+}

@github-actions
Copy link
Copy Markdown

github-actions Bot commented Apr 28, 2026

✅ With the latest revision this PR passed the C/C++ code formatter.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants