fix ARM64 tails=true build

2025-04-16 07:27:17 +00:00 · 2014-12-30 09:37:26 -07:00 · 2014-12-30 09:37:26 -07:00 · e3ea60fc31
commit e3ea60fc31
parent 76bfcaa8c0
2 changed files with 46 additions and 16 deletions
--- a/src/codegen/target/arm/assembler.cpp
+++ b/src/codegen/target/arm/assembler.cpp
@ -130,10 +130,10 @@ void nextFrame(ArchitectureContext* con,
      unsigned shift = (*instruction >> 22) & 1;
      switch (shift) {
      case 0:
-        offset -= value;
+        offset -= value / TargetBytesPerWord;
        break;
      case 1:
-        offset -= value << 12;
+        offset -= (value << 12) / TargetBytesPerWord;
        break;
      default:
        abort(con);
@ -769,6 +769,11 @@ class MyAssembler : public Assembler {
    // how to handle them:
    assertT(&con, footprint < 256);

+    // todo: ARM64 frame allocation should be of the form:
+    //   stp   x29, x30, [sp,#size]!
+    // and deallocation should be of the form:
+    //   ldp   x29, x30, [sp],#size
+
    lir::RegisterPair stack(StackRegister);
    ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
    lir::Constant footprintConstant(&footprintPromise);
@ -875,10 +880,19 @@ class MyAssembler : public Assembler {
    return_(&con);
  }

-  virtual void popFrameAndUpdateStackAndReturn(unsigned frameFootprint,
+  virtual void popFrameAndUpdateStackAndReturn(unsigned footprint,
                                               unsigned stackOffsetFromThread)
  {
-    popFrame(frameFootprint);
+    footprint += FrameHeaderSize;
+
+    lir::RegisterPair returnAddress(LinkRegister);
+    lir::Memory returnAddressSrc(StackRegister,
+                                 (footprint - 1) * TargetBytesPerWord);
+    moveMR(&con,
+           TargetBytesPerWord,
+           &returnAddressSrc,
+           TargetBytesPerWord,
+           &returnAddress);

    lir::RegisterPair stack(StackRegister);
    lir::Memory newStackSrc(ThreadRegister, stackOffsetFromThread);
--- a/src/codegen/target/arm/operations64.cpp
+++ b/src/codegen/target/arm/operations64.cpp
@ -139,8 +139,8 @@ uint32_t addi(Register Rd, Register Rn, int value, int shift, unsigned size)

 uint32_t mov(Register Rd, Register Rn, unsigned size)
 {
-  return Rn.index() == 31 ? addi(Rd, Rn, 0, 0, size)
-                          : orr(Rd, Register(31), Rn, size);
+  return Rn.index() == 31 or Rd.index() == 31 ? addi(Rd, Rn, 0, 0, size)
+    : orr(Rd, Register(31), Rn, size);
 }

 uint32_t movz(Register Rd, int value, unsigned shift, unsigned size)
@ -653,6 +653,10 @@ void moveCR2(Context* c,
    moveRR(c, size, &tmp, size, dst);
    c->client->releaseTemporary(tmp.low);
  } else if (callOffset == 0 and src->value->resolved()) {
+    // todo: Is it better performance-wise to load using immediate
+    // moves or via a PC-relative constant pool?  Does it depend on
+    // how many significant bits there are?
+
    int64_t value = src->value->value();
    if (value >= 0) {
      append(c, movz(dst->low, value & 0xFFFF, 0, size));
@ -1195,16 +1199,28 @@ void moveMR(Context* c,
            unsigned dstSize,
            lir::RegisterPair* dst)
 {
-  load(c,
-       srcSize,
-       src->base,
-       src->offset,
-       src->index,
-       src->scale,
-       dstSize,
-       dst,
-       true,
-       true);
+  if (dst->low.index() == 31) {
+    assertT(c, c->client == 0);  // the compiler should never ask us to
+                                 // load the SP; we'll only get here
+                                 // when assembling a thunk
+
+    lir::RegisterPair tmp(Register(9));  // we're in a thunk, so we can
+                                         // clobber this
+
+    load(c, srcSize, src->base, src->offset, src->index, src->scale, dstSize, &tmp, true, true);
+    moveRR(c, dstSize, &tmp, dstSize, dst);
+  } else {
+    load(c,
+         srcSize,
+         src->base,
+         src->offset,
+         src->index,
+         src->scale,
+         dstSize,
+         dst,
+         true,
+         true);
+  }
 }

 void moveZMR(Context* c,