Merge pull request #383 from dicej/master

add ARM64 JIT support
2025-06-01 07:00:54 +00:00 · 2014-12-30 15:56:26 -07:00 · 2014-12-30 15:56:26 -07:00 · 7b1bdf127e
commit 7b1bdf127e
parent 4d9c649676 c9026a6053
14 changed files with 2070 additions and 241 deletions
--- a/classpath/avian/Classes.java
+++ b/classpath/avian/Classes.java
@ -38,7 +38,7 @@ public class Classes {
  public static native VMClass primitiveClass(char name);

  public static native void initialize(VMClass vmClass);
-  
+
  public static native boolean isAssignableFrom(VMClass a, VMClass b);

  public static native VMClass getVMClass(Object o);
@ -134,7 +134,7 @@ public class Classes {
        array[i] = parseAnnotationValue(loader, pool, in);
      }
      return array;
-    }    
+    }

    default: throw new AssertionError();
    }
@ -207,7 +207,7 @@ public class Classes {
        while (spec[end] != ';') ++ end;
        ++ end;
        break;
-        
+
      default:
        ++ end;
      }
@ -295,9 +295,9 @@ public class Classes {
    }
    Class c = loader.loadClass(name);
    VMClass vmc = SystemClassLoader.vmClass(c);
-    Classes.link(vmc, loader);
+    link(vmc, loader);
    if (initialize) {
-      Classes.initialize(vmc);
+      initialize(vmc);
    }
    return c;
  }
@ -315,7 +315,7 @@ public class Classes {
      } else {
        if (name.length() == 1) {
          return SystemClassLoader.getClass
-            (Classes.primitiveClass(name.charAt(0)));
+            (primitiveClass(name.charAt(0)));
        } else {
          throw new ClassNotFoundException(name);
        }
@ -378,7 +378,7 @@ public class Classes {

  public static int findField(VMClass vmClass, String name) {
    if (vmClass.fieldTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);

      for (int i = 0; i < vmClass.fieldTable.length; ++i) {
        if (toString(vmClass.fieldTable[i].name).equals(name)) {
@ -426,7 +426,7 @@ public class Classes {
  {
    VMMethod[] methodTable = vmClass.methodTable;
    if (methodTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);

      if (parameterTypes == null) {
        parameterTypes = new Class[0];
@ -464,7 +464,7 @@ public class Classes {
    Method[] array = new Method[countMethods(vmClass, publicOnly)];
    VMMethod[] methodTable = vmClass.methodTable;
    if (methodTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);

      int ai = 0;
      for (int i = 0, j = declaredMethodCount(vmClass); i < j; ++i) {
@ -498,7 +498,7 @@ public class Classes {
  public static Field[] getFields(VMClass vmClass, boolean publicOnly) {
    Field[] array = new Field[countFields(vmClass, publicOnly)];
    if (vmClass.fieldTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);

      int ai = 0;
      for (int i = 0; i < vmClass.fieldTable.length; ++i) {
@ -568,9 +568,9 @@ public class Classes {

    return new ProtectionDomain(source, p);
  }
-  
+
  public static native Method makeMethod(Class c, int slot);
-  
+
  public static native Field makeField(Class c, int slot);

  private static native void acquireClassLock();
--- a/include/avian/codegen/architecture.h
+++ b/include/avian/codegen/architecture.h
@ -28,8 +28,6 @@ namespace codegen {

 class Assembler;

-class RegisterFile;
-
 class OperandMask {
 public:
  uint8_t typeMask;
--- a/src/arm64.S
+++ b/src/arm64.S
@ -35,6 +35,7 @@ GLOBAL(vmNativeCall):

  // allocate frame
  stp   x29, x30, [sp,#-64]!
+  mov   x29, sp

  // save callee-saved register values so we can clobber them
  stp   x19, x20, [sp,#16]
@ -118,6 +119,7 @@ GLOBAL(vmRun):

  // allocate frame
  stp   x29, x30, [sp,#-96]!
+  mov   x29, sp

  // save callee-saved register values
  stp   x19, x20, [sp,#16]
--- a/src/codegen/compiler/site.h
+++ b/src/codegen/compiler/site.h
@ -123,7 +123,7 @@ class Site {

  virtual RegisterMask registerMask(Context*)
  {
-    return 0;
+    return RegisterMask(0);
  }

  virtual bool isVolatile(Context*)
--- a/src/codegen/target/arm/CMakeLists.txt
+++ b/src/codegen/target/arm/CMakeLists.txt
@ -4,5 +4,6 @@ add_library(avian_codegen_arm
  context.cpp
  fixup.cpp
  multimethod.cpp
-  operations.cpp
+  operations32.cpp
+  operations64.cpp
 )
--- a/src/codegen/target/arm/assembler.cpp
+++ b/src/codegen/target/arm/assembler.cpp
@ -39,7 +39,7 @@ namespace isa {
 bool vfpSupported()
 {
 // TODO: Use at runtime detection
-#if defined(__ARM_PCS_VFP)
+#if (defined __ARM_PCS_VFP) || (defined ARCH_arm64)
  // armhf
  return true;
 #else
@ -55,9 +55,9 @@ bool vfpSupported()
 const RegisterFile MyRegisterFileWithoutFloats(GPR_MASK, 0);
 const RegisterFile MyRegisterFileWithFloats(GPR_MASK, FPR_MASK);

-const unsigned FrameHeaderSize = 1;
+const unsigned FrameHeaderSize = TargetBytesPerWord / 4;

-const unsigned StackAlignmentInBytes = 8;
+const unsigned StackAlignmentInBytes = TargetBytesPerWord * 2;
 const unsigned StackAlignmentInWords = StackAlignmentInBytes
                                       / TargetBytesPerWord;

@ -89,11 +89,11 @@ void nextFrame(ArchitectureContext* con,
               void** stack)
 {
  assertT(con, *ip >= start);
-  assertT(con, *ip <= start + (size / TargetBytesPerWord));
+  assertT(con, *ip <= start + (size / 4));

  uint32_t* instruction = static_cast<uint32_t*>(*ip);

-  if ((*start >> 20) == 0xe59) {
+  if ((*start >> 20) == (TargetBytesPerWord == 8 ? 0xf94 : 0xe59)) {
    // skip stack overflow check
    start += 3;
  }
@ -111,7 +111,8 @@ void nextFrame(ArchitectureContext* con,
    return;
  }

-  if (*instruction == 0xe12fff1e) {  // return
+  if (*instruction == (TargetBytesPerWord == 8 ? 0xd61f03c0 : 0xe12fff1e)) {
+    // return
    *ip = link;
    return;
  }
@ -124,7 +125,20 @@ void nextFrame(ArchitectureContext* con,

    // check for post-non-tail-call stack adjustment of the form "sub
    // sp, sp, #offset":
-    if ((*instruction >> 12) == 0xe24dd) {
+    if (TargetBytesPerWord == 8 and (*instruction & 0xff0003ff) == 0xd10003ff) {
+      unsigned value = (*instruction >> 10) & 0xfff;
+      unsigned shift = (*instruction >> 22) & 1;
+      switch (shift) {
+      case 0:
+        offset -= value / TargetBytesPerWord;
+        break;
+      case 1:
+        offset -= (value << 12) / TargetBytesPerWord;
+        break;
+      default:
+        abort(con);
+      }
+    } else if (TargetBytesPerWord == 4 and (*instruction >> 12) == 0xe24dd) {
      unsigned value = *instruction & 0xff;
      unsigned rotation = (*instruction >> 8) & 0xf;
      switch (rotation) {
@ -218,6 +232,7 @@ class MyArchitecture : public Architecture {
  {
    switch (register_.index()) {
    case LinkRegister.index():
+    case FrameRegister.index():
    case StackRegister.index():
    case ThreadRegister.index():
    case ProgramCounter.index():
@ -258,7 +273,7 @@ class MyArchitecture : public Architecture {

  virtual unsigned argumentRegisterCount()
  {
-    return 4;
+    return TargetBytesPerWord;
  }

  virtual Register argumentRegister(unsigned index)
@ -306,8 +321,13 @@ class MyArchitecture : public Architecture {
    case lir::AlignedLongCall:
    case lir::AlignedLongJump: {
      uint32_t* p = static_cast<uint32_t*>(returnAddress) - 2;
-      *reinterpret_cast<void**>(p + (((*p & PoolOffsetMask) + 8) / 4))
-          = newTarget;
+      if (TargetBytesPerWord == 8) {
+        const int32_t mask = (PoolOffsetMask >> 2) << 5;
+        *reinterpret_cast<void**>(p + ((*p & mask) >> 5)) = newTarget;
+      } else {
+        *reinterpret_cast<void**>(p + (((*p & PoolOffsetMask) + 8) / 4))
+            = newTarget;
+      }
    } break;

    default:
@ -434,11 +454,11 @@ class MyArchitecture : public Architecture {
      break;

    case lir::Float2Int:
-      // todo: Java requires different semantics than SSE for
+      // todo: Java requires different semantics than VFP for
      // converting floats to integers, we we need to either use
      // thunks or produce inline machine code which handles edge
      // cases properly.
-      if (false && vfpSupported() && bSize == 4) {
+      if (false && vfpSupported() && bSize <= TargetBytesPerWord) {
        aMask.typeMask = lir::Operand::RegisterPairMask;
        aMask.setLowHighRegisterMasks(FPR_MASK, FPR_MASK);
      } else {
@ -447,7 +467,7 @@ class MyArchitecture : public Architecture {
      break;

    case lir::Int2Float:
-      if (vfpSupported() && aSize == 4) {
+      if (vfpSupported() && aSize <= TargetBytesPerWord) {
        aMask.typeMask = lir::Operand::RegisterPairMask;
        aMask.setLowHighRegisterMasks(GPR_MASK, GPR_MASK);
      } else {
@ -544,7 +564,7 @@ class MyArchitecture : public Architecture {
    case lir::ShiftLeft:
    case lir::ShiftRight:
    case lir::UnsignedShiftRight:
-      if (bSize == 8)
+      if (bSize > TargetBytesPerWord)
        aMask.typeMask = bMask.typeMask = lir::Operand::RegisterPairMask;
      break;

@ -556,6 +576,11 @@ class MyArchitecture : public Architecture {
      aMask.typeMask = bMask.typeMask = lir::Operand::RegisterPairMask;
      break;

+    // todo: Although ARM has instructions for integer division and
+    // remainder, they don't trap on division by zero, which is why
+    // we use thunks.  Alternatively, we could generate inline code
+    // with an explicit zero check, which would probably be a bit
+    // faster.
    case lir::Divide:
    case lir::Remainder:
    case lir::FloatRemainder:
@ -567,7 +592,7 @@ class MyArchitecture : public Architecture {
    case lir::FloatMultiply:
    case lir::FloatDivide:
      if (vfpSupported()) {
-        bMask.typeMask = lir::Operand::RegisterPairMask;
+        aMask.typeMask = lir::Operand::RegisterPairMask;
        aMask.setLowHighRegisterMasks(FPR_MASK, FPR_MASK);
        bMask = aMask;
      } else {
@ -745,19 +770,45 @@ class MyAssembler : public Assembler {
    // how to handle them:
    assertT(&con, footprint < 256);

-    lir::RegisterPair stack(StackRegister);
-    ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
-    lir::Constant footprintConstant(&footprintPromise);
-    subC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+    // todo: the ARM ABI says the frame preamble should be of the form
+    //
+    //   stp x29, x30, [sp,#-footprint]!
+    //   mov x29, sp
+    //
+    // and the frame should be popped with e.g.
+    //
+    //   ldp x29, x30, [sp],#footprint
+    //   br x30
+    //
+    // However, that will invalidate a lot of assumptions elsewhere
+    // about the return address being stored at the opposite end of
+    // the frame, so lots of other code will need to change before we
+    // can do that.  The code below can be enabled as a starting point
+    // when we're ready to tackle that.
+    if (false and TargetBytesPerWord == 8) {
+      // stp x29, x30, [sp,#-footprint]!
+      con.code.append4(0xa9800000 | ((-footprint & 0x7f) << 15)
+                       | (StackRegister.index() << 5)
+                       | (LinkRegister.index() << 10) | FrameRegister.index());

-    lir::RegisterPair returnAddress(LinkRegister);
-    lir::Memory returnAddressDst(StackRegister,
-                                 (footprint - 1) * TargetBytesPerWord);
-    moveRM(&con,
-           TargetBytesPerWord,
-           &returnAddress,
-           TargetBytesPerWord,
-           &returnAddressDst);
+      lir::RegisterPair stack(StackRegister);
+      lir::RegisterPair frame(FrameRegister);
+      moveRR(&con, TargetBytesPerWord, &stack, TargetBytesPerWord, &frame);
+    } else {
+      lir::RegisterPair stack(StackRegister);
+      ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
+      lir::Constant footprintConstant(&footprintPromise);
+      subC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+
+      lir::RegisterPair returnAddress(LinkRegister);
+      lir::Memory returnAddressDst(StackRegister,
+                                   (footprint - 1) * TargetBytesPerWord);
+      moveRM(&con,
+             TargetBytesPerWord,
+             &returnAddress,
+             TargetBytesPerWord,
+             &returnAddressDst);
+    }
  }

  virtual void adjustFrame(unsigned difference)
@ -772,19 +823,26 @@ class MyAssembler : public Assembler {
  {
    footprint += FrameHeaderSize;

-    lir::RegisterPair returnAddress(LinkRegister);
-    lir::Memory returnAddressSrc(StackRegister,
-                                 (footprint - 1) * TargetBytesPerWord);
-    moveMR(&con,
-           TargetBytesPerWord,
-           &returnAddressSrc,
-           TargetBytesPerWord,
-           &returnAddress);
+    // see comment regarding the ARM64 ABI in allocateFrame
+    if (false and TargetBytesPerWord == 8) {
+      // ldp x29, x30, [sp],#footprint
+      con.code.append4(0xa8c00000 | (footprint << 15) | (31 << 5) | (30 << 10)
+                       | 29);
+    } else {
+      lir::RegisterPair returnAddress(LinkRegister);
+      lir::Memory returnAddressSrc(StackRegister,
+                                   (footprint - 1) * TargetBytesPerWord);
+      moveMR(&con,
+             TargetBytesPerWord,
+             &returnAddressSrc,
+             TargetBytesPerWord,
+             &returnAddress);

-    lir::RegisterPair stack(StackRegister);
-    ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
-    lir::Constant footprintConstant(&footprintPromise);
-    addC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+      lir::RegisterPair stack(StackRegister);
+      ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
+      lir::Constant footprintConstant(&footprintPromise);
+      addC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+    }
  }

  virtual void popFrameForTailCall(unsigned footprint,
@ -851,10 +909,26 @@ class MyAssembler : public Assembler {
    return_(&con);
  }

-  virtual void popFrameAndUpdateStackAndReturn(unsigned frameFootprint,
+  virtual void popFrameAndUpdateStackAndReturn(unsigned footprint,
                                               unsigned stackOffsetFromThread)
  {
-    popFrame(frameFootprint);
+    footprint += FrameHeaderSize;
+
+    // see comment regarding the ARM64 ABI in allocateFrame
+    if (false and TargetBytesPerWord == 8) {
+      // ldp x29, x30, [sp],#footprint
+      con.code.append4(0xa8c00000 | (footprint << 15) | (31 << 5) | (30 << 10)
+                       | 29);
+    } else {
+      lir::RegisterPair returnAddress(LinkRegister);
+      lir::Memory returnAddressSrc(StackRegister,
+                                   (footprint - 1) * TargetBytesPerWord);
+      moveMR(&con,
+             TargetBytesPerWord,
+             &returnAddressSrc,
+             TargetBytesPerWord,
+             &returnAddress);
+    }

    lir::RegisterPair stack(StackRegister);
    lir::Memory newStackSrc(ThreadRegister, stackOffsetFromThread);
@ -946,17 +1020,28 @@ class MyAssembler : public Assembler {
          unsigned instruction = o->block->start + padding(o->block, o->offset)
                                 + o->offset;

-          int32_t v = (entry - 8) - instruction;
-          expect(&con, v == (v & PoolOffsetMask));
-
          int32_t* p = reinterpret_cast<int32_t*>(dst + instruction);
-          *p = (v & PoolOffsetMask) | ((~PoolOffsetMask) & *p);
+
+          if (TargetBytesPerWord == 8) {
+            int32_t v = entry - instruction;
+            expect(&con, v == (v & PoolOffsetMask));
+
+            const int32_t mask = (PoolOffsetMask >> 2) << 5;
+            *p = (((v >> 2) << 5) & mask) | ((~mask) & *p);
+          } else {
+            int32_t v = (entry - 8) - instruction;
+            expect(&con, v == (v & PoolOffsetMask));
+
+            *p = (v & PoolOffsetMask) | ((~PoolOffsetMask) & *p);
+          }

          poolSize += TargetBytesPerWord;
        }

        bool jump = needJump(b);
        if (jump) {
+          expect(&con, TargetBytesPerWord == 4);
+
          write4(dst + dstOffset,
                 isa::b((poolSize + TargetBytesPerWord - 8) >> 2));
        }
--- a/src/codegen/target/arm/fixup.cpp
+++ b/src/codegen/target/arm/fixup.cpp
@ -12,6 +12,12 @@
 #include "fixup.h"
 #include "block.h"

+namespace {
+
+const unsigned InstructionSize = 4;
+
+}  // namespace
+
 namespace avian {
 namespace codegen {
 namespace arm {
@ -38,8 +44,7 @@ int64_t OffsetPromise::value()
  assertT(con, resolved());

  unsigned o = offset - block->offset;
-  return block->start
-         + padding(block, forTrace ? o - vm::TargetBytesPerWord : o) + o;
+  return block->start + padding(block, forTrace ? o - InstructionSize : o) + o;
 }

 Promise* offsetPromise(Context* con, bool forTrace)
@ -92,17 +97,30 @@ bool bounded(int right, int left, int32_t v)

 void* updateOffset(vm::System* s, uint8_t* instruction, int64_t value)
 {
-  // ARM's PC is two words ahead, and branches drop the bottom 2 bits.
-  int32_t v = (reinterpret_cast<uint8_t*>(value) - (instruction + 8)) >> 2;
-
-  int32_t mask;
-  expect(s, bounded(0, 8, v));
-  mask = 0xFFFFFF;
-
  int32_t* p = reinterpret_cast<int32_t*>(instruction);
+
+  int32_t v;
+  int32_t mask;
+  if (vm::TargetBytesPerWord == 8) {
+  if ((*p >> 24) == 0x54) {
+    // conditional branch
+    v = ((reinterpret_cast<uint8_t*>(value) - instruction) >> 2) << 5;
+    mask = 0xFFFFE0;
+  } else {
+    // unconditional branch
+    v = (reinterpret_cast<uint8_t*>(value) - instruction) >> 2;
+    mask = 0x3FFFFFF;
+  }
+  } else {
+    v = (reinterpret_cast<uint8_t*>(value) - (instruction + 8)) >> 2;
+    mask = 0xFFFFFF;
+  }
+
+  expect(s, bounded(0, 8, v));
+
  *p = (v & mask) | ((~mask) & *p);

-  return instruction + 4;
+  return instruction + InstructionSize;
 }

 ConstantPoolEntry::ConstantPoolEntry(Context* con,
@ -214,6 +232,101 @@ void appendPoolEvent(Context* con,
  b->poolEventTail = e;
 }

+bool needJump(MyBlock* b)
+{
+  return b->next or b->size != (b->size & PoolOffsetMask);
+}
+
+unsigned padding(MyBlock* b, unsigned offset)
+{
+  unsigned total = 0;
+  for (PoolEvent* e = b->poolEventHead; e; e = e->next) {
+    if (e->offset <= offset) {
+      if (needJump(b)) {
+        total += vm::TargetBytesPerWord;
+      }
+      for (PoolOffset* o = e->poolOffsetHead; o; o = o->next) {
+        total += vm::TargetBytesPerWord;
+      }
+    } else {
+      break;
+    }
+  }
+  return total;
+}
+
+void resolve(MyBlock* b)
+{
+  Context* con = b->context;
+
+  if (b->poolOffsetHead) {
+    if (con->poolOffsetTail) {
+      con->poolOffsetTail->next = b->poolOffsetHead;
+    } else {
+      con->poolOffsetHead = b->poolOffsetHead;
+    }
+    con->poolOffsetTail = b->poolOffsetTail;
+  }
+
+  if (con->poolOffsetHead) {
+    bool append;
+    if (b->next == 0 or b->next->poolEventHead) {
+      append = true;
+    } else {
+      int32_t v
+          = (b->start + b->size + b->next->size + vm::TargetBytesPerWord - 8)
+            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
+
+      append = (v != (v & PoolOffsetMask));
+
+      if (DebugPool) {
+        fprintf(stderr,
+                "current %p %d %d next %p %d %d\n",
+                b,
+                b->start,
+                b->size,
+                b->next,
+                b->start + b->size,
+                b->next->size);
+        fprintf(stderr,
+                "offset %p %d is of distance %d to next block; append? %d\n",
+                con->poolOffsetHead,
+                con->poolOffsetHead->offset,
+                v,
+                append);
+      }
+    }
+
+    if (append) {
+#ifndef NDEBUG
+      int32_t v
+          = (b->start + b->size - 8)
+            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
+
+      expect(con, v == (v & PoolOffsetMask));
+#endif  // not NDEBUG
+
+      appendPoolEvent(
+          con, b, b->size, con->poolOffsetHead, con->poolOffsetTail);
+
+      if (DebugPool) {
+        for (PoolOffset* o = con->poolOffsetHead; o; o = o->next) {
+          fprintf(stderr,
+                  "include %p %d in pool event %p at offset %d in block %p\n",
+                  o,
+                  o->offset,
+                  b->poolEventTail,
+                  b->size,
+                  b);
+        }
+      }
+
+      con->poolOffsetHead = 0;
+      con->poolOffsetTail = 0;
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace codegen
 }  // namespace avian
--- a/src/codegen/target/arm/fixup.h
+++ b/src/codegen/target/arm/fixup.h
@ -27,7 +27,7 @@ namespace arm {

 const bool DebugPool = false;

-const int32_t PoolOffsetMask = 0xFFF;
+const int32_t PoolOffsetMask = vm::TargetBytesPerWord == 8 ? 0x1FFFFF : 0xFFF;

 class Task {
 public:
--- a/src/codegen/target/arm/operations32.cpp
+++ b/src/codegen/target/arm/operations32.cpp
@ -15,6 +15,8 @@
 #include "fixup.h"
 #include "multimethod.h"

+#if TARGET_BYTES_PER_WORD == 4
+
 namespace avian {
 namespace codegen {
 namespace arm {
@ -179,101 +181,6 @@ void unsignedShiftRightC(Context* con,
  }
 }

-bool needJump(MyBlock* b)
-{
-  return b->next or b->size != (b->size & PoolOffsetMask);
-}
-
-unsigned padding(MyBlock* b, unsigned offset)
-{
-  unsigned total = 0;
-  for (PoolEvent* e = b->poolEventHead; e; e = e->next) {
-    if (e->offset <= offset) {
-      if (needJump(b)) {
-        total += vm::TargetBytesPerWord;
-      }
-      for (PoolOffset* o = e->poolOffsetHead; o; o = o->next) {
-        total += vm::TargetBytesPerWord;
-      }
-    } else {
-      break;
-    }
-  }
-  return total;
-}
-
-void resolve(MyBlock* b)
-{
-  Context* con = b->context;
-
-  if (b->poolOffsetHead) {
-    if (con->poolOffsetTail) {
-      con->poolOffsetTail->next = b->poolOffsetHead;
-    } else {
-      con->poolOffsetHead = b->poolOffsetHead;
-    }
-    con->poolOffsetTail = b->poolOffsetTail;
-  }
-
-  if (con->poolOffsetHead) {
-    bool append;
-    if (b->next == 0 or b->next->poolEventHead) {
-      append = true;
-    } else {
-      int32_t v
-          = (b->start + b->size + b->next->size + vm::TargetBytesPerWord - 8)
-            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
-
-      append = (v != (v & PoolOffsetMask));
-
-      if (DebugPool) {
-        fprintf(stderr,
-                "current %p %d %d next %p %d %d\n",
-                b,
-                b->start,
-                b->size,
-                b->next,
-                b->start + b->size,
-                b->next->size);
-        fprintf(stderr,
-                "offset %p %d is of distance %d to next block; append? %d\n",
-                con->poolOffsetHead,
-                con->poolOffsetHead->offset,
-                v,
-                append);
-      }
-    }
-
-    if (append) {
-#ifndef NDEBUG
-      int32_t v
-          = (b->start + b->size - 8)
-            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
-
-      expect(con, v == (v & PoolOffsetMask));
-#endif  // not NDEBUG
-
-      appendPoolEvent(
-          con, b, b->size, con->poolOffsetHead, con->poolOffsetTail);
-
-      if (DebugPool) {
-        for (PoolOffset* o = con->poolOffsetHead; o; o = o->next) {
-          fprintf(stderr,
-                  "include %p %d in pool event %p at offset %d in block %p\n",
-                  o,
-                  o->offset,
-                  b->poolEventTail,
-                  b->size,
-                  b);
-        }
-      }
-
-      con->poolOffsetHead = 0;
-      con->poolOffsetTail = 0;
-    }
-  }
-}
-
 void jumpR(Context* con, unsigned size UNUSED, lir::RegisterPair* target)
 {
  assertT(con, size == vm::TargetBytesPerWord);
@ -410,7 +317,8 @@ void moveCR2(Context* con,
    lir::RegisterPair dstHi(dst->high);
    moveCR(con, 4, &srcLo, 4, dst);
    moveCR(con, 4, &srcHi, 4, &dstHi);
-  } else if (src->value->resolved() and isOfWidth(getValue(src), 8)) {
+  } else if (callOffset == 0 and src->value->resolved()
+             and isOfWidth(getValue(src), 8)) {
    emit(con, movi(dst->low, lo8(getValue(src))));  // fits in immediate
  } else {
    appendConstantPoolEntry(con, src->value, callOffset);
@ -510,9 +418,9 @@ void multiplyR(Context* con,
  if (size == 8) {
    bool useTemporaries = b->low == t->low;
    Register tmpLow = useTemporaries ? con->client->acquireTemporary(GPR_MASK)
-                                : t->low;
+                                     : t->low;
    Register tmpHigh = useTemporaries ? con->client->acquireTemporary(GPR_MASK)
-                                 : t->high;
+                                      : t->high;

    emit(con, umull(tmpLow, tmpHigh, a->low, b->low));
    emit(con, mla(tmpHigh, a->low, b->high, tmpHigh));
@ -665,11 +573,11 @@ void floatDivideR(Context* con,
 }

 Register normalize(Context* con,
-              int offset,
-              Register index,
-              unsigned scale,
-              bool* preserveIndex,
-              bool* release)
+                   int offset,
+                   Register index,
+                   unsigned scale,
+                   bool* preserveIndex,
+                   bool* release)
 {
  if (offset != 0 or scale != 1) {
    lir::RegisterPair normalizedIndex(
@ -947,26 +855,8 @@ void load(Context* con,
      case 8: {
        if (dstSize == 8) {
          lir::RegisterPair dstHigh(dst->high);
-          load(con,
-               4,
-               base,
-               offset,
-               NoRegister,
-               1,
-               4,
-               &dstHigh,
-               false,
-               false);
-          load(con,
-               4,
-               base,
-               offset + 4,
-               NoRegister,
-               1,
-               4,
-               dst,
-               false,
-               false);
+          load(con, 4, base, offset, NoRegister, 1, 4, &dstHigh, false, false);
+          load(con, 4, base, offset + 4, NoRegister, 1, 4, dst, false, false);
        } else {
          emit(con, ldri(dst->low, base, offset));
        }
@ -1496,15 +1386,26 @@ void longCallC(Context* con, unsigned size UNUSED, lir::Constant* target)
  callR(con, vm::TargetBytesPerWord, &tmp);
 }

+void alignedLongCallC(Context* con, unsigned size, lir::Constant* target)
+{
+  longCallC(con, size, target);
+}
+
 void longJumpC(Context* con, unsigned size UNUSED, lir::Constant* target)
 {
  assertT(con, size == vm::TargetBytesPerWord);

-  lir::RegisterPair tmp(Register(4));  // a non-arg reg that we don't mind clobbering
+  lir::RegisterPair tmp(
+      Register(4));  // a non-arg reg that we don't mind clobbering
  moveCR2(con, vm::TargetBytesPerWord, target, &tmp, offsetPromise(con));
  jumpR(con, vm::TargetBytesPerWord, &tmp);
 }

+void alignedLongJumpC(Context* con, unsigned size, lir::Constant* target)
+{
+  longJumpC(con, size, target);
+}
+
 void jumpC(Context* con, unsigned size UNUSED, lir::Constant* target)
 {
  assertT(con, size == vm::TargetBytesPerWord);
@ -1554,3 +1455,5 @@ void storeLoadBarrier(Context* con)
 }  // namespace arm
 }  // namespace codegen
 }  // namespace avian
+
+#endif  // TARGET_BYTES_PER_WORD == 4
--- a/src/codegen/target/arm/operations64.cpp
+++ b/src/codegen/target/arm/operations64.cpp
--- a/src/codegen/target/arm/registers.h
+++ b/src/codegen/target/arm/registers.h
@ -14,6 +14,8 @@
 #include <avian/codegen/lir.h>
 #include <avian/codegen/assembler.h>

+#include "avian/environment.h"
+
 namespace avian {
 namespace codegen {
 namespace arm {
@ -21,16 +23,30 @@ namespace arm {
 const uint64_t MASK_LO32 = 0xffffffff;
 const unsigned MASK_LO8 = 0xff;

+#if TARGET_BYTES_PER_WORD == 8
+constexpr Register ThreadRegister(19);
+constexpr Register StackRegister(31);
+constexpr Register LinkRegister(30);
+constexpr Register FrameRegister(29);
+constexpr Register ProgramCounter(0xFE);  // i.e. unaddressable
+
+const int N_GPRS = 32;
+const int N_FPRS = 32;
+const RegisterMask GPR_MASK = 0xffffffff;
+const RegisterMask FPR_MASK = 0xffffffff00000000;
+
+#else
+constexpr Register ThreadRegister(8);
+constexpr Register StackRegister(13);
+constexpr Register LinkRegister(14);
+constexpr Register FrameRegister(0xFE);  // i.e. there is none
+constexpr Register ProgramCounter(15);
+
 const int N_GPRS = 16;
 const int N_FPRS = 16;
 const RegisterMask GPR_MASK = 0xffff;
 const RegisterMask FPR_MASK = 0xffff0000;

-inline bool isFpr(lir::RegisterPair* reg)
-{
-  return reg->low.index() >= N_GPRS;
-}
-
 inline int fpr64(Register reg)
 {
  return reg.index() - N_GPRS;
@ -47,19 +63,13 @@ inline int fpr32(lir::RegisterPair* reg)
 {
  return fpr64(reg) << 1;
 }
-
-#ifdef ARCH_arm64
-constexpr Register ThreadRegister(19);
-constexpr Register StackRegister(31);
-constexpr Register LinkRegister(30);
-constexpr Register ProgramCounter(0xFE); // i.e. unaddressable
-#else
-constexpr Register ThreadRegister(8);
-constexpr Register StackRegister(13);
-constexpr Register LinkRegister(14);
-constexpr Register ProgramCounter(15);
 #endif

+inline bool isFpr(lir::RegisterPair* reg)
+{
+  return reg->low.index() >= N_GPRS;
+}
+
 }  // namespace arm
 }  // namespace codegen
 }  // namespace avian
--- a/src/compile-arm.S
+++ b/src/compile-arm.S
@ -16,11 +16,11 @@
 #define BYTES_PER_WORD 4

 #define LOCAL(x) .L##x
-   
+
 #ifdef __APPLE__
 #  define GLOBAL(x) _##x
 #else
-#  define GLOBAL(x) x   
+#  define GLOBAL(x) x
 #endif

 #define CONTINUATION_NEXT 4
@ -29,7 +29,7 @@
 #define CONTINUATION_FRAME_POINTER_OFFSET 24
 #define CONTINUATION_LENGTH 28
 #define CONTINUATION_BODY 32
-   
+
 .globl GLOBAL(vmInvoke)
 .align 2
 GLOBAL(vmInvoke):
@ -56,7 +56,7 @@ GLOBAL(vmInvoke):
  eor   r4, sp, r3
  tst   r4, #4
  subne sp, sp, #4
-   
+
  // copy arguments into place
  sub   sp, r3
  mov   r4, #0
@ -87,7 +87,7 @@ LOCAL(vmInvoke_argumentTest):
 GLOBAL(vmInvoke_returnAddress):
  // restore stack pointer
  ldr   sp, [r8, #TARGET_THREAD_SCRATCH]
-   
+
   // clear MyThread::stack to avoid confusing another thread calling
   // java.lang.Thread.getStackTrace on this one.  See
   // MyProcess::getStackTrace in compile.cpp for details on how we get
@ -109,7 +109,7 @@ GLOBAL(vmInvoke_safeStack):
   ldr  r6,[r5,#CONTINUATION_LENGTH]
   lsl  r6,r6,#2
   neg  r7,r6
-   add  r7,r7,#-80
+   add  r7,r7,#-80 // 80 bytes for callee-saved register values
   mov  r4,sp
   str  r4,[sp,r7]!

@ -167,10 +167,10 @@ LOCAL(vmInvoke_handleException):
   bx   r7

 LOCAL(vmInvoke_exit):
-#endif // AVIAN_CONTINUATIONS

  mov   ip, #0
  str   ip, [r8, #TARGET_THREAD_STACK]
+#endif // AVIAN_CONTINUATIONS

  // restore return type
  ldr   ip, [sp], #4
@ -201,7 +201,7 @@ GLOBAL(vmJumpAndInvoke):
   // which is not true in this case
   sub  r2,r2,r6
   sub  r2,r2,#84
-   
+
   mov  r8,r0

   // copy arguments into place
@ -220,7 +220,7 @@ LOCAL(vmJumpAndInvoke_argumentTest):
   // the arguments have been copied, so we can set the real stack
   // pointer now
   mov  sp,r2
-   
+
   // set return address to vmInvoke_returnAddress
 #ifdef __APPLE__
   movw r11, :lower16:(GLOBAL(vmInvoke_returnAddress)-(LOCAL(vmJumpAndInvoke_getAddress)+8))
@ -246,7 +246,7 @@ LOCAL(vmInvoke_getAddress_word):
 LOCAL(vmJumpAndInvoke_getAddress_word):
   .word _GLOBAL_OFFSET_TABLE_-(LOCAL(vmJumpAndInvoke_getAddress)+8)
 #endif // not __APPLE__
-   
+
 #else // not AVIAN_CONTINUATIONS
   // vmJumpAndInvoke should only be called when continuations are
   // enabled, so we force a crash if we reach here:
--- a/src/compile-arm64.S
+++ b/src/compile-arm64.S
@ -13,23 +13,23 @@

 .text

-#define BYTES_PER_WORD 4
+#define BYTES_PER_WORD 8

 #define LOCAL(x) .L##x
-   
+
 #ifdef __APPLE__
 #  define GLOBAL(x) _##x
 #else
-#  define GLOBAL(x) x   
+#  define GLOBAL(x) x
 #endif

-#define CONTINUATION_NEXT 4
-#define CONTINUATION_ADDRESS 16
-#define CONTINUATION_RETURN_ADDRESS_OFFSET 20
-#define CONTINUATION_FRAME_POINTER_OFFSET 24
-#define CONTINUATION_LENGTH 28
-#define CONTINUATION_BODY 32
-   
+#define CONTINUATION_NEXT 8
+#define CONTINUATION_ADDRESS 32
+#define CONTINUATION_RETURN_ADDRESS_OFFSET 40
+#define CONTINUATION_FRAME_POINTER_OFFSET 48
+#define CONTINUATION_LENGTH 56
+#define CONTINUATION_BODY 64
+
 .globl GLOBAL(vmInvoke)
 .align 2
 GLOBAL(vmInvoke):
@ -43,6 +43,7 @@ GLOBAL(vmInvoke):

  // allocate frame
  stp   x29, x30, [sp,#-96]!
+  mov   x29, sp

  // save callee-saved register values
  stp   x19, x20, [sp,#16]
@ -59,7 +60,7 @@ GLOBAL(vmInvoke):

  // copy arguments into place
  sub   sp, sp, w3, uxtw
-  mov   x5, #0
+  mov   x4, #0
  b     LOCAL(vmInvoke_argumentTest)

 LOCAL(vmInvoke_argumentLoop):
@ -89,22 +90,74 @@ GLOBAL(vmInvoke_returnAddress):
  // MyProcess::getStackTrace in compile.cpp for details on how we get
  // a reliable stack trace from a thread that might be interrupted at
  // any point in its execution.
-  mov  x5, #0
-  str  x5, [x19, #TARGET_THREAD_STACK]
+  str  xzr, [x19, #TARGET_THREAD_STACK]

 .globl GLOBAL(vmInvoke_safeStack)
 .align 2
 GLOBAL(vmInvoke_safeStack):

 #ifdef AVIAN_CONTINUATIONS
-#error todo
+  // call the next continuation, if any
+  ldr   x5, [x19,#TARGET_THREAD_CONTINUATION]
+  cmp   x5, xzr
+  b.eq  LOCAL(vmInvoke_exit)
+
+  ldr   x6, [x5,#CONTINUATION_LENGTH]
+  lsl   x6, x6, #3
+  neg   x7, x6
+  add   x7, x7, #-128 // 128 bytes for callee-saved register values
+  mov   x4, sp
+  add   sp, sp, x7
+  str   x4, [sp]
+
+  add   x7, x5, #CONTINUATION_BODY
+  mov   x11, xzr
+  b     LOCAL(vmInvoke_continuationTest)
+
+LOCAL(vmInvoke_continuationLoop):
+  ldr   x9, [x7,x11]
+  str   x9, [sp,x11]
+  add   x11, x11, #8
+
+LOCAL(vmInvoke_continuationTest):
+  cmp   x11, x6
+  b.le  LOCAL(vmInvoke_continuationLoop)
+
+  ldr   x7, [x5,#CONTINUATION_RETURN_ADDRESS_OFFSET]
+  adr   x11, GLOBAL(vmInvoke_returnAddress)
+  str   x11, [sp,x7]
+
+  ldr   x7, [x5,#CONTINUATION_NEXT]
+  str   x7, [x19,#TARGET_THREAD_CONTINUATION]
+
+  // call the continuation unless we're handling an exception
+  ldr   x7, [x19,#TARGET_THREAD_EXCEPTION]
+  cmp   x7, xzr
+  b.ne  LOCAL(vmInvoke_handleException)
+  ldr   x7, [x5,#CONTINUATION_ADDRESS]
+  br    x7
+
+LOCAL(vmInvoke_handleException):
+   // we're handling an exception - call the exception handler instead
+  str   xzr, [x19,#TARGET_THREAD_EXCEPTION]
+  ldr   x11, [x19,#TARGET_THREAD_EXCEPTIONSTACKADJUSTMENT]
+  ldr   x9, [sp]
+  neg   x11, x11
+  add   sp, sp, x11
+  str   x9, [sp]
+  ldr   x11, [x19,#TARGET_THREAD_EXCEPTIONOFFSET]
+  str   x7, [sp,x11]
+
+  ldr   x7, [x19,#TARGET_THREAD_EXCEPTIONHANDLER]
+  br    x7
+
+LOCAL(vmInvoke_exit):
+  str   xzr, [x19, #TARGET_THREAD_STACK]
+
 #endif // AVIAN_CONTINUATIONS

-  mov   x5, #0
-  str   x5, [x19, #TARGET_THREAD_STACK]
-
  // restore return type
-  ldr   w5, [sp], #4
+  ldr   w5, [sp],#16

  // restore callee-saved register values
  ldp   x19, x20, [sp,#16]
@ -121,7 +174,44 @@ LOCAL(vmInvoke_return):
 .align 2
 GLOBAL(vmJumpAndInvoke):
 #ifdef AVIAN_CONTINUATIONS
-#error todo
+   // x0: thread
+   // x1: address
+   // x2: stack
+   // x3: argumentFootprint
+   // x4: arguments
+   // x5: frameSize
+
+   // allocate new frame, adding room for callee-saved registers, plus
+   // 8 bytes of padding since the calculation of frameSize assumes 8
+   // bytes have already been allocated to save the return address,
+   // which is not true in this case
+   sub   x2, x2, x5
+   sub   x2, x2, #136
+
+   mov   x19, x0
+
+   // copy arguments into place
+   mov   x6, xzr
+   b     LOCAL(vmJumpAndInvoke_argumentTest)
+
+LOCAL(vmJumpAndInvoke_argumentLoop):
+   ldr   x12, [x4,x6]
+   str   x12, [x2,x6]
+   add   x6, x6, #4
+
+LOCAL(vmJumpAndInvoke_argumentTest):
+   cmp   x6, x3
+   ble   LOCAL(vmJumpAndInvoke_argumentLoop)
+
+   // the arguments have been copied, so we can set the real stack
+   // pointer now
+   mov   sp, x2
+
+   // set return address to vmInvoke_returnAddress
+   adr   x30, GLOBAL(vmInvoke_returnAddress)
+
+   br    x1
+
 #else // not AVIAN_CONTINUATIONS
   // vmJumpAndInvoke should only be called when continuations are
   // enabled, so we force a crash if we reach here:
--- a/src/compile.cpp
+++ b/src/compile.cpp
@ -2189,6 +2189,8 @@ GcContinuation* makeCurrentContinuation(MyThread* t,

  *targetIp = 0;
  while (*targetIp == 0) {
+    assertT(t, ip);
+
    GcMethod* method = methodForIp(t, ip);
    if (method) {
      PROTECT(t, method);