diff --git a/classpath/avian/Classes.java b/classpath/avian/Classes.java
index 34877d5ef2..3fc9f37144 100644
--- a/classpath/avian/Classes.java
+++ b/classpath/avian/Classes.java
@@ -38,7 +38,7 @@ public class Classes {
   public static native VMClass primitiveClass(char name);
 
   public static native void initialize(VMClass vmClass);
-  
+
   public static native boolean isAssignableFrom(VMClass a, VMClass b);
 
   public static native VMClass getVMClass(Object o);
@@ -134,7 +134,7 @@ public class Classes {
         array[i] = parseAnnotationValue(loader, pool, in);
       }
       return array;
-    }    
+    }
 
     default: throw new AssertionError();
     }
@@ -207,7 +207,7 @@ public class Classes {
         while (spec[end] != ';') ++ end;
         ++ end;
         break;
-        
+
       default:
         ++ end;
       }
@@ -295,9 +295,9 @@ public class Classes {
     }
     Class c = loader.loadClass(name);
     VMClass vmc = SystemClassLoader.vmClass(c);
-    Classes.link(vmc, loader);
+    link(vmc, loader);
     if (initialize) {
-      Classes.initialize(vmc);
+      initialize(vmc);
     }
     return c;
   }
@@ -315,7 +315,7 @@ public class Classes {
       } else {
         if (name.length() == 1) {
           return SystemClassLoader.getClass
-            (Classes.primitiveClass(name.charAt(0)));
+            (primitiveClass(name.charAt(0)));
         } else {
           throw new ClassNotFoundException(name);
         }
@@ -378,7 +378,7 @@ public class Classes {
 
   public static int findField(VMClass vmClass, String name) {
     if (vmClass.fieldTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);
 
       for (int i = 0; i < vmClass.fieldTable.length; ++i) {
         if (toString(vmClass.fieldTable[i].name).equals(name)) {
@@ -426,7 +426,7 @@ public class Classes {
   {
     VMMethod[] methodTable = vmClass.methodTable;
     if (methodTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);
 
       if (parameterTypes == null) {
         parameterTypes = new Class[0];
@@ -464,7 +464,7 @@ public class Classes {
     Method[] array = new Method[countMethods(vmClass, publicOnly)];
     VMMethod[] methodTable = vmClass.methodTable;
     if (methodTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);
 
       int ai = 0;
       for (int i = 0, j = declaredMethodCount(vmClass); i < j; ++i) {
@@ -498,7 +498,7 @@ public class Classes {
   public static Field[] getFields(VMClass vmClass, boolean publicOnly) {
     Field[] array = new Field[countFields(vmClass, publicOnly)];
     if (vmClass.fieldTable != null) {
-      Classes.link(vmClass);
+      link(vmClass);
 
       int ai = 0;
       for (int i = 0; i < vmClass.fieldTable.length; ++i) {
@@ -568,9 +568,9 @@ public class Classes {
 
     return new ProtectionDomain(source, p);
   }
-  
+
   public static native Method makeMethod(Class c, int slot);
-  
+
   public static native Field makeField(Class c, int slot);
 
   private static native void acquireClassLock();
diff --git a/include/avian/codegen/architecture.h b/include/avian/codegen/architecture.h
index 47687aefaf..528be74858 100644
--- a/include/avian/codegen/architecture.h
+++ b/include/avian/codegen/architecture.h
@@ -28,8 +28,6 @@ namespace codegen {
 
 class Assembler;
 
-class RegisterFile;
-
 class OperandMask {
  public:
   uint8_t typeMask;
diff --git a/src/arm64.S b/src/arm64.S
index 6953ea0cf6..b5ce9a5000 100644
--- a/src/arm64.S
+++ b/src/arm64.S
@@ -35,6 +35,7 @@ GLOBAL(vmNativeCall):
 
   // allocate frame
   stp   x29, x30, [sp,#-64]!
+  mov   x29, sp
 
   // save callee-saved register values so we can clobber them
   stp   x19, x20, [sp,#16]
@@ -118,6 +119,7 @@ GLOBAL(vmRun):
 
   // allocate frame
   stp   x29, x30, [sp,#-96]!
+  mov   x29, sp
 
   // save callee-saved register values
   stp   x19, x20, [sp,#16]
diff --git a/src/codegen/compiler/site.h b/src/codegen/compiler/site.h
index b2c10ddc39..5099704a34 100644
--- a/src/codegen/compiler/site.h
+++ b/src/codegen/compiler/site.h
@@ -123,7 +123,7 @@ class Site {
 
   virtual RegisterMask registerMask(Context*)
   {
-    return 0;
+    return RegisterMask(0);
   }
 
   virtual bool isVolatile(Context*)
diff --git a/src/codegen/target/arm/CMakeLists.txt b/src/codegen/target/arm/CMakeLists.txt
index bc26352adc..23faf6694f 100644
--- a/src/codegen/target/arm/CMakeLists.txt
+++ b/src/codegen/target/arm/CMakeLists.txt
@@ -4,5 +4,6 @@ add_library(avian_codegen_arm
   context.cpp
   fixup.cpp
   multimethod.cpp
-  operations.cpp
+  operations32.cpp
+  operations64.cpp
 )
diff --git a/src/codegen/target/arm/assembler.cpp b/src/codegen/target/arm/assembler.cpp
index a6c7491279..3130662073 100644
--- a/src/codegen/target/arm/assembler.cpp
+++ b/src/codegen/target/arm/assembler.cpp
@@ -39,7 +39,7 @@ namespace isa {
 bool vfpSupported()
 {
 // TODO: Use at runtime detection
-#if defined(__ARM_PCS_VFP)
+#if (defined __ARM_PCS_VFP) || (defined ARCH_arm64)
   // armhf
   return true;
 #else
@@ -55,9 +55,9 @@ bool vfpSupported()
 const RegisterFile MyRegisterFileWithoutFloats(GPR_MASK, 0);
 const RegisterFile MyRegisterFileWithFloats(GPR_MASK, FPR_MASK);
 
-const unsigned FrameHeaderSize = 1;
+const unsigned FrameHeaderSize = TargetBytesPerWord / 4;
 
-const unsigned StackAlignmentInBytes = 8;
+const unsigned StackAlignmentInBytes = TargetBytesPerWord * 2;
 const unsigned StackAlignmentInWords = StackAlignmentInBytes
                                        / TargetBytesPerWord;
 
@@ -89,11 +89,11 @@ void nextFrame(ArchitectureContext* con,
                void** stack)
 {
   assertT(con, *ip >= start);
-  assertT(con, *ip <= start + (size / TargetBytesPerWord));
+  assertT(con, *ip <= start + (size / 4));
 
   uint32_t* instruction = static_cast<uint32_t*>(*ip);
 
-  if ((*start >> 20) == 0xe59) {
+  if ((*start >> 20) == (TargetBytesPerWord == 8 ? 0xf94 : 0xe59)) {
     // skip stack overflow check
     start += 3;
   }
@@ -111,7 +111,8 @@ void nextFrame(ArchitectureContext* con,
     return;
   }
 
-  if (*instruction == 0xe12fff1e) {  // return
+  if (*instruction == (TargetBytesPerWord == 8 ? 0xd61f03c0 : 0xe12fff1e)) {
+    // return
     *ip = link;
     return;
   }
@@ -124,7 +125,20 @@ void nextFrame(ArchitectureContext* con,
 
     // check for post-non-tail-call stack adjustment of the form "sub
     // sp, sp, #offset":
-    if ((*instruction >> 12) == 0xe24dd) {
+    if (TargetBytesPerWord == 8 and (*instruction & 0xff0003ff) == 0xd10003ff) {
+      unsigned value = (*instruction >> 10) & 0xfff;
+      unsigned shift = (*instruction >> 22) & 1;
+      switch (shift) {
+      case 0:
+        offset -= value / TargetBytesPerWord;
+        break;
+      case 1:
+        offset -= (value << 12) / TargetBytesPerWord;
+        break;
+      default:
+        abort(con);
+      }
+    } else if (TargetBytesPerWord == 4 and (*instruction >> 12) == 0xe24dd) {
       unsigned value = *instruction & 0xff;
       unsigned rotation = (*instruction >> 8) & 0xf;
       switch (rotation) {
@@ -218,6 +232,7 @@ class MyArchitecture : public Architecture {
   {
     switch (register_.index()) {
     case LinkRegister.index():
+    case FrameRegister.index():
     case StackRegister.index():
     case ThreadRegister.index():
     case ProgramCounter.index():
@@ -258,7 +273,7 @@ class MyArchitecture : public Architecture {
 
   virtual unsigned argumentRegisterCount()
   {
-    return 4;
+    return TargetBytesPerWord;
   }
 
   virtual Register argumentRegister(unsigned index)
@@ -306,8 +321,13 @@ class MyArchitecture : public Architecture {
     case lir::AlignedLongCall:
     case lir::AlignedLongJump: {
       uint32_t* p = static_cast<uint32_t*>(returnAddress) - 2;
-      *reinterpret_cast<void**>(p + (((*p & PoolOffsetMask) + 8) / 4))
-          = newTarget;
+      if (TargetBytesPerWord == 8) {
+        const int32_t mask = (PoolOffsetMask >> 2) << 5;
+        *reinterpret_cast<void**>(p + ((*p & mask) >> 5)) = newTarget;
+      } else {
+        *reinterpret_cast<void**>(p + (((*p & PoolOffsetMask) + 8) / 4))
+            = newTarget;
+      }
     } break;
 
     default:
@@ -434,11 +454,11 @@ class MyArchitecture : public Architecture {
       break;
 
     case lir::Float2Int:
-      // todo: Java requires different semantics than SSE for
+      // todo: Java requires different semantics than VFP for
       // converting floats to integers, we we need to either use
       // thunks or produce inline machine code which handles edge
       // cases properly.
-      if (false && vfpSupported() && bSize == 4) {
+      if (false && vfpSupported() && bSize <= TargetBytesPerWord) {
         aMask.typeMask = lir::Operand::RegisterPairMask;
         aMask.setLowHighRegisterMasks(FPR_MASK, FPR_MASK);
       } else {
@@ -447,7 +467,7 @@ class MyArchitecture : public Architecture {
       break;
 
     case lir::Int2Float:
-      if (vfpSupported() && aSize == 4) {
+      if (vfpSupported() && aSize <= TargetBytesPerWord) {
         aMask.typeMask = lir::Operand::RegisterPairMask;
         aMask.setLowHighRegisterMasks(GPR_MASK, GPR_MASK);
       } else {
@@ -544,7 +564,7 @@ class MyArchitecture : public Architecture {
     case lir::ShiftLeft:
     case lir::ShiftRight:
     case lir::UnsignedShiftRight:
-      if (bSize == 8)
+      if (bSize > TargetBytesPerWord)
         aMask.typeMask = bMask.typeMask = lir::Operand::RegisterPairMask;
       break;
 
@@ -556,6 +576,11 @@ class MyArchitecture : public Architecture {
       aMask.typeMask = bMask.typeMask = lir::Operand::RegisterPairMask;
       break;
 
+    // todo: Although ARM has instructions for integer division and
+    // remainder, they don't trap on division by zero, which is why
+    // we use thunks.  Alternatively, we could generate inline code
+    // with an explicit zero check, which would probably be a bit
+    // faster.
     case lir::Divide:
     case lir::Remainder:
     case lir::FloatRemainder:
@@ -567,7 +592,7 @@ class MyArchitecture : public Architecture {
     case lir::FloatMultiply:
     case lir::FloatDivide:
       if (vfpSupported()) {
-        bMask.typeMask = lir::Operand::RegisterPairMask;
+        aMask.typeMask = lir::Operand::RegisterPairMask;
         aMask.setLowHighRegisterMasks(FPR_MASK, FPR_MASK);
         bMask = aMask;
       } else {
@@ -745,19 +770,45 @@ class MyAssembler : public Assembler {
     // how to handle them:
     assertT(&con, footprint < 256);
 
-    lir::RegisterPair stack(StackRegister);
-    ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
-    lir::Constant footprintConstant(&footprintPromise);
-    subC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+    // todo: the ARM ABI says the frame preamble should be of the form
+    //
+    //   stp x29, x30, [sp,#-footprint]!
+    //   mov x29, sp
+    //
+    // and the frame should be popped with e.g.
+    //
+    //   ldp x29, x30, [sp],#footprint
+    //   br x30
+    //
+    // However, that will invalidate a lot of assumptions elsewhere
+    // about the return address being stored at the opposite end of
+    // the frame, so lots of other code will need to change before we
+    // can do that.  The code below can be enabled as a starting point
+    // when we're ready to tackle that.
+    if (false and TargetBytesPerWord == 8) {
+      // stp x29, x30, [sp,#-footprint]!
+      con.code.append4(0xa9800000 | ((-footprint & 0x7f) << 15)
+                       | (StackRegister.index() << 5)
+                       | (LinkRegister.index() << 10) | FrameRegister.index());
 
-    lir::RegisterPair returnAddress(LinkRegister);
-    lir::Memory returnAddressDst(StackRegister,
-                                 (footprint - 1) * TargetBytesPerWord);
-    moveRM(&con,
-           TargetBytesPerWord,
-           &returnAddress,
-           TargetBytesPerWord,
-           &returnAddressDst);
+      lir::RegisterPair stack(StackRegister);
+      lir::RegisterPair frame(FrameRegister);
+      moveRR(&con, TargetBytesPerWord, &stack, TargetBytesPerWord, &frame);
+    } else {
+      lir::RegisterPair stack(StackRegister);
+      ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
+      lir::Constant footprintConstant(&footprintPromise);
+      subC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+
+      lir::RegisterPair returnAddress(LinkRegister);
+      lir::Memory returnAddressDst(StackRegister,
+                                   (footprint - 1) * TargetBytesPerWord);
+      moveRM(&con,
+             TargetBytesPerWord,
+             &returnAddress,
+             TargetBytesPerWord,
+             &returnAddressDst);
+    }
   }
 
   virtual void adjustFrame(unsigned difference)
@@ -772,19 +823,26 @@ class MyAssembler : public Assembler {
   {
     footprint += FrameHeaderSize;
 
-    lir::RegisterPair returnAddress(LinkRegister);
-    lir::Memory returnAddressSrc(StackRegister,
-                                 (footprint - 1) * TargetBytesPerWord);
-    moveMR(&con,
-           TargetBytesPerWord,
-           &returnAddressSrc,
-           TargetBytesPerWord,
-           &returnAddress);
+    // see comment regarding the ARM64 ABI in allocateFrame
+    if (false and TargetBytesPerWord == 8) {
+      // ldp x29, x30, [sp],#footprint
+      con.code.append4(0xa8c00000 | (footprint << 15) | (31 << 5) | (30 << 10)
+                       | 29);
+    } else {
+      lir::RegisterPair returnAddress(LinkRegister);
+      lir::Memory returnAddressSrc(StackRegister,
+                                   (footprint - 1) * TargetBytesPerWord);
+      moveMR(&con,
+             TargetBytesPerWord,
+             &returnAddressSrc,
+             TargetBytesPerWord,
+             &returnAddress);
 
-    lir::RegisterPair stack(StackRegister);
-    ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
-    lir::Constant footprintConstant(&footprintPromise);
-    addC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+      lir::RegisterPair stack(StackRegister);
+      ResolvedPromise footprintPromise(footprint * TargetBytesPerWord);
+      lir::Constant footprintConstant(&footprintPromise);
+      addC(&con, TargetBytesPerWord, &footprintConstant, &stack, &stack);
+    }
   }
 
   virtual void popFrameForTailCall(unsigned footprint,
@@ -851,10 +909,26 @@ class MyAssembler : public Assembler {
     return_(&con);
   }
 
-  virtual void popFrameAndUpdateStackAndReturn(unsigned frameFootprint,
+  virtual void popFrameAndUpdateStackAndReturn(unsigned footprint,
                                                unsigned stackOffsetFromThread)
   {
-    popFrame(frameFootprint);
+    footprint += FrameHeaderSize;
+
+    // see comment regarding the ARM64 ABI in allocateFrame
+    if (false and TargetBytesPerWord == 8) {
+      // ldp x29, x30, [sp],#footprint
+      con.code.append4(0xa8c00000 | (footprint << 15) | (31 << 5) | (30 << 10)
+                       | 29);
+    } else {
+      lir::RegisterPair returnAddress(LinkRegister);
+      lir::Memory returnAddressSrc(StackRegister,
+                                   (footprint - 1) * TargetBytesPerWord);
+      moveMR(&con,
+             TargetBytesPerWord,
+             &returnAddressSrc,
+             TargetBytesPerWord,
+             &returnAddress);
+    }
 
     lir::RegisterPair stack(StackRegister);
     lir::Memory newStackSrc(ThreadRegister, stackOffsetFromThread);
@@ -946,17 +1020,28 @@ class MyAssembler : public Assembler {
           unsigned instruction = o->block->start + padding(o->block, o->offset)
                                  + o->offset;
 
-          int32_t v = (entry - 8) - instruction;
-          expect(&con, v == (v & PoolOffsetMask));
-
           int32_t* p = reinterpret_cast<int32_t*>(dst + instruction);
-          *p = (v & PoolOffsetMask) | ((~PoolOffsetMask) & *p);
+
+          if (TargetBytesPerWord == 8) {
+            int32_t v = entry - instruction;
+            expect(&con, v == (v & PoolOffsetMask));
+
+            const int32_t mask = (PoolOffsetMask >> 2) << 5;
+            *p = (((v >> 2) << 5) & mask) | ((~mask) & *p);
+          } else {
+            int32_t v = (entry - 8) - instruction;
+            expect(&con, v == (v & PoolOffsetMask));
+
+            *p = (v & PoolOffsetMask) | ((~PoolOffsetMask) & *p);
+          }
 
           poolSize += TargetBytesPerWord;
         }
 
         bool jump = needJump(b);
         if (jump) {
+          expect(&con, TargetBytesPerWord == 4);
+
           write4(dst + dstOffset,
                  isa::b((poolSize + TargetBytesPerWord - 8) >> 2));
         }
diff --git a/src/codegen/target/arm/fixup.cpp b/src/codegen/target/arm/fixup.cpp
index e1d41b6eb4..3117688b15 100644
--- a/src/codegen/target/arm/fixup.cpp
+++ b/src/codegen/target/arm/fixup.cpp
@@ -12,6 +12,12 @@
 #include "fixup.h"
 #include "block.h"
 
+namespace {
+
+const unsigned InstructionSize = 4;
+
+}  // namespace
+
 namespace avian {
 namespace codegen {
 namespace arm {
@@ -38,8 +44,7 @@ int64_t OffsetPromise::value()
   assertT(con, resolved());
 
   unsigned o = offset - block->offset;
-  return block->start
-         + padding(block, forTrace ? o - vm::TargetBytesPerWord : o) + o;
+  return block->start + padding(block, forTrace ? o - InstructionSize : o) + o;
 }
 
 Promise* offsetPromise(Context* con, bool forTrace)
@@ -92,17 +97,30 @@ bool bounded(int right, int left, int32_t v)
 
 void* updateOffset(vm::System* s, uint8_t* instruction, int64_t value)
 {
-  // ARM's PC is two words ahead, and branches drop the bottom 2 bits.
-  int32_t v = (reinterpret_cast<uint8_t*>(value) - (instruction + 8)) >> 2;
-
-  int32_t mask;
-  expect(s, bounded(0, 8, v));
-  mask = 0xFFFFFF;
-
   int32_t* p = reinterpret_cast<int32_t*>(instruction);
+
+  int32_t v;
+  int32_t mask;
+  if (vm::TargetBytesPerWord == 8) {
+  if ((*p >> 24) == 0x54) {
+    // conditional branch
+    v = ((reinterpret_cast<uint8_t*>(value) - instruction) >> 2) << 5;
+    mask = 0xFFFFE0;
+  } else {
+    // unconditional branch
+    v = (reinterpret_cast<uint8_t*>(value) - instruction) >> 2;
+    mask = 0x3FFFFFF;
+  }
+  } else {
+    v = (reinterpret_cast<uint8_t*>(value) - (instruction + 8)) >> 2;
+    mask = 0xFFFFFF;
+  }
+
+  expect(s, bounded(0, 8, v));
+
   *p = (v & mask) | ((~mask) & *p);
 
-  return instruction + 4;
+  return instruction + InstructionSize;
 }
 
 ConstantPoolEntry::ConstantPoolEntry(Context* con,
@@ -214,6 +232,101 @@ void appendPoolEvent(Context* con,
   b->poolEventTail = e;
 }
 
+bool needJump(MyBlock* b)
+{
+  return b->next or b->size != (b->size & PoolOffsetMask);
+}
+
+unsigned padding(MyBlock* b, unsigned offset)
+{
+  unsigned total = 0;
+  for (PoolEvent* e = b->poolEventHead; e; e = e->next) {
+    if (e->offset <= offset) {
+      if (needJump(b)) {
+        total += vm::TargetBytesPerWord;
+      }
+      for (PoolOffset* o = e->poolOffsetHead; o; o = o->next) {
+        total += vm::TargetBytesPerWord;
+      }
+    } else {
+      break;
+    }
+  }
+  return total;
+}
+
+void resolve(MyBlock* b)
+{
+  Context* con = b->context;
+
+  if (b->poolOffsetHead) {
+    if (con->poolOffsetTail) {
+      con->poolOffsetTail->next = b->poolOffsetHead;
+    } else {
+      con->poolOffsetHead = b->poolOffsetHead;
+    }
+    con->poolOffsetTail = b->poolOffsetTail;
+  }
+
+  if (con->poolOffsetHead) {
+    bool append;
+    if (b->next == 0 or b->next->poolEventHead) {
+      append = true;
+    } else {
+      int32_t v
+          = (b->start + b->size + b->next->size + vm::TargetBytesPerWord - 8)
+            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
+
+      append = (v != (v & PoolOffsetMask));
+
+      if (DebugPool) {
+        fprintf(stderr,
+                "current %p %d %d next %p %d %d\n",
+                b,
+                b->start,
+                b->size,
+                b->next,
+                b->start + b->size,
+                b->next->size);
+        fprintf(stderr,
+                "offset %p %d is of distance %d to next block; append? %d\n",
+                con->poolOffsetHead,
+                con->poolOffsetHead->offset,
+                v,
+                append);
+      }
+    }
+
+    if (append) {
+#ifndef NDEBUG
+      int32_t v
+          = (b->start + b->size - 8)
+            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
+
+      expect(con, v == (v & PoolOffsetMask));
+#endif  // not NDEBUG
+
+      appendPoolEvent(
+          con, b, b->size, con->poolOffsetHead, con->poolOffsetTail);
+
+      if (DebugPool) {
+        for (PoolOffset* o = con->poolOffsetHead; o; o = o->next) {
+          fprintf(stderr,
+                  "include %p %d in pool event %p at offset %d in block %p\n",
+                  o,
+                  o->offset,
+                  b->poolEventTail,
+                  b->size,
+                  b);
+        }
+      }
+
+      con->poolOffsetHead = 0;
+      con->poolOffsetTail = 0;
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace codegen
 }  // namespace avian
diff --git a/src/codegen/target/arm/fixup.h b/src/codegen/target/arm/fixup.h
index 5460295d95..cce2b59dce 100644
--- a/src/codegen/target/arm/fixup.h
+++ b/src/codegen/target/arm/fixup.h
@@ -27,7 +27,7 @@ namespace arm {
 
 const bool DebugPool = false;
 
-const int32_t PoolOffsetMask = 0xFFF;
+const int32_t PoolOffsetMask = vm::TargetBytesPerWord == 8 ? 0x1FFFFF : 0xFFF;
 
 class Task {
  public:
diff --git a/src/codegen/target/arm/operations.cpp b/src/codegen/target/arm/operations32.cpp
similarity index 92%
rename from src/codegen/target/arm/operations.cpp
rename to src/codegen/target/arm/operations32.cpp
index 87d88613fd..e9cd601fe3 100644
--- a/src/codegen/target/arm/operations.cpp
+++ b/src/codegen/target/arm/operations32.cpp
@@ -15,6 +15,8 @@
 #include "fixup.h"
 #include "multimethod.h"
 
+#if TARGET_BYTES_PER_WORD == 4
+
 namespace avian {
 namespace codegen {
 namespace arm {
@@ -179,101 +181,6 @@ void unsignedShiftRightC(Context* con,
   }
 }
 
-bool needJump(MyBlock* b)
-{
-  return b->next or b->size != (b->size & PoolOffsetMask);
-}
-
-unsigned padding(MyBlock* b, unsigned offset)
-{
-  unsigned total = 0;
-  for (PoolEvent* e = b->poolEventHead; e; e = e->next) {
-    if (e->offset <= offset) {
-      if (needJump(b)) {
-        total += vm::TargetBytesPerWord;
-      }
-      for (PoolOffset* o = e->poolOffsetHead; o; o = o->next) {
-        total += vm::TargetBytesPerWord;
-      }
-    } else {
-      break;
-    }
-  }
-  return total;
-}
-
-void resolve(MyBlock* b)
-{
-  Context* con = b->context;
-
-  if (b->poolOffsetHead) {
-    if (con->poolOffsetTail) {
-      con->poolOffsetTail->next = b->poolOffsetHead;
-    } else {
-      con->poolOffsetHead = b->poolOffsetHead;
-    }
-    con->poolOffsetTail = b->poolOffsetTail;
-  }
-
-  if (con->poolOffsetHead) {
-    bool append;
-    if (b->next == 0 or b->next->poolEventHead) {
-      append = true;
-    } else {
-      int32_t v
-          = (b->start + b->size + b->next->size + vm::TargetBytesPerWord - 8)
-            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
-
-      append = (v != (v & PoolOffsetMask));
-
-      if (DebugPool) {
-        fprintf(stderr,
-                "current %p %d %d next %p %d %d\n",
-                b,
-                b->start,
-                b->size,
-                b->next,
-                b->start + b->size,
-                b->next->size);
-        fprintf(stderr,
-                "offset %p %d is of distance %d to next block; append? %d\n",
-                con->poolOffsetHead,
-                con->poolOffsetHead->offset,
-                v,
-                append);
-      }
-    }
-
-    if (append) {
-#ifndef NDEBUG
-      int32_t v
-          = (b->start + b->size - 8)
-            - (con->poolOffsetHead->offset + con->poolOffsetHead->block->start);
-
-      expect(con, v == (v & PoolOffsetMask));
-#endif  // not NDEBUG
-
-      appendPoolEvent(
-          con, b, b->size, con->poolOffsetHead, con->poolOffsetTail);
-
-      if (DebugPool) {
-        for (PoolOffset* o = con->poolOffsetHead; o; o = o->next) {
-          fprintf(stderr,
-                  "include %p %d in pool event %p at offset %d in block %p\n",
-                  o,
-                  o->offset,
-                  b->poolEventTail,
-                  b->size,
-                  b);
-        }
-      }
-
-      con->poolOffsetHead = 0;
-      con->poolOffsetTail = 0;
-    }
-  }
-}
-
 void jumpR(Context* con, unsigned size UNUSED, lir::RegisterPair* target)
 {
   assertT(con, size == vm::TargetBytesPerWord);
@@ -410,7 +317,8 @@ void moveCR2(Context* con,
     lir::RegisterPair dstHi(dst->high);
     moveCR(con, 4, &srcLo, 4, dst);
     moveCR(con, 4, &srcHi, 4, &dstHi);
-  } else if (src->value->resolved() and isOfWidth(getValue(src), 8)) {
+  } else if (callOffset == 0 and src->value->resolved()
+             and isOfWidth(getValue(src), 8)) {
     emit(con, movi(dst->low, lo8(getValue(src))));  // fits in immediate
   } else {
     appendConstantPoolEntry(con, src->value, callOffset);
@@ -510,9 +418,9 @@ void multiplyR(Context* con,
   if (size == 8) {
     bool useTemporaries = b->low == t->low;
     Register tmpLow = useTemporaries ? con->client->acquireTemporary(GPR_MASK)
-                                : t->low;
+                                     : t->low;
     Register tmpHigh = useTemporaries ? con->client->acquireTemporary(GPR_MASK)
-                                 : t->high;
+                                      : t->high;
 
     emit(con, umull(tmpLow, tmpHigh, a->low, b->low));
     emit(con, mla(tmpHigh, a->low, b->high, tmpHigh));
@@ -665,11 +573,11 @@ void floatDivideR(Context* con,
 }
 
 Register normalize(Context* con,
-              int offset,
-              Register index,
-              unsigned scale,
-              bool* preserveIndex,
-              bool* release)
+                   int offset,
+                   Register index,
+                   unsigned scale,
+                   bool* preserveIndex,
+                   bool* release)
 {
   if (offset != 0 or scale != 1) {
     lir::RegisterPair normalizedIndex(
@@ -947,26 +855,8 @@ void load(Context* con,
       case 8: {
         if (dstSize == 8) {
           lir::RegisterPair dstHigh(dst->high);
-          load(con,
-               4,
-               base,
-               offset,
-               NoRegister,
-               1,
-               4,
-               &dstHigh,
-               false,
-               false);
-          load(con,
-               4,
-               base,
-               offset + 4,
-               NoRegister,
-               1,
-               4,
-               dst,
-               false,
-               false);
+          load(con, 4, base, offset, NoRegister, 1, 4, &dstHigh, false, false);
+          load(con, 4, base, offset + 4, NoRegister, 1, 4, dst, false, false);
         } else {
           emit(con, ldri(dst->low, base, offset));
         }
@@ -1496,15 +1386,26 @@ void longCallC(Context* con, unsigned size UNUSED, lir::Constant* target)
   callR(con, vm::TargetBytesPerWord, &tmp);
 }
 
+void alignedLongCallC(Context* con, unsigned size, lir::Constant* target)
+{
+  longCallC(con, size, target);
+}
+
 void longJumpC(Context* con, unsigned size UNUSED, lir::Constant* target)
 {
   assertT(con, size == vm::TargetBytesPerWord);
 
-  lir::RegisterPair tmp(Register(4));  // a non-arg reg that we don't mind clobbering
+  lir::RegisterPair tmp(
+      Register(4));  // a non-arg reg that we don't mind clobbering
   moveCR2(con, vm::TargetBytesPerWord, target, &tmp, offsetPromise(con));
   jumpR(con, vm::TargetBytesPerWord, &tmp);
 }
 
+void alignedLongJumpC(Context* con, unsigned size, lir::Constant* target)
+{
+  longJumpC(con, size, target);
+}
+
 void jumpC(Context* con, unsigned size UNUSED, lir::Constant* target)
 {
   assertT(con, size == vm::TargetBytesPerWord);
@@ -1554,3 +1455,5 @@ void storeLoadBarrier(Context* con)
 }  // namespace arm
 }  // namespace codegen
 }  // namespace avian
+
+#endif  // TARGET_BYTES_PER_WORD == 4
diff --git a/src/codegen/target/arm/operations64.cpp b/src/codegen/target/arm/operations64.cpp
new file mode 100644
index 0000000000..e0c4a69ed6
--- /dev/null
+++ b/src/codegen/target/arm/operations64.cpp
@@ -0,0 +1,1625 @@
+/* Copyright (c) 2008-2014, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+#include "context.h"
+#include "operations.h"
+#include "block.h"
+#include "fixup.h"
+#include "multimethod.h"
+
+#if TARGET_BYTES_PER_WORD == 8
+
+namespace {
+
+using namespace avian::codegen;
+using namespace avian::codegen::arm;
+
+Register fpr(Register reg)
+{
+  return Register(reg.index() - N_GPRS);
+}
+
+Register fpr(lir::RegisterPair* reg)
+{
+  return fpr(reg->low);
+}
+
+void append(Context* c, uint32_t instruction)
+{
+  c->code.append4(instruction);
+}
+
+uint32_t lslv(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0x9ac02000 : 0x1ac02000) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ubfm(Register Rd, Register Rn, int r, int s, unsigned size)
+{
+  return (size == 8 ? 0xd3400000 : 0x53000000) | (r << 16) | (s << 10)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t sbfm(Register Rd, Register Rn, int r, int s, unsigned size)
+{
+  return (size == 8 ? 0x93400000 : 0x13000000) | (r << 16) | (s << 10)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t lsli(Register Rd, Register Rn, int shift, unsigned size)
+{
+  if (size == 4) {
+    return ubfm(Rd, Rn, (32 - shift) & 0x1f, 31 - shift, size);
+  } else {
+    return ubfm(Rd, Rn, (64 - shift) & 0x3f, 63 - shift, size);
+  }
+}
+
+uint32_t asrv(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0x9ac02800 : 0x1ac02800) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t lsrv(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0x9ac02400 : 0x1ac02400) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t lsri(Register Rd, Register Rn, int shift, unsigned size)
+{
+  return ubfm(Rd, Rn, shift, size == 8 ? 63 : 31, size);
+}
+
+uint32_t asri(Register Rd, Register Rn, int shift, unsigned size)
+{
+  return sbfm(Rd, Rn, shift, size == 8 ? 63 : 31, size);
+}
+
+uint32_t sxtb(Register Rd, Register Rn)
+{
+  return sbfm(Rd, Rn, 0, 7, 8);
+}
+
+uint32_t sxth(Register Rd, Register Rn)
+{
+  return sbfm(Rd, Rn, 0, 15, 8);
+}
+
+uint32_t uxth(Register Rd, Register Rn)
+{
+  return ubfm(Rd, Rn, 0, 15, 4);
+}
+
+uint32_t sxtw(Register Rd, Register Rn)
+{
+  return sbfm(Rd, Rn, 0, 31, 8);
+}
+
+uint32_t br(Register Rn)
+{
+  return 0xd61f0000 | (Rn.index() << 5);
+}
+
+uint32_t fmovFdFn(Register Fd, Register Fn, unsigned size)
+{
+  return (size == 8 ? 0x1e604000 : 0x1e204000) | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fmovRdFn(Register Rd, Register Fn, unsigned size)
+{
+  return (size == 8 ? 0x9e660000 : 0x1e260000) | (Fn.index() << 5) | Rd.index();
+}
+
+uint32_t fmovFdRn(Register Fd, Register Rn, unsigned size)
+{
+  return (size == 8 ? 0x9e670000 : 0x1e270000) | (Rn.index() << 5) | Fd.index();
+}
+
+uint32_t orr(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xaa000000 : 0x2a000000) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t addi(Register Rd, Register Rn, int value, int shift, unsigned size)
+{
+  return (size == 8 ? 0x91000000 : 0x11000000) | (shift ? 0x400000 : 0)
+         | (value << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t mov(Register Rd, Register Rn, unsigned size)
+{
+  return Rn.index() == 31 or Rd.index() == 31 ? addi(Rd, Rn, 0, 0, size)
+    : orr(Rd, Register(31), Rn, size);
+}
+
+uint32_t movz(Register Rd, int value, unsigned shift, unsigned size)
+{
+  return (size == 8 ? 0xd2800000 : 0x52800000) | ((shift >> 4) << 21)
+         | (value << 5) | Rd.index();
+}
+
+uint32_t movn(Register Rd, int value, unsigned shift, unsigned size)
+{
+  return (size == 8 ? 0x92800000 : 0x12800000) | ((shift >> 4) << 21)
+         | (value << 5) | Rd.index();
+}
+
+uint32_t movk(Register Rd, int value, unsigned shift, unsigned size)
+{
+  return (size == 8 ? 0xf2800000 : 0x72800000) | ((shift >> 4) << 21)
+         | (value << 5) | Rd.index();
+}
+
+uint32_t ldrPCRel(Register Rd, int offset, unsigned size)
+{
+  return (size == 8 ? 0x58000000 : 0x18000000) | ((offset >> 2) << 5)
+         | Rd.index();
+}
+
+uint32_t add(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0x8b000000 : 0x0b000000) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t sub(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xcb000000 : 0x4b000000) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t and_(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0x8a000000 : 0x0a000000) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t eor(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xca000000 : 0x4a000000) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t madd(Register Rd, Register Rn, Register Rm, Register Ra, unsigned size)
+{
+  return (size == 8 ? 0x9b000000 : 0x1b000000) | (Rm.index() << 16)
+         | (Ra.index() << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t mul(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return madd(Rd, Rn, Rm, Register(31), size);
+}
+
+uint32_t subi(Register Rd, Register Rn, int value, int shift, unsigned size)
+{
+  return (size == 8 ? 0xd1000000 : 0x51000000) | (shift ? 0x400000 : 0)
+         | (value << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t fabs_(Register Fd, Register Fn, unsigned size)
+{
+  return (size == 8 ? 0x1e60c000 : 0x1e20c000) | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fneg(Register Fd, Register Fn, unsigned size)
+{
+  return (size == 8 ? 0x1e614000 : 0x1e214000) | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fsqrt(Register Fd, Register Fn, unsigned size)
+{
+  return (size == 8 ? 0x1e61c000 : 0x1e21c000) | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fadd(Register Fd, Register Fn, Register Fm, unsigned size)
+{
+  return (size == 8 ? 0x1e602800 : 0x1e202800) | (Fm.index() << 16)
+         | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fsub(Register Fd, Register Fn, Register Fm, unsigned size)
+{
+  return (size == 8 ? 0x1e603800 : 0x1e203800) | (Fm.index() << 16)
+         | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fmul(Register Fd, Register Fn, Register Fm, unsigned size)
+{
+  return (size == 8 ? 0x1e600800 : 0x1e200800) | (Fm.index() << 16)
+         | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fdiv(Register Fd, Register Fn, Register Fm, unsigned size)
+{
+  return (size == 8 ? 0x1e601800 : 0x1e201800) | (Fm.index() << 16)
+         | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fcvtSdDn(Register Fd, Register Fn)
+{
+  return 0x1e624000 | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fcvtDdSn(Register Fd, Register Fn)
+{
+  return 0x1e22c000 | (Fn.index() << 5) | Fd.index();
+}
+
+uint32_t fcvtasXdDn(Register Rd, Register Fn)
+{
+  return 0x9e640000 | (Fn.index() << 5) | Rd.index();
+}
+
+uint32_t fcvtasWdSn(Register Rd, Register Fn)
+{
+  return 0x1e240000 | (Fn.index() << 5) | Rd.index();
+}
+
+uint32_t scvtfDdXn(Register Fd, Register Rn)
+{
+  return 0x9e620000 | (Rn.index() << 5) | Fd.index();
+}
+
+uint32_t scvtfSdWn(Register Fd, Register Rn)
+{
+  return 0x1e220000 | (Rn.index() << 5) | Fd.index();
+}
+
+uint32_t strFs(Register Fs, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xfc206800 : 0xbc206800) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Fs.index();
+}
+
+uint32_t strb(Register Rs, Register Rn, Register Rm)
+{
+  return 0x38206800 | (Rm.index() << 16) | (Rn.index() << 5) | Rs.index();
+}
+
+uint32_t strh(Register Rs, Register Rn, Register Rm)
+{
+  return 0x78206800 | (Rm.index() << 16) | (Rn.index() << 5) | Rs.index();
+}
+
+uint32_t striFs(Register Fs, Register Rn, int offset, unsigned size)
+{
+  return (size == 8 ? 0xfd000000 : 0xbd000000)
+         | ((offset >> (size == 8 ? 3 : 2)) << 10) | (Rn.index() << 5)
+         | Fs.index();
+}
+
+uint32_t str(Register Rs, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xf8206800 : 0xb8206800) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rs.index();
+}
+
+uint32_t strbi(Register Rs, Register Rn, int offset)
+{
+  return 0x39000000 | (offset << 10) | (Rn.index() << 5) | Rs.index();
+}
+
+uint32_t strhi(Register Rs, Register Rn, int offset)
+{
+  return 0x79000000 | ((offset >> 1) << 10) | (Rn.index() << 5) | Rs.index();
+}
+
+uint32_t stri(Register Rs, Register Rn, int offset, unsigned size)
+{
+  return (size == 8 ? 0xf9000000 : 0xb9000000)
+         | ((offset >> (size == 8 ? 3 : 2)) << 10) | (Rn.index() << 5)
+         | Rs.index();
+}
+
+uint32_t ldrFd(Register Fd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xfc606800 : 0xbc606800) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Fd.index();
+}
+
+uint32_t ldrb(Register Rd, Register Rn, Register Rm)
+{
+  return 0x38606800 | (Rm.index() << 16) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrsb(Register Rd, Register Rn, Register Rm)
+{
+  return 0x38e06800 | (Rm.index() << 16) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrh(Register Rd, Register Rn, Register Rm)
+{
+  return 0x78606800 | (Rm.index() << 16) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrsh(Register Rd, Register Rn, Register Rm)
+{
+  return 0x78e06800 | (Rm.index() << 16) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrsw(Register Rd, Register Rn, Register Rm)
+{
+  return 0xb8a06800 | (Rm.index() << 16) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldr(Register Rd, Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xf8606800 : 0xb8606800) | (Rm.index() << 16)
+         | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldriFd(Register Fd, Register Rn, int offset, unsigned size)
+{
+  return (size == 8 ? 0xfd400000 : 0xbd400000)
+         | ((offset >> (size == 8 ? 3 : 2)) << 10) | (Rn.index() << 5)
+         | Fd.index();
+}
+
+uint32_t ldrbi(Register Rd, Register Rn, int offset)
+{
+  return 0x39400000 | (offset << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrsbi(Register Rd, Register Rn, int offset)
+{
+  return 0x39c00000 | (offset << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrhi(Register Rd, Register Rn, int offset)
+{
+  return 0x79400000 | ((offset >> 1) << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrshi(Register Rd, Register Rn, int offset)
+{
+  return 0x79c00000 | ((offset >> 1) << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldrswi(Register Rd, Register Rn, int offset)
+{
+  return 0xb9800000 | ((offset >> 2) << 10) | (Rn.index() << 5) | Rd.index();
+}
+
+uint32_t ldri(Register Rd, Register Rn, int offset, unsigned size)
+{
+  return (size == 8 ? 0xf9400000 : 0xb9400000)
+         | ((offset >> (size == 8 ? 3 : 2)) << 10) | (Rn.index() << 5)
+         | Rd.index();
+}
+
+uint32_t fcmp(Register Fn, Register Fm, unsigned size)
+{
+  return (size == 8 ? 0x1e602000 : 0x1e202000) | (Fm.index() << 16)
+         | (Fn.index() << 5);
+}
+
+uint32_t neg(Register Rd, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xcb0003e0 : 0x4b0003e0) | (Rm.index() << 16)
+         | Rd.index();
+}
+
+uint32_t cmp(Register Rn, Register Rm, unsigned size)
+{
+  return (size == 8 ? 0xeb00001f : 0x6b00001f) | (Rm.index() << 16)
+         | (Rn.index() == 31 ? 0x2063ff : (Rn.index() << 5));
+}
+
+uint32_t cmpi(Register Rn, int value, unsigned shift, unsigned size)
+{
+  return (size == 8 ? 0xf100001f : 0x7100001f) | (shift == 12 ? 0x400000 : 0)
+         | (value << 10) | (Rn.index() << 5);
+}
+
+uint32_t b(int offset)
+{
+  return 0x14000000 | (offset >> 2);
+}
+
+uint32_t bl(int offset)
+{
+  return 0x94000000 | (offset >> 2);
+}
+
+uint32_t blr(Register Rn)
+{
+  return 0xd63f0000 | (Rn.index() << 5);
+}
+
+uint32_t beq(int offset)
+{
+  return 0x54000000 | ((offset >> 2) << 5);
+}
+
+uint32_t bne(int offset)
+{
+  return 0x54000001 | ((offset >> 2) << 5);
+}
+
+uint32_t blt(int offset)
+{
+  return 0x5400000b | ((offset >> 2) << 5);
+}
+
+uint32_t bgt(int offset)
+{
+  return 0x5400000c | ((offset >> 2) << 5);
+}
+
+uint32_t ble(int offset)
+{
+  return 0x5400000d | ((offset >> 2) << 5);
+}
+
+uint32_t bge(int offset)
+{
+  return 0x5400000a | ((offset >> 2) << 5);
+}
+
+uint32_t bhi(int offset)
+{
+  return 0x54000008 | ((offset >> 2) << 5);
+}
+
+uint32_t bpl(int offset)
+{
+  return 0x54000005 | ((offset >> 2) << 5);
+}
+
+uint32_t brk(int flag)
+{
+  return 0xd4200020 | (flag << 5);
+}
+
+uint32_t dmb(int flag)
+{
+  return 0xd50330bf | (flag << 8);
+}
+
+}  // namespace
+
+namespace avian {
+namespace codegen {
+namespace arm {
+
+using namespace avian::util;
+
+void shiftLeftR(Context* c,
+                unsigned size,
+                lir::RegisterPair* a,
+                lir::RegisterPair* b,
+                lir::RegisterPair* dst)
+{
+  append(c, lslv(dst->low, b->low, a->low, size));
+}
+
+void shiftLeftC(Context* c,
+                unsigned size,
+                lir::Constant* a,
+                lir::RegisterPair* b,
+                lir::RegisterPair* dst)
+{
+  uint64_t value = a->value->value();
+  if (size == 4 and (value & 0x1F)) {
+    append(c, lsli(dst->low, b->low, value & 0x1F, 4));
+  } else if (size == 8 and (value & 0x3F)) {
+    append(c, lsli(dst->low, b->low, value & 0x3F, 8));
+  } else {
+    moveRR(c, size, b, size, dst);
+  }
+}
+
+void shiftRightR(Context* c,
+                 unsigned size,
+                 lir::RegisterPair* a,
+                 lir::RegisterPair* b,
+                 lir::RegisterPair* dst)
+{
+  append(c, asrv(dst->low, b->low, a->low, size));
+}
+
+void shiftRightC(Context* c,
+                 unsigned size UNUSED,
+                 lir::Constant* a,
+                 lir::RegisterPair* b,
+                 lir::RegisterPair* dst)
+{
+  uint64_t value = a->value->value();
+  if (size == 4 and (value & 0x1F)) {
+    append(c, asri(dst->low, b->low, value & 0x1F, 4));
+  } else if (size == 8 and (value & 0x3F)) {
+    append(c, asri(dst->low, b->low, value & 0x3F, 8));
+  } else {
+    moveRR(c, size, b, size, dst);
+  }
+}
+
+void unsignedShiftRightR(Context* c,
+                         unsigned size,
+                         lir::RegisterPair* a,
+                         lir::RegisterPair* b,
+                         lir::RegisterPair* dst)
+{
+  append(c, lsrv(dst->low, b->low, a->low, size));
+}
+
+void unsignedShiftRightC(Context* c,
+                         unsigned size UNUSED,
+                         lir::Constant* a,
+                         lir::RegisterPair* b,
+                         lir::RegisterPair* dst)
+{
+  uint64_t value = a->value->value();
+  if (size == 4 and (value & 0x1F)) {
+    append(c, lsri(dst->low, b->low, value & 0x1F, 4));
+  } else if (size == 8 and (value & 0x3F)) {
+    append(c, lsri(dst->low, b->low, value & 0x3F, 8));
+  } else {
+    moveRR(c, size, b, size, dst);
+  }
+}
+
+void jumpR(Context* c, unsigned size UNUSED, lir::RegisterPair* target)
+{
+  assertT(c, size == vm::TargetBytesPerWord);
+  append(c, br(target->low));
+}
+
+void moveRR(Context* c,
+            unsigned srcSize,
+            lir::RegisterPair* src,
+            unsigned dstSize,
+            lir::RegisterPair* dst)
+{
+  bool srcIsFpr = isFpr(src);
+  bool dstIsFpr = isFpr(dst);
+  if (srcIsFpr or dstIsFpr) {
+    assertT(c, srcSize == dstSize);
+
+    if (srcIsFpr and dstIsFpr) {
+      append(c, fmovFdFn(fpr(dst), fpr(src), srcSize));
+    } else if (srcIsFpr) {
+      append(c, fmovRdFn(dst->low, fpr(src), srcSize));
+    } else {
+      append(c, fmovFdRn(fpr(dst), src->low, srcSize));
+    }
+  } else {
+    switch (srcSize) {
+    case 1:
+      append(c, sxtb(dst->low, src->low));
+      break;
+
+    case 2:
+      append(c, sxth(dst->low, src->low));
+      break;
+
+    case 4:
+      if (dstSize == 4) {
+        append(c, mov(dst->low, src->low, srcSize));
+      } else {
+        append(c, sxtw(dst->low, src->low));
+      }
+      break;
+
+    case 8:
+      append(c, mov(dst->low, src->low, srcSize));
+      break;
+
+    default:
+      abort(c);
+    }
+  }
+}
+
+void moveZRR(Context* c,
+             unsigned srcSize,
+             lir::RegisterPair* src,
+             unsigned,
+             lir::RegisterPair* dst)
+{
+  switch (srcSize) {
+  case 2:
+    append(c, uxth(dst->low, src->low));
+    break;
+
+  default:
+    abort(c);
+  }
+}
+
+void moveCR2(Context* c,
+             unsigned size,
+             lir::Constant* src,
+             lir::RegisterPair* dst,
+             Promise* callOffset)
+{
+  if (isFpr(dst)) {
+    // todo: could use a single fmov here and avoid the temporary for
+    // constants that fit
+    lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+    moveCR(c, size, src, size, &tmp);
+    moveRR(c, size, &tmp, size, dst);
+    c->client->releaseTemporary(tmp.low);
+  } else if (callOffset == 0 and src->value->resolved()) {
+    // todo: Is it better performance-wise to load using immediate
+    // moves or via a PC-relative constant pool?  Does it depend on
+    // how many significant bits there are?
+
+    int64_t value = src->value->value();
+    if (value >= 0) {
+      append(c, movz(dst->low, value & 0xFFFF, 0, size));
+      if (value >> 16) {
+        if ((value >> 16) & 0xFFFF) {
+          append(c, movk(dst->low, (value >> 16) & 0xFFFF, 16, size));
+        }
+        if (value >> 32) {
+          if ((value >> 32) & 0xFFFF) {
+            append(c, movk(dst->low, (value >> 32) & 0xFFFF, 32, size));
+          }
+          if (value >> 48) {
+            append(c, movk(dst->low, (value >> 48) & 0xFFFF, 48, size));
+          }
+        }
+      }
+    } else {
+      append(c, movn(dst->low, (~value) & 0xFFFF, 0, size));
+      if (~(value >> 16)) {
+        if (((value >> 16) & 0xFFFF) != 0xFFFF) {
+          append(c, movk(dst->low, (value >> 16) & 0xFFFF, 16, size));
+        }
+        if (~(value >> 32)) {
+          if (((value >> 32) & 0xFFFF) != 0xFFFF) {
+            append(c, movk(dst->low, (value >> 32) & 0xFFFF, 32, size));
+          }
+          if (~(value >> 48)) {
+            append(c, movk(dst->low, (value >> 48) & 0xFFFF, 48, size));
+          }
+        }
+      }
+    }
+  } else {
+    appendConstantPoolEntry(c, src->value, callOffset);
+    append(c, ldrPCRel(dst->low, 0, size));
+  }
+}
+
+void moveCR(Context* c,
+            unsigned size,
+            lir::Constant* src,
+            unsigned,
+            lir::RegisterPair* dst)
+{
+  moveCR2(c, size, src, dst, 0);
+}
+
+void addR(Context* c,
+          unsigned size,
+          lir::RegisterPair* a,
+          lir::RegisterPair* b,
+          lir::RegisterPair* dst)
+{
+  append(c, add(dst->low, a->low, b->low, size));
+}
+
+void subR(Context* c,
+          unsigned size,
+          lir::RegisterPair* a,
+          lir::RegisterPair* b,
+          lir::RegisterPair* dst)
+{
+  append(c, sub(dst->low, b->low, a->low, size));
+}
+
+void addC(Context* c,
+          unsigned size,
+          lir::Constant* a,
+          lir::RegisterPair* b,
+          lir::RegisterPair* dst)
+{
+  int64_t v = a->value->value();
+  if (v) {
+    if (v > 0 and v < 0x1000) {
+      append(c, addi(dst->low, b->low, v, 0, size));
+    } else if (v > 0 and v < 0x1000000 and v % 0x1000 == 0) {
+      append(c, addi(dst->low, b->low, v >> 12, 12, size));
+    } else {
+      // todo
+      abort(c);
+    }
+  } else {
+    moveRR(c, size, b, size, dst);
+  }
+}
+
+void subC(Context* c,
+          unsigned size,
+          lir::Constant* a,
+          lir::RegisterPair* b,
+          lir::RegisterPair* dst)
+{
+  int64_t v = a->value->value();
+  if (v) {
+    if (v > 0 and v < 0x1000) {
+      append(c, subi(dst->low, b->low, v, 0, size));
+    } else if (v > 0 and v < 0x1000000 and v % 0x1000 == 0) {
+      append(c, subi(dst->low, b->low, v >> 12, 12, size));
+    } else {
+      // todo
+      abort(c);
+    }
+  } else {
+    moveRR(c, size, b, size, dst);
+  }
+}
+
+void multiplyR(Context* c,
+               unsigned size,
+               lir::RegisterPair* a,
+               lir::RegisterPair* b,
+               lir::RegisterPair* dst)
+{
+  append(c, mul(dst->low, a->low, b->low, size));
+}
+
+void floatAbsoluteRR(Context* c,
+                     unsigned size,
+                     lir::RegisterPair* a,
+                     unsigned,
+                     lir::RegisterPair* b)
+{
+  append(c, fabs_(fpr(b), fpr(a), size));
+}
+
+void floatNegateRR(Context* c,
+                   unsigned size,
+                   lir::RegisterPair* a,
+                   unsigned,
+                   lir::RegisterPair* b)
+{
+  append(c, fneg(fpr(b), fpr(a), size));
+}
+
+void float2FloatRR(Context* c,
+                   unsigned size,
+                   lir::RegisterPair* a,
+                   unsigned,
+                   lir::RegisterPair* b)
+{
+  if (size == 8) {
+    append(c, fcvtSdDn(fpr(b), fpr(a)));
+  } else {
+    append(c, fcvtDdSn(fpr(b), fpr(a)));
+  }
+}
+
+void float2IntRR(Context* c,
+                 unsigned size,
+                 lir::RegisterPair* a,
+                 unsigned,
+                 lir::RegisterPair* b)
+{
+  if (size == 8) {
+    append(c, fcvtasXdDn(b->low, fpr(a)));
+  } else {
+    append(c, fcvtasWdSn(b->low, fpr(a)));
+  }
+}
+
+void int2FloatRR(Context* c,
+                 unsigned,
+                 lir::RegisterPair* a,
+                 unsigned size,
+                 lir::RegisterPair* b)
+{
+  if (size == 8) {
+    append(c, scvtfDdXn(fpr(b), a->low));
+  } else {
+    append(c, scvtfSdWn(fpr(b), a->low));
+  }
+}
+
+void floatSqrtRR(Context* c,
+                 unsigned size,
+                 lir::RegisterPair* a,
+                 unsigned,
+                 lir::RegisterPair* b)
+{
+  append(c, fsqrt(fpr(b), fpr(a), size));
+}
+
+void floatAddR(Context* c,
+               unsigned size,
+               lir::RegisterPair* a,
+               lir::RegisterPair* b,
+               lir::RegisterPair* dst)
+{
+  append(c, fadd(fpr(dst), fpr(b), fpr(a), size));
+}
+
+void floatSubtractR(Context* c,
+                    unsigned size,
+                    lir::RegisterPair* a,
+                    lir::RegisterPair* b,
+                    lir::RegisterPair* dst)
+{
+  append(c, fsub(fpr(dst), fpr(b), fpr(a), size));
+}
+
+void floatMultiplyR(Context* c,
+                    unsigned size,
+                    lir::RegisterPair* a,
+                    lir::RegisterPair* b,
+                    lir::RegisterPair* dst)
+{
+  append(c, fmul(fpr(dst), fpr(b), fpr(a), size));
+}
+
+void floatDivideR(Context* c,
+                  unsigned size,
+                  lir::RegisterPair* a,
+                  lir::RegisterPair* b,
+                  lir::RegisterPair* dst)
+{
+  append(c, fdiv(fpr(dst), fpr(b), fpr(a), size));
+}
+
+Register normalize(Context* c,
+                   int offset,
+                   Register index,
+                   unsigned scale,
+                   bool* preserveIndex,
+                   bool* release)
+{
+  if (offset != 0 or scale != 1) {
+    lir::RegisterPair normalizedIndex(
+        *preserveIndex ? c->client->acquireTemporary(GPR_MASK) : index);
+
+    if (*preserveIndex) {
+      *release = true;
+      *preserveIndex = false;
+    } else {
+      *release = false;
+    }
+
+    Register scaled;
+
+    if (scale != 1) {
+      lir::RegisterPair unscaledIndex(index);
+
+      ResolvedPromise scalePromise(log(scale));
+      lir::Constant scaleConstant(&scalePromise);
+
+      shiftLeftC(c,
+                 vm::TargetBytesPerWord,
+                 &scaleConstant,
+                 &unscaledIndex,
+                 &normalizedIndex);
+
+      scaled = normalizedIndex.low;
+    } else {
+      scaled = index;
+    }
+
+    if (offset != 0) {
+      lir::RegisterPair untranslatedIndex(scaled);
+
+      ResolvedPromise offsetPromise(offset);
+      lir::Constant offsetConstant(&offsetPromise);
+
+      lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+      moveCR(c,
+             vm::TargetBytesPerWord,
+             &offsetConstant,
+             vm::TargetBytesPerWord,
+             &tmp);
+      addR(c,
+           vm::TargetBytesPerWord,
+           &tmp,
+           &untranslatedIndex,
+           &normalizedIndex);
+      c->client->releaseTemporary(tmp.low);
+    }
+
+    return normalizedIndex.low;
+  } else {
+    *release = false;
+    return index;
+  }
+}
+
+void store(Context* c,
+           unsigned size,
+           lir::RegisterPair* src,
+           Register base,
+           int offset,
+           Register index,
+           unsigned scale,
+           bool preserveIndex)
+{
+  if (index != NoRegister) {
+    bool release;
+
+    // todo: browsing the instruction set, it looks like we could do a
+    // scaled store or load in a single instruction if the offset is
+    // zero, and we could simplify things for the case of non-zero
+    // offsets also
+
+    Register normalized
+        = normalize(c, offset, index, scale, &preserveIndex, &release);
+
+    if (isFpr(src)) {
+      switch (size) {
+      case 4:
+      case 8:
+        append(c, strFs(fpr(src->low), base, normalized, size));
+        break;
+
+      default:
+        abort(c);
+      }
+    } else {
+      switch (size) {
+      case 1:
+        append(c, strb(src->low, base, normalized));
+        break;
+
+      case 2:
+        append(c, strh(src->low, base, normalized));
+        break;
+
+      case 4:
+      case 8:
+        append(c, str(src->low, base, normalized, size));
+        break;
+
+      default:
+        abort(c);
+      }
+    }
+
+    if (release) {
+      c->client->releaseTemporary(normalized);
+    }
+  } else if (abs(offset) == (abs(offset) & 0xFFF)) {
+    if (isFpr(src)) {
+      switch (size) {
+      case 4:
+      case 8:
+        assertT(c, offset == (offset & (size == 8 ? (~7) : (~3))));
+        append(c, striFs(fpr(src->low), base, offset, size));
+        break;
+
+      default:
+        abort(c);
+      }
+    } else {  // FPR store
+      switch (size) {
+      case 1:
+        append(c, strbi(src->low, base, offset));
+        break;
+
+      case 2:
+        assertT(c, offset == (offset & (~1)));
+        append(c, strhi(src->low, base, offset));
+        break;
+
+      case 4:
+        assertT(c, offset == (offset & (~3)));
+        append(c, stri(src->low, base, offset, size));
+        break;
+
+      case 8:
+        assertT(c, offset == (offset & (~7)));
+        append(c, stri(src->low, base, offset, size));
+        break;
+
+      default:
+        abort(c);
+      }
+    }
+  } else {
+    lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+    ResolvedPromise offsetPromise(offset);
+    lir::Constant offsetConstant(&offsetPromise);
+    moveCR(c,
+           vm::TargetBytesPerWord,
+           &offsetConstant,
+           vm::TargetBytesPerWord,
+           &tmp);
+
+    store(c, size, src, base, 0, tmp.low, 1, false);
+
+    c->client->releaseTemporary(tmp.low);
+  }
+}
+
+void moveRM(Context* c,
+            unsigned srcSize,
+            lir::RegisterPair* src,
+            unsigned dstSize UNUSED,
+            lir::Memory* dst)
+{
+  assertT(c, srcSize == dstSize);
+
+  if (src->low.index() == 31) {
+    assertT(c, c->client == 0);  // the compiler should never ask us to
+                                 // store the SP; we'll only get here
+                                 // when assembling a thunk
+
+    lir::RegisterPair tmp(Register(9));  // we're in a thunk, so we can
+                                         // clobber this
+
+    moveRR(c, srcSize, src, srcSize, &tmp);
+    store(
+        c, srcSize, &tmp, dst->base, dst->offset, dst->index, dst->scale, true);
+  } else {
+    store(
+        c, srcSize, src, dst->base, dst->offset, dst->index, dst->scale, true);
+  }
+}
+
+void load(Context* c,
+          unsigned srcSize,
+          Register base,
+          int offset,
+          Register index,
+          unsigned scale,
+          unsigned dstSize,
+          lir::RegisterPair* dst,
+          bool preserveIndex,
+          bool signExtend)
+{
+  if (index != NoRegister) {
+    bool release;
+    Register normalized
+        = normalize(c, offset, index, scale, &preserveIndex, &release);
+
+    if (isFpr(dst)) {  // FPR load
+      switch (srcSize) {
+      case 4:
+      case 8:
+        append(c, ldrFd(fpr(dst->low), base, normalized, srcSize));
+        break;
+
+      default:
+        abort(c);
+      }
+    } else {
+      switch (srcSize) {
+      case 1:
+        if (signExtend) {
+          append(c, ldrsb(dst->low, base, normalized));
+        } else {
+          append(c, ldrb(dst->low, base, normalized));
+        }
+        break;
+
+      case 2:
+        if (signExtend) {
+          append(c, ldrsh(dst->low, base, normalized));
+        } else {
+          append(c, ldrh(dst->low, base, normalized));
+        }
+        break;
+
+      case 4:
+      case 8:
+        if (signExtend and srcSize == 4 and dstSize == 8) {
+          append(c, ldrsw(dst->low, base, normalized));
+        } else {
+          append(c, ldr(dst->low, base, normalized, srcSize));
+        }
+        break;
+
+      default:
+        abort(c);
+      }
+    }
+
+    if (release) {
+      c->client->releaseTemporary(normalized);
+    }
+  } else if (abs(offset) == (abs(offset) & 0xFFF)) {
+    if (isFpr(dst)) {
+      switch (srcSize) {
+      case 4:
+      case 8:
+        assertT(c, offset == (offset & (srcSize == 8 ? (~7) : (~3))));
+        append(c, ldriFd(fpr(dst->low), base, offset, srcSize));
+        break;
+
+      default:
+        abort(c);
+      }
+    } else {
+      switch (srcSize) {
+      case 1:
+        if (signExtend) {
+          append(c, ldrsbi(dst->low, base, offset));
+        } else {
+          append(c, ldrbi(dst->low, base, offset));
+        }
+        break;
+
+      case 2:
+        assertT(c, offset == (offset & (~1)));
+        if (signExtend) {
+          append(c, ldrshi(dst->low, base, offset));
+        } else {
+          append(c, ldrhi(dst->low, base, offset));
+        }
+        break;
+
+      case 4:
+      case 8:
+        if (signExtend and srcSize == 4 and dstSize == 8) {
+          assertT(c, offset == (offset & (~3)));
+          append(c, ldrswi(dst->low, base, offset));
+        } else {
+          assertT(c, offset == (offset & (srcSize == 8 ? (~7) : (~3))));
+          append(c, ldri(dst->low, base, offset, srcSize));
+        }
+        break;
+
+      default:
+        abort(c);
+      }
+    }
+  } else {
+    lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+    ResolvedPromise offsetPromise(offset);
+    lir::Constant offsetConstant(&offsetPromise);
+    moveCR(c,
+           vm::TargetBytesPerWord,
+           &offsetConstant,
+           vm::TargetBytesPerWord,
+           &tmp);
+
+    load(c, srcSize, base, 0, tmp.low, 1, dstSize, dst, false, signExtend);
+
+    c->client->releaseTemporary(tmp.low);
+  }
+}
+
+void moveMR(Context* c,
+            unsigned srcSize,
+            lir::Memory* src,
+            unsigned dstSize,
+            lir::RegisterPair* dst)
+{
+  if (dst->low.index() == 31) {
+    assertT(c, c->client == 0);  // the compiler should never ask us to
+                                 // load the SP; we'll only get here
+                                 // when assembling a thunk
+
+    lir::RegisterPair tmp(Register(9));  // we're in a thunk, so we can
+                                         // clobber this
+
+    load(c, srcSize, src->base, src->offset, src->index, src->scale, dstSize, &tmp, true, true);
+    moveRR(c, dstSize, &tmp, dstSize, dst);
+  } else {
+    load(c,
+         srcSize,
+         src->base,
+         src->offset,
+         src->index,
+         src->scale,
+         dstSize,
+         dst,
+         true,
+         true);
+  }
+}
+
+void moveZMR(Context* c,
+             unsigned srcSize,
+             lir::Memory* src,
+             unsigned dstSize,
+             lir::RegisterPair* dst)
+{
+  load(c,
+       srcSize,
+       src->base,
+       src->offset,
+       src->index,
+       src->scale,
+       dstSize,
+       dst,
+       true,
+       false);
+}
+
+void andR(Context* c,
+          unsigned size,
+          lir::RegisterPair* a,
+          lir::RegisterPair* b,
+          lir::RegisterPair* dst)
+{
+  append(c, and_(dst->low, a->low, b->low, size));
+}
+
+void andC(Context* c,
+          unsigned size,
+          lir::Constant* a,
+          lir::RegisterPair* b,
+          lir::RegisterPair* dst)
+{
+  int64_t v = a->value->value();
+
+  if (~v) {
+    bool useTemporary = b->low == dst->low;
+    lir::RegisterPair tmp(dst->low);
+    if (useTemporary) {
+      tmp.low = c->client->acquireTemporary(GPR_MASK);
+    }
+
+    moveCR(c, size, a, size, &tmp);
+    andR(c, size, b, &tmp, dst);
+
+    if (useTemporary) {
+      c->client->releaseTemporary(tmp.low);
+    }
+  } else {
+    moveRR(c, size, b, size, dst);
+  }
+}
+
+void orR(Context* c,
+         unsigned size,
+         lir::RegisterPair* a,
+         lir::RegisterPair* b,
+         lir::RegisterPair* dst)
+{
+  append(c, orr(dst->low, a->low, b->low, size));
+}
+
+void xorR(Context* c,
+          unsigned size,
+          lir::RegisterPair* a,
+          lir::RegisterPair* b,
+          lir::RegisterPair* dst)
+{
+  append(c, eor(dst->low, a->low, b->low, size));
+}
+
+void moveAR(Context* c,
+            unsigned srcSize,
+            lir::Address* src,
+            unsigned dstSize,
+            lir::RegisterPair* dst)
+{
+  assertT(
+      c,
+      srcSize == vm::TargetBytesPerWord and dstSize == vm::TargetBytesPerWord);
+
+  lir::Constant constant(src->address);
+  moveCR(c, srcSize, &constant, dstSize, dst);
+
+  lir::Memory memory(dst->low, 0, NoRegister, 0);
+  moveMR(c, dstSize, &memory, dstSize, dst);
+}
+
+void compareRR(Context* c,
+               unsigned aSize,
+               lir::RegisterPair* a,
+               unsigned bSize UNUSED,
+               lir::RegisterPair* b)
+{
+  assertT(c, not(isFpr(a) xor isFpr(b)));
+  assertT(c, aSize == bSize);
+
+  if (isFpr(a)) {
+    append(c, fcmp(fpr(b), fpr(a), aSize));
+  } else {
+    append(c, cmp(b->low, a->low, aSize));
+  }
+}
+
+void compareCR(Context* c,
+               unsigned aSize,
+               lir::Constant* a,
+               unsigned bSize UNUSED,
+               lir::RegisterPair* b)
+{
+  assertT(c, aSize == bSize);
+
+  if (!isFpr(b) && a->value->resolved()) {
+    int64_t v = a->value->value();
+    if (v == 0) {
+      append(c, cmp(b->low, Register(31), aSize));
+      return;
+    } else if (v > 0 and v < 0x1000) {
+      append(c, cmpi(b->low, v, 0, aSize));
+      return;
+    } else if (v > 0 and v < 0x1000000 and v % 0x1000 == 0) {
+      append(c, cmpi(b->low, v >> 12, 12, aSize));
+      return;
+    }
+  }
+
+  lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+  moveCR(c, aSize, a, bSize, &tmp);
+  compareRR(c, bSize, &tmp, bSize, b);
+  c->client->releaseTemporary(tmp.low);
+}
+
+void compareCM(Context* c,
+               unsigned aSize,
+               lir::Constant* a,
+               unsigned bSize,
+               lir::Memory* b)
+{
+  assertT(c, aSize == bSize);
+
+  lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+  moveMR(c, bSize, b, bSize, &tmp);
+  compareCR(c, aSize, a, bSize, &tmp);
+  c->client->releaseTemporary(tmp.low);
+}
+
+void compareRM(Context* c,
+               unsigned aSize,
+               lir::RegisterPair* a,
+               unsigned bSize,
+               lir::Memory* b)
+{
+  assertT(c, aSize == bSize);
+
+  lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+  moveMR(c, bSize, b, bSize, &tmp);
+  compareRR(c, aSize, a, bSize, &tmp);
+  c->client->releaseTemporary(tmp.low);
+}
+
+void compareMR(Context* c,
+               unsigned aSize,
+               lir::Memory* a,
+               unsigned bSize,
+               lir::RegisterPair* b)
+{
+  assertT(c, aSize == bSize);
+
+  lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+  moveMR(c, aSize, a, aSize, &tmp);
+  compareRR(c, aSize, &tmp, bSize, b);
+  c->client->releaseTemporary(tmp.low);
+}
+
+int32_t branch(Context* c, lir::TernaryOperation op)
+{
+  switch (op) {
+  case lir::JumpIfEqual:
+  case lir::JumpIfFloatEqual:
+    return beq(0);
+
+  case lir::JumpIfNotEqual:
+  case lir::JumpIfFloatNotEqual:
+    return bne(0);
+
+  case lir::JumpIfLess:
+  case lir::JumpIfFloatLess:
+  case lir::JumpIfFloatLessOrUnordered:
+    return blt(0);
+
+  case lir::JumpIfGreater:
+  case lir::JumpIfFloatGreater:
+    return bgt(0);
+
+  case lir::JumpIfLessOrEqual:
+  case lir::JumpIfFloatLessOrEqual:
+  case lir::JumpIfFloatLessOrEqualOrUnordered:
+    return ble(0);
+
+  case lir::JumpIfGreaterOrEqual:
+  case lir::JumpIfFloatGreaterOrEqual:
+    return bge(0);
+
+  case lir::JumpIfFloatGreaterOrUnordered:
+    return bhi(0);
+
+  case lir::JumpIfFloatGreaterOrEqualOrUnordered:
+    return bpl(0);
+
+  default:
+    abort(c);
+  }
+}
+
+void conditional(Context* c, int32_t branch, lir::Constant* target)
+{
+  appendOffsetTask(c, target->value, offsetPromise(c));
+  append(c, branch);
+}
+
+void branch(Context* c, lir::TernaryOperation op, lir::Constant* target)
+{
+  conditional(c, branch(c, op), target);
+}
+
+void branchRR(Context* c,
+              lir::TernaryOperation op,
+              unsigned size,
+              lir::RegisterPair* a,
+              lir::RegisterPair* b,
+              lir::Constant* target)
+{
+  compareRR(c, size, a, size, b);
+  branch(c, op, target);
+}
+
+void branchCR(Context* c,
+              lir::TernaryOperation op,
+              unsigned size,
+              lir::Constant* a,
+              lir::RegisterPair* b,
+              lir::Constant* target)
+{
+  assertT(c, not isFloatBranch(op));
+
+  compareCR(c, size, a, size, b);
+  branch(c, op, target);
+}
+
+void branchRM(Context* c,
+              lir::TernaryOperation op,
+              unsigned size,
+              lir::RegisterPair* a,
+              lir::Memory* b,
+              lir::Constant* target)
+{
+  assertT(c, not isFloatBranch(op));
+  assertT(c, size <= vm::TargetBytesPerWord);
+
+  if (a->low.index() == 31) {
+    // stack overflow checks need to compare to the stack pointer, but
+    // we can only encode that in the opposite operand order we're
+    // given, so we need to reverse everything:
+    assertT(c, op == lir::JumpIfGreaterOrEqual);
+    compareMR(c, size, b, size, a);
+    branch(c, lir::JumpIfLess, target);
+  } else {
+    compareRM(c, size, a, size, b);
+    branch(c, op, target);
+  }
+}
+
+void branchCM(Context* c,
+              lir::TernaryOperation op,
+              unsigned size,
+              lir::Constant* a,
+              lir::Memory* b,
+              lir::Constant* target)
+{
+  assertT(c, not isFloatBranch(op));
+  assertT(c, size <= vm::TargetBytesPerWord);
+
+  compareCM(c, size, a, size, b);
+  branch(c, op, target);
+}
+
+ShiftMaskPromise* shiftMaskPromise(Context* c,
+                                   Promise* base,
+                                   unsigned shift,
+                                   int64_t mask)
+{
+  return new (c->zone) ShiftMaskPromise(base, shift, mask);
+}
+
+void moveCM(Context* c,
+            unsigned srcSize,
+            lir::Constant* src,
+            unsigned dstSize,
+            lir::Memory* dst)
+{
+  lir::RegisterPair tmp(c->client->acquireTemporary(GPR_MASK));
+  moveCR(c, srcSize, src, dstSize, &tmp);
+  moveRM(c, dstSize, &tmp, dstSize, dst);
+  c->client->releaseTemporary(tmp.low);
+}
+
+void negateRR(Context* c,
+              unsigned srcSize,
+              lir::RegisterPair* src,
+              unsigned dstSize UNUSED,
+              lir::RegisterPair* dst)
+{
+  assertT(c, srcSize == dstSize);
+
+  append(c, neg(dst->low, src->low, srcSize));
+}
+
+void callR(Context* c, unsigned size UNUSED, lir::RegisterPair* target)
+{
+  assertT(c, size == vm::TargetBytesPerWord);
+  append(c, blr(target->low));
+}
+
+void callC(Context* c, unsigned size UNUSED, lir::Constant* target)
+{
+  assertT(c, size == vm::TargetBytesPerWord);
+
+  appendOffsetTask(c, target->value, offsetPromise(c));
+  append(c, bl(0));
+}
+
+void longCallC(Context* c, unsigned size UNUSED, lir::Constant* target)
+{
+  assertT(c, size == vm::TargetBytesPerWord);
+
+  lir::RegisterPair tmp(
+      Register(9));  // a non-arg reg that we don't mind clobbering
+  moveCR2(c, vm::TargetBytesPerWord, target, &tmp, offsetPromise(c));
+  callR(c, vm::TargetBytesPerWord, &tmp);
+}
+
+void longJumpC(Context* c, unsigned size UNUSED, lir::Constant* target)
+{
+  assertT(c, size == vm::TargetBytesPerWord);
+
+  lir::RegisterPair tmp(
+      Register(9));  // a non-arg reg that we don't mind clobbering
+  moveCR2(c, vm::TargetBytesPerWord, target, &tmp, offsetPromise(c));
+  jumpR(c, vm::TargetBytesPerWord, &tmp);
+}
+
+void jumpC(Context* c, unsigned size UNUSED, lir::Constant* target)
+{
+  assertT(c, size == vm::TargetBytesPerWord);
+
+  appendOffsetTask(c, target->value, offsetPromise(c));
+  append(c, b(0));
+}
+
+void return_(Context* c)
+{
+  append(c, br(LinkRegister));
+}
+
+void trap(Context* c)
+{
+  append(c, brk(0));
+}
+
+// todo: determine the minimal operation types and domains needed to
+// implement the following barriers (see
+// http://community.arm.com/groups/processors/blog/2011/10/19/memory-access-ordering-part-3--memory-access-ordering-in-the-arm-architecture).
+// For now, we just use DMB SY as a conservative but not necessarily
+// performant choice.
+
+void memoryBarrier(Context* c)
+{
+  append(c, dmb(0xF));
+}
+
+void loadBarrier(Context* c)
+{
+  memoryBarrier(c);
+}
+
+void storeStoreBarrier(Context* c)
+{
+  memoryBarrier(c);
+}
+
+void storeLoadBarrier(Context* c)
+{
+  memoryBarrier(c);
+}
+
+}  // namespace arm
+}  // namespace codegen
+}  // namespace avian
+
+#endif  // TARGET_BYTES_PER_WORD == 8
diff --git a/src/codegen/target/arm/registers.h b/src/codegen/target/arm/registers.h
index ad13db466a..3bf4dc4041 100644
--- a/src/codegen/target/arm/registers.h
+++ b/src/codegen/target/arm/registers.h
@@ -14,6 +14,8 @@
 #include <avian/codegen/lir.h>
 #include <avian/codegen/assembler.h>
 
+#include "avian/environment.h"
+
 namespace avian {
 namespace codegen {
 namespace arm {
@@ -21,16 +23,30 @@ namespace arm {
 const uint64_t MASK_LO32 = 0xffffffff;
 const unsigned MASK_LO8 = 0xff;
 
+#if TARGET_BYTES_PER_WORD == 8
+constexpr Register ThreadRegister(19);
+constexpr Register StackRegister(31);
+constexpr Register LinkRegister(30);
+constexpr Register FrameRegister(29);
+constexpr Register ProgramCounter(0xFE);  // i.e. unaddressable
+
+const int N_GPRS = 32;
+const int N_FPRS = 32;
+const RegisterMask GPR_MASK = 0xffffffff;
+const RegisterMask FPR_MASK = 0xffffffff00000000;
+
+#else
+constexpr Register ThreadRegister(8);
+constexpr Register StackRegister(13);
+constexpr Register LinkRegister(14);
+constexpr Register FrameRegister(0xFE);  // i.e. there is none
+constexpr Register ProgramCounter(15);
+
 const int N_GPRS = 16;
 const int N_FPRS = 16;
 const RegisterMask GPR_MASK = 0xffff;
 const RegisterMask FPR_MASK = 0xffff0000;
 
-inline bool isFpr(lir::RegisterPair* reg)
-{
-  return reg->low.index() >= N_GPRS;
-}
-
 inline int fpr64(Register reg)
 {
   return reg.index() - N_GPRS;
@@ -47,19 +63,13 @@ inline int fpr32(lir::RegisterPair* reg)
 {
   return fpr64(reg) << 1;
 }
-
-#ifdef ARCH_arm64
-constexpr Register ThreadRegister(19);
-constexpr Register StackRegister(31);
-constexpr Register LinkRegister(30);
-constexpr Register ProgramCounter(0xFE); // i.e. unaddressable
-#else
-constexpr Register ThreadRegister(8);
-constexpr Register StackRegister(13);
-constexpr Register LinkRegister(14);
-constexpr Register ProgramCounter(15);
 #endif
 
+inline bool isFpr(lir::RegisterPair* reg)
+{
+  return reg->low.index() >= N_GPRS;
+}
+
 }  // namespace arm
 }  // namespace codegen
 }  // namespace avian
diff --git a/src/compile-arm.S b/src/compile-arm.S
index 37b61da454..83703af607 100644
--- a/src/compile-arm.S
+++ b/src/compile-arm.S
@@ -16,11 +16,11 @@
 #define BYTES_PER_WORD 4
 
 #define LOCAL(x) .L##x
-   
+
 #ifdef __APPLE__
 #  define GLOBAL(x) _##x
 #else
-#  define GLOBAL(x) x   
+#  define GLOBAL(x) x
 #endif
 
 #define CONTINUATION_NEXT 4
@@ -29,7 +29,7 @@
 #define CONTINUATION_FRAME_POINTER_OFFSET 24
 #define CONTINUATION_LENGTH 28
 #define CONTINUATION_BODY 32
-   
+
 .globl GLOBAL(vmInvoke)
 .align 2
 GLOBAL(vmInvoke):
@@ -56,7 +56,7 @@ GLOBAL(vmInvoke):
   eor   r4, sp, r3
   tst   r4, #4
   subne sp, sp, #4
-   
+
   // copy arguments into place
   sub   sp, r3
   mov   r4, #0
@@ -87,7 +87,7 @@ LOCAL(vmInvoke_argumentTest):
 GLOBAL(vmInvoke_returnAddress):
   // restore stack pointer
   ldr   sp, [r8, #TARGET_THREAD_SCRATCH]
-   
+
    // clear MyThread::stack to avoid confusing another thread calling
    // java.lang.Thread.getStackTrace on this one.  See
    // MyProcess::getStackTrace in compile.cpp for details on how we get
@@ -109,7 +109,7 @@ GLOBAL(vmInvoke_safeStack):
    ldr  r6,[r5,#CONTINUATION_LENGTH]
    lsl  r6,r6,#2
    neg  r7,r6
-   add  r7,r7,#-80
+   add  r7,r7,#-80 // 80 bytes for callee-saved register values
    mov  r4,sp
    str  r4,[sp,r7]!
 
@@ -167,10 +167,10 @@ LOCAL(vmInvoke_handleException):
    bx   r7
 
 LOCAL(vmInvoke_exit):
-#endif // AVIAN_CONTINUATIONS
 
   mov   ip, #0
   str   ip, [r8, #TARGET_THREAD_STACK]
+#endif // AVIAN_CONTINUATIONS
 
   // restore return type
   ldr   ip, [sp], #4
@@ -201,7 +201,7 @@ GLOBAL(vmJumpAndInvoke):
    // which is not true in this case
    sub  r2,r2,r6
    sub  r2,r2,#84
-   
+
    mov  r8,r0
 
    // copy arguments into place
@@ -220,7 +220,7 @@ LOCAL(vmJumpAndInvoke_argumentTest):
    // the arguments have been copied, so we can set the real stack
    // pointer now
    mov  sp,r2
-   
+
    // set return address to vmInvoke_returnAddress
 #ifdef __APPLE__
    movw r11, :lower16:(GLOBAL(vmInvoke_returnAddress)-(LOCAL(vmJumpAndInvoke_getAddress)+8))
@@ -246,7 +246,7 @@ LOCAL(vmInvoke_getAddress_word):
 LOCAL(vmJumpAndInvoke_getAddress_word):
    .word _GLOBAL_OFFSET_TABLE_-(LOCAL(vmJumpAndInvoke_getAddress)+8)
 #endif // not __APPLE__
-   
+
 #else // not AVIAN_CONTINUATIONS
    // vmJumpAndInvoke should only be called when continuations are
    // enabled, so we force a crash if we reach here:
diff --git a/src/compile-arm64.S b/src/compile-arm64.S
index 65f76df6f3..c1c9c942b2 100644
--- a/src/compile-arm64.S
+++ b/src/compile-arm64.S
@@ -13,23 +13,23 @@
 
 .text
 
-#define BYTES_PER_WORD 4
+#define BYTES_PER_WORD 8
 
 #define LOCAL(x) .L##x
-   
+
 #ifdef __APPLE__
 #  define GLOBAL(x) _##x
 #else
-#  define GLOBAL(x) x   
+#  define GLOBAL(x) x
 #endif
 
-#define CONTINUATION_NEXT 4
-#define CONTINUATION_ADDRESS 16
-#define CONTINUATION_RETURN_ADDRESS_OFFSET 20
-#define CONTINUATION_FRAME_POINTER_OFFSET 24
-#define CONTINUATION_LENGTH 28
-#define CONTINUATION_BODY 32
-   
+#define CONTINUATION_NEXT 8
+#define CONTINUATION_ADDRESS 32
+#define CONTINUATION_RETURN_ADDRESS_OFFSET 40
+#define CONTINUATION_FRAME_POINTER_OFFSET 48
+#define CONTINUATION_LENGTH 56
+#define CONTINUATION_BODY 64
+
 .globl GLOBAL(vmInvoke)
 .align 2
 GLOBAL(vmInvoke):
@@ -43,6 +43,7 @@ GLOBAL(vmInvoke):
 
   // allocate frame
   stp   x29, x30, [sp,#-96]!
+  mov   x29, sp
 
   // save callee-saved register values
   stp   x19, x20, [sp,#16]
@@ -59,7 +60,7 @@ GLOBAL(vmInvoke):
 
   // copy arguments into place
   sub   sp, sp, w3, uxtw
-  mov   x5, #0
+  mov   x4, #0
   b     LOCAL(vmInvoke_argumentTest)
 
 LOCAL(vmInvoke_argumentLoop):
@@ -89,22 +90,74 @@ GLOBAL(vmInvoke_returnAddress):
   // MyProcess::getStackTrace in compile.cpp for details on how we get
   // a reliable stack trace from a thread that might be interrupted at
   // any point in its execution.
-  mov  x5, #0
-  str  x5, [x19, #TARGET_THREAD_STACK]
+  str  xzr, [x19, #TARGET_THREAD_STACK]
 
 .globl GLOBAL(vmInvoke_safeStack)
 .align 2
 GLOBAL(vmInvoke_safeStack):
 
 #ifdef AVIAN_CONTINUATIONS
-#error todo
+  // call the next continuation, if any
+  ldr   x5, [x19,#TARGET_THREAD_CONTINUATION]
+  cmp   x5, xzr
+  b.eq  LOCAL(vmInvoke_exit)
+
+  ldr   x6, [x5,#CONTINUATION_LENGTH]
+  lsl   x6, x6, #3
+  neg   x7, x6
+  add   x7, x7, #-128 // 128 bytes for callee-saved register values
+  mov   x4, sp
+  add   sp, sp, x7
+  str   x4, [sp]
+
+  add   x7, x5, #CONTINUATION_BODY
+  mov   x11, xzr
+  b     LOCAL(vmInvoke_continuationTest)
+
+LOCAL(vmInvoke_continuationLoop):
+  ldr   x9, [x7,x11]
+  str   x9, [sp,x11]
+  add   x11, x11, #8
+
+LOCAL(vmInvoke_continuationTest):
+  cmp   x11, x6
+  b.le  LOCAL(vmInvoke_continuationLoop)
+
+  ldr   x7, [x5,#CONTINUATION_RETURN_ADDRESS_OFFSET]
+  adr   x11, GLOBAL(vmInvoke_returnAddress)
+  str   x11, [sp,x7]
+
+  ldr   x7, [x5,#CONTINUATION_NEXT]
+  str   x7, [x19,#TARGET_THREAD_CONTINUATION]
+
+  // call the continuation unless we're handling an exception
+  ldr   x7, [x19,#TARGET_THREAD_EXCEPTION]
+  cmp   x7, xzr
+  b.ne  LOCAL(vmInvoke_handleException)
+  ldr   x7, [x5,#CONTINUATION_ADDRESS]
+  br    x7
+
+LOCAL(vmInvoke_handleException):
+   // we're handling an exception - call the exception handler instead
+  str   xzr, [x19,#TARGET_THREAD_EXCEPTION]
+  ldr   x11, [x19,#TARGET_THREAD_EXCEPTIONSTACKADJUSTMENT]
+  ldr   x9, [sp]
+  neg   x11, x11
+  add   sp, sp, x11
+  str   x9, [sp]
+  ldr   x11, [x19,#TARGET_THREAD_EXCEPTIONOFFSET]
+  str   x7, [sp,x11]
+
+  ldr   x7, [x19,#TARGET_THREAD_EXCEPTIONHANDLER]
+  br    x7
+
+LOCAL(vmInvoke_exit):
+  str   xzr, [x19, #TARGET_THREAD_STACK]
+
 #endif // AVIAN_CONTINUATIONS
 
-  mov   x5, #0
-  str   x5, [x19, #TARGET_THREAD_STACK]
-
   // restore return type
-  ldr   w5, [sp], #4
+  ldr   w5, [sp],#16
 
   // restore callee-saved register values
   ldp   x19, x20, [sp,#16]
@@ -121,7 +174,44 @@ LOCAL(vmInvoke_return):
 .align 2
 GLOBAL(vmJumpAndInvoke):
 #ifdef AVIAN_CONTINUATIONS
-#error todo
+   // x0: thread
+   // x1: address
+   // x2: stack
+   // x3: argumentFootprint
+   // x4: arguments
+   // x5: frameSize
+
+   // allocate new frame, adding room for callee-saved registers, plus
+   // 8 bytes of padding since the calculation of frameSize assumes 8
+   // bytes have already been allocated to save the return address,
+   // which is not true in this case
+   sub   x2, x2, x5
+   sub   x2, x2, #136
+
+   mov   x19, x0
+
+   // copy arguments into place
+   mov   x6, xzr
+   b     LOCAL(vmJumpAndInvoke_argumentTest)
+
+LOCAL(vmJumpAndInvoke_argumentLoop):
+   ldr   x12, [x4,x6]
+   str   x12, [x2,x6]
+   add   x6, x6, #4
+
+LOCAL(vmJumpAndInvoke_argumentTest):
+   cmp   x6, x3
+   ble   LOCAL(vmJumpAndInvoke_argumentLoop)
+
+   // the arguments have been copied, so we can set the real stack
+   // pointer now
+   mov   sp, x2
+
+   // set return address to vmInvoke_returnAddress
+   adr   x30, GLOBAL(vmInvoke_returnAddress)
+
+   br    x1
+
 #else // not AVIAN_CONTINUATIONS
    // vmJumpAndInvoke should only be called when continuations are
    // enabled, so we force a crash if we reach here:
diff --git a/src/compile.cpp b/src/compile.cpp
index 47b55574e7..51790bb0b3 100644
--- a/src/compile.cpp
+++ b/src/compile.cpp
@@ -2189,6 +2189,8 @@ GcContinuation* makeCurrentContinuation(MyThread* t,
 
   *targetIp = 0;
   while (*targetIp == 0) {
+    assertT(t, ip);
+
     GcMethod* method = methodForIp(t, ip);
     if (method) {
       PROTECT(t, method);