diff --git a/classpath/jni-util.h b/classpath/jni-util.h
index 4270976131..d373aaf268 100644
--- a/classpath/jni-util.h
+++ b/classpath/jni-util.h
@@ -64,7 +64,7 @@ typedef unsigned __int64 uint64_t;
 #    define ARCH_x86_32
 #  elif defined __x86_64__
 #    define ARCH_x86_64
-#  elif defined __POWERPC__
+#  elif (defined __POWERPC__) || (defined __powerpc__)
 #    define ARCH_powerpc
 #  elif defined __arm__
 #    define ARCH_arm
diff --git a/makefile b/makefile
index ccd6c8ada9..35db15f9f9 100644
--- a/makefile
+++ b/makefile
@@ -3,7 +3,11 @@ MAKEFLAGS = -s
 name = avian
 version = 0.4
 
-build-arch := $(shell uname -m | sed 's/^i.86$$/i386/' | sed 's/^arm.*$$/arm/')
+build-arch := $(shell uname -m \
+	| sed 's/^i.86$$/i386/' \
+	| sed 's/^arm.*$$/arm/' \
+	| sed 's/ppc/powerpc/')
+
 ifeq (Power,$(filter Power,$(build-arch)))
 	build-arch = powerpc
 endif
@@ -165,7 +169,7 @@ endif
 build-cflags = $(common-cflags) -fPIC -fvisibility=hidden \
 	"-I$(JAVA_HOME)/include/linux" -I$(src) -pthread
 
-converter-cflags = -D__STDC_CONSTANT_MACROS
+converter-cflags = -D__STDC_CONSTANT_MACROS -Isrc/binaryToObject
 
 cflags = $(build-cflags)
 
@@ -197,6 +201,17 @@ endif
 ifeq ($(arch),powerpc)
 	asm = powerpc
 	pointer-size = 4
+
+	ifneq ($(platform),darwin)
+		ifneq ($(arch),$(build-arch))
+			converter-cflags += -DOPPOSITE_ENDIAN
+			cxx = powerpc-linux-gnu-g++
+			cc = powerpc-linux-gnu-gcc
+			ar = powerpc-linux-gnu-ar
+			ranlib = powerpc-linux-gnu-ranlib
+			strip = powerpc-linux-gnu-strip
+		endif
+	endif
 endif
 ifeq ($(arch),arm)
 	asm = arm
@@ -239,7 +254,7 @@ ifeq ($(platform),darwin)
 	shared = -dynamiclib
 
 	ifeq ($(arch),powerpc)
-		ifneq (,$(filter i386 x86_64,$(build-arch)))
+		ifneq (,$(filter i386 x86_64 arm,$(build-arch)))
 			converter-cflags += -DOPPOSITE_ENDIAN
 		endif
 		openjdk-extra-cflags += -arch ppc -mmacosx-version-min=10.4
diff --git a/src/arm.S b/src/arm.S
index 458ece75cb..ece98ea386 100644
--- a/src/arm.S
+++ b/src/arm.S
@@ -66,6 +66,8 @@ vmRun:
   // r1: arguments
   // r2: checkpoint
   stmfd sp!, {r4-r11, lr}
+  // align stack
+  sub   sp, sp, #12
    
   str   sp, [r2, #CHECKPOINT_STACK]
 
@@ -76,5 +78,6 @@ vmRun:
 
 .globl vmRun_returnAddress
 vmRun_returnAddress:
+  add   sp, sp, #12
   ldmfd sp!, {r4-r11, lr}
   bx    lr
diff --git a/src/arm.cpp b/src/arm.cpp
index d0bb5f02c7..38ba3a6ce4 100644
--- a/src/arm.cpp
+++ b/src/arm.cpp
@@ -670,7 +670,9 @@ padding(MyBlock* b, unsigned offset)
   unsigned total = 0;
   for (PoolEvent* e = b->poolEventHead; e; e = e->next) {
     if (e->offset <= offset) {
-      total += BytesPerWord;
+      if (b->next) {
+        total += BytesPerWord;
+      }
       for (PoolOffset* o = e->poolOffsetHead; o; o = o->next) {
         total += BytesPerWord;
       }
@@ -1838,7 +1840,11 @@ class MyArchitecture: public Assembler::Architecture {
 
     return index;
   }
-  
+
+  virtual bool hasLinkRegister() {
+    return true;
+  }
+
   virtual unsigned stackAlignmentInWords() {
     return StackAlignmentInWords;
   }
@@ -2126,7 +2132,11 @@ class MyAssembler: public Assembler {
              &handlerConstant);
   }
 
-  virtual void saveFrame(unsigned stackOffset) {
+  virtual void saveFrame(unsigned stackOffset, unsigned ipOffset) {
+    Register link(LinkRegister);
+    Memory linkDst(ThreadRegister, ipOffset);
+    moveRM(&c, BytesPerWord, &link, BytesPerWord, &linkDst);
+
     Register stack(StackRegister);
     Memory stackDst(ThreadRegister, stackOffset);
     moveRM(&c, BytesPerWord, &stack, BytesPerWord, &stackDst);
@@ -2325,9 +2335,12 @@ class MyAssembler: public Assembler {
     }
   }
 
-  virtual void writeTo(uint8_t* dst) {
+  virtual void setDestination(uint8_t* dst) {
     c.result = dst;
-    
+  }
+
+  virtual void write() {
+    uint8_t* dst = c.result;
     unsigned dstOffset = 0;
     for (MyBlock* b = c.firstBlock; b; b = b->next) {
       if (DebugPool) {
@@ -2348,10 +2361,12 @@ class MyAssembler: public Assembler {
                     o, o->offset, b);
           }
 
-          poolSize += BytesPerWord;
-
           unsigned entry = dstOffset + poolSize;
 
+          if (b->next) {
+            entry += BytesPerWord;
+          }
+
           o->entry->address = dst + entry;
 
           unsigned instruction = o->block->start
@@ -2362,9 +2377,13 @@ class MyAssembler: public Assembler {
 
           int32_t* p = reinterpret_cast<int32_t*>(dst + instruction);
           *p = (v & PoolOffsetMask) | ((~PoolOffsetMask) & *p);
+
+          poolSize += BytesPerWord;
         }
 
-        write4(dst + dstOffset, ::b((poolSize + BytesPerWord - 8) >> 2));
+        if (b->next) {
+          write4(dst + dstOffset, ::b((poolSize + BytesPerWord - 8) >> 2));
+        }
 
         dstOffset += poolSize + BytesPerWord;
       }
diff --git a/src/assembler.h b/src/assembler.h
index d122a53207..f57636cb14 100644
--- a/src/assembler.h
+++ b/src/assembler.h
@@ -22,7 +22,7 @@ const bool TailCalls = true;
 const bool TailCalls = false;
 #endif
 
-#ifdef AVIAN_USE_FRAME_POINTER
+#if (defined AVIAN_USE_FRAME_POINTER) || (defined ARCH_powerpc)
 const bool UseFramePointer = true;
 #else
 const bool UseFramePointer = false;
@@ -333,6 +333,8 @@ class Assembler {
     virtual unsigned argumentRegisterCount() = 0;
     virtual int argumentRegister(unsigned index) = 0;
 
+    virtual bool hasLinkRegister() = 0;
+
     virtual unsigned stackAlignmentInWords() = 0;
 
     virtual bool matchCall(void* returnAddress, void* target) = 0;
@@ -397,7 +399,7 @@ class Assembler {
 
   virtual void checkStackOverflow(uintptr_t handler,
                                   unsigned stackLimitOffsetFromThread) = 0;
-  virtual void saveFrame(unsigned stackOffset) = 0;
+  virtual void saveFrame(unsigned stackOffset, unsigned ipOffset) = 0;
   virtual void pushFrame(unsigned argumentCount, ...) = 0;
   virtual void allocateFrame(unsigned footprint) = 0;
   virtual void adjustFrame(unsigned difference) = 0;
@@ -426,7 +428,9 @@ class Assembler {
                      unsigned bSize, OperandType bType, Operand* bOperand,
                      unsigned cSize, OperandType cType, Operand* cOperand) = 0;
 
-  virtual void writeTo(uint8_t* dst) = 0;
+  virtual void setDestination(uint8_t* dst) = 0;
+
+  virtual void write() = 0;
 
   virtual Promise* offset(bool forTrace = false) = 0;
 
diff --git a/src/binaryToObject/elf.cpp b/src/binaryToObject/elf.cpp
index 8af5c0cd58..51193dd2c7 100644
--- a/src/binaryToObject/elf.cpp
+++ b/src/binaryToObject/elf.cpp
@@ -12,6 +12,8 @@
 #include "stdio.h"
 #include "string.h"
 
+#include "endianness.h"
+
 #define EI_NIDENT 16
 
 #define EI_MAG0 0
@@ -35,6 +37,7 @@
 #define EV_CURRENT 1
 
 #define ELFDATA2LSB 1
+#define ELFDATA2MSB 2
 
 #define ELFOSABI_SYSV 0
 
@@ -43,6 +46,7 @@
 #define EM_386 3
 #define EM_X86_64 62
 #define EM_ARM 40
+#define EM_PPC 20
 
 #define SHT_PROGBITS 1
 #define SHT_SYMTAB 2
@@ -77,7 +81,6 @@
 #  error
 #endif
 
-#define Data ELFDATA2LSB
 #define OSABI ELFOSABI_SYSV
 
 namespace {
@@ -178,7 +181,7 @@ void
 writeObject(const uint8_t* data, unsigned size, FILE* out,
             const char* startName, const char* endName,
             const char* sectionName, unsigned sectionFlags,
-            unsigned alignment, int machine)
+            unsigned alignment, int machine, int encoding)
 {
   const unsigned sectionCount = 5;
   const unsigned symbolCount = 2;
@@ -217,99 +220,108 @@ writeObject(const uint8_t* data, unsigned size, FILE* out,
   const unsigned stringTableSectionNumber = 3;
 
   FileHeader fileHeader;
-  fileHeader.e_ident[EI_MAG0] = ELFMAG0;
-  fileHeader.e_ident[EI_MAG1] = ELFMAG1;
-  fileHeader.e_ident[EI_MAG2] = ELFMAG2;
-  fileHeader.e_ident[EI_MAG3] = ELFMAG3;
-  fileHeader.e_ident[EI_CLASS] = Class;
-  fileHeader.e_ident[EI_DATA] = Data;
-  fileHeader.e_ident[EI_VERSION] = EV_CURRENT;
-  fileHeader.e_ident[EI_OSABI] = OSABI;
-  fileHeader.e_ident[EI_ABIVERSION] = 0;
-  fileHeader.e_type = ET_REL;
-  fileHeader.e_machine = machine;
-  fileHeader.e_version = EV_CURRENT;
-  fileHeader.e_entry = 0;
-  fileHeader.e_phoff = 0;
-  fileHeader.e_shoff = sizeof(FileHeader);
-  fileHeader.e_flags = (machine == EM_ARM ? 0x04000000 : 0);
-  fileHeader.e_ehsize = sizeof(FileHeader);
-  fileHeader.e_phentsize = 0;
-  fileHeader.e_phnum = 0;
-  fileHeader.e_shentsize = sizeof(SectionHeader);
-  fileHeader.e_shnum = sectionCount;
-  fileHeader.e_shstrndx = sectionStringTableSectionNumber;
+  memset(&fileHeader, 0, sizeof(FileHeader));
+  fileHeader.e_ident[EI_MAG0] = V1(ELFMAG0);
+  fileHeader.e_ident[EI_MAG1] = V1(ELFMAG1);
+  fileHeader.e_ident[EI_MAG2] = V1(ELFMAG2);
+  fileHeader.e_ident[EI_MAG3] = V1(ELFMAG3);
+  fileHeader.e_ident[EI_CLASS] = V1(Class);
+  fileHeader.e_ident[EI_DATA] = V1(encoding);
+  fileHeader.e_ident[EI_VERSION] = V1(EV_CURRENT);
+  fileHeader.e_ident[EI_OSABI] = V1(OSABI);
+  fileHeader.e_ident[EI_ABIVERSION] = V1(0);
+  fileHeader.e_type = V2(ET_REL);
+  fileHeader.e_machine = V2(machine);
+  fileHeader.e_version = V4(EV_CURRENT);
+  fileHeader.e_entry = VW(0);
+  fileHeader.e_phoff = VW(0);
+  fileHeader.e_shoff = VW(sizeof(FileHeader));
+  fileHeader.e_flags = V4(machine == EM_ARM ? 0x04000000 : 0);
+  fileHeader.e_ehsize = V2(sizeof(FileHeader));
+  fileHeader.e_phentsize = V2(0);
+  fileHeader.e_phnum = V2(0);
+  fileHeader.e_shentsize = V2(sizeof(SectionHeader));
+  fileHeader.e_shnum = V2(sectionCount);
+  fileHeader.e_shstrndx = V2(sectionStringTableSectionNumber);
 
   SectionHeader nullSection;
   memset(&nullSection, 0, sizeof(SectionHeader));
 
   SectionHeader bodySection;
-  bodySection.sh_name = sectionNameOffset;
-  bodySection.sh_type = SHT_PROGBITS;
-  bodySection.sh_flags = sectionFlags;
-  bodySection.sh_addr = 0;
-  bodySection.sh_offset = sizeof(FileHeader)
-    + (sizeof(SectionHeader) * sectionCount);
-  bodySection.sh_size = size;
-  bodySection.sh_link = 0;
-  bodySection.sh_info = 0;
-  bodySection.sh_addralign = alignment;
-  bodySection.sh_entsize = 0;
+  bodySection.sh_name = V4(sectionNameOffset);
+  bodySection.sh_type = V4(SHT_PROGBITS);
+  bodySection.sh_flags = VW(sectionFlags);
+  bodySection.sh_addr = VW(0);
+  unsigned bodySectionOffset
+    = sizeof(FileHeader) + (sizeof(SectionHeader) * sectionCount);
+  bodySection.sh_offset = VW(bodySectionOffset);
+  unsigned bodySectionSize = size;
+  bodySection.sh_size = VW(bodySectionSize);
+  bodySection.sh_link = V4(0);
+  bodySection.sh_info = V4(0);
+  bodySection.sh_addralign = VW(alignment);
+  bodySection.sh_entsize = VW(0);
 
   SectionHeader sectionStringTableSection;
-  sectionStringTableSection.sh_name = sectionStringTableNameOffset;
-  sectionStringTableSection.sh_type = SHT_STRTAB;
-  sectionStringTableSection.sh_flags = 0;
-  sectionStringTableSection.sh_addr = 0;
-  sectionStringTableSection.sh_offset
-    = bodySection.sh_offset + bodySection.sh_size;
-  sectionStringTableSection.sh_size = sectionStringTableLength;
-  sectionStringTableSection.sh_link = 0;
-  sectionStringTableSection.sh_info = 0;
-  sectionStringTableSection.sh_addralign = 1;
-  sectionStringTableSection.sh_entsize = 0;
+  sectionStringTableSection.sh_name = V4(sectionStringTableNameOffset);
+  sectionStringTableSection.sh_type = V4(SHT_STRTAB);
+  sectionStringTableSection.sh_flags = VW(0);
+  sectionStringTableSection.sh_addr = VW(0);
+  unsigned sectionStringTableSectionOffset
+    = bodySectionOffset + bodySectionSize;
+  sectionStringTableSection.sh_offset = VW(sectionStringTableSectionOffset);
+  unsigned sectionStringTableSectionSize = sectionStringTableLength;
+  sectionStringTableSection.sh_size = VW(sectionStringTableSectionSize);
+  sectionStringTableSection.sh_link = V4(0);
+  sectionStringTableSection.sh_info = V4(0);
+  sectionStringTableSection.sh_addralign = VW(1);
+  sectionStringTableSection.sh_entsize = VW(0);
 
   SectionHeader stringTableSection;
-  stringTableSection.sh_name = stringTableNameOffset;
-  stringTableSection.sh_type = SHT_STRTAB;
-  stringTableSection.sh_flags = 0;
-  stringTableSection.sh_addr = 0;
-  stringTableSection.sh_offset = sectionStringTableSection.sh_offset
-    + sectionStringTableSection.sh_size;
-  stringTableSection.sh_size = stringTableLength;
-  stringTableSection.sh_link = 0;
-  stringTableSection.sh_info = 0;
-  stringTableSection.sh_addralign = 1;
-  stringTableSection.sh_entsize = 0;
+  stringTableSection.sh_name = V4(stringTableNameOffset);
+  stringTableSection.sh_type = V4(SHT_STRTAB);
+  stringTableSection.sh_flags = VW(0);
+  stringTableSection.sh_addr = VW(0);
+  unsigned stringTableSectionOffset
+    = sectionStringTableSectionOffset + sectionStringTableSectionSize;
+  stringTableSection.sh_offset  = VW(stringTableSectionOffset);
+  unsigned stringTableSectionSize = stringTableLength;
+  stringTableSection.sh_size = VW(stringTableSectionSize);
+  stringTableSection.sh_link = V4(0);
+  stringTableSection.sh_info = V4(0);
+  stringTableSection.sh_addralign = VW(1);
+  stringTableSection.sh_entsize = VW(0);
 
   SectionHeader symbolTableSection;
-  symbolTableSection.sh_name = symbolTableNameOffset;
-  symbolTableSection.sh_type = SHT_SYMTAB;
-  symbolTableSection.sh_flags = 0;
-  symbolTableSection.sh_addr = 0;
-  symbolTableSection.sh_offset = stringTableSection.sh_offset
-    + stringTableSection.sh_size;
-  symbolTableSection.sh_size = sizeof(Symbol) * symbolCount;
-  symbolTableSection.sh_link = stringTableSectionNumber;
-  symbolTableSection.sh_info = 0;
-  symbolTableSection.sh_addralign = BITS_PER_WORD / 8;
-  symbolTableSection.sh_entsize = sizeof(Symbol);
+  symbolTableSection.sh_name = V4(symbolTableNameOffset);
+  symbolTableSection.sh_type = V4(SHT_SYMTAB);
+  symbolTableSection.sh_flags = VW(0);
+  symbolTableSection.sh_addr = VW(0);
+  unsigned symbolTableSectionOffset
+    = stringTableSectionOffset + stringTableSectionSize;
+  symbolTableSection.sh_offset = VW(symbolTableSectionOffset);
+  unsigned symbolTableSectionSize = sizeof(Symbol) * symbolCount;
+  symbolTableSection.sh_size = VW(symbolTableSectionSize);
+  symbolTableSection.sh_link = V4(stringTableSectionNumber);
+  symbolTableSection.sh_info = V4(0);
+  symbolTableSection.sh_addralign = VW(BITS_PER_WORD / 8);
+  symbolTableSection.sh_entsize = VW(sizeof(Symbol));
 
   Symbol startSymbol;
-  startSymbol.st_name = startNameOffset;
-  startSymbol.st_value = 0;
-  startSymbol.st_size = 0;
-  startSymbol.st_info = SYMBOL_INFO(STB_GLOBAL, STT_NOTYPE);
-  startSymbol.st_other = STV_DEFAULT;
-  startSymbol.st_shndx = bodySectionNumber;
+  startSymbol.st_name = V4(startNameOffset);
+  startSymbol.st_value = VW(0);
+  startSymbol.st_size = VW(0);
+  startSymbol.st_info = V1(SYMBOL_INFO(STB_GLOBAL, STT_NOTYPE));
+  startSymbol.st_other = V1(STV_DEFAULT);
+  startSymbol.st_shndx = V2(bodySectionNumber);
 
   Symbol endSymbol;
-  endSymbol.st_name = endNameOffset;
-  endSymbol.st_value = size;
-  endSymbol.st_size = 0;
-  endSymbol.st_info = SYMBOL_INFO(STB_GLOBAL, STT_NOTYPE);
-  endSymbol.st_other = STV_DEFAULT;
-  endSymbol.st_shndx = bodySectionNumber;
+  endSymbol.st_name = V4(endNameOffset);
+  endSymbol.st_value = VW(size);
+  endSymbol.st_size = VW(0);
+  endSymbol.st_info = V1(SYMBOL_INFO(STB_GLOBAL, STT_NOTYPE));
+  endSymbol.st_other = V1(STV_DEFAULT);
+  endSymbol.st_shndx = V2(bodySectionNumber);
 
   fwrite(&fileHeader, 1, sizeof(fileHeader), out);
   fwrite(&nullSection, 1, sizeof(nullSection), out);
@@ -349,12 +361,19 @@ MAKE_NAME(writeElf, BITS_PER_WORD, Object)
    bool writable, bool executable)
 {
   int machine;
+  int encoding;
   if (strcmp(architecture, "x86_64") == 0) {
     machine = EM_X86_64;
+    encoding = ELFDATA2LSB;
   } else if (strcmp(architecture, "i386") == 0) {
     machine = EM_386;
+    encoding = ELFDATA2LSB;
   } else if (strcmp(architecture, "arm") == 0) {
     machine = EM_ARM;
+    encoding = ELFDATA2LSB;
+  } else if (strcmp(architecture, "powerpc") == 0) {
+    machine = EM_PPC;
+    encoding = ELFDATA2MSB;
   } else {
     fprintf(stderr, "unsupported architecture: %s\n", architecture);
     return false;
@@ -376,7 +395,7 @@ MAKE_NAME(writeElf, BITS_PER_WORD, Object)
   }
 
   writeObject(data, size, out, startName, endName, sectionName, sectionFlags,
-              alignment, machine);
+              alignment, machine, encoding);
 
   return true;
 }
diff --git a/src/binaryToObject/endianness.h b/src/binaryToObject/endianness.h
new file mode 100644
index 0000000000..46cfcd8b46
--- /dev/null
+++ b/src/binaryToObject/endianness.h
@@ -0,0 +1,38 @@
+#ifndef ENDIANNESS_H
+#define ENDIANNESS_H
+
+#define V1(v) (v)
+
+#ifdef OPPOSITE_ENDIAN
+#  define V2(v) \
+  ((((v) >> 8) & 0xFF) | \
+   (((v) << 8)))
+#  define V4(v) \
+  ((((v) >> 24) & 0x000000FF) | \
+   (((v) >>  8) & 0x0000FF00) | \
+   (((v) <<  8) & 0x00FF0000) | \
+   (((v) << 24)))
+#  define V8(v) \
+  (((static_cast<uint64_t>(v) >> 56) & UINT64_C(0x00000000000000FF)) | \
+   ((static_cast<uint64_t>(v) >> 40) & UINT64_C(0x000000000000FF00)) | \
+   ((static_cast<uint64_t>(v) >> 24) & UINT64_C(0x0000000000FF0000)) | \
+   ((static_cast<uint64_t>(v) >>  8) & UINT64_C(0x00000000FF000000)) | \
+   ((static_cast<uint64_t>(v) <<  8) & UINT64_C(0x000000FF00000000)) | \
+   ((static_cast<uint64_t>(v) << 24) & UINT64_C(0x0000FF0000000000)) | \
+   ((static_cast<uint64_t>(v) << 40) & UINT64_C(0x00FF000000000000)) | \
+   ((static_cast<uint64_t>(v) << 56)))
+#else
+#  define V2(v) (v)
+#  define V4(v) (v)
+#  define V8(v) (v)
+#endif
+
+#if (BITS_PER_WORD == 64)
+#  define VW(v) V8(v)
+#elif (BITS_PER_WORD == 32)
+#  define VW(v) V4(v)
+#else
+#  error
+#endif
+
+#endif//ENDIANNESS_H
diff --git a/src/binaryToObject/mach-o.cpp b/src/binaryToObject/mach-o.cpp
index d832703200..742e3870d1 100644
--- a/src/binaryToObject/mach-o.cpp
+++ b/src/binaryToObject/mach-o.cpp
@@ -12,31 +12,7 @@
 #include "stdio.h"
 #include "string.h"
 
-#define V1(v) v
-
-#ifdef OPPOSITE_ENDIAN
-#  define V2(v) \
-  ((((v) >> 8) & 0xFF) | \
-   (((v) << 8)))
-#  define V4(v) \
-  ((((v) >> 24) & 0x000000FF) | \
-   (((v) >>  8) & 0x0000FF00) | \
-   (((v) <<  8) & 0x00FF0000) | \
-   (((v) << 24)))
-#  define V8(v) \
-  (((static_cast<uint64_t>(v) >> 56) & UINT64_C(0x00000000000000FF)) | \
-   ((static_cast<uint64_t>(v) >> 40) & UINT64_C(0x000000000000FF00)) | \
-   ((static_cast<uint64_t>(v) >> 24) & UINT64_C(0x0000000000FF0000)) | \
-   ((static_cast<uint64_t>(v) >>  8) & UINT64_C(0x00000000FF000000)) | \
-   ((static_cast<uint64_t>(v) <<  8) & UINT64_C(0x000000FF00000000)) | \
-   ((static_cast<uint64_t>(v) << 24) & UINT64_C(0x0000FF0000000000)) | \
-   ((static_cast<uint64_t>(v) << 40) & UINT64_C(0x00FF000000000000)) | \
-   ((static_cast<uint64_t>(v) << 56)))
-#else
-#  define V2(v) v
-#  define V4(v) v
-#  define V8(v) v
-#endif
+#include "endianness.h"
 
 #define MH_MAGIC_64 0xfeedfacf
 #define MH_MAGIC 0xfeedface
@@ -63,7 +39,6 @@
 #define CPU_SUBTYPE_POWERPC_ALL 0
 
 #if (BITS_PER_WORD == 64)
-#  define VW(v) V8(v)
 #  define Magic MH_MAGIC_64
 #  define Segment LC_SEGMENT_64
 #  define FileHeader mach_header_64
@@ -71,7 +46,6 @@
 #  define Section section_64
 #  define NList struct nlist_64
 #elif (BITS_PER_WORD == 32)
-#  define VW(v) V4(v)
 #  define Magic MH_MAGIC
 #  define Segment LC_SEGMENT
 #  define FileHeader mach_header
diff --git a/src/binaryToObject/main.cpp b/src/binaryToObject/main.cpp
index c95f193d3d..41f2951645 100644
--- a/src/binaryToObject/main.cpp
+++ b/src/binaryToObject/main.cpp
@@ -73,8 +73,10 @@ writeObject(uint8_t* data, unsigned size, FILE* out, const char* startName,
       success = writeElf64Object
         (data, size, out, startName, endName, architecture, alignment,
          writable, executable);
-    } else if (strcmp("i386", architecture) == 0 ||
-               strcmp("arm", architecture) == 0) {
+    } else if (strcmp("i386", architecture) == 0
+               or strcmp("arm", architecture) == 0
+               or strcmp("powerpc", architecture) == 0)
+    {
       found = true;
       success = writeElf32Object
         (data, size, out, startName, endName, architecture, alignment,
diff --git a/src/bootimage.cpp b/src/bootimage.cpp
index 13a278ea8b..e7f0d9510c 100644
--- a/src/bootimage.cpp
+++ b/src/bootimage.cpp
@@ -23,7 +23,7 @@ using namespace vm;
 
 namespace {
 
-const unsigned HeapCapacity = 768 * 1024 * 1024;
+const unsigned HeapCapacity = 128 * 1024 * 1024;
 
 // Notes on immutable references in the heap image:
 //
@@ -560,9 +560,15 @@ main(int ac, const char** av)
   Finder* f = makeFinder(s, h, av[1], 0);
   Processor* p = makeProcessor(s, h, false);
 
-  BootImage image;
-  const unsigned CodeCapacity = 128 * 1024 * 1024;
+  // todo: currently, the compiler cannot compile code with jumps or
+  // calls spanning more than the maximum size of an immediate value
+  // in a branch instruction for the target architecture (~32MB on
+  // PowerPC and ARM).  When that limitation is removed, we'll be able
+  // to specify a capacity as large as we like here:
+  const unsigned CodeCapacity = 30 * 1024 * 1024;
+
   uint8_t* code = static_cast<uint8_t*>(h->allocate(CodeCapacity));
+  BootImage image;
   p->initialize(&image, code, CodeCapacity);
 
   Machine* m = new (h->allocate(sizeof(Machine))) Machine
diff --git a/src/classpath-avian.cpp b/src/classpath-avian.cpp
index 95b2a32703..9efba455e4 100644
--- a/src/classpath-avian.cpp
+++ b/src/classpath-avian.cpp
@@ -312,8 +312,11 @@ Avian_java_lang_reflect_Method_invoke
       }
     });
 
+  unsigned returnCode = methodReturnCode(t, method);
+
   return reinterpret_cast<int64_t>
-    (t->m->processor->invokeArray(t, method, instance, args));
+    (translateInvokeResult
+     (t, returnCode, t->m->processor->invokeArray(t, method, instance, args)));
 }
 
 extern "C" JNIEXPORT int64_t JNICALL
diff --git a/src/classpath-common.h b/src/classpath-common.h
index 6efc0a1424..6fdcdf3742 100644
--- a/src/classpath-common.h
+++ b/src/classpath-common.h
@@ -286,6 +286,39 @@ makeStackTraceElement(Thread* t, object e)
   return makeStackTraceElement(t, class_, method, file, line);
 }
 
+object
+translateInvokeResult(Thread* t, unsigned returnCode, object o)
+{
+  switch (returnCode) {
+  case ByteField:
+    return makeByte(t, intValue(t, o));
+
+  case BooleanField:
+    return makeBoolean(t, intValue(t, o) != 0);
+
+  case CharField:
+    return makeChar(t, intValue(t, o));
+
+  case ShortField:
+    return makeShort(t, intValue(t, o));
+
+  case FloatField:
+    return makeFloat(t, intValue(t, o));
+
+  case IntField:
+  case LongField:
+  case ObjectField:
+  case VoidField:
+    return o;
+
+  case DoubleField:
+    return makeDouble(t, longValue(t, o));
+
+  default:
+    abort(t);
+  }
+}
+
 } // namespace vm
 
 #endif//CLASSPATH_COMMON_H
diff --git a/src/classpath-openjdk.cpp b/src/classpath-openjdk.cpp
index ab7ba61179..61bbc06dbf 100644
--- a/src/classpath-openjdk.cpp
+++ b/src/classpath-openjdk.cpp
@@ -1395,6 +1395,22 @@ pipeAvailable(int fd, int* available)
 #endif
 }
 
+object
+fieldForOffset(Thread* t, object o, unsigned offset)
+{
+  object table = classFieldTable(t, objectClass(t, o));
+  for (unsigned i = 0; i < objectArrayLength(t, table); ++i) {
+    object field = objectArrayBody(t, table, i);
+    if ((fieldFlags(t, field) & ACC_STATIC) == 0
+        and fieldOffset(t, field) == offset)
+    {
+      return field;
+    }
+  }
+  
+  abort(t);
+}
+
 } // namespace local
 
 } // namespace
@@ -1569,6 +1585,32 @@ Avian_sun_misc_Unsafe_getIntVolatile
   return result;
 }
 
+extern "C" JNIEXPORT int64_t JNICALL
+Avian_sun_misc_Unsafe_getLongVolatile
+(Thread* t, object, uintptr_t* arguments)
+{
+  object o = reinterpret_cast<object>(arguments[1]);
+  int64_t offset; memcpy(&offset, arguments + 2, 8);
+
+  object field;
+  if (BytesPerWord < 8) {
+    field = local::fieldForOffset(t, o, offset);
+
+    PROTECT(t, field);
+    acquire(t, field);        
+  }
+
+  int64_t result = cast<int64_t>(o, offset);
+
+  if (BytesPerWord < 8) {
+    release(t, field);        
+  } else {
+    loadMemoryBarrier();
+  }
+
+  return result;
+}
+
 extern "C" JNIEXPORT void JNICALL
 Avian_sun_misc_Unsafe_putInt__Ljava_lang_Object_2JI
 (Thread*, object, uintptr_t* arguments)
@@ -1803,7 +1845,7 @@ EXPORT(JVM_IHashCode)(Thread* t, jobject o)
 {
   ENTER(t, Thread::ActiveState);
 
-  return objectHash(t, *o);
+  return o ? objectHash(t, *o) : 0;
 }
 
 uint64_t
@@ -2871,10 +2913,18 @@ extern "C" JNIEXPORT jobjectArray JNICALL
 EXPORT(JVM_GetDeclaredClasses)(Thread*, jclass) { abort(); }
 
 extern "C" JNIEXPORT jclass JNICALL
-EXPORT(JVM_GetDeclaringClass)(Thread*, jclass) { abort(); }
+EXPORT(JVM_GetDeclaringClass)(Thread*, jclass)
+{
+  // todo: implement properly
+  return 0;
+}
 
 extern "C" JNIEXPORT jstring JNICALL
-EXPORT(JVM_GetClassSignature)(Thread*, jclass) { abort(); }
+EXPORT(JVM_GetClassSignature)(Thread*, jclass)
+{
+  // todo: implement properly
+  return 0;
+}
 
 extern "C" JNIEXPORT jbyteArray JNICALL
 EXPORT(JVM_GetClassAnnotations)(Thread* t, jclass c)
@@ -3162,6 +3212,8 @@ jvmInvokeMethod(Thread* t, uintptr_t* arguments)
     instance = 0;
   }
 
+  unsigned returnCode = methodReturnCode(t, vmMethod);
+
   object result;
   if (args) {
     result = t->m->processor->invokeArray
@@ -3170,7 +3222,8 @@ jvmInvokeMethod(Thread* t, uintptr_t* arguments)
     result = t->m->processor->invoke(t, vmMethod, instance ? *instance : 0);
   }
 
-  return reinterpret_cast<uint64_t>(makeLocalReference(t, result));
+  return reinterpret_cast<uint64_t>
+    (makeLocalReference(t, translateInvokeResult(t, returnCode, result)));
 }
 
 extern "C" JNIEXPORT jobject JNICALL
@@ -3724,10 +3777,21 @@ EXPORT(JVM_GetSockName)(jint socket, struct sockaddr* address,
 }
 
 extern "C" JNIEXPORT jint JNICALL
-EXPORT(JVM_GetSockOpt)(jint, int, int, char*, int*) { abort(); }
+EXPORT(JVM_GetSockOpt)(jint socket, int level, int optionName,
+                       char* optionValue, int* optionLength)
+{
+  socklen_t length;
+  int rv = getsockopt(socket, level, optionName, optionValue, &length);
+  *optionLength = length;
+  return rv;
+}
 
 extern "C" JNIEXPORT jint JNICALL
-EXPORT(JVM_SetSockOpt)(jint, int, int, const char*, int) { abort(); }
+EXPORT(JVM_SetSockOpt)(jint socket, int level, int optionName,
+                       const char* optionValue, int optionLength)
+{
+  return setsockopt(socket, level, optionName, optionValue, optionLength);
+}
 
 extern "C" JNIEXPORT struct protoent* JNICALL
 EXPORT(JVM_GetProtoByName)(char*) { abort(); }
@@ -3787,7 +3851,11 @@ extern "C" JNIEXPORT jobject JNICALL
 EXPORT(JVM_InitAgentProperties)(Thread*, jobject) { abort(); }
 
 extern "C" JNIEXPORT jobjectArray JNICALL
-EXPORT(JVM_GetEnclosingMethodInfo)(JNIEnv*, jclass) { abort(); }
+EXPORT(JVM_GetEnclosingMethodInfo)(JNIEnv*, jclass)
+{
+  // todo: implement properly
+  return 0;
+}
 
 extern "C" JNIEXPORT jintArray JNICALL
 EXPORT(JVM_GetThreadStateValues)(JNIEnv*, jint) { abort(); }
diff --git a/src/common.h b/src/common.h
index 13957bedfd..5c7160e54c 100644
--- a/src/common.h
+++ b/src/common.h
@@ -88,7 +88,7 @@ alias(void* p, unsigned offset)
 #    define ARCH_x86_32
 #  elif defined __x86_64__
 #    define ARCH_x86_64
-#  elif defined __POWERPC__
+#  elif (defined __POWERPC__) || (defined __powerpc__)
 #    define ARCH_powerpc
 #  elif defined __arm__
 #    define ARCH_arm
diff --git a/src/compile-arm.S b/src/compile-arm.S
index 8af887fdbb..c96ef2bc29 100644
--- a/src/compile-arm.S
+++ b/src/compile-arm.S
@@ -170,17 +170,6 @@ LOCAL(vmInvoke_exit):
   // restore callee-saved registers
   ldmfd sp!, {r4-r11, lr}
 
-LOCAL(vmInvoke_void):
-  cmp   ip, #VOID_TYPE
-  beq   LOCAL(vmInvoke_return)
-
-LOCAL(vmInvoke_int64):
-  cmp   ip, #INT64_TYPE
-  beq   LOCAL(vmInvoke_return)
-
-LOCAL(vmInvoke_int32):
-  mov   r1, #0
-
 LOCAL(vmInvoke_return):
   bx    lr
 
@@ -197,9 +186,12 @@ GLOBAL(vmJumpAndInvoke):
    ldr  r5,[sp,#0]
    ldr  r6,[sp,#4]
 
-   // allocate new frame, adding room for callee-saved registers
+   // allocate new frame, adding room for callee-saved registers, plus
+   // 4 bytes of padding since the calculation of frameSize assumes 4
+   // bytes have already been allocated to save the return address,
+   // which is not true in this case
    sub  r2,r2,r6
-   sub  r2,r2,#80
+   sub  r2,r2,#84
    
    mov  r8,r0
 
@@ -213,7 +205,7 @@ LOCAL(vmJumpAndInvoke_argumentLoop):
    add  r6,r6,#4
 
 LOCAL(vmJumpAndInvoke_argumentTest):
-   cmp  r6,r4
+   cmp  r6,r3
    ble  LOCAL(vmJumpAndInvoke_argumentLoop)
 
    // the arguments have been copied, so we can set the real stack
diff --git a/src/compile-powerpc.S b/src/compile-powerpc.S
index 7e40816105..a074a5f186 100644
--- a/src/compile-powerpc.S
+++ b/src/compile-powerpc.S
@@ -13,16 +13,21 @@
 .text
 
 #define BYTES_PER_WORD 4
-#define LINKAGE_AREA 6
-#define ARGUMENT_BASE BYTES_PER_WORD * LINKAGE_AREA
-
-#define LOCAL(x) L##x
    
 #ifdef __APPLE__
 #  define GLOBAL(x) _##x
+#  define LOCAL(x) L##x
+#  define LINKAGE_AREA 6
+#  define RETURN_ADDRESS_OFFSET 8
 #else
-#  define GLOBAL(x) x   
+#  define GLOBAL(x) x
+#  define LOCAL(x) .L##x
+#  define LINKAGE_AREA 2
+#  define RETURN_ADDRESS_OFFSET 4
+#  include "powerpc-regs.S"
 #endif
+
+#define ARGUMENT_BASE BYTES_PER_WORD * LINKAGE_AREA
       
 #define THREAD_STACK 2148
 #define THREAD_CONTINUATION 2156
@@ -42,7 +47,7 @@
 GLOBAL(vmInvoke):
    // save return address
    mflr r0
-   stw  r0,8(r1)
+   stw  r0,RETURN_ADDRESS_OFFSET(r1)
 
    // r3: thread
    // r4: function
@@ -53,12 +58,10 @@ GLOBAL(vmInvoke):
 
    // r9: temporary
 
-   // save return type
-   stw  r8,44(r1)
-
-   // allocate stack space, adding room for callee-saved registers
+   // allocate stack space, adding room for callee-saved registers and
+   // return type
    subfic r9,r7,-80
-	 stwux r1,r1,r9
+   stwux r1,r1,r9
    
    // save callee-saved registers
    add  r9,r7,r1
@@ -83,6 +86,9 @@ GLOBAL(vmInvoke):
    stw  r30,68(r9)
    stw  r31,72(r9)
 
+   // save return type
+   stw  r8,76(r9)
+
    // we use r13 to hold the thread pointer, by convention
    mr   r13,r3
    
@@ -151,7 +157,11 @@ LOCAL(vmInvoke_continuationTest):
    
 LOCAL(vmInvoke_getPC):
    mflr r10
+#ifdef __APPLE__
    la   r10,lo16(GLOBAL(vmInvoke_returnAddress)-LOCAL(vmInvoke_getPC))(r10)
+#else
+   lwz  r10,LOCAL(vmInvoke_returnAddress_address)-LOCAL(vmInvoke_getPC)(r10)
+#endif
    stwx r10,r1,r7
 
    lwz  r7,CONTINUATION_FRAME_POINTER_OFFSET(r5)
@@ -213,29 +223,21 @@ LOCAL(vmInvoke_exit):
    lwz  r31,72(r9)
    
    // handle return value based on expected type
-   lwz  r8,44(r1)
+   lwz  r8,76(r9)
    
-LOCAL(vmInvoke_void):
-   cmplwi r8,VOID_TYPE
-   bne    LOCAL(vmInvoke_int64)
-   b      LOCAL(vmInvoke_return)
-
-LOCAL(vmInvoke_int64):
-   cmplwi r8,INT64_TYPE
-   bne    LOCAL(vmInvoke_int32)
-   b      LOCAL(vmInvoke_return)
-   
-LOCAL(vmInvoke_int32):
-   li   r3,0
-
 LOCAL(vmInvoke_return):
    // load return address
-   lwz  r0,8(r1)
+   lwz  r0,RETURN_ADDRESS_OFFSET(r1)
    mtlr r0
 
    // return
    blr
 
+#ifndef __APPLE__
+LOCAL(vmInvoke_returnAddress_address):
+   .long GLOBAL(vmInvoke_returnAddress)
+#endif
+
 .globl GLOBAL(vmJumpAndInvoke)
 GLOBAL(vmJumpAndInvoke):
 #ifdef AVIAN_CONTINUATIONS
@@ -283,7 +285,11 @@ LOCAL(vmJumpAndInvoke_argumentTest):
    
 LOCAL(vmJumpAndInvoke_getPC):
    mflr r10
+#ifdef __APPLE__
    la   r10,lo16(GLOBAL(vmInvoke_returnAddress)-LOCAL(vmJumpAndInvoke_getPC))(r10)
+#else
+   lwz  r10,LOCAL(vmInvoke_returnAddress_address)-LOCAL(vmJumpAndInvoke_getPC)(r10)
+#endif
    mtlr r10
 
    mtctr r4
diff --git a/src/compile-x86.S b/src/compile-x86.S
index 3f839ce3c5..77d681298d 100644
--- a/src/compile-x86.S
+++ b/src/compile-x86.S
@@ -385,21 +385,7 @@ GLOBAL(vmInvoke_safeStack):
 
    // handle return value based on expected type
    movl   28(%esp),%ecx
-   
-LOCAL(vmInvoke_void):
-   cmpl   $VOID_TYPE,%ecx
-   jne    LOCAL(vmInvoke_int64)
-   jmp    LOCAL(vmInvoke_return)
 
-LOCAL(vmInvoke_int64):
-   cmpl   $INT64_TYPE,%ecx
-   jne    LOCAL(vmInvoke_int32)
-   jmp    LOCAL(vmInvoke_return)
-   
-LOCAL(vmInvoke_int32):
-   movl   $0,%edx
-
-LOCAL(vmInvoke_return):
    popl   %ebp
    ret
 
diff --git a/src/compile.cpp b/src/compile.cpp
index 37374f03c0..c9b00f1d9b 100644
--- a/src/compile.cpp
+++ b/src/compile.cpp
@@ -81,12 +81,18 @@ isVmInvokeUnsafeStack(void* ip)
     < reinterpret_cast<uintptr_t> (voidPointer(vmInvoke_safeStack));
 }
 
+class MyThread;
+
+void*
+getIp(MyThread*);
+
 class MyThread: public Thread {
  public:
   class CallTrace {
    public:
     CallTrace(MyThread* t, object method):
       t(t),
+      ip(getIp(t)),
       stack(t->stack),
       scratch(t->scratch),
       continuation(t->continuation),
@@ -103,10 +109,11 @@ class MyThread: public Thread {
 
       t->scratch = scratch;
 
-      doTransition(t, 0, stack, continuation, next);
+      doTransition(t, ip, stack, continuation, next);
     }
 
     MyThread* t;
+    void* ip;
     void* stack;
     void* scratch;
     object continuation;
@@ -367,7 +374,7 @@ methodForIp(MyThread* t, void* ip)
 
   // we must use a version of the method tree at least as recent as the
   // compiled form of the method containing the specified address (see
-  // compile(MyThread*, Allocator*, BootContext*, object)):
+  // compile(MyThread*, FixedAllocator*, BootContext*, object)):
   loadMemoryBarrier();
 
   return treeQuery(t, root(t, MethodTree), reinterpret_cast<intptr_t>(ip),
@@ -435,6 +442,23 @@ nextFrame(MyThread* t, void** ip, void** sp, object method, object target)
   // fprintf(stderr, "next frame ip %p sp %p\n", *ip, *sp);
 }
 
+void*
+getIp(MyThread* t, void* ip, void* stack)
+{
+  // Here we use the convention that, if the return address is neither
+  // pushed on to the stack automatically as part of the call nor
+  // stored in the caller's frame, it will be saved in MyThread::ip
+  // instead of on the stack.  See the various implementations of
+  // Assembler::saveFrame for details on how this is done.
+  return t->arch->returnAddressOffset() < 0 ? ip : t->arch->frameIp(stack);
+}
+
+void*
+getIp(MyThread* t)
+{
+  return getIp(t, t->ip, t->stack);
+}
+
 class MyStackWalker: public Processor::StackWalker {
  public:
   enum State {
@@ -475,7 +499,7 @@ class MyStackWalker: public Processor::StackWalker {
       trace = t->traceContext->trace;
       continuation = t->traceContext->continuation;
     } else {
-      ip_ = 0;
+      ip_ = getIp(t);
       stack = t->stack;
       trace = t->trace;
       continuation = t->continuation;      
@@ -509,10 +533,6 @@ class MyStackWalker: public Processor::StackWalker {
 //       fprintf(stderr, "state: %d\n", state);
       switch (state) {
       case Start:
-        if (ip_ == 0) {
-          ip_ = t->arch->frameIp(stack);
-        }
-
         if (trace and trace->nativeMethod) {
           method_ = trace->nativeMethod;
           state = NativeMethod;
@@ -542,7 +562,7 @@ class MyStackWalker: public Processor::StackWalker {
         if (trace) {
           continuation = trace->continuation;
           stack = trace->stack;
-          ip_ = t->arch->frameIp(stack);
+          ip_ = trace->ip;
           trace = trace->next;
 
           state = Start;
@@ -1981,15 +2001,11 @@ findUnwindTarget(MyThread* t, void** targetIp, void** targetFrame,
     stack = t->traceContext->stack;
     continuation = t->traceContext->continuation;
   } else {
-    ip = 0;
+    ip = getIp(t);
     stack = t->stack;
     continuation = t->continuation;      
   }
 
-  if (ip == 0) {
-    ip = t->arch->frameIp(stack);
-  }
-
   object target = t->trace->targetMethod;
 
   *targetIp = 0;
@@ -2024,6 +2040,7 @@ findUnwindTarget(MyThread* t, void** targetIp, void** targetFrame,
         target = method;
       }
     } else {
+      expect(t, ip);
       *targetIp = ip;
       *targetFrame = 0;
       *targetStack = static_cast<void**>(stack)
@@ -2068,7 +2085,7 @@ findUnwindTarget(MyThread* t, void** targetIp, void** targetFrame,
 object
 makeCurrentContinuation(MyThread* t, void** targetIp, void** targetStack)
 {
-  void* ip = t->arch->frameIp(t->stack);
+  void* ip = getIp(t);
   void* stack = t->stack;
 
   object context = t->continuation
@@ -2212,7 +2229,7 @@ FixedAllocator*
 codeAllocator(MyThread* t);
 
 void
-compile(MyThread* t, Allocator* allocator, BootContext* bootContext,
+compile(MyThread* t, FixedAllocator* allocator, BootContext* bootContext,
         object method);
 
 int64_t
@@ -5566,7 +5583,8 @@ finish(MyThread* t, Allocator* allocator, Assembler* a, const char* name,
 {
   uint8_t* start = static_cast<uint8_t*>(allocator->allocate(pad(length)));
 
-  a->writeTo(start);
+  a->setDestination(start);
+  a->write();
 
   logCompile(t, start, length, 0, name, 0);
 
@@ -5834,7 +5852,7 @@ makeSimpleFrameMapTable(MyThread* t, Context* context, uint8_t* start,
 }
 
 void
-finish(MyThread* t, Allocator* allocator, Context* context)
+finish(MyThread* t, FixedAllocator* allocator, Context* context)
 {
   Compiler* c = context->compiler;
 
@@ -5868,9 +5886,13 @@ finish(MyThread* t, Allocator* allocator, Context* context)
   // parallelism (the downside being that it may end up being a waste
   // of cycles if another thread compiles the same method in parallel,
   // which might be mitigated by fine-grained, per-method locking):
-  unsigned codeSize = c->compile
-    (context->leaf ? 0 : stackOverflowThunk(t),
-     difference(&(t->stackLimit), t));
+  c->compile(context->leaf ? 0 : stackOverflowThunk(t),
+             difference(&(t->stackLimit), t));
+
+  // we must acquire the class lock here at the latest
+ 
+  unsigned codeSize = c->resolve
+    (allocator->base + allocator->offset + BytesPerWord);
 
   unsigned total = pad(codeSize) + pad(c->poolSize()) + BytesPerWord;
 
@@ -5904,7 +5926,7 @@ finish(MyThread* t, Allocator* allocator, Context* context)
     }
   }
 
-  c->writeTo(start);
+  c->write();
 
   BootContext* bc = context->bootContext;
   if (bc) {
@@ -6207,7 +6229,7 @@ compileMethod(MyThread* t)
     ip = t->tailAddress;
     t->tailAddress = 0;
   } else {
-    ip = t->arch->frameIp(t->stack);
+    ip = getIp(t);
   }
 
   return reinterpret_cast<uintptr_t>(compileMethod2(t, ip));
@@ -6458,7 +6480,7 @@ invokeNative(MyThread* t)
       ip = t->tailAddress;
       t->tailAddress = 0;
     } else {
-      ip = t->arch->frameIp(t->stack);
+      ip = getIp(t);
     }
 
     object node = findCallNode(t, ip);
@@ -6499,7 +6521,7 @@ invokeNative(MyThread* t)
 
   stack += t->arch->frameReturnAddressSize();
 
-  transition(t, t->arch->frameIp(t->stack), stack, t->continuation, t->trace);
+  transition(t, getIp(t), stack, t->continuation, t->trace);
 
   return result;
 }
@@ -6669,7 +6691,7 @@ visitArguments(MyThread* t, Heap::Visitor* v, void* stack, object method)
 void
 visitStack(MyThread* t, Heap::Visitor* v)
 {
-  void* ip = t->arch->frameIp(t->stack);
+  void* ip = getIp(t);
   void* stack = t->stack;
 
   MyThread::CallTrace* trace = t->trace;
@@ -6696,7 +6718,7 @@ visitStack(MyThread* t, Heap::Visitor* v)
       target = method;
     } else if (trace) {
       stack = trace->stack;
-      ip = t->arch->frameIp(stack);
+      ip = trace->ip;
       trace = trace->next;
 
       if (trace) {
@@ -7685,12 +7707,20 @@ class MyProcessor: public Processor {
           // we caught the thread in a thunk or native code, and the
           // saved stack pointer indicates the most recent Java frame
           // on the stack
-          c.ip = t->arch->frameIp(target->stack);
+          c.ip = getIp(target);
           c.stack = target->stack;
         } else if (isThunk(t, ip) or isVirtualThunk(t, ip)) {
           // we caught the thread in a thunk where the stack register
           // indicates the most recent Java frame on the stack
-          c.ip = t->arch->frameIp(stack);
+          
+          // On e.g. x86, the return address will have already been
+          // pushed onto the stack, in which case we use getIp to
+          // retrieve it.  On e.g. PowerPC and ARM, it will be in the
+          // link register.  Note that we can't just check if the link
+          // argument is null here, since we use ecx/rcx as a
+          // pseudo-link register on x86 for the purpose of tail
+          // calls.
+          c.ip = t->arch->hasLinkRegister() ? link : getIp(t, link, stack);
           c.stack = stack;
         } else {
           // we caught the thread in native code, and the most recent
@@ -8398,7 +8428,7 @@ compileThunks(MyThread* t, Allocator* allocator, MyProcessor* p)
 
   { Assembler* a = defaultContext.context.assembler;
     
-    a->saveFrame(difference(&(t->stack), t));
+    a->saveFrame(difference(&(t->stack), t), difference(&(t->ip), t));
 
     p->thunks.default_.frameSavedOffset = a->length();
 
@@ -8442,7 +8472,7 @@ compileThunks(MyThread* t, Allocator* allocator, MyProcessor* p)
     a->apply(Move, BytesPerWord, RegisterOperand, &index,
              BytesPerWord, MemoryOperand, &virtualCallIndex);
     
-    a->saveFrame(difference(&(t->stack), t));
+    a->saveFrame(difference(&(t->stack), t), difference(&(t->ip), t));
 
     p->thunks.defaultVirtual.frameSavedOffset = a->length();
 
@@ -8464,7 +8494,7 @@ compileThunks(MyThread* t, Allocator* allocator, MyProcessor* p)
 
   { Assembler* a = nativeContext.context.assembler;
 
-    a->saveFrame(difference(&(t->stack), t));
+    a->saveFrame(difference(&(t->stack), t), difference(&(t->ip), t));
 
     p->thunks.native.frameSavedOffset = a->length();
 
@@ -8484,7 +8514,7 @@ compileThunks(MyThread* t, Allocator* allocator, MyProcessor* p)
 
   { Assembler* a = aioobContext.context.assembler;
       
-    a->saveFrame(difference(&(t->stack), t));
+    a->saveFrame(difference(&(t->stack), t), difference(&(t->ip), t));
 
     p->thunks.aioob.frameSavedOffset = a->length();
 
@@ -8501,7 +8531,7 @@ compileThunks(MyThread* t, Allocator* allocator, MyProcessor* p)
 
   { Assembler* a = stackOverflowContext.context.assembler;
       
-    a->saveFrame(difference(&(t->stack), t));
+    a->saveFrame(difference(&(t->stack), t), difference(&(t->ip), t));
 
     p->thunks.stackOverflow.frameSavedOffset = a->length();
 
@@ -8518,7 +8548,7 @@ compileThunks(MyThread* t, Allocator* allocator, MyProcessor* p)
 
   { Assembler* a = tableContext.context.assembler;
   
-    a->saveFrame(difference(&(t->stack), t));
+    a->saveFrame(difference(&(t->stack), t), difference(&(t->ip), t));
 
     p->thunks.table.frameSavedOffset = a->length();
 
@@ -8621,7 +8651,8 @@ compileThunks(MyThread* t, Allocator* allocator, MyProcessor* p)
   uint8_t* start = p->thunks.table.start;
 
 #define THUNK(s)                                                        \
-  tableContext.context.assembler->writeTo(start);                       \
+  tableContext.context.assembler->setDestination(start);                \
+  tableContext.context.assembler->write();                              \
   start += p->thunks.table.length;                                      \
   { void* call;                                                         \
     tableContext.promise.listener->resolve                              \
@@ -8712,7 +8743,8 @@ compileVirtualThunk(MyThread* t, unsigned index, unsigned* size)
 
   uint8_t* start = static_cast<uint8_t*>(codeAllocator(t)->allocate(*size));
 
-  a->writeTo(start);
+  a->setDestination(start);
+  a->write();
 
   logCompile(t, start, *size, 0, "virtualThunk", 0);
 
@@ -8749,7 +8781,7 @@ virtualThunk(MyThread* t, unsigned index)
 }
 
 void
-compile(MyThread* t, Allocator* allocator, BootContext* bootContext,
+compile(MyThread* t, FixedAllocator* allocator, BootContext* bootContext,
         object method)
 {
   PROTECT(t, method);
diff --git a/src/compiler.cpp b/src/compiler.cpp
index 18dc98e06f..01522a8ef7 100644
--- a/src/compiler.cpp
+++ b/src/compiler.cpp
@@ -386,6 +386,7 @@ class Context {
     lastEvent(0),
     forkState(0),
     subroutine(0),
+    firstBlock(0),
     logicalIp(-1),
     constantCount(0),
     logicalCodeLength(0),
@@ -432,6 +433,7 @@ class Context {
   Event* lastEvent;
   ForkState* forkState;
   MySubroutine* subroutine;
+  Block* firstBlock;
   int logicalIp;
   unsigned constantCount;
   unsigned logicalCodeLength;
@@ -2171,9 +2173,9 @@ class MemorySite: public Site {
   }
 
   virtual Site* makeNextWord(Context* c, unsigned index) {
-    // todo: endianness?
     return memorySite
-      (c, base, offset + (index == 1 ? BytesPerWord : -BytesPerWord),
+      (c, base, offset + ((index == 1) xor c->arch->bigEndian()
+                          ? BytesPerWord : -BytesPerWord),
        this->index, scale);
   }
 
@@ -2184,12 +2186,11 @@ class MemorySite: public Site {
   }
 
   virtual SiteMask nextWordMask(Context* c, unsigned index) {
-    // todo: endianness?
     int frameIndex;
     if (base == c->arch->stack()) {
       assert(c, this->index == NoRegister);
       frameIndex = static_cast<int>(offsetToFrameIndex(c, offset))
-        + (index == 1 ? 1 : -1);
+        + ((index == 1) xor c->arch->bigEndian() ? 1 : -1);
     } else {
       frameIndex = NoFrameIndex;
     }
@@ -5707,7 +5708,7 @@ block(Context* c, Event* head)
   return new (c->zone->allocate(sizeof(Block))) Block(head);
 }
 
-unsigned
+void
 compile(Context* c, uintptr_t stackOverflowHandler, unsigned stackLimitOffset)
 {
   if (c->logicalCode[c->logicalIp]->lastEvent == 0) {
@@ -5837,19 +5838,7 @@ compile(Context* c, uintptr_t stackOverflowHandler, unsigned stackLimitOffset)
     }
   }
 
-  block = firstBlock;
-  while (block->nextBlock or block->nextInstruction) {
-    Block* next = block->nextBlock
-      ? block->nextBlock
-      : block->nextInstruction->firstEvent->block;
-
-    next->start = block->assemblerBlock->resolve
-      (block->start, next->assemblerBlock);
-
-    block = next;
-  }
-
-  return block->assemblerBlock->resolve(block->start, 0) + a->footerSize();
+  c->firstBlock = firstBlock;
 }
 
 unsigned
@@ -6884,25 +6873,43 @@ class MyCompiler: public Compiler {
     appendBarrier(&c, StoreLoadBarrier);
   }
 
-  virtual unsigned compile(uintptr_t stackOverflowHandler,
-                           unsigned stackLimitOffset)
+  virtual void compile(uintptr_t stackOverflowHandler,
+                       unsigned stackLimitOffset)
   {
-    return c.machineCodeSize = local::compile
-      (&c, stackOverflowHandler, stackLimitOffset);
+    local::compile(&c, stackOverflowHandler, stackLimitOffset);
+  }
+
+  virtual unsigned resolve(uint8_t* dst) {
+    c.machineCode = dst;
+    c.assembler->setDestination(dst);
+
+    Block* block = c.firstBlock;
+    while (block->nextBlock or block->nextInstruction) {
+      Block* next = block->nextBlock
+        ? block->nextBlock
+        : block->nextInstruction->firstEvent->block;
+
+      next->start = block->assemblerBlock->resolve
+        (block->start, next->assemblerBlock);
+
+      block = next;
+    }
+
+    return c.machineCodeSize = block->assemblerBlock->resolve
+      (block->start, 0) + c.assembler->footerSize();
   }
 
   virtual unsigned poolSize() {
     return c.constantCount * BytesPerWord;
   }
 
-  virtual void writeTo(uint8_t* dst) {
-    c.machineCode = dst;
-    c.assembler->writeTo(dst);
+  virtual void write() {
+    c.assembler->write();
 
     int i = 0;
     for (ConstantPoolNode* n = c.firstConstant; n; n = n->next) {
       intptr_t* target = reinterpret_cast<intptr_t*>
-        (dst + pad(c.machineCodeSize) + i);
+        (c.machineCode + pad(c.machineCodeSize) + i);
 
       if (n->promise->resolved()) {
         *target = n->promise->value();
diff --git a/src/compiler.h b/src/compiler.h
index 282941b722..3349c5e713 100644
--- a/src/compiler.h
+++ b/src/compiler.h
@@ -188,10 +188,11 @@ class Compiler {
   virtual void storeStoreBarrier() = 0;
   virtual void storeLoadBarrier() = 0;
 
-  virtual unsigned compile(uintptr_t stackOverflowHandler,
-                           unsigned stackLimitOffset) = 0;
+  virtual void compile(uintptr_t stackOverflowHandler,
+                       unsigned stackLimitOffset) = 0;
+  virtual unsigned resolve(uint8_t* dst) = 0;
   virtual unsigned poolSize() = 0;
-  virtual void writeTo(uint8_t* dst) = 0;
+  virtual void write() = 0;
 
   virtual void dispose() = 0;
 };
diff --git a/src/finder.cpp b/src/finder.cpp
index b0e25ef82a..2199dd5207 100644
--- a/src/finder.cpp
+++ b/src/finder.cpp
@@ -632,59 +632,139 @@ class BuiltinElement: public JarElement {
   const char* libraryName;
 };
 
+void
+add(Element** first, Element** last, Element* e)
+{
+  if (*last) {
+    (*last)->next = e;
+  } else {
+    *first = e;
+  }
+  *last = e;
+}
+
+unsigned
+baseName(const char* name, char fileSeparator)
+{
+  const char* p = name;
+  const char* last = 0;
+  while (*p) {
+    if (*p == fileSeparator) {
+      last = p;
+    }
+    ++p;
+  }
+
+  return last ? (last + 1) - name : 0;
+}
+
+void
+add(System* s, Element** first, Element** last, Allocator* allocator,
+    const char* name, unsigned nameLength, const char* bootLibrary);
+
+void
+addJar(System* s, Element** first, Element** last, Allocator* allocator,
+       const char* name, const char* bootLibrary)
+{
+  if (DebugFind) {
+    fprintf(stderr, "add jar %s\n", name);
+  }
+
+  JarElement* e = new (allocator->allocate(sizeof(JarElement)))
+    JarElement(s, allocator, name);
+
+  add(first, last, e);
+
+  System::Region* region = e->find("META-INF/MANIFEST.MF");
+  if (region) {
+    unsigned start = 0;
+    unsigned length;
+    while (readLine(region->start(), region->length(), &start, &length)) {
+      const unsigned PrefixLength = 12;
+      if (strncmp("Class-Path: ", reinterpret_cast<const char*>
+                  (region->start() + start), PrefixLength) == 0)
+      {
+        for (Tokenizer t(reinterpret_cast<const char*>
+                         (region->start() + start + PrefixLength),
+                         length - PrefixLength, ' ');
+             t.hasMore();)
+        {
+          Tokenizer::Token token(t.next());
+
+          unsigned base = baseName(name, s->fileSeparator());
+
+          RUNTIME_ARRAY(char, n, base + token.length + 1);
+          memcpy(RUNTIME_ARRAY_BODY(n), name, base);
+          memcpy(RUNTIME_ARRAY_BODY(n) + base, token.s, token.length);
+          RUNTIME_ARRAY_BODY(n)[base + token.length] = 0;
+          
+          add(s, first, last, allocator, RUNTIME_ARRAY_BODY(n),
+              base + token.length, bootLibrary);
+        }
+      }
+      start += length;
+    }
+
+    region->dispose();
+  }
+}
+
+void
+add(System* s, Element** first, Element** last, Allocator* allocator,
+    const char* token, unsigned tokenLength, const char* bootLibrary)
+{
+  if (*token == '[' and token[tokenLength - 1] == ']') {
+    char* name = static_cast<char*>(allocator->allocate(tokenLength - 1));
+    memcpy(name, token + 1, tokenLength - 1);
+    name[tokenLength - 2] = 0; 
+
+    if (DebugFind) {
+      fprintf(stderr, "add builtin %s\n", name);
+    }
+  
+    add(first, last, new (allocator->allocate(sizeof(BuiltinElement)))
+        BuiltinElement(s, allocator, name, bootLibrary));
+  } else {
+    char* name = static_cast<char*>(allocator->allocate(tokenLength + 1));
+    memcpy(name, token, tokenLength);
+    name[tokenLength] = 0;
+
+    unsigned length;
+    switch (s->stat(name, &length)) {
+    case System::TypeFile: {
+      addJar(s, first, last, allocator, name, bootLibrary);
+    } break;
+
+    case System::TypeDirectory: {
+      if (DebugFind) {
+        fprintf(stderr, "add directory %s\n", name);
+      }
+
+      add(first, last, new (allocator->allocate(sizeof(DirectoryElement)))
+          DirectoryElement(s, allocator, name));
+    } break;
+
+    default: {
+      if (DebugFind) {
+        fprintf(stderr, "ignore nonexistent %s\n", name);
+      }
+
+      allocator->free(name, strlen(name) + 1);
+    } break;
+    }
+  }
+}
+
 Element*
 parsePath(System* s, Allocator* allocator, const char* path,
           const char* bootLibrary)
 {
   Element* first = 0;
-  Element* prev = 0;
+  Element* last = 0;
   for (Tokenizer t(path, s->pathSeparator()); t.hasMore();) {
     Tokenizer::Token token(t.next());
 
-    Element* e;
-    if (*token.s == '[' and token.s[token.length - 1] == ']') {
-      char* name = static_cast<char*>(allocator->allocate(token.length - 1));
-      memcpy(name, token.s + 1, token.length - 1);
-      name[token.length - 2] = 0; 
-  
-      e = new (allocator->allocate(sizeof(BuiltinElement)))
-        BuiltinElement(s, allocator, name, bootLibrary);
-    } else {
-      char* name = static_cast<char*>(allocator->allocate(token.length + 1));
-      memcpy(name, token.s, token.length);
-      name[token.length] = 0;
-
-      unsigned length;
-      switch (s->stat(name, &length)) {
-      case System::TypeFile: {
-        e = new (allocator->allocate(sizeof(JarElement)))
-          JarElement(s, allocator, name);
-      } break;
-
-      case System::TypeDirectory: {
-        e = new (allocator->allocate(sizeof(DirectoryElement)))
-          DirectoryElement(s, allocator, name);
-      } break;
-
-      default: {
-        allocator->free(name, strlen(name) + 1);
-        e = 0;
-      } break;
-      }
-    }
-
-    if (DebugFind) {
-      fprintf(stderr, "add element %.*s %p\n", token.length, token.s, e);
-    }
-
-    if (e) {
-      if (prev) {
-        prev->next = e;
-      } else {
-        first = e;
-      }
-      prev = e;
-    }
+    add(s, &first, &last, allocator, token.s, token.length, bootLibrary);
   }
 
   return first;
diff --git a/src/finder.h b/src/finder.h
index 203abb49e3..7546fb8b9c 100644
--- a/src/finder.h
+++ b/src/finder.h
@@ -17,6 +17,22 @@
 
 namespace vm {
 
+inline bool
+readLine(const uint8_t* base, unsigned total, unsigned* start,
+         unsigned* length)
+{
+  const uint8_t* p = base + *start;
+  const uint8_t* end = base + total;
+  while (p != end and (*p == '\n' or *p == '\r')) ++ p;
+
+  *start = p - base;
+  while (p != end and not (*p == '\n' or *p == '\r')) ++ p;
+
+  *length = (p - base) - *start;
+
+  return *length != 0;
+}
+
 class Finder {
  public:
   class IteratorImp {
diff --git a/src/heap.cpp b/src/heap.cpp
index 02e0e99a19..c2474cf653 100644
--- a/src/heap.cpp
+++ b/src/heap.cpp
@@ -1746,7 +1746,10 @@ tryAllocate(Context* c, unsigned size)
 void*
 allocate(Context* c, unsigned size)
 {
-  return allocate(c, size, false);
+  void* p = allocate(c, size, false);
+  expect(c->system, p);
+
+  return p;
 }
 
 void
diff --git a/src/jnienv.cpp b/src/jnienv.cpp
index eebbf2ad6d..3fb07db9ad 100644
--- a/src/jnienv.cpp
+++ b/src/jnienv.cpp
@@ -2287,11 +2287,9 @@ append(char** p, const char* value, unsigned length, char tail)
   }
 }
 
-void
-boot(Thread* t)
+uint64_t
+boot(Thread* t, uintptr_t*)
 {
-  enter(t, Thread::ActiveState);
-
   t->javaThread = t->m->classpath->makeThread(t, 0);
 
   setRoot(t, Machine::NullPointerException, makeThrowable
@@ -2313,6 +2311,8 @@ boot(Thread* t)
   t->m->classpath->boot(t);
 
   enter(t, Thread::IdleState);
+
+  return 1;
 }
 
 } // namespace local
@@ -2628,7 +2628,8 @@ JNI_CreateJavaVM(Machine** m, Thread** t, void* args)
 
   *t = p->makeThread(*m, 0, 0);
 
-  local::boot(*t);
+  enter(*t, Thread::ActiveState);
+  enter(*t, Thread::IdleState);
 
-  return 0;
+  return run(*t, local::boot, 0) ? 0 : -1;
 }
diff --git a/src/machine.cpp b/src/machine.cpp
index c174f4aafd..a561006ca7 100644
--- a/src/machine.cpp
+++ b/src/machine.cpp
@@ -2413,20 +2413,23 @@ Thread::exit()
     } else {
       threadPeer(this, javaThread) = 0;
 
+      System::Monitor* myLock = lock;
+      System::Thread* mySystemThread = systemThread;
+
       { ACQUIRE_RAW(this, m->stateLock);
 
         while (flags & SystemFlag) {
           m->stateLock->wait(systemThread, 0);
         }
+
+        atomicOr(&flags, Thread::DisposeFlag);
       
         enter(this, Thread::ZombieState);
       }
 
-      lock->dispose();
-      lock = 0;
+      myLock->dispose();
 
-      systemThread->dispose();
-      systemThread = 0;
+      mySystemThread->dispose();
     }
   }
 }
@@ -2434,12 +2437,14 @@ Thread::exit()
 void
 Thread::dispose()
 {
-  if (lock) {
-    lock->dispose();
-  }
+  if ((flags & Thread::DisposeFlag) == 0) {
+    if (lock) {
+      lock->dispose();
+    }
 
-  if (systemThread) {
-    systemThread->dispose();
+    if (systemThread) {
+      systemThread->dispose();
+    }
   }
 
   m->heap->free(defaultHeap, ThreadHeapSizeInBytes);
diff --git a/src/machine.h b/src/machine.h
index c342b9f695..6c3633000f 100644
--- a/src/machine.h
+++ b/src/machine.h
@@ -1369,6 +1369,7 @@ class Thread {
   static const unsigned StressFlag = 1 << 4;
   static const unsigned ActiveFlag = 1 << 5;
   static const unsigned SystemFlag = 1 << 6;
+  static const unsigned DisposeFlag = 1 << 7;
 
   class Protector {
    public:
diff --git a/src/main.cpp b/src/main.cpp
index 9996e6078d..d04fbe3f34 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -74,22 +74,6 @@ vmNativeCall(void*, void*, unsigned, unsigned)
 
 namespace {
 
-bool
-readLine(const uint8_t* base, unsigned total, unsigned* start,
-         unsigned* length)
-{
-  const uint8_t* p = base + *start;
-  const uint8_t* end = base + total;
-  while (p != end and (*p == '\n' or *p == '\r')) ++ p;
-
-  *start = p - base;
-  while (p != end and not (*p == '\n' or *p == '\r')) ++ p;
-
-  *length = (p - base) - *start;
-
-  return *length != 0;
-}
-
 const char*
 mainClass(const char* jar)
 {
diff --git a/src/posix.cpp b/src/posix.cpp
index b885e422fc..3fcbb218de 100644
--- a/src/posix.cpp
+++ b/src/posix.cpp
@@ -924,10 +924,9 @@ handleSignal(int signal, siginfo_t* info, void* context)
       // supposed to work.
 
       sigset_t set;
-
       sigemptyset(&set);
       sigaddset(&set, signal);
-      sigprocmask(SIG_UNBLOCK, &set, 0);
+      pthread_sigmask(SIG_UNBLOCK, &set, 0);
 
       vmJump(ip, frame, stack, thread, 0, 0);
     }
diff --git a/src/powerpc-regs.S b/src/powerpc-regs.S
new file mode 100644
index 0000000000..da5940f403
--- /dev/null
+++ b/src/powerpc-regs.S
@@ -0,0 +1,64 @@
+#define r0 0
+#define r1 1
+#define r2 2
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+#define r13 13
+#define r14 14
+#define r15 15
+#define r16 16
+#define r17 17
+#define r18 18
+#define r19 19
+#define r20 20
+#define r21 21
+#define r22 22
+#define r23 23
+#define r24 24
+#define r25 25
+#define r26 26
+#define r27 27
+#define r28 28
+#define r29 29
+#define r30 30
+#define r31 31
+#define f0 0
+#define f1 1
+#define f2 2
+#define f3 3
+#define f4 4
+#define f5 5
+#define f6 6
+#define f7 7
+#define f8 8
+#define f9 9
+#define f10 10
+#define f11 11
+#define f12 12
+#define f13 13
+#define f14 14
+#define f15 15
+#define f16 16
+#define f17 17
+#define f18 18
+#define f19 19
+#define f20 20
+#define f21 21
+#define f22 22
+#define f23 23
+#define f24 24
+#define f25 25
+#define f26 26
+#define f27 27
+#define f28 28
+#define f29 29
+#define f30 30
+#define f31 31
diff --git a/src/powerpc.S b/src/powerpc.S
index 2b26f7985f..96bf382cfb 100644
--- a/src/powerpc.S
+++ b/src/powerpc.S
@@ -13,22 +13,28 @@
 .text
 
 #define BYTES_PER_WORD 4
-#define LINKAGE_AREA 6
 #define GPR_COUNT 8
-#define MEMORY_BASE BYTES_PER_WORD * (LINKAGE_AREA + GPR_COUNT)
-#define LOCAL(x) L##x
    
 #ifdef __APPLE__
 #  define GLOBAL(x) _##x
+#  define LOCAL(x) L##x
+#  define LINKAGE_AREA 6
+#  define MEMORY_BASE BYTES_PER_WORD * (LINKAGE_AREA + GPR_COUNT)
+#  define RETURN_ADDRESS_OFFSET 8
 #else
-#  define GLOBAL(x) x   
+#  define GLOBAL(x) x
+#  define LOCAL(x) .L##x
+#  define LINKAGE_AREA 2
+#  define MEMORY_BASE BYTES_PER_WORD * LINKAGE_AREA
+#  define RETURN_ADDRESS_OFFSET 4
+#  include "powerpc-regs.S"
 #endif
 
 .globl GLOBAL(vmNativeCall)
 GLOBAL(vmNativeCall):
    // save return address
    mflr r0
-   stw  r0,8(r1)
+   stw  r0,RETURN_ADDRESS_OFFSET(r1)
    
    // r3 aka r13: function
    // r4        : stackTotal
@@ -42,17 +48,26 @@ GLOBAL(vmNativeCall):
    // r16       : temporary
    // r17       : temporary
    // r18       : temporary
-   
-   // save registers used for local variables
-   stw  r13,24(r1)
-   stw  r14,28(r1)
-   stw  r15,32(r1)
-   stw  r16,36(r1)
-   stw  r17,40(r1)
-   stw  r18,44(r1)
 
-   // allocate stack space
-	 stwux	r1,r1,r4
+   // allocate stack space, adding room for callee-saved registers and
+   // scratch space for copying a FP return value into GPRs
+   subfic r10,r4,-48
+	 stwux r1,r1,r10
+
+   // save callee-saved registers used for local variables
+   add  r10,r4,r1
+
+   // save registers used for local variables
+   stw  r13,0(r10)
+   stw  r14,4(r10)
+   stw  r15,8(r10)
+   stw  r16,12(r10)
+   stw  r17,16(r10)
+   stw  r18,20(r10)
+   stw  r19,24(r10)
+
+   // remember where we saved the local variables
+   mr   r19,r10
 
    // save our argument registers so we can clobber them
    mr   r13,r3
@@ -84,12 +99,14 @@ LOCAL(test):
    lfd  f6,40(r8)
    lfd  f7,48(r8)
    lfd  f8,56(r8)
+#ifdef __APPLE__
    lfd  f9,64(r8)
    lfd  f10,72(r8)
    lfd  f11,80(r8)
    lfd  f12,88(r8)
    lfd  f13,96(r8)
-   
+#endif
+
 LOCAL(gpr):
    // do we need to load the general-purpose registers?
    cmpwi r7,0
@@ -128,25 +145,26 @@ LOCAL(float):
 
 LOCAL(copy):
    // move floating point return value to GPRs via memory
-   stfd f1,8(r1)
-   lwz  r3,8(r1)
-   lwz  r4,12(r1)
+   stfd f1,32(r19)
+   lwz  r3,32(r19)
+   lwz  r4,36(r19)
    b    LOCAL(exit)
 
 LOCAL(exit):
+   // restore callee-saved registers used for local variables
+   lwz  r13,0(r19)
+   lwz  r14,4(r19)
+   lwz  r15,8(r19)
+   lwz  r16,12(r19)
+   lwz  r17,16(r19)
+   lwz  r18,20(r19)
+   lwz  r19,24(r19)
+
    // restore stack pointer
 	 lwz	r1,0(r1)
 
-   // restore registers used for local variables
-   lwz  r13,24(r1)
-   lwz  r14,28(r1)
-   lwz  r15,32(r1)
-   lwz  r16,36(r1)
-   lwz  r17,40(r1)
-   lwz  r18,44(r1)
-
    // load return address
-   lwz  r0,8(r1)
+   lwz  r0,RETURN_ADDRESS_OFFSET(r1)
    mtlr r0
 
    // return
@@ -171,7 +189,7 @@ GLOBAL(vmRun):
    // r5: checkpoint
 
    mflr r0
-   stw  r0,8(r1)
+   stw  r0,RETURN_ADDRESS_OFFSET(r1)
 
    stwu r1,-(MEMORY_BASE+88)(r1)
       
@@ -226,6 +244,6 @@ GLOBAL(vmRun_returnAddress):
    lwz  r31,MEMORY_BASE+72(r1)
 
    lwz	r1,0(r1)
-   lwz  r0,8(r1)
+   lwz  r0,RETURN_ADDRESS_OFFSET(r1)
    mtlr r0
    blr
diff --git a/src/powerpc.cpp b/src/powerpc.cpp
index edcd587132..e7e247dacb 100644
--- a/src/powerpc.cpp
+++ b/src/powerpc.cpp
@@ -162,7 +162,15 @@ carry16(intptr_t v)
   return static_cast<int16_t>(v) < 0 ? 1 : 0;
 }
 
+#ifdef __APPLE__
 const unsigned FrameFooterSize = 6;
+const unsigned ReturnAddressOffset = 2;
+const unsigned AlignArguments = false;
+#else
+const unsigned FrameFooterSize = 2;
+const unsigned ReturnAddressOffset = 1;
+const unsigned AlignArguments = true;
+#endif
 
 const unsigned StackAlignmentInBytes = 16;
 const unsigned StackAlignmentInWords = StackAlignmentInBytes / BytesPerWord;
@@ -170,20 +178,44 @@ const unsigned StackAlignmentInWords = StackAlignmentInBytes / BytesPerWord;
 const int StackRegister = 1;
 const int ThreadRegister = 13;
 
+const bool DebugJumps = false;
+
+class Context;
+class MyBlock;
+class JumpOffset;
+class JumpEvent;
+
+void
+resolve(MyBlock*);
+
+unsigned
+padding(MyBlock*, unsigned);
+
 class MyBlock: public Assembler::Block {
  public:
-  MyBlock(unsigned offset):
-    next(0), offset(offset), start(~0), size(0)
+  MyBlock(Context* context, unsigned offset):
+    context(context), next(0), jumpOffsetHead(0), jumpOffsetTail(0),
+    lastJumpOffsetTail(0), jumpEventHead(0), jumpEventTail(0),
+    lastEventOffset(0), offset(offset), start(~0), size(0)
   { }
 
   virtual unsigned resolve(unsigned start, Assembler::Block* next) {
     this->start = start;
     this->next = static_cast<MyBlock*>(next);
 
-    return start + size;
+    ::resolve(this);
+
+    return start + size + padding(this, size);
   }
 
+  Context* context;
   MyBlock* next;
+  JumpOffset* jumpOffsetHead;
+  JumpOffset* jumpOffsetTail;
+  JumpOffset* lastJumpOffsetTail;
+  JumpEvent* jumpEventHead;
+  JumpEvent* jumpEventTail;
+  unsigned lastEventOffset;
   unsigned offset;
   unsigned start;
   unsigned size;
@@ -191,15 +223,14 @@ class MyBlock: public Assembler::Block {
 
 class Task;
 class ConstantPoolEntry;
-class JumpPromise;
 
 class Context {
  public:
   Context(System* s, Allocator* a, Zone* zone):
     s(s), zone(zone), client(0), code(s, a, 1024), tasks(0), result(0),
-    firstBlock(new (zone->allocate(sizeof(MyBlock))) MyBlock(0)),
-    lastBlock(firstBlock), constantPool(0), jumps(0), constantPoolCount(0),
-    jumpCount(0)
+    firstBlock(new (zone->allocate(sizeof(MyBlock))) MyBlock(this, 0)),
+    lastBlock(firstBlock), jumpOffsetHead(0), jumpOffsetTail(0),
+    constantPool(0), constantPoolCount(0)
   { }
 
   System* s;
@@ -210,10 +241,10 @@ class Context {
   uint8_t* result;
   MyBlock* firstBlock;
   MyBlock* lastBlock;
+  JumpOffset* jumpOffsetHead;
+  JumpOffset* jumpOffsetTail;
   ConstantPoolEntry* constantPool;
-  JumpPromise* jumps;
   unsigned constantPoolCount;
-  unsigned jumpCount;
 };
 
 class Task {
@@ -316,38 +347,6 @@ offset(Context* c)
     Offset(c, c->lastBlock, c->code.length());
 }
 
-class JumpPromise: public Promise {
- public:
-  JumpPromise(Context* c, uintptr_t target):
-    c(c), target(target), next(c->jumps), index(c->jumpCount++)
-  {
-    c->jumps = this;
-  }
-
-  virtual bool resolved() {
-    return c->result != 0;
-  }
-  
-  virtual int64_t value() {
-    assert(c, resolved());
-
-    return reinterpret_cast<intptr_t>
-      (c->result + c->code.length() + (index * BytesPerWord));
-  }
-
-  Context* c;
-  uintptr_t target;
-  JumpPromise* next;
-  unsigned index;
-};
-
-Promise*
-jump(Context* c, uintptr_t target)
-{
-  return new (c->zone->allocate(sizeof(JumpPromise)))
-    JumpPromise(c, target);
-}
-
 bool
 bounded(int right, int left, int32_t v)
 {
@@ -355,13 +354,21 @@ bounded(int right, int left, int32_t v)
 }
 
 void*
-updateOffset(System* s, uint8_t* instruction, bool conditional, int64_t value)
+updateOffset(System* s, uint8_t* instruction, bool conditional, int64_t value,
+             void* jumpAddress)
 {
   int32_t v = reinterpret_cast<uint8_t*>(value) - instruction;
    
   int32_t mask;
   if (conditional) {
-    expect(s, bounded(2, 16, v));
+    if (not bounded(2, 16, v)) {
+      *static_cast<uint32_t*>(jumpAddress) = isa::b(0);
+      updateOffset(s, static_cast<uint8_t*>(jumpAddress), false, value, 0);
+
+      v = static_cast<uint8_t*>(jumpAddress) - instruction;
+
+      expect(s, bounded(2, 16, v));
+    }
     mask = 0xFFFC;
   } else {
     expect(s, bounded(2, 6, v));
@@ -376,20 +383,23 @@ updateOffset(System* s, uint8_t* instruction, bool conditional, int64_t value)
 
 class OffsetListener: public Promise::Listener {
  public:
-  OffsetListener(System* s, uint8_t* instruction, bool conditional):
+  OffsetListener(System* s, uint8_t* instruction, bool conditional,
+                 void* jumpAddress):
     s(s),
     instruction(instruction),
+    jumpAddress(jumpAddress),
     conditional(conditional)
   { }
 
   virtual bool resolve(int64_t value, void** location) {
-    void* p = updateOffset(s, instruction, conditional, value);
+    void* p = updateOffset(s, instruction, conditional, value, jumpAddress);
     if (location) *location = p;
     return false;
   }
 
   System* s;
   uint8_t* instruction;
+  void* jumpAddress;
   bool conditional;
 };
 
@@ -400,6 +410,7 @@ class OffsetTask: public Task {
     Task(next),
     promise(promise),
     instructionOffset(instructionOffset),
+    jumpAddress(0),
     conditional(conditional)
   { }
 
@@ -407,25 +418,181 @@ class OffsetTask: public Task {
     if (promise->resolved()) {
       updateOffset
         (c->s, c->result + instructionOffset->value(), conditional,
-         promise->value());
+         promise->value(), jumpAddress);
     } else {
       new (promise->listen(sizeof(OffsetListener)))
         OffsetListener(c->s, c->result + instructionOffset->value(),
-                       conditional);
+                       conditional, jumpAddress);
     }
   }
 
   Promise* promise;
   Promise* instructionOffset;
+  void* jumpAddress;
   bool conditional;
 };
 
+class JumpOffset {
+ public:
+  JumpOffset(MyBlock* block, OffsetTask* task, unsigned offset):
+    block(block), task(task), next(0), offset(offset)
+  { }
+
+  MyBlock* block;
+  OffsetTask* task;
+  JumpOffset* next;
+  unsigned offset;  
+};
+
+class JumpEvent {
+ public:
+  JumpEvent(JumpOffset* jumpOffsetHead, JumpOffset* jumpOffsetTail,
+            unsigned offset):
+    jumpOffsetHead(jumpOffsetHead), jumpOffsetTail(jumpOffsetTail), next(0),
+    offset(offset)
+  { }
+
+  JumpOffset* jumpOffsetHead;
+  JumpOffset* jumpOffsetTail;
+  JumpEvent* next;
+  unsigned offset;
+};
+
 void
 appendOffsetTask(Context* c, Promise* promise, Promise* instructionOffset,
                  bool conditional)
 {
-  c->tasks = new (c->zone->allocate(sizeof(OffsetTask))) OffsetTask
+  OffsetTask* task = new (c->zone->allocate(sizeof(OffsetTask))) OffsetTask
     (c->tasks, promise, instructionOffset, conditional);
+
+  c->tasks = task;
+
+  if (conditional) {
+    JumpOffset* offset = new (c->zone->allocate(sizeof(JumpOffset))) JumpOffset
+      (c->lastBlock, task, c->code.length() - c->lastBlock->offset);
+
+    if (c->lastBlock->jumpOffsetTail) {
+      c->lastBlock->jumpOffsetTail->next = offset;
+    } else {
+      c->lastBlock->jumpOffsetHead = offset;
+    }
+    c->lastBlock->jumpOffsetTail = offset;
+  }
+}
+
+void
+appendJumpEvent(Context* c, MyBlock* b, unsigned offset, JumpOffset* head,
+                JumpOffset* tail)
+{
+  JumpEvent* e = new (c->zone->allocate(sizeof(JumpEvent))) JumpEvent
+    (head, tail, offset);
+
+  if (b->jumpEventTail) {
+    b->jumpEventTail->next = e;
+  } else {
+    b->jumpEventHead = e;
+  }
+  b->jumpEventTail = e;
+}
+
+unsigned
+padding(MyBlock* b, unsigned offset)
+{
+  unsigned total = 0;
+  for (JumpEvent** e = &(b->jumpEventHead); *e;) {
+    if ((*e)->offset <= offset) {
+      for (JumpOffset** o = &((*e)->jumpOffsetHead); *o;) {
+        if ((*o)->task->promise->resolved()
+            and (*o)->task->instructionOffset->resolved())
+        {
+          int32_t v = reinterpret_cast<uint8_t*>((*o)->task->promise->value())
+            - (b->context->result + (*o)->task->instructionOffset->value());
+
+          if (bounded(2, 16, v)) {
+            // this conditional jump needs no indirection -- a direct
+            // jump will suffice
+            *o = (*o)->next;
+            continue;
+          }
+        }
+
+        total += BytesPerWord;
+        o = &((*o)->next);
+      }
+
+      if ((*e)->jumpOffsetHead == 0) {
+        *e = (*e)->next;
+      } else {
+        if (b->next) {
+          total += BytesPerWord;
+        }
+        e = &((*e)->next);
+      }
+    } else {
+      break;
+    }
+  }
+
+  return total;
+}
+
+void
+resolve(MyBlock* b)
+{
+  Context* c = b->context;
+
+  if (b->jumpOffsetHead) {
+    if (c->jumpOffsetTail) {
+      c->jumpOffsetTail->next = b->jumpOffsetHead;
+    } else {
+      c->jumpOffsetHead = b->jumpOffsetHead;
+    }
+    c->jumpOffsetTail = b->jumpOffsetTail;
+  }
+
+  if (c->jumpOffsetHead) {
+    bool append;
+    if (b->next == 0 or b->next->jumpEventHead) {
+      append = true;
+    } else {
+      int32_t v = (b->start + b->size + b->next->size + BytesPerWord)
+        - (c->jumpOffsetHead->offset + c->jumpOffsetHead->block->start);
+
+      append = not bounded(2, 16, v);
+
+      if (DebugJumps) {
+        fprintf(stderr,
+                "current %p %d %d next %p %d %d\n",
+                b, b->start, b->size, b->next, b->start + b->size,
+                b->next->size);
+        fprintf(stderr,
+                "offset %p %d is of distance %d to next block; append? %d\n",
+                c->jumpOffsetHead, c->jumpOffsetHead->offset, v, append);
+      }
+    }
+
+    if (append) {
+#ifndef NDEBUG
+      int32_t v = (b->start + b->size)
+        - (c->jumpOffsetHead->offset + c->jumpOffsetHead->block->start);
+      
+      expect(c, bounded(2, 16, v));
+#endif // not NDEBUG
+
+      appendJumpEvent(c, b, b->size, c->jumpOffsetHead, c->jumpOffsetTail);
+
+      if (DebugJumps) {
+        for (JumpOffset* o = c->jumpOffsetHead; o; o = o->next) {
+          fprintf(stderr,
+                  "include %p %d in jump event %p at offset %d in block %p\n",
+                  o, o->offset, b->jumpEventTail, b->size, b);
+        }
+      }
+
+      c->jumpOffsetHead = 0;
+      c->jumpOffsetTail = 0;
+    }
+  }
 }
 
 inline unsigned
@@ -483,6 +650,11 @@ inline int newTemp(Context* con) { return con->client->acquireTemporary(); }
 inline void freeTemp(Context* con, int r) { con->client->releaseTemporary(r); }
 inline int64_t getValue(Assembler::Constant* c) { return c->value->value(); }
 
+inline void
+write4(uint8_t* dst, uint32_t v)
+{
+  memcpy(dst, &v, 4);
+}
 
 void shiftLeftR(Context* con, unsigned size, Assembler::Register* a, Assembler::Register* b, Assembler::Register* t)
 {
@@ -1525,7 +1697,7 @@ branchLong(Context* c, TernaryOperation op, Assembler::Operand* al,
   if (next) {
     updateOffset
       (c->s, c->code.data + next, true, reinterpret_cast<intptr_t>
-       (c->code.data + c->code.length()));
+       (c->code.data + c->code.length()), 0);
   }
 }
 
@@ -1767,17 +1939,17 @@ nextFrame(ArchitectureContext* c UNUSED, int32_t* start, unsigned size,
     // check for post-non-tail-call stack adjustment of the form "lwzx
     // r0,0(r1); stwu r0,offset(r1)":
     if (instruction < start + (size / BytesPerWord) - 1
-        and (static_cast<uint32_t>(instruction[1]) >> 16) == 0x9421)
+        and (static_cast<uint32_t>(instruction[1]) >> 16) == 0x9401)
     {
-      offset += static_cast<int16_t>(instruction[1]);
-    } else if ((static_cast<uint32_t>(*instruction) >> 16) == 0x9421) {
-      offset += static_cast<int16_t>(*instruction);
+      offset += static_cast<int16_t>(instruction[1]) / BytesPerWord;
+    } else if ((static_cast<uint32_t>(*instruction) >> 16) == 0x9401) {
+      offset += static_cast<int16_t>(*instruction) / BytesPerWord;
     }
 
     // todo: check for and handle tail calls
   }
 
-  *ip = static_cast<void**>(*stack)[offset + 2];
+  *ip = static_cast<void**>(*stack)[offset + ReturnAddressOffset];
   *stack = static_cast<void**>(*stack) + offset;
 }
 
@@ -1924,6 +2096,10 @@ class MyArchitecture: public Assembler::Architecture {
     case 0: // r0 has special meaning in addi and other instructions
     case StackRegister:
     case ThreadRegister:
+#ifndef __APPLE__
+      // r2 is reserved for system uses on SYSV
+    case 2:
+#endif
       return true;
 
     default:
@@ -1940,7 +2116,7 @@ class MyArchitecture: public Assembler::Architecture {
   }
 
   virtual bool argumentAlignment() {
-    return false;
+    return AlignArguments;
   }
 
   virtual unsigned argumentRegisterCount() {
@@ -1952,6 +2128,10 @@ class MyArchitecture: public Assembler::Architecture {
 
     return index + 3;
   }
+
+  virtual bool hasLinkRegister() {
+    return true;
+  }
   
   virtual unsigned stackAlignmentInWords() {
     return StackAlignmentInWords;
@@ -1975,7 +2155,7 @@ class MyArchitecture: public Assembler::Architecture {
     case AlignedCall:
     case AlignedJump: {
       updateOffset(c.s, static_cast<uint8_t*>(returnAddress) - 4, false,
-                   reinterpret_cast<intptr_t>(newTarget));
+                   reinterpret_cast<intptr_t>(newTarget), 0);
     } break;
 
     case LongCall:
@@ -2019,7 +2199,7 @@ class MyArchitecture: public Assembler::Architecture {
   }
 
   virtual void* frameIp(void* stack) {
-    return stack ? static_cast<void**>(stack)[2] : 0;
+    return stack ? static_cast<void**>(stack)[ReturnAddressOffset] : 0;
   }
 
   virtual unsigned frameHeaderSize() {
@@ -2035,7 +2215,7 @@ class MyArchitecture: public Assembler::Architecture {
   }
 
   virtual int returnAddressOffset() {
-    return 8 / BytesPerWord;
+    return ReturnAddressOffset;
   }
 
   virtual int framePointerOffset() {
@@ -2248,16 +2428,18 @@ class MyAssembler: public Assembler {
   {
     Register stack(StackRegister);
     Memory stackLimit(ThreadRegister, stackLimitOffsetFromThread);
-    Constant handlerConstant(jump(&c, handler));
+    Constant handlerConstant
+      (new (c.zone->allocate(sizeof(ResolvedPromise)))
+       ResolvedPromise(handler));
     branchRM(&c, JumpIfGreaterOrEqual, BytesPerWord, &stack, &stackLimit,
              &handlerConstant);
   }
 
-  virtual void saveFrame(unsigned stackOffset) {
+  virtual void saveFrame(unsigned stackOffset, unsigned) {
     Register returnAddress(0);
     emit(&c, mflr(returnAddress.low));
 
-    Memory returnAddressDst(StackRegister, 8);
+    Memory returnAddressDst(StackRegister, ReturnAddressOffset * BytesPerWord);
     moveRM(&c, BytesPerWord, &returnAddress, BytesPerWord, &returnAddressDst);
 
     Register stack(StackRegister);
@@ -2310,7 +2492,7 @@ class MyAssembler: public Assembler {
     Register returnAddress(0);
     emit(&c, mflr(returnAddress.low));
 
-    Memory returnAddressDst(StackRegister, 8);
+    Memory returnAddressDst(StackRegister, ReturnAddressOffset * BytesPerWord);
     moveRM(&c, BytesPerWord, &returnAddress, BytesPerWord, &returnAddressDst);
 
     Register stack(StackRegister);
@@ -2333,7 +2515,7 @@ class MyAssembler: public Assembler {
     moveMR(&c, BytesPerWord, &stackSrc, BytesPerWord, &stack);
 
     Register returnAddress(0);
-    Memory returnAddressSrc(StackRegister, 8);
+    Memory returnAddressSrc(StackRegister, ReturnAddressOffset * BytesPerWord);
     moveMR(&c, BytesPerWord, &returnAddressSrc, BytesPerWord, &returnAddress);
     
     emit(&c, mtlr(returnAddress.low));
@@ -2347,7 +2529,8 @@ class MyAssembler: public Assembler {
     if (TailCalls) {
       if (offset) {
         Register tmp(0);
-        Memory returnAddressSrc(StackRegister, 8 + (footprint * BytesPerWord));
+        Memory returnAddressSrc
+          (StackRegister, (ReturnAddressOffset + footprint) * BytesPerWord);
         moveMR(&c, BytesPerWord, &returnAddressSrc, BytesPerWord, &tmp);
     
         emit(&c, mtlr(tmp.low));
@@ -2362,7 +2545,8 @@ class MyAssembler: public Assembler {
           assert(&c, offset > 0);
 
           Register ras(returnAddressSurrogate);
-          Memory dst(StackRegister, 8 + (offset * BytesPerWord));
+          Memory dst
+            (StackRegister, (ReturnAddressOffset + offset) * BytesPerWord);
           moveRM(&c, BytesPerWord, &ras, BytesPerWord, &dst);
         }
 
@@ -2468,22 +2652,62 @@ class MyAssembler: public Assembler {
     }
   }
 
-  virtual void writeTo(uint8_t* dst) {
+  virtual void setDestination(uint8_t* dst) {
     c.result = dst;
+  }
 
+  virtual void write() {
+    uint8_t* dst = c.result;
+    unsigned dstOffset = 0;
     for (MyBlock* b = c.firstBlock; b; b = b->next) {
-      memcpy(dst + b->start, c.code.data + b->offset, b->size);
-    }
+      if (DebugJumps) {
+        fprintf(stderr, "write block %p\n", b);
+      }
 
-    for (JumpPromise* j = c.jumps; j; j = j->next) {
-      uint8_t* instruction
-        = dst + c.code.length() + (c.jumpCount - j->index - 1);
-      int32_t op = ::b(0);
-      memcpy(instruction, &op, BytesPerWord);
-      updateOffset(c.s, instruction, false, j->target);
+      unsigned blockOffset = 0;
+      for (JumpEvent* e = b->jumpEventHead; e; e = e->next) {
+        unsigned size = e->offset - blockOffset;
+        memcpy(dst + dstOffset, c.code.data + b->offset + blockOffset, size);
+        blockOffset = e->offset;
+        dstOffset += size;
+
+        unsigned jumpTableSize = 0;
+        for (JumpOffset* o = e->jumpOffsetHead; o; o = o->next) {
+          if (DebugJumps) {
+            fprintf(stderr, "visit offset %p %d in block %p\n",
+                    o, o->offset, b);
+          }
+
+          uint8_t* address = dst + dstOffset + jumpTableSize;
+
+          if (b->next) {
+            address += BytesPerWord;
+          }
+
+          o->task->jumpAddress = address;
+
+          jumpTableSize += BytesPerWord;
+        }
+
+        assert(&c, jumpTableSize);
+
+        if (b->next) {
+          write4(dst + dstOffset, ::b(jumpTableSize + BytesPerWord));
+        }
+
+        dstOffset += jumpTableSize + BytesPerWord;
+      }
+
+      unsigned size = b->size - blockOffset;
+
+      memcpy(dst + dstOffset,
+             c.code.data + b->offset + blockOffset,
+             size);
+
+      dstOffset += size;
     }
     
-    unsigned index = c.code.length() + (c.jumpCount * BytesPerWord);
+    unsigned index = c.code.length();
     assert(&c, index % BytesPerWord == 0);
     for (ConstantPoolEntry* e = c.constantPool; e; e = e->next) {
       e->address = dst + index;
@@ -2509,7 +2733,7 @@ class MyAssembler: public Assembler {
     b->size = c.code.length() - b->offset;
     if (startNew) {
       c.lastBlock = new (c.zone->allocate(sizeof(MyBlock)))
-        MyBlock(c.code.length());
+        MyBlock(&c, c.code.length());
     } else {
       c.lastBlock = 0;
     }
@@ -2517,7 +2741,37 @@ class MyAssembler: public Assembler {
   }
 
   virtual void endEvent() {
-    // ignore
+    MyBlock* b = c.lastBlock;
+    unsigned thisEventOffset = c.code.length() - b->offset;
+    if (b->jumpOffsetHead) {
+      int32_t v = (thisEventOffset + BytesPerWord)
+        - b->jumpOffsetHead->offset;
+
+      if (v > 0 and not bounded(2, 16, v)) {
+        appendJumpEvent
+          (&c, b, b->lastEventOffset, b->jumpOffsetHead,
+           b->lastJumpOffsetTail);
+
+        if (DebugJumps) {
+          for (JumpOffset* o = b->jumpOffsetHead;
+               o != b->lastJumpOffsetTail->next; o = o->next)
+          {
+            fprintf(stderr,
+                    "in endEvent, include %p %d in jump event %p "
+                    "at offset %d in block %p\n",
+                    o, o->offset, b->jumpEventTail, b->lastEventOffset, b);
+          }
+        }
+
+        b->jumpOffsetHead = b->lastJumpOffsetTail->next;
+        b->lastJumpOffsetTail->next = 0;
+        if (b->jumpOffsetHead == 0) {
+          b->jumpOffsetTail = 0;
+        }
+      }
+    }
+    b->lastEventOffset = thisEventOffset;
+    b->lastJumpOffsetTail = b->jumpOffsetTail;
   }
 
   virtual unsigned length() {
@@ -2525,7 +2779,7 @@ class MyAssembler: public Assembler {
   }
 
   virtual unsigned footerSize() {
-    return (c.jumpCount + c.constantPoolCount) * BytesPerWord;
+    return c.constantPoolCount * BytesPerWord;
   }
 
   virtual void dispose() {
diff --git a/src/powerpc.h b/src/powerpc.h
index b519ac8064..5fe4543cc7 100644
--- a/src/powerpc.h
+++ b/src/powerpc.h
@@ -14,8 +14,6 @@
 #include "types.h"
 #include "common.h"
 
-#define VA_LIST(x) (&(x))
-
 #ifdef __APPLE__
 #  include "mach/mach_types.h"
 #  include "mach/ppc/thread_act.h"
@@ -45,9 +43,17 @@
 #  define LINK_REGISTER(context) \
   THREAD_STATE_LINK(context->uc_mcontext->FIELD(ss))
 
-#else
-#  error "non-Apple PowerPC-based platforms not yet supported"
-#endif
+#define VA_LIST(x) (&(x))
+
+#else // not __APPLE__
+#  define IP_REGISTER(context) (context->uc_mcontext.regs->gpr[32])
+#  define STACK_REGISTER(context) (context->uc_mcontext.regs->gpr[1])
+#  define THREAD_REGISTER(context) (context->uc_mcontext.regs->gpr[13])
+#  define LINK_REGISTER(context) (context->uc_mcontext.regs->gpr[36])
+
+#define VA_LIST(x) (x)
+
+#endif // not __APPLE__
 
 extern "C" uint64_t
 vmNativeCall(void* function, unsigned stackTotal, void* memoryTable,
@@ -150,13 +156,22 @@ dynamicCall(void* function, uintptr_t* arguments, uint8_t* argumentTypes,
             unsigned argumentCount, unsigned argumentsSize,
             unsigned returnType)
 {
+#ifdef __APPLE__
+#  define SKIP(var, count) var += count;
+#  define ALIGN(var)
   const unsigned LinkageArea = 24;
+  const unsigned FprCount = 13;
+#else
+#  define SKIP(var, count)
+#  define ALIGN(var) if (var & 1) ++var;
+  const unsigned LinkageArea = 8;
+  const unsigned FprCount = 8;
+#endif
 
   const unsigned GprCount = 8;
   uintptr_t gprTable[GprCount];
   unsigned gprIndex = 0;
 
-  const unsigned FprCount = 13;
   uint64_t fprTable[FprCount];
   unsigned fprIndex = 0;
 
@@ -172,8 +187,8 @@ dynamicCall(void* function, uintptr_t* arguments, uint8_t* argumentTypes,
         double d = bitsToFloat(arguments[ai]);
         memcpy(fprTable + fprIndex, &d, 8);
         ++ fprIndex;
-        ++ gprIndex;
-        ++ stackSkip;
+        SKIP(gprIndex, 1);
+        SKIP(stackSkip, 1);
       } else {
         stack[stackIndex++] = arguments[ai];
       }
@@ -184,9 +199,10 @@ dynamicCall(void* function, uintptr_t* arguments, uint8_t* argumentTypes,
       if (fprIndex + (8 / BytesPerWord) <= FprCount) {
         memcpy(fprTable + fprIndex, arguments + ai, 8);
         ++ fprIndex;
-        gprIndex += 8 / BytesPerWord;
-        stackSkip += 8 / BytesPerWord;
+        SKIP(gprIndex, 8 / BytesPerWord);
+        SKIP(stackSkip, 8 / BytesPerWord);
       } else {
+        ALIGN(stackIndex);
         memcpy(stack + stackIndex, arguments + ai, 8);
         stackIndex += 8 / BytesPerWord;
       }
@@ -195,10 +211,12 @@ dynamicCall(void* function, uintptr_t* arguments, uint8_t* argumentTypes,
 
     case INT64_TYPE: {
       if (gprIndex + (8 / BytesPerWord) <= GprCount) {
+        ALIGN(gprIndex);
         memcpy(gprTable + gprIndex, arguments + ai, 8);
         gprIndex += 8 / BytesPerWord;
-        stackSkip += 8 / BytesPerWord;
+        SKIP(stackSkip, 8 / BytesPerWord);
       } else {
+        ALIGN(stackIndex);
         memcpy(stack + stackIndex, arguments + ai, 8);
         stackIndex += 8 / BytesPerWord;
       }
@@ -208,7 +226,7 @@ dynamicCall(void* function, uintptr_t* arguments, uint8_t* argumentTypes,
     default: {
       if (gprIndex < GprCount) {
         gprTable[gprIndex++] = arguments[ai];
-        ++ stackSkip;
+        SKIP(stackSkip, 1);
       } else {
         stack[stackIndex++] = arguments[ai];
       }
@@ -219,8 +237,7 @@ dynamicCall(void* function, uintptr_t* arguments, uint8_t* argumentTypes,
 
   return vmNativeCall
     (function,
-     - ((((1 + stackSkip + stackIndex) * BytesPerWord) + LinkageArea + 15)
-        & -16),
+     (((1 + stackSkip + stackIndex) * BytesPerWord) + LinkageArea + 15) & -16,
      stack, stackIndex * BytesPerWord,
      (gprIndex ? gprTable : 0),
      (fprIndex ? fprTable : 0), returnType);
diff --git a/src/tokenizer.h b/src/tokenizer.h
index 4036e7d197..ac008055ff 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -23,20 +23,27 @@ class Tokenizer {
     unsigned length;
   };
 
-  Tokenizer(const char* s, char delimiter): s(s), delimiter(delimiter) { }
+  Tokenizer(const char* s, char delimiter):
+    s(s), limit(0), delimiter(delimiter)
+  { }
+
+  Tokenizer(const char* s, unsigned length, char delimiter):
+    s(s), limit(s + length), delimiter(delimiter)
+  { }
 
   bool hasMore() {
-    while (*s == delimiter) ++s;
-    return *s != 0;
+    while (*s == delimiter and s != limit) ++s;
+    return *s != 0 and s != limit;
   }
 
   Token next() {
     const char* p = s;
-    while (*s and *s != delimiter) ++s;
+    while (*s and *s != delimiter and s != limit) ++s;
     return Token(p, s - p);
   }
 
   const char* s;
+  const char* limit;
   char delimiter;
 };
 
diff --git a/src/x86.cpp b/src/x86.cpp
index 97ef3c2bb3..5c250f449a 100644
--- a/src/x86.cpp
+++ b/src/x86.cpp
@@ -2846,6 +2846,10 @@ class MyArchitecture: public Assembler::Architecture {
     }
   }
 
+  virtual bool hasLinkRegister() {
+    return false;
+  }
+
   virtual unsigned stackAlignmentInWords() {
     return StackAlignmentInWords;
   }
@@ -3382,7 +3386,7 @@ class MyAssembler: public Assembler {
              &handlerConstant);
   }
 
-  virtual void saveFrame(unsigned stackOffset) {
+  virtual void saveFrame(unsigned stackOffset, unsigned) {
     Register stack(rsp);
     Memory stackDst(rbx, stackOffset);
     apply(Move, BytesPerWord, RegisterOperand, &stack,
@@ -3612,9 +3616,12 @@ class MyAssembler: public Assembler {
     }
   }
 
-  virtual void writeTo(uint8_t* dst) {
+  virtual void setDestination(uint8_t* dst) {
     c.result = dst;
-    
+  }
+
+  virtual void write() {
+    uint8_t* dst = c.result;
     for (MyBlock* b = c.firstBlock; b; b = b->next) {
       unsigned index = 0;
       unsigned padding = 0;
diff --git a/test/Initializers.java b/test/Initializers.java
new file mode 100644
index 0000000000..5b6fe2c2cb
--- /dev/null
+++ b/test/Initializers.java
@@ -0,0 +1,24 @@
+public class Initializers {
+  private static class Static2 {
+    public static String foo = "Static2.foo";
+
+    static {
+      System.gc();
+      new Exception().printStackTrace();
+    }
+  }
+
+  private static class Static1 {
+    public static String foo = "Static1.foo";
+
+    static {
+      System.out.println(Static2.foo);
+    }
+  }
+
+  public static void main(String[] args) {
+    Object x = new Object();
+    System.out.println(Static1.foo);
+    x.toString();
+  }
+}
diff --git a/test/Reflection.java b/test/Reflection.java
index 71a70db815..f71b39e81b 100644
--- a/test/Reflection.java
+++ b/test/Reflection.java
@@ -2,6 +2,42 @@ import java.lang.reflect.Method;
 import java.lang.reflect.Field;
 
 public class Reflection {
+  public static boolean booleanMethod() {
+    return true;
+  }
+
+  public static byte byteMethod() {
+    return 1;
+  }
+
+  public static char charMethod() {
+    return '2';
+  }
+
+  public static short shortMethod() {
+    return 3;
+  }
+
+  public static int intMethod() {
+    return 4;
+  }
+
+  public static float floatMethod() {
+    return 5.0f;
+  }
+
+  public static long longMethod() {
+    return 6;
+  }
+
+  public static double doubleMethod() {
+    return 7.0;
+  }
+
+  public static void expect(boolean v) {
+    if (! v) throw new RuntimeException();
+  }
+
   public static void main(String[] args) throws Exception {
     Class system = Class.forName("java.lang.System");
     Field out = system.getDeclaredField("out");
@@ -9,5 +45,27 @@ public class Reflection {
     Method println = output.getDeclaredMethod("println", String.class);
 
     println.invoke(out.get(null), "Hello, World!");
+
+    expect((Boolean) Reflection.class.getMethod("booleanMethod").invoke(null));
+
+    expect(1 == (Byte) Reflection.class.getMethod("byteMethod").invoke(null));
+
+    expect('2' == (Character) Reflection.class.getMethod
+           ("charMethod").invoke(null));
+
+    expect(3 == (Short) Reflection.class.getMethod
+           ("shortMethod").invoke(null));
+
+    expect(4 == (Integer) Reflection.class.getMethod
+           ("intMethod").invoke(null));
+
+    expect(5.0 == (Float) Reflection.class.getMethod
+           ("floatMethod").invoke(null));
+
+    expect(6 == (Long) Reflection.class.getMethod
+           ("longMethod").invoke(null));
+
+    expect(7.0 == (Double) Reflection.class.getMethod
+           ("doubleMethod").invoke(null));
   }
 }
diff --git a/test/Trace.java b/test/Trace.java
index 79aa3b421d..e07976888b 100644
--- a/test/Trace.java
+++ b/test/Trace.java
@@ -67,10 +67,16 @@ public class Trace implements Runnable {
         if (i % 100 == 0) {
           System.out.print("r");
           System.out.flush();
+          synchronized (this) {
+            notifyAll();
+          }
         }
       }
     } finally {
-      alive = false;
+      synchronized (this) {
+        alive = false;
+        notifyAll();
+      }
     }
   }
 
@@ -88,7 +94,7 @@ public class Trace implements Runnable {
         ++ count;
         
         if (count % 100 == 0) {
-          Thread.yield();
+          trace.wait();
           System.out.print("t");
           System.out.flush();
         }