From c7a1a7af77a94d7ddeeb9e75612793d737fa2014 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 08:44:15 -0600
Subject: [PATCH 01/16] added floating point support, split plan function.

---
 src/assembler.h | 70 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 13 deletions(-)

diff --git a/src/assembler.h b/src/assembler.h
index c915cfe894..78a807ebd6 100644
--- a/src/assembler.h
+++ b/src/assembler.h
@@ -36,7 +36,8 @@ enum UnaryOperation {
   JumpIfLessOrEqual,
   JumpIfGreaterOrEqual,
   JumpIfEqual,
-  JumpIfNotEqual
+  JumpIfNotEqual,
+  JumpIfUnordered
 };
 
 const unsigned UnaryOperationCount = JumpIfNotEqual + 1;
@@ -45,10 +46,24 @@ enum BinaryOperation {
   Move,
   MoveZ,
   Compare,
-  Negate
+  Negate,
+  
+  //extensions:
+  FloatNegate,
+  FloatCompare,
+  Float2Float,
+  Float2Int,
+  Int2Float,
+  
+  //intrinsic functions:
+  FloatSqrt,
+  FloatAbs,
+  Abs,
+  
+  NoBinaryOperation = -1
 };
 
-const unsigned BinaryOperationCount = Negate + 1;
+const unsigned BinaryOperationCount = Abs + 1;
 
 enum TernaryOperation {
   LongCompare,
@@ -62,10 +77,23 @@ enum TernaryOperation {
   UnsignedShiftRight,
   And,
   Or,
-  Xor
+  Xor,
+  
+  //extensions:
+  FloatAdd,
+  FloatSubtract,
+  FloatMultiply,
+  FloatDivide,
+  FloatRemainder,
+  
+  //intrinsic functions:
+  FloatMax,
+  FloatMin,
+  
+  NoTernaryOperation = -1
 };
 
-const unsigned TernaryOperationCount = Xor + 1;
+const unsigned TernaryOperationCount = FloatMin + 1;
 
 enum OperandType {
   ConstantOperand,
@@ -258,15 +286,19 @@ class Assembler {
   class Architecture {
    public:
     virtual unsigned registerCount() = 0;
+    virtual unsigned generalRegisterCount() = 0;
+    virtual unsigned floatRegisterCount() = 0;
+    virtual uint64_t generalRegisters() = 0;
+    virtual uint64_t floatRegisters() = 0;
 
     virtual int stack() = 0;
     virtual int thread() = 0;
     virtual int returnLow() = 0;
     virtual int returnHigh() = 0;
 
-    virtual bool condensedAddressing() = 0;
-
     virtual bool bigEndian() = 0;
+    
+    virtual bool supportsFloatCompare(unsigned size) = 0;
 
     virtual bool reserved(int register_) = 0;
 
@@ -287,24 +319,36 @@ class Assembler {
     virtual unsigned frameReturnAddressSize() = 0;
     virtual unsigned frameFooterSize() = 0;
     virtual void nextFrame(void** stack, void** base) = 0;
+    
+    virtual BinaryOperation hasBinaryIntrinsic(Thread* t, object method) = 0;
+    virtual TernaryOperation hasTernaryIntrinsic(Thread* t, object method) = 0;
 
     virtual void plan
     (UnaryOperation op,
      unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
      bool* thunk) = 0;
 
-    virtual void plan
+    virtual void planSource
     (BinaryOperation op,
      unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
-     unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-     bool* thunk) = 0;
+     unsigned bSize, bool* thunk) = 0;
+     
+    virtual void planDestination
+    (BinaryOperation op,
+     unsigned aSize, const uint8_t* aTypeMask, const uint64_t* aRegisterMask,
+     unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask) = 0;
 
-    virtual void plan
+    virtual void planSource
     (TernaryOperation op,
      unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
      unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-     unsigned cSize, uint8_t* cTypeMask, uint64_t* cRegisterMask,
-     bool* thunk) = 0; 
+     unsigned cSize, bool* thunk) = 0; 
+
+    virtual void planDestination
+    (TernaryOperation op,
+     unsigned aSize, const uint8_t* aTypeMask, const uint64_t* aRegisterMask,
+     unsigned bSize, const uint8_t* bTypeMask, const uint64_t* bRegisterMask,
+     unsigned cSize, uint8_t* cTypeMask, uint64_t* cRegisterMask) = 0; 
 
     virtual void acquire() = 0;
     virtual void release() = 0;

From 5cc605b56df43a959669274638e3fd8613d77840 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 08:48:15 -0600
Subject: [PATCH 02/16] added floating point support.

---
 src/compiler.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/compiler.h b/src/compiler.h
index db5f5963be..b2a702596d 100644
--- a/src/compiler.h
+++ b/src/compiler.h
@@ -21,8 +21,9 @@ class Compiler {
  public:
   class Client {
    public:
-    virtual intptr_t getThunk(UnaryOperation op, unsigned size) = 0;
-    virtual intptr_t getThunk(TernaryOperation op, unsigned size) = 0;
+    virtual intptr_t getThunk(UnaryOperation op, unsigned size, unsigned resultSize) = 0;
+    virtual intptr_t getThunk(BinaryOperation op, unsigned size, unsigned resultSize) = 0;
+    virtual intptr_t getThunk(TernaryOperation op, unsigned size, unsigned resultSize) = 0;
   };
   
   static const unsigned Aligned  = 1 << 0;
@@ -106,18 +107,25 @@ class Compiler {
                          unsigned dstSize) = 0;
   virtual Operand* lcmp(Operand* a, Operand* b) = 0;
   virtual void cmp(unsigned size, Operand* a, Operand* b) = 0;
+  virtual void fcmp(unsigned size, Operand* a, Operand* b) = 0;
   virtual void jl(Operand* address) = 0;
   virtual void jg(Operand* address) = 0;
   virtual void jle(Operand* address) = 0;
   virtual void jge(Operand* address) = 0;
   virtual void je(Operand* address) = 0;
   virtual void jne(Operand* address) = 0;
+  virtual void juo(Operand* address) = 0;
   virtual void jmp(Operand* address) = 0;
   virtual Operand* add(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* sub(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* mul(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* div(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* rem(unsigned size, Operand* a, Operand* b) = 0;
+  virtual Operand* fadd(unsigned size, Operand* a, Operand* b) = 0;
+  virtual Operand* fsub(unsigned size, Operand* a, Operand* b) = 0;
+  virtual Operand* fmul(unsigned size, Operand* a, Operand* b) = 0;
+  virtual Operand* fdiv(unsigned size, Operand* a, Operand* b) = 0;
+  virtual Operand* frem(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* shl(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* shr(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* ushr(unsigned size, Operand* a, Operand* b) = 0;
@@ -125,6 +133,12 @@ class Compiler {
   virtual Operand* or_(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* xor_(unsigned size, Operand* a, Operand* b) = 0;
   virtual Operand* neg(unsigned size, Operand* a) = 0;
+  virtual Operand* fneg(unsigned size, Operand* a) = 0;
+  virtual Operand* operation(BinaryOperation op, unsigned aSize, unsigned resSize, Operand* a) = 0;
+  virtual Operand* operation(TernaryOperation op, unsigned aSize, unsigned bSize, unsigned resSize, Operand* a, Operand* b) = 0;
+  virtual Operand* f2f(unsigned aSize, unsigned resSize, Operand* a) = 0;
+  virtual Operand* f2i(unsigned aSize, unsigned resSize, Operand* a) = 0;
+  virtual Operand* i2f(unsigned aSize, unsigned resSize, Operand* a) = 0;
 
   virtual void loadBarrier() = 0;
   virtual void storeStoreBarrier() = 0;

From c042354ea0028cf0991bc2512e5f9a4d40b7d6c9 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 08:49:26 -0600
Subject: [PATCH 03/16] added detectFeature function, used to detect sse in
 x86.cpp

---
 src/x86.S | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 1 deletion(-)

diff --git a/src/x86.S b/src/x86.S
index 09abdb5b94..01d53894fa 100644
--- a/src/x86.S
+++ b/src/x86.S
@@ -18,6 +18,42 @@
 #ifdef __x86_64__
 
 #ifdef __WINDOWS__
+#  if defined __APPLE__ || defined __MINGW32__ || defined __CYGWIN32__
+.globl _detectFeature
+_detectFeature: 
+#  else
+.globl detectFeature
+detectFeature:
+#  endif
+   pushq %rbp
+   movq %rsp, %rbp
+   pushq %rdx
+   pushq %rcx
+   pushq %rbx
+   pushq %rsi
+   pushq %rdi
+   movl %ecx, %edi
+   movl %edx, %esi
+   movl $1, %eax
+   cpuid
+   andl %esi, %edx
+   andl %edi, %ecx
+   orl %edx, %ecx
+   test %ecx, %ecx
+   je LOCAL(NOSSE)
+   movl $1, %eax
+   jmp LOCAL(SSEEND)
+LOCAL(NOSSE):
+   movl $0, %eax
+LOCAL(SSEEND):
+   popq %rdi
+   popq %rsi
+   popq %rbx
+   popq %rcx
+   popq %rdx
+   movq %rbp,%rsp
+   popq %rbp
+   ret
 
 #  if defined __APPLE__ || defined __MINGW32__ || defined __CYGWIN32__
 .globl _vmNativeCall
@@ -141,6 +177,36 @@ _vmJump:
    jmp    *%rcx
    
 #elif defined __LINUX__
+#  if defined __APPLE__ || defined __MINGW32__ || defined __CYGWIN32__
+.globl _detectFeature
+_detectFeature: 
+#  else
+.globl detectFeature
+detectFeature:
+#  endif
+   pushq %rbp
+   movq %rsp, %rbp
+   pushq %rdx
+   pushq %rcx
+   pushq %rbx
+   movl $1, %eax
+   cpuid
+   andl %esi, %edx
+   andl %edi, %ecx
+   orl %edx, %ecx
+   test %ecx, %ecx
+   je LOCAL(NOSSE)
+   movl $1, %eax
+   jmp LOCAL(SSEEND)
+LOCAL(NOSSE):
+   movl $0, %eax
+LOCAL(SSEEND):
+   popq %rbx
+   popq %rcx
+   popq %rdx
+   movq %rbp,%rsp
+   popq %rbp
+   ret
 
 #  if defined __APPLE__ || defined __MINGW32__ || defined __CYGWIN32__
 .globl _vmNativeCall
@@ -252,8 +318,44 @@ vmJump:
    jmp    *%rdi
    
 #endif //def __WINDOWS__
-
 #elif defined __i386__
+#  if defined __APPLE__ || defined __MINGW32__ || defined __CYGWIN32__
+.globl _detectFeature
+_detectFeature: 
+#  else
+.globl detectFeature
+detectFeature:
+#  endif
+   pushl %ebp
+   movl %esp, %ebp
+   pushl %edx
+   pushl %ecx
+   pushl %ebx
+   pushl %esi
+   pushl %edi
+   movl 12(%ebp), %esi
+   movl 8(%ebp), %edi
+   movl $1, %eax
+   cpuid
+   andl %esi, %edx
+   andl %edi, %ecx
+   orl %edx, %ecx
+   test %ecx, %ecx
+   je LOCAL(NOSSE)
+   movl $1, %eax
+   jmp LOCAL(SSEEND)
+LOCAL(NOSSE):
+   movl $0, %eax
+LOCAL(SSEEND):
+   popl %edi
+   popl %esi
+   popl %ebx
+   popl %ecx
+   popl %edx
+   movl %ebp,%esp
+   popl %ebp
+   ret
+   
 #  if defined __APPLE__ || defined __MINGW32__ || defined __CYGWIN32__
 .globl _vmNativeCall
 _vmNativeCall: 

From c3a389429e5ad6ba2c519978c653bf2c21fdd98b Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 08:54:23 -0600
Subject: [PATCH 04/16] split source function, update interface for floating
 point / instrinsic support

---
 src/powerpc.cpp | 71 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/src/powerpc.cpp b/src/powerpc.cpp
index 01f5a3e862..da368961b7 100644
--- a/src/powerpc.cpp
+++ b/src/powerpc.cpp
@@ -1679,10 +1679,6 @@ class MyArchitecture: public Assembler::Architecture {
     return (BytesPerWord == 4 ? 3 : NoRegister);
   }
 
-  virtual bool condensedAddressing() {
-    return false;
-  }
-
   virtual bool bigEndian() {
     return true;
   }
@@ -1770,6 +1766,18 @@ class MyArchitecture: public Assembler::Architecture {
     *stack = *static_cast<void**>(*stack);
   }
 
+  virtual BinaryOperation hasBinaryIntrinsic(Thread* t, object method) {
+  	return NoBinaryOperation;
+  }
+  
+  virtual TernaryOperation hasTernaryIntrinsic(Thread* t UNUSED, object method UNUSED) {
+  	return NoTernaryOperation;
+  }
+  
+  virtual bool supportsFloatCompare(unsigned size) {
+    return false;
+  }
+  
   virtual void plan
   (UnaryOperation,
    unsigned, uint8_t* aTypeMask, uint64_t* aRegisterMask,
@@ -1780,42 +1788,62 @@ class MyArchitecture: public Assembler::Architecture {
     *thunk = false;
   }
 
-  virtual void plan
+  virtual void planSource
   (BinaryOperation op,
    unsigned, uint8_t* aTypeMask, uint64_t* aRegisterMask,
-   unsigned, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-   bool* thunk)
+   unsigned, bool* thunk)
   {
     *aTypeMask = ~0;
     *aRegisterMask = ~static_cast<uint64_t>(0);
 
-    *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
-    *bRegisterMask = ~static_cast<uint64_t>(0);
-
     *thunk = false;
 
     switch (op) {
     case Compare:
       *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
-      *bTypeMask = (1 << RegisterOperand);
       break;
 
     case Negate:
       *aTypeMask = (1 << RegisterOperand);
+      break;
+    case FloatCompare:
+    case FloatNegate:
+    case Float2Float:
+    case Float2Int:
+    case Int2Float:
+      *thunk = true;
+      break;
+    default:
+      break;
+    }
+  }
+  
+  virtual void planDestination
+  (BinaryOperation op,
+   unsigned, const uint8_t* aTypeMask, const uint64_t* aRegisterMask,
+   unsigned, uint8_t* bTypeMask, uint64_t* bRegisterMask)
+  {
+    *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+    *bRegisterMask = ~static_cast<uint64_t>(0);
+
+    switch (op) {
+    case Compare:
       *bTypeMask = (1 << RegisterOperand);
       break;
 
+    case Negate:
+      *bTypeMask = (1 << RegisterOperand);
+      break;
     default:
       break;
     }
   }
 
-  virtual void plan
+  virtual void planSource
   (TernaryOperation op,
    unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
    unsigned, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-   unsigned, uint8_t* cTypeMask, uint64_t* cRegisterMask,
-   bool* thunk)
+   unsigned, bool* thunk)
   {
     *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
     *aRegisterMask = ~static_cast<uint64_t>(0);
@@ -1851,10 +1879,25 @@ class MyArchitecture: public Assembler::Architecture {
       }
       break;
 
+    case FloatAdd:
+    case FloatSubtract:
+    case FloatMultiply:
+    case FloatDivide:
+    case FloatRemainder:
+      *bTypeMask = ~0;
+      *thunk = true;
+      break;
     default:
       break;
     }
+  }
 
+  virtual void planDestination
+  (TernaryOperation op,
+   unsigned, const uint8_t*, const uint64_t*,
+   unsigned, const uint8_t* bTypeMask, const uint64_t* bRegisterMask,
+   unsigned, uint8_t* cTypeMask, uint64_t* cRegisterMask)
+  {
     *cTypeMask = *bTypeMask;
     *cRegisterMask = *bRegisterMask;
   }

From 7483fa154d4181d9f68850e8011e292258a47852 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 10:01:57 -0600
Subject: [PATCH 05/16] added floating point support, instrinsics support

---
 src/compile.cpp | 346 +++++++++++++++++++++++++++++-------------------
 1 file changed, 212 insertions(+), 134 deletions(-)

diff --git a/src/compile.cpp b/src/compile.cpp
index 6113452a7e..3b8637c872 100644
--- a/src/compile.cpp
+++ b/src/compile.cpp
@@ -27,7 +27,7 @@ vmCall();
 
 namespace {
 
-const bool DebugCompile = false;
+const bool DebugCompile = true;
 const bool DebugNatives = false;
 const bool DebugCallTable = false;
 const bool DebugMethodTree = false;
@@ -562,20 +562,86 @@ class Context {
     virtual intptr_t getThunk(UnaryOperation, unsigned) {
       abort(t);
     }
+    
+    virtual intptr_t getThunk(BinaryOperation op, unsigned size, unsigned resultSize) {
+      switch(op) {
+      case FloatNegate:
+        if (size == 4) {
+          return ::getThunk(t, negateFloatThunk);
+        } else {
+          return ::getThunk(t, negateDoubleThunk);
+        }
+      case Float2Float:
+        if (size == 4 && resultSize == 8) {
+          return ::getThunk(t, floatToDoubleThunk);
+        } else if(size == 8 && resultSize == 4) {
+          return ::getThunk(t, doubleToFloatThunk);
+        }
+      case Float2Int:
+        if (size == 4 && resultSize == 4) {
+          return ::getThunk(t, floatToIntThunk);
+        } else if(size == 4 && resultSize == 8) {
+          return ::getThunk(t, floatToLongThunk);
+        } else if(size == 8 && resultSize == 4) {
+          return ::getThunk(t, doubleToIntThunk);
+        } else if(size == 8 && resultSize == 8) {
+          return ::getThunk(t, doubleToLongThunk);
+        }
+      case Int2Float:
+        if (size == 4 && resultSize == 4) {
+          return ::getThunk(t, intToFloatThunk);
+        } else if(size == 4 && resultSize == 8) {
+          return ::getThunk(t, intToDoubleThunk);
+        } else if(size == 8 && resultSize == 4) {
+          return ::getThunk(t, longToFloatThunk);
+        } else if(size == 8 && resultSize == 8) {
+          return ::getThunk(t, longToDoubleThunk);
+        }
+          
+      default: break;
+      }
+      
+      abort(t);
+    }
 
-    virtual intptr_t getThunk(TernaryOperation op, unsigned size) {
+    virtual intptr_t getThunk(TernaryOperation op, unsigned size UNUSED, unsigned resultSize) {
       switch (op) {
       case Divide:
-        if (size == 8) {
+        if (resultSize == 8) {
           return ::getThunk(t, divideLongThunk);
         }
         break;
 
       case Remainder:
-        if (size == 8) {
+        if (resultSize == 8) {
           return ::getThunk(t, moduloLongThunk);
         }
         break;
+      
+      case FloatAdd:
+        if(resultSize == 4) {
+          return ::getThunk(t, addFloatThunk);
+        } else {
+          return ::getThunk(t, addDoubleThunk);
+        }        
+      case FloatSubtract:
+        if(resultSize == 4) {
+          return ::getThunk(t, subtractFloatThunk);
+        } else {
+          return ::getThunk(t, subtractDoubleThunk);
+        }  
+      case FloatMultiply:
+        if(resultSize == 4) {
+          return ::getThunk(t, multiplyFloatThunk);
+        } else {
+          return ::getThunk(t, multiplyDoubleThunk);
+        }  
+      case FloatDivide:
+        if(resultSize == 4) {
+          return ::getThunk(t, divideFloatThunk);
+        } else {
+          return ::getThunk(t, divideDoubleThunk);
+        }  
 
       default: break;
       }
@@ -1008,7 +1074,7 @@ class Frame {
     poppedLong();
     return popLongQuiet();
   }
-
+  
   Compiler::Operand* popObject() {
     poppedObject();
     return popQuiet(1);
@@ -2071,6 +2137,22 @@ saveStateAndCompile(MyThread* t, Frame* initialFrame, unsigned ip)
   initialFrame->c->restoreState(state);
 }
 
+bool
+isCJump(unsigned instruction)
+{
+  switch(instruction) {
+  case ifeq:
+  case ifne:
+  case ifgt:
+  case ifge:
+  case iflt:
+  case ifle: 
+    return true;
+  default:
+    return false;
+  }
+}
+
 void
 compile(MyThread* t, Frame* initialFrame, unsigned ip,
         int exceptionHandlerStart)
@@ -2084,6 +2166,8 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
 
   object code = methodCode(t, context->method);
   PROTECT(t, code);
+  
+  int lastFcmpl = 1, lastFcmpg = 1;
     
   while (ip < codeLength(t, code)) {
     if (context->visitTable[ip] ++) {
@@ -2108,6 +2192,9 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
          0,
          1, c->thread());
     }
+    
+    ++ lastFcmpl;
+    ++ lastFcmpg;
 
 //     fprintf(stderr, "ip: %d map: %ld\n", ip, *(frame->map));
 
@@ -2341,63 +2428,56 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
     } break;
 
     case d2f: {
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, doubleToFloatThunk)),
-          0, 0, 4, 2,
-          static_cast<Compiler::Operand*>(0), frame->popLong()));
+        frame->pushInt(c->f2f(8, 4, frame->popLong()));
     } break;
 
     case d2i: {
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, doubleToIntThunk)),
-          0, 0, 4, 2,
-          static_cast<Compiler::Operand*>(0), frame->popLong()));
+      frame->pushInt(c->f2i(8, 4, frame->popLong()));
     } break;
 
     case d2l: {
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, doubleToLongThunk)),
-          0, 0, 8, 2,
-          static_cast<Compiler::Operand*>(0), frame->popLong()));
+      frame->pushLong(c->f2i(8, 8, frame->popLong()));
     } break;
 
     case dadd: {
       Compiler::Operand* a = frame->popLong();
       Compiler::Operand* b = frame->popLong();
 
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, addDoubleThunk)),
-          0, 0, 8, 4,
-          static_cast<Compiler::Operand*>(0), a,
-          static_cast<Compiler::Operand*>(0), b));
+      frame->pushLong(c->fadd(8, a, b));
     } break;
 
     case dcmpg: {
       Compiler::Operand* a = frame->popLong();
       Compiler::Operand* b = frame->popLong();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, compareDoublesGThunk)),
-          0, 0, 4, 4,
-          static_cast<Compiler::Operand*>(0), a,
-          static_cast<Compiler::Operand*>(0), b));
+      if(t->arch->supportsFloatCompare(8) && isCJump(codeBody(t, code, ip))) {
+        c->fcmp(8, a, b);
+        lastFcmpg = 0;
+      } else {
+        frame->pushInt
+          (c->call
+           (c->constant(getThunk(t, compareDoublesGThunk)),
+            0, 0, 4, 4,
+            static_cast<Compiler::Operand*>(0), a,
+            static_cast<Compiler::Operand*>(0), b));
+      }
     } break;
 
     case dcmpl: {
       Compiler::Operand* a = frame->popLong();
       Compiler::Operand* b = frame->popLong();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, compareDoublesLThunk)),
-          0, 0, 4, 4,
-          static_cast<Compiler::Operand*>(0), a,
-          static_cast<Compiler::Operand*>(0), b));
+      if(t->arch->supportsFloatCompare(8) && isCJump(codeBody(t, code, ip))) {
+        c->fcmp(8, a, b);
+        lastFcmpl = 0;
+      } else {
+        frame->pushInt
+          (c->call
+           (c->constant(getThunk(t, compareDoublesLThunk)),
+            0, 0, 4, 4,
+            static_cast<Compiler::Operand*>(0), a,
+            static_cast<Compiler::Operand*>(0), b));
+      }
     } break;
 
     case dconst_0:
@@ -2412,56 +2492,32 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       Compiler::Operand* a = frame->popLong();
       Compiler::Operand* b = frame->popLong();
 
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, divideDoubleThunk)),
-          0, 0, 8, 4,
-          static_cast<Compiler::Operand*>(0), a,
-          static_cast<Compiler::Operand*>(0), b));
+      frame->pushLong(c->fdiv(8, a, b));
     } break;
 
     case dmul: {
       Compiler::Operand* a = frame->popLong();
       Compiler::Operand* b = frame->popLong();
 
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, multiplyDoubleThunk)),
-          0, 0, 8, 4,
-          static_cast<Compiler::Operand*>(0), a,
-          static_cast<Compiler::Operand*>(0), b));
+      frame->pushLong(c->fmul(8, a, b));
     } break;
 
     case dneg: {
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, negateDoubleThunk)),
-          0, 0, 8, 2,
-          static_cast<Compiler::Operand*>(0), frame->popLong()));
+      frame->pushLong(c->fneg(8, frame->popLong()));
     } break;
 
     case vm::drem: {
       Compiler::Operand* a = frame->popLong();
       Compiler::Operand* b = frame->popLong();
 
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, moduloDoubleThunk)),
-          0, 0, 8, 4,
-          static_cast<Compiler::Operand*>(0), a,
-          static_cast<Compiler::Operand*>(0), b));
+      frame->pushLong(c->frem(8, a, b));
     } break;
 
     case dsub: {
       Compiler::Operand* a = frame->popLong();
       Compiler::Operand* b = frame->popLong();
 
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, subtractDoubleThunk)),
-          0, 0, 8, 4,
-          static_cast<Compiler::Operand*>(0), a,
-          static_cast<Compiler::Operand*>(0), b));
+      frame->pushLong(c->fsub(8, a, b));
     } break;
 
     case dup:
@@ -2489,54 +2545,52 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       break;
 
     case f2d: {
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, floatToDoubleThunk)),
-          0, 0, 8, 1, frame->popInt()));
+      frame->pushLong(c->f2f(4, 8, frame->popInt()));
     } break;
 
     case f2i: {
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, floatToIntThunk)),
-          0, 0, 4, 1, frame->popInt()));
+      frame->pushInt(c->f2i(4, 4, frame->popInt()));
     } break;
 
     case f2l: {
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, floatToLongThunk)),
-          0, 0, 8, 1, frame->popInt()));
+      frame->pushLong(c->f2i(4, 8, frame->popInt()));
     } break;
 
     case fadd: {
       Compiler::Operand* a = frame->popInt();
       Compiler::Operand* b = frame->popInt();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, addFloatThunk)),
-          0, 0, 4, 2, a, b));
+      frame->pushInt(c->fadd(4, a, b));
     } break;
 
     case fcmpg: {
       Compiler::Operand* a = frame->popInt();
       Compiler::Operand* b = frame->popInt();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, compareFloatsGThunk)),
-          0, 0, 4, 2, a, b));
+      if(t->arch->supportsFloatCompare(4) && isCJump(codeBody(t, code, ip))) {
+        c->fcmp(4, a, b);
+        lastFcmpg = 0;
+      } else {
+        frame->pushInt
+          (c->call
+           (c->constant(getThunk(t, compareFloatsGThunk)),
+            0, 0, 4, 2, a, b));
+      }
     } break;
 
     case fcmpl: {
       Compiler::Operand* a = frame->popInt();
       Compiler::Operand* b = frame->popInt();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, compareFloatsLThunk)),
-          0, 0, 4, 2, a, b));
+      if(t->arch->supportsFloatCompare(4) && isCJump(codeBody(t, code, ip))) {
+        c->fcmp(4, a, b);
+        lastFcmpl = 0;
+      } else {
+        frame->pushInt
+          (c->call
+           (c->constant(getThunk(t, compareFloatsLThunk)),
+            0, 0, 4, 2, a, b));
+      }
     } break;
 
     case fconst_0:
@@ -2555,47 +2609,32 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       Compiler::Operand* a = frame->popInt();
       Compiler::Operand* b = frame->popInt();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, divideFloatThunk)),
-          0, 0, 4, 2, a, b));
+      frame->pushInt(c->fdiv(4, a, b));
     } break;
 
     case fmul: {
       Compiler::Operand* a = frame->popInt();
       Compiler::Operand* b = frame->popInt();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, multiplyFloatThunk)),
-          0, 0, 4, 2, a, b));
+      frame->pushInt(c->fmul(4, a, b));
     } break;
 
     case fneg: {
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, negateFloatThunk)),
-          0, 0, 4, 1, frame->popInt()));
+      frame->pushInt(c->fneg(4, frame->popInt()));
     } break;
 
     case vm::frem: {
       Compiler::Operand* a = frame->popInt();
       Compiler::Operand* b = frame->popInt();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, moduloFloatThunk)),
-          0, 0, 4, 2, a, b));
+      frame->pushInt(c->frem(4, a, b));   	
     } break;
 
     case fsub: {
       Compiler::Operand* a = frame->popInt();
       Compiler::Operand* b = frame->popInt();
 
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, subtractFloatThunk)),
-          0, 0, 4, 2, a, b));
+      frame->pushInt(c->fsub(4, a, b));
     } break;
 
     case getfield:
@@ -2731,17 +2770,11 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
     } break;
 
     case i2d: {
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, intToDoubleThunk)),
-          0, 0, 8, 1, frame->popInt()));
+      frame->pushLong(c->i2f(4, 8, frame->popInt()));
     } break;
 
     case i2f: {
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, intToFloatThunk)),
-          0, 0, 4, 1, frame->popInt()));
+      frame->pushInt(c->i2f(4, 4, frame->popInt()));
     } break;
 
     case i2l:
@@ -2869,27 +2902,48 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       uint32_t newIp = (ip - 3) + offset;
       assert(t, newIp < codeLength(t, code));
 
-      Compiler::Operand* a = frame->popInt();
       Compiler::Operand* target = frame->machineIp(newIp);
+      Compiler::Operand* cont = frame->machineIp(ip);
 
-      c->cmp(4, c->constant(0), a);
+      if(lastFcmpl != 1 && lastFcmpg != 1) {
+        Compiler::Operand* a = frame->popInt();
+        c->cmp(4, c->constant(0), a);
+      }
       switch (instruction) {
       case ifeq:
+        if(lastFcmpl == 1 || lastFcmpg == 1) {
+          c->juo(cont);
+        }
         c->je(target);
         break;
       case ifne:
+        if(lastFcmpl == 1 || lastFcmpg == 1) {
+          c->juo(cont);
+        }
         c->jne(target);
         break;
       case ifgt:
+        if(lastFcmpl == 1) {
+          c->juo(cont);
+        }
         c->jg(target);
         break;
       case ifge:
+        if(lastFcmpl == 1) {
+          c->juo(cont);
+        }
         c->jge(target);
         break;
       case iflt:
+        if(lastFcmpg == 1) {
+          c->juo(cont);
+        }
         c->jl(target);
         break;
       case ifle:
+        if(lastFcmpg == 1) {
+          c->juo(cont);
+        }
         c->jle(target);
         break;
       }
@@ -3033,8 +3087,40 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       if (UNLIKELY(t->exception)) return;
 
       assert(t, methodFlags(t, target) & ACC_STATIC);
-
-      compileDirectInvoke(t, frame, target);
+      int params = methodParameterCount(t, target);
+      if(params == 1) {//TODO: Get number of method params
+      	BinaryOperation op = t->arch->hasBinaryIntrinsic(t, target);
+      	if(op != NoBinaryOperation) {
+      	  printf("Could use binary intrinsic %i.\n", op);
+		  int opSize = methodParameterFootprint(t, target) * BytesPerWord;
+		  int resSize = resultSize(t, methodReturnCode(t, target));
+		  Compiler::Operand* param;
+		  if(opSize == 4) {
+		    param = frame->popInt();
+		  } else {
+		    param = frame->popLong();
+		  }
+		  if(resSize == 4) {
+		    frame->pushInt(c->operation(op, opSize, resSize, param));
+		  } else {
+		    frame->pushLong(c->operation(op, opSize, resSize, param));
+		  }
+      	} else {
+          compileDirectInvoke(t, frame, target);
+      	}
+      } else if(params == 2) { //TODO: Get number of method params
+      	TernaryOperation op = t->arch->hasTernaryIntrinsic(t, target);
+      	if(op != NoTernaryOperation) {
+      	  printf("Could use ternary intrinsic %i.\n", op);
+      	  //int aSize, bSize;
+		  //int resSize = resultSize(t, methodReturnCode(t, target));
+          compileDirectInvoke(t, frame, target); //TODO: use intrinsic
+      	} else {
+          compileDirectInvoke(t, frame, target);
+      	}
+      } else {
+        compileDirectInvoke(t, frame, target);
+      }
     } break;
 
     case invokevirtual: {
@@ -3187,19 +3273,11 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
     } break;
 
     case l2d: {
-      frame->pushLong
-        (c->call
-         (c->constant(getThunk(t, longToDoubleThunk)),
-          0, 0, 8, 2,
-          static_cast<Compiler::Operand*>(0), frame->popLong()));
+      frame->pushLong(c->i2f(8, 8, frame->popLong()));
     } break;
 
     case l2f: {
-      frame->pushInt
-        (c->call
-         (c->constant(getThunk(t, longToFloatThunk)),
-          0, 0, 4, 2,
-          static_cast<Compiler::Operand*>(0), frame->popLong()));
+      frame->pushInt(c->i2f(8, 4, frame->popLong()));
     } break;
 
     case l2i:
@@ -4095,7 +4173,6 @@ clearBit(MyThread* t, object map, unsigned count, unsigned size, unsigned i,
   intArrayBody(t, map, count + (index / 32))
     &= ~(static_cast<int32_t>(1) << (index % 32));
 }
-
 uint8_t*
 finish(MyThread* t, Allocator* allocator, Context* context)
 {
@@ -4260,6 +4337,7 @@ finish(MyThread* t, Allocator* allocator, Context* context)
        "printStackTrace") == 0)
   {
     trap();
+    printf("Address: %p\n", ::vmAddressFromLine(t, (object)(context->method), 1176));
   }
 
   syncInstructionCache(start, codeSize);

From 53c0656ee7225c4c80374524af2ba8a0100e1e39 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 10:14:31 -0600
Subject: [PATCH 06/16] added floating point support, split plan method

---
 src/compiler.cpp | 333 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 256 insertions(+), 77 deletions(-)

diff --git a/src/compiler.cpp b/src/compiler.cpp
index f48fa446a6..6ef7751355 100644
--- a/src/compiler.cpp
+++ b/src/compiler.cpp
@@ -286,11 +286,16 @@ intersect(const SiteMask& a, const SiteMask& b)
                   intersectFrameIndexes(a.frameIndex, b.frameIndex));
 }
 
+enum ValueType {
+  ValueGeneral,
+  ValueFloat
+};
+
 class Value: public Compiler::Operand {
  public:
   Value(Site* site, Site* target):
     reads(0), lastRead(0), sites(site), source(0), target(target), buddy(this),
-    high(0), home(NoFrameIndex)
+    high(0), home(NoFrameIndex), type(ValueGeneral)
   { }
 
   virtual void addPredecessor(Context*, Event*) { }
@@ -303,6 +308,7 @@ class Value: public Compiler::Operand {
   Value* buddy;
   Value* high;
   int8_t home;
+  ValueType type;
 };
 
 class Context {
@@ -338,12 +344,19 @@ class Context {
     machineCodeSize(0),
     alignedFrameSize(0),
     availableRegisterCount(arch->registerCount()),
+    floatRegisterCount(arch->floatRegisterCount()),
+    generalRegisterCount(arch->generalRegisterCount()),
     constantCompare(CompareNone)
   {
     for (unsigned i = 0; i < arch->registerCount(); ++i) {
       new (registerResources + i) RegisterResource(arch->reserved(i));
       if (registerResources[i].reserved) {
         -- availableRegisterCount;
+        if (arch->generalRegisters() & (1 << i)) {
+          -- generalRegisterCount;
+        } else if (arch->floatRegisters() & (1 << i)) {
+          -- floatRegisterCount;
+        }
       }
     }
   }
@@ -375,6 +388,8 @@ class Context {
   unsigned machineCodeSize;
   unsigned alignedFrameSize;
   unsigned availableRegisterCount;
+  unsigned floatRegisterCount;
+  unsigned generalRegisterCount;
   ConstantCompare constantCompare;
 };
 
@@ -949,20 +964,41 @@ buddies(Value* a, Value* b)
 }
 
 void
-decrementAvailableRegisterCount(Context* c)
+decrementAvailableRegisterCount(Context* c, Value* v)
 {
   assert(c, c->availableRegisterCount);
   -- c->availableRegisterCount;
   
+  if (v) {
+    if (v->type == ValueGeneral) {
+      -- c->generalRegisterCount;
+    } else if (v->type == ValueFloat) {
+      -- c->floatRegisterCount;
+    }
+  } else {
+    -- c->generalRegisterCount;
+  }
+    
+  
   if (DebugResources) {
-    fprintf(stderr, "%d registers available\n", c->availableRegisterCount);
+    fprintf(stderr, "%d registers available - %d float, %d general\n", c->availableRegisterCount, c->floatRegisterCount, c->generalRegisterCount);
   }
 }
 
 void
-incrementAvailableRegisterCount(Context* c)
+incrementAvailableRegisterCount(Context* c, Value* v)
 {
   ++ c->availableRegisterCount;
+  
+  if (v) {
+    if (v->type == ValueGeneral) {
+      ++ c->generalRegisterCount;
+    } else if (v->type == ValueFloat) {
+      ++ c->floatRegisterCount;
+    }
+  } else {
+    ++ c->generalRegisterCount;
+  }
 
   if (DebugResources) {
     fprintf(stderr, "%d registers available\n", c->availableRegisterCount);
@@ -981,7 +1017,7 @@ increment(Context* c, RegisterResource* r)
     ++ r->referenceCount;
 
     if (r->referenceCount == 1) {
-      decrementAvailableRegisterCount(c);
+      decrementAvailableRegisterCount(c, r->value);
     }
   }
 }
@@ -1000,7 +1036,7 @@ decrement(Context* c, Resource* r)
     -- r->referenceCount;
 
     if (r->referenceCount == 0) {
-      incrementAvailableRegisterCount(c);
+      incrementAvailableRegisterCount(c, r->value);
     }
   }
 }
@@ -1023,7 +1059,7 @@ RegisterResource::freeze(Context* c, Value* v)
     freezeResource(c, this, v);
 
     if (freezeCount == 1) {
-      decrementAvailableRegisterCount(c);
+      decrementAvailableRegisterCount(c, v);
     }
   }
 }
@@ -1056,7 +1092,7 @@ RegisterResource::thaw(Context* c, Value* v)
     thawResource(c, this, v);
 
     if (freezeCount == 0) {
-      incrementAvailableRegisterCount(c);
+      incrementAvailableRegisterCount(c, v);
     }
   }
 }
@@ -1112,6 +1148,13 @@ pickRegisterTarget(Context* c, Value* v, uint32_t mask, unsigned* cost)
 {
   int target = NoRegister;
   unsigned bestCost = Target::Impossible;
+  if (v) {
+    if (v->type == ValueFloat) {
+      mask &= (c->arch->floatRegisters() | c->arch->generalRegisters());
+    } else if(v->type == ValueGeneral) {
+      mask &= c->arch->generalRegisters();
+    }
+  }
   for (int i = c->arch->registerCount() - 1; i >= 0; --i) {
     if ((1 << i) & mask) {
       RegisterResource* r = c->registerResources + i;
@@ -1190,15 +1233,30 @@ pickTarget(Context* c, Read* read, bool intersectRead,
   SiteMask mask;
   read->intersect(&mask);
 
-  unsigned registerPenalty = (c->availableRegisterCount > registerReserveCount
+  unsigned registerPenalty;
+  if(read->value) {
+    if(read->value->type == ValueGeneral) {
+      registerPenalty = (c->generalRegisterCount > registerReserveCount
                               ? 0 : Target::Penalty);
+    } else if(read->value->type == ValueFloat) {
+      registerPenalty = (c->floatRegisterCount > registerReserveCount
+                              ? 0 : Target::Penalty);
+    } else {
+      registerPenalty = (c->availableRegisterCount > registerReserveCount
+                              ? 0 : Target::Penalty);
+    }
+  } else {
+    registerPenalty = (c->availableRegisterCount > registerReserveCount
+                            ? 0 : Target::Penalty);
+  }
+  	
   
   Target best;
   if ((mask.typeMask & (1 << RegisterOperand))) {
     Target mine = pickRegisterTarget(c, read->value, mask.registerMask);
 
     mine.cost += registerPenalty;
-
+    if(mine.cost == Target::Impossible) asm("int3");
     if (mine.cost == 0) {
       return mine;
     } else if (mine.cost < best.cost) {
@@ -1916,17 +1974,17 @@ read(Context* c, const SiteMask& mask)
 }
 
 Read*
-anyRegisterRead(Context* c)
+generalRegisterRead(Context* c)
 {
-  return read(c, SiteMask(1 << RegisterOperand, ~0, NoFrameIndex));
+  return read(c, SiteMask(1 << RegisterOperand, c->arch->generalRegisters(), NoFrameIndex));
 }
 
 Read*
-registerOrConstantRead(Context* c)
+generalRegisterOrConstantRead(Context* c)
 {
   return read
     (c, SiteMask
-     ((1 << RegisterOperand) | (1 << ConstantOperand), ~0, NoFrameIndex));
+     ((1 << RegisterOperand) | (1 << ConstantOperand), c->arch->generalRegisters(), NoFrameIndex));
 }
 
 Read*
@@ -2524,12 +2582,9 @@ maybeMove(Context* c, BinaryOperation type, unsigned srcSize,
       bool thunk;
       uint8_t srcTypeMask;
       uint64_t srcRegisterMask;
-      uint8_t dstTypeMask;
-      uint64_t dstRegisterMask;
 
-      c->arch->plan(type, dstSize, &srcTypeMask, &srcRegisterMask,
-                    dstSize, &dstTypeMask, &dstRegisterMask,
-                    &thunk);
+      c->arch->planSource(type, dstSize, &srcTypeMask, &srcRegisterMask,
+                    dstSize, &thunk);
 
       assert(c, dstMask.typeMask & srcTypeMask & (1 << RegisterOperand));
 
@@ -2758,12 +2813,14 @@ appendMove(Context* c, BinaryOperation type, unsigned srcSize,
   uint8_t dstTypeMask;
   uint64_t dstRegisterMask;
 
-  c->arch->plan(type, srcSelectSize, &srcTypeMask, &srcRegisterMask,
-                dstSize, &dstTypeMask, &dstRegisterMask,
-                &thunk);
+  c->arch->planSource(type, srcSelectSize, &srcTypeMask, &srcRegisterMask,
+                dstSize, &thunk);
 
   assert(c, not thunk);
 
+  c->arch->planDestination(type, srcSelectSize, &srcTypeMask, &srcRegisterMask,
+                dstSize, &dstTypeMask, &dstRegisterMask);
+
   append(c, new (c->zone->allocate(sizeof(MoveEvent)))
          MoveEvent
          (c, type, srcSize, srcSelectSize, src, dstSize, dst,
@@ -2787,10 +2844,11 @@ findConstantSite(Context* c, Value* v)
 
 class CompareEvent: public Event {
  public:
-  CompareEvent(Context* c, unsigned size, Value* first, Value* second,
+  CompareEvent(Context* c, BinaryOperation type, unsigned size, Value* first, Value* second,
                const SiteMask& firstMask, const SiteMask& secondMask):
-    Event(c), size(size), first(first), second(second)
+    Event(c), type(type), size(size), first(first), second(second)
   {
+    assert(c, type != FloatCompare || (first->type == ValueFloat && first->type == ValueFloat)); 
     addRead(c, this, first, read(c, firstMask));
     addRead(c, this, second, read(c, secondMask));
   }
@@ -2817,20 +2875,21 @@ class CompareEvent: public Event {
     } else {
       c->constantCompare = CompareNone;
 
-      apply(c, Compare, size, first->source, 0, size, second->source, 0);
+      apply(c, type, size, first->source, 0, size, second->source, 0);
     }
 
     popRead(c, this, first);
     popRead(c, this, second);
   }
-
+  
+  BinaryOperation type;
   unsigned size;
   Value* first;
   Value* second;
 };
 
 void
-appendCompare(Context* c, unsigned size, Value* first, Value* second)
+appendCompare(Context* c, BinaryOperation op, unsigned size, Value* first, Value* second)
 {
   bool thunk;
   uint8_t firstTypeMask;
@@ -2838,15 +2897,17 @@ appendCompare(Context* c, unsigned size, Value* first, Value* second)
   uint8_t secondTypeMask;
   uint64_t secondRegisterMask;
 
-  c->arch->plan(Compare, size, &firstTypeMask, &firstRegisterMask,
-                size, &secondTypeMask, &secondRegisterMask,
-                &thunk);
+  c->arch->planSource(op, size, &firstTypeMask, &firstRegisterMask,
+                size, &thunk);
 
   assert(c, not thunk); // todo
 
+  c->arch->planDestination(op, size, &firstTypeMask, &firstRegisterMask,
+                size, &secondTypeMask, &secondRegisterMask);
+
   append(c, new (c->zone->allocate(sizeof(CompareEvent)))
          CompareEvent
-         (c, size, first, second,
+         (c, op, size, first, second,
           SiteMask(firstTypeMask, firstRegisterMask, AnyFrameIndex),
           SiteMask(secondTypeMask, secondRegisterMask, AnyFrameIndex)));
 }
@@ -2867,7 +2928,7 @@ getTarget(Context* c, Value* value, Value* result, const SiteMask& resultMask)
   Site* s;
   Value* v;
   Read* r = liveNext(c, value);
-  if (c->arch->condensedAddressing() or r == 0) {
+  if (r == 0 and value->source->match(c, static_cast<const SiteMask&>(resultMask))) {
     s = value->source;
     v = value;
     if (r and not hasMoreThanOneSite(v)) {
@@ -2911,6 +2972,13 @@ thawSource(Context* c, unsigned size, Value* v)
   }
 }
 
+uint64_t
+registerMask(Value* v) {
+  Site* s = source(v);
+  if(!s) return 0;
+  else return static_cast<uint64_t>(1) << ((RegisterSite*)s)->number;
+}
+
 class CombineEvent: public Event {
  public:
   CombineEvent(Context* c, TernaryOperation type,
@@ -2920,13 +2988,10 @@ class CombineEvent: public Event {
                const SiteMask& firstLowMask,
                const SiteMask& firstHighMask,
                const SiteMask& secondLowMask,
-               const SiteMask& secondHighMask,
-               const SiteMask& resultLowMask,
-               const SiteMask& resultHighMask):
+               const SiteMask& secondHighMask):
     Event(c), type(type), firstSize(firstSize), first(first),
     secondSize(secondSize), second(second), resultSize(resultSize),
-    result(result), resultLowMask(resultLowMask),
-    resultHighMask(resultHighMask)
+    result(result)
   {
     addRead(c, this, first, read(c, firstLowMask));
     if (firstSize > BytesPerWord) {
@@ -2949,6 +3014,17 @@ class CombineEvent: public Event {
 
   virtual void compile(Context* c) {
     freezeSource(c, firstSize, first);
+    
+    uint8_t aTypeMask = first->source->type(c);
+    uint8_t bTypeMask = second->source->type(c);
+    uint8_t cTypeMask;
+    uint64_t aRegisterMask = (registerMask(first->high) << 32) | registerMask(first);
+    uint64_t bRegisterMask = (registerMask(second->high) << 32) | registerMask(second);
+    uint64_t cRegisterMask;
+    
+    c->arch->planDestination(type, firstSize, &aTypeMask, &aRegisterMask, secondSize, &bTypeMask, &bRegisterMask, resultSize, &cTypeMask, &cRegisterMask);
+    SiteMask resultLowMask(cTypeMask, cRegisterMask, AnyFrameIndex);
+    SiteMask resultHighMask(cTypeMask, cRegisterMask >> 32, AnyFrameIndex);
 
     Site* low = getTarget(c, second, result, resultLowMask);
     Site* high
@@ -2987,8 +3063,6 @@ class CombineEvent: public Event {
   Value* second;
   unsigned resultSize;
   Value* result;
-  SiteMask resultLowMask;
-  SiteMask resultHighMask;
 };
 
 void
@@ -3284,13 +3358,10 @@ appendCombine(Context* c, TernaryOperation type,
   uint64_t firstRegisterMask;
   uint8_t secondTypeMask;
   uint64_t secondRegisterMask;
-  uint8_t resultTypeMask;
-  uint64_t resultRegisterMask;
 
-  c->arch->plan(type, firstSize, &firstTypeMask, &firstRegisterMask,
+  c->arch->planSource(type, firstSize, &firstTypeMask, &firstRegisterMask,
                 secondSize, &secondTypeMask, &secondRegisterMask,
-                resultSize, &resultTypeMask, &resultRegisterMask,
-                &thunk);
+                resultSize, &thunk);
 
   if (thunk) {
     Stack* oldStack = c->stack;
@@ -3302,7 +3373,7 @@ appendCombine(Context* c, TernaryOperation type,
     c->stack = oldStack;
 
     appendCall
-      (c, value(c, constantSite(c, c->client->getThunk(type, resultSize))),
+      (c, value(c, constantSite(c, c->client->getThunk(type, firstSize, resultSize))),
        0, 0, result, resultSize, argumentStack,
        ceiling(secondSize, BytesPerWord) + ceiling(firstSize, BytesPerWord),
        0);
@@ -3317,22 +3388,17 @@ appendCombine(Context* c, TernaryOperation type,
         SiteMask(firstTypeMask, firstRegisterMask, AnyFrameIndex),
         SiteMask(firstTypeMask, firstRegisterMask >> 32, AnyFrameIndex),
         SiteMask(secondTypeMask, secondRegisterMask, AnyFrameIndex),
-        SiteMask(secondTypeMask, secondRegisterMask >> 32, AnyFrameIndex),
-        SiteMask(resultTypeMask, resultRegisterMask, AnyFrameIndex),
-        SiteMask(resultTypeMask, resultRegisterMask >> 32, AnyFrameIndex)));
+        SiteMask(secondTypeMask, secondRegisterMask >> 32, AnyFrameIndex)));
   }
 }
 
 class TranslateEvent: public Event {
  public:
-  TranslateEvent(Context* c, BinaryOperation type, unsigned size, Value* value,
+  TranslateEvent(Context* c, BinaryOperation type, unsigned size, unsigned resSize, Value* value,
                  Value* result,
                  const SiteMask& valueLowMask,
-                 const SiteMask& valueHighMask,
-                 const SiteMask& resultLowMask,
-                 const SiteMask& resultHighMask):
-    Event(c), type(type), size(size), value(value), result(result),
-    resultLowMask(resultLowMask), resultHighMask(resultHighMask)
+                 const SiteMask& valueHighMask):
+    Event(c), type(type), size(size), resSize(resSize), value(value), result(result)
   {
     addRead(c, this, value, read(c, valueLowMask));
     if (size > BytesPerWord) {
@@ -3346,6 +3412,15 @@ class TranslateEvent: public Event {
   }
 
   virtual void compile(Context* c) {
+    uint8_t aTypeMask = value->source->type(c);
+    uint8_t bTypeMask;
+    uint64_t aRegisterMask = (registerMask(value->high) << 32) | registerMask(value);
+    uint64_t bRegisterMask;
+    
+    c->arch->planDestination(type, size, &aTypeMask, &aRegisterMask, resSize, &bTypeMask, &bRegisterMask);
+    SiteMask resultLowMask(bTypeMask, bRegisterMask, AnyFrameIndex);
+    SiteMask resultHighMask(bTypeMask, bRegisterMask >> 32, AnyFrameIndex);
+    
     Site* low = getTarget(c, value, result, resultLowMask);
     Site* high
       = (size > BytesPerWord
@@ -3375,6 +3450,7 @@ class TranslateEvent: public Event {
 
   BinaryOperation type;
   unsigned size;
+  unsigned resSize;
   Value* value;
   Value* result;
   Read* resultRead;
@@ -3383,28 +3459,35 @@ class TranslateEvent: public Event {
 };
 
 void
-appendTranslate(Context* c, BinaryOperation type, unsigned size, Value* value,
-                Value* result)
+appendTranslate(Context* c, BinaryOperation type, unsigned firstSize, Value* first,
+                unsigned resultSize, Value* result)
 {
   bool thunk;
   uint8_t firstTypeMask;
   uint64_t firstRegisterMask;
-  uint8_t resultTypeMask;
-  uint64_t resultRegisterMask;
 
-  c->arch->plan(type, size, &firstTypeMask, &firstRegisterMask,
-                size, &resultTypeMask, &resultRegisterMask,
-                &thunk);
+  c->arch->planSource(type, firstSize, &firstTypeMask, &firstRegisterMask,
+                resultSize, &thunk);
 
-  assert(c, not thunk); // todo
+  if (thunk) {
+    Stack* oldStack = c->stack;
 
-  append(c, new (c->zone->allocate(sizeof(TranslateEvent)))
-         TranslateEvent
-         (c, type, size, value, result,
-          SiteMask(firstTypeMask, firstRegisterMask, AnyFrameIndex),
-          SiteMask(firstTypeMask, firstRegisterMask >> 32, AnyFrameIndex),
-          SiteMask(resultTypeMask, resultRegisterMask, AnyFrameIndex),
-          SiteMask(resultTypeMask, resultRegisterMask >> 32, AnyFrameIndex)));
+    ::push(c, ceiling(firstSize, BytesPerWord), first);
+
+    Stack* argumentStack = c->stack;
+    c->stack = oldStack;
+
+    appendCall
+      (c, value(c, constantSite(c, c->client->getThunk(type, firstSize, resultSize))),
+       0, 0, result, resultSize, argumentStack,
+       ceiling(firstSize, BytesPerWord), 0);
+  } else {
+    append(c, new (c->zone->allocate(sizeof(TranslateEvent)))
+           TranslateEvent
+           (c, type, firstSize, resultSize, first, result,
+            SiteMask(firstTypeMask, firstRegisterMask, AnyFrameIndex),
+            SiteMask(firstTypeMask, firstRegisterMask >> 32, AnyFrameIndex)));
+  }
 }
 
 class BarrierEvent: public Event {
@@ -3437,9 +3520,9 @@ class MemoryEvent: public Event {
     Event(c), base(base), displacement(displacement), index(index),
     scale(scale), result(result)
   {
-    addRead(c, this, base, anyRegisterRead(c));
+    addRead(c, this, base, generalRegisterRead(c));
     if (index) {
-      addRead(c, this, index, registerOrConstantRead(c));
+      addRead(c, this, index, generalRegisterOrConstantRead(c));
     }
   }
 
@@ -3617,8 +3700,8 @@ class BoundsCheckEvent: public Event {
     Event(c), object(object), lengthOffset(lengthOffset), index(index),
     handler(handler)
   {
-    addRead(c, this, object, anyRegisterRead(c));
-    addRead(c, this, index, registerOrConstantRead(c));
+    addRead(c, this, object, generalRegisterRead(c));
+    addRead(c, this, index, generalRegisterOrConstantRead(c));
   }
 
   virtual const char* name() {
@@ -4322,10 +4405,8 @@ populateSources(Context* c, Event* e)
 {
   SiteRecord frozenRecords[e->readCount];
   SiteRecordList frozen(frozenRecords, e->readCount);
-
   for (Read* r = e->reads; r; r = r->eventNext) {
     r->value->source = readSource(c, r);
-
     if (r->value->source) {
       if (DebugReads) {
         char buffer[256]; r->value->source->toString(c, buffer, 256);
@@ -5188,10 +5269,20 @@ class MyCompiler: public Compiler {
   }
 
   virtual void cmp(unsigned size, Operand* a, Operand* b) {
-    appendCompare(&c, size, static_cast<Value*>(a),
+    appendCompare(&c, Compare, size, static_cast<Value*>(a),
                   static_cast<Value*>(b));
   }
 
+  virtual void fcmp(unsigned size, Operand* a, Operand* b) {
+  	static_cast<Value*>(a)->type = ValueFloat;
+  	static_cast<Value*>(b)->type = ValueFloat;
+    appendCompare(&c, FloatCompare, size, static_cast<Value*>(a),
+                  static_cast<Value*>(b));
+  	//static_cast<Value*>(a)->type = ValueGeneral;
+  	//static_cast<Value*>(b)->type = ValueGeneral;
+  }
+  
+
   virtual void jl(Operand* address) {
     appendBranch(&c, JumpIfLess, static_cast<Value*>(address));
   }
@@ -5215,6 +5306,10 @@ class MyCompiler: public Compiler {
   virtual void jne(Operand* address) {
     appendBranch(&c, JumpIfNotEqual, static_cast<Value*>(address));
   }
+  
+  virtual void juo(Operand* address) {
+    appendBranch(&c, JumpIfUnordered, static_cast<Value*>(address));
+  }
 
   virtual void jmp(Operand* address) {
     appendBranch(&c, Jump, static_cast<Value*>(address));
@@ -5255,6 +5350,46 @@ class MyCompiler: public Compiler {
     return result;
   }
 
+  virtual Operand* fadd(unsigned size, Operand* a, Operand* b) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = static_cast<Value*>(b)->type = ValueFloat;
+    appendCombine(&c, FloatAdd, size, static_cast<Value*>(a),
+                  size, static_cast<Value*>(b), size, result);
+    return result;
+  }
+
+  virtual Operand* fsub(unsigned size, Operand* a, Operand* b) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = static_cast<Value*>(b)->type = ValueFloat;
+    appendCombine(&c, FloatSubtract, size, static_cast<Value*>(a),
+                  size, static_cast<Value*>(b), size, result);
+    return result;
+  }
+
+  virtual Operand* fmul(unsigned size, Operand* a, Operand* b) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = static_cast<Value*>(b)->type = ValueFloat;
+    appendCombine(&c, FloatMultiply, size, static_cast<Value*>(a),
+                  size, static_cast<Value*>(b), size, result);
+    return result;
+  }
+
+  virtual Operand* fdiv(unsigned size, Operand* a, Operand* b)  {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = static_cast<Value*>(b)->type = ValueFloat;
+    appendCombine(&c, FloatDivide, size, static_cast<Value*>(a),
+                  size, static_cast<Value*>(b), size, result);
+    return result;
+  }
+
+  virtual Operand* frem(unsigned size, Operand* a, Operand* b) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = static_cast<Value*>(b)->type = ValueFloat;
+    appendCombine(&c, FloatRemainder, size, static_cast<Value*>(a),
+                  size, static_cast<Value*>(b), size, result);
+    return result;
+  }
+
   virtual Operand* shl(unsigned size, Operand* a, Operand* b) {
     Value* result = value(&c);
     appendCombine(&c, ShiftLeft, BytesPerWord, static_cast<Value*>(a),
@@ -5299,7 +5434,51 @@ class MyCompiler: public Compiler {
 
   virtual Operand* neg(unsigned size, Operand* a) {
     Value* result = value(&c);
-    appendTranslate(&c, Negate, size, static_cast<Value*>(a), result);
+    appendTranslate(&c, Negate, size, static_cast<Value*>(a), size, result);
+    return result;
+  }
+
+  virtual Operand* fneg(unsigned size, Operand* a) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = ValueFloat;
+    appendTranslate(&c, FloatNegate, size, static_cast<Value*>(a), size, result);
+    return result;
+  }
+  
+  virtual Operand* operation(BinaryOperation op, unsigned aSize, unsigned resSize, Operand* a) {
+  	Value* result = value(&c);
+  	static_cast<Value*>(a)->type = ValueFloat;
+  	appendTranslate(&c, op, aSize, static_cast<Value*>(a), resSize, result);
+  	return result;
+  }
+  
+  virtual Operand* operation(TernaryOperation op, unsigned aSize, unsigned bSize, unsigned resSize, Operand* a, Operand* b) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = static_cast<Value*>(b)->type = ValueFloat;
+    appendCombine(&c, op, aSize, static_cast<Value*>(a),
+                  bSize, static_cast<Value*>(b), resSize, result);
+    return result;
+  }
+  
+  virtual Operand* f2f(unsigned aSize, unsigned resSize, Operand* a) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = ValueFloat;
+    appendTranslate(&c, Float2Float, aSize, static_cast<Value*>(a), resSize, result);
+    return result;
+  }
+  
+  virtual Operand* f2i(unsigned aSize, unsigned resSize, Operand* a) {
+    Value* result = value(&c);
+    static_cast<Value*>(a)->type = ValueFloat;
+    appendTranslate(&c, Float2Int, aSize, static_cast<Value*>(a), resSize, result);
+    return result;
+  }
+  
+  virtual Operand* i2f(unsigned aSize, unsigned resSize, Operand* a) {
+    Value* result = value(&c);
+    //result->type = ValueFloat;
+    appendTranslate(&c, Int2Float, aSize, static_cast<Value*>(a), resSize, result);
+    //result->type = ValueGeneral;
     return result;
   }
 

From 61bc7299743047ffc6c8f7995fe343ced3556189 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 10:17:48 -0600
Subject: [PATCH 07/16] added floating point support, split plan method

---
 src/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler.h b/src/compiler.h
index b2a702596d..8d78720e25 100644
--- a/src/compiler.h
+++ b/src/compiler.h
@@ -21,7 +21,7 @@ class Compiler {
  public:
   class Client {
    public:
-    virtual intptr_t getThunk(UnaryOperation op, unsigned size, unsigned resultSize) = 0;
+    virtual intptr_t getThunk(UnaryOperation op, unsigned size) = 0;
     virtual intptr_t getThunk(BinaryOperation op, unsigned size, unsigned resultSize) = 0;
     virtual intptr_t getThunk(TernaryOperation op, unsigned size, unsigned resultSize) = 0;
   };

From a2e639a2d21498c2e11f5cdc5a380f1c274e6897 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 10:26:22 -0600
Subject: [PATCH 08/16] added floating point support, split plan function

---
 src/x86.cpp | 690 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 639 insertions(+), 51 deletions(-)

diff --git a/src/x86.cpp b/src/x86.cpp
index 7fc7a343dc..c57e3b42fe 100644
--- a/src/x86.cpp
+++ b/src/x86.cpp
@@ -13,10 +13,15 @@
 
 #include "assembler.h"
 #include "vector.h"
+#include "machine.h"
 
 #define CAST1(x) reinterpret_cast<UnaryOperationType>(x)
 #define CAST2(x) reinterpret_cast<BinaryOperationType>(x)
 
+const bool DebugSSE = false;
+const bool EnableSSE = true;
+const bool EnableSSE2 = true;
+
 using namespace vm;
 
 namespace {
@@ -40,6 +45,28 @@ enum {
   r15 = 15,
 };
 
+enum {
+  xmm0 = r15 + 1,
+  xmm1,
+  xmm2,
+  xmm3,
+  xmm4,
+  xmm5,
+  xmm6,
+  xmm7,
+  xmm8,
+  xmm9,
+  xmm10,
+  xmm11,
+  xmm12,
+  xmm13,
+  xmm14,
+  xmm15,
+};
+
+const unsigned GeneralRegisterMask = BytesPerWord == 4 ? 0x000000ff : 0x0000ffff;
+const unsigned FloatRegisterMask = BytesPerWord == 4 ? 0x00ff0000 : 0xffff0000;
+
 const unsigned FrameHeaderSize = 2;
 
 inline bool
@@ -399,6 +426,35 @@ padding(AlignmentPadding* p, unsigned start, unsigned offset,
   return padding;
 }
 
+extern "C"
+bool detectFeature(unsigned ecx, unsigned edx);
+
+inline bool
+supportsSSE()
+{
+	static int supported = -1;
+	if(supported == -1) {
+	  supported = EnableSSE && detectFeature(0, 0x2000000);
+	  if(DebugSSE) {
+	    fprintf(stderr, "sse %sdetected.\n", supported ? "" : "not ");
+	  }
+	}
+	return supported;	
+}
+
+inline bool
+supportsSSE2()
+{
+	static int supported = -1;
+	if(supported == -1) {
+	  supported = EnableSSE2 && detectFeature(0, 0x4000000);
+	  if(DebugSSE) {
+	    fprintf(stderr, "sse2 %sdetected.\n", supported ? "" : "not ");
+	  }
+	}
+	return supported;
+}
+
 #define REX_W 0x48
 #define REX_R 0x44
 #define REX_X 0x42
@@ -503,6 +559,12 @@ inline void opcode(Context* c, uint8_t op1, uint8_t op2) {
   c->code.append(op2);
 }
 
+inline void opcode(Context* c, uint8_t op1, uint8_t op2, uint8_t op3) {
+  c->code.append(op1);
+  c->code.append(op2);
+  c->code.append(op3);
+}
+
 void
 return_(Context* c)
 {
@@ -667,6 +729,14 @@ jumpIfLessOrEqualC(Context* c, unsigned size UNUSED, Assembler::Constant* a)
   conditional(c, 0x8e, a);
 }
 
+void
+jumpIfUnorderedC(Context* c, unsigned size UNUSED, Assembler::Constant* a)
+{
+  assert(c, size == BytesPerWord);
+
+  conditional(c, 0x8a, a);
+}
+
 void
 longJumpC(Context* c, unsigned size, Assembler::Constant* a)
 {
@@ -806,11 +876,59 @@ moveCR2(Context* c, UNUSED unsigned aSize, Assembler::Constant* a,
   }
 }
 
+inline bool floatReg(Assembler::Register* a) {
+	return a->low >= xmm0;
+}
+
+void
+sseMoveRR(Context* c, unsigned aSize, Assembler::Register* a,
+       unsigned bSize UNUSED, Assembler::Register* b)
+{
+  if(floatReg(a) && floatReg(b)) {
+  	if(aSize == 4) {
+  	  opcode(c, 0xf3);
+  	  maybeRex(c, 4, a, b);
+  	  opcode(c, 0x0f, 0x10);
+  	  modrm(c, 0xc0, b, a);
+  	} else {
+  	  opcode(c, 0xf2);
+  	  maybeRex(c, 4, a, b);
+  	  opcode(c, 0x0f, 0x10);
+  	  modrm(c, 0xc0, b, a);
+  	} 
+  } else if(floatReg(a)) {
+  	opcode(c, 0x66);
+  	maybeRex(c, aSize, a, b);
+  	opcode(c, 0x0f, 0x7e);
+  	modrm(c, 0xc0, b, a);  	
+  } else {
+  	opcode(c, 0x66);
+  	maybeRex(c, aSize, a, b);
+  	opcode(c, 0x0f, 0x6e);
+  	modrm(c, 0xc0, a, b);  	
+  }
+}
+
+void
+sseMoveCR(Context* c, unsigned aSize, Assembler::Constant* a,
+       unsigned bSize, Assembler::Register* b)
+{
+  assert(c, aSize <= BytesPerWord);
+  Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
+  moveCR2(c, aSize, a, aSize, &tmp, 0);
+  sseMoveRR(c, aSize, &tmp, bSize, b);
+  c->client->releaseTemporary(tmp.low);
+}
+
 void
 moveCR(Context* c, unsigned aSize, Assembler::Constant* a,
        unsigned bSize, Assembler::Register* b)
 {
-  moveCR2(c, aSize, a, bSize, b, 0);
+  if(floatReg(b)) {
+  	sseMoveCR(c, aSize, a, bSize, b);
+  } else {
+    moveCR2(c, aSize, a, bSize, b, 0);
+  }
 }
 
 void
@@ -829,7 +947,11 @@ void
 moveRR(Context* c, unsigned aSize, Assembler::Register* a,
        UNUSED unsigned bSize, Assembler::Register* b)
 {
-	
+  if(floatReg(a) or floatReg(b)) {
+  	sseMoveRR(c, aSize, a, bSize, b);
+  	return;
+  }
+  
   if (BytesPerWord == 4 and aSize == 8 and bSize == 8) {
     Assembler::Register ah(a->high);
     Assembler::Register bh(b->high);
@@ -902,10 +1024,25 @@ moveRR(Context* c, unsigned aSize, Assembler::Register* a,
   }
 }
 
+void
+sseMoveMR(Context* c, unsigned aSize, Assembler::Memory* a,
+       unsigned bSize UNUSED, Assembler::Register* b)
+{
+  opcode(c, 0x66);
+  maybeRex(c, aSize, b, a);
+  opcode(c, 0x0f, 0x6e);
+  modrmSibImm(c, b, a);
+}
+
 void
 moveMR(Context* c, unsigned aSize, Assembler::Memory* a,
        unsigned bSize, Assembler::Register* b)
 {
+  if(floatReg(b)) {
+  	sseMoveMR(c, aSize, a, bSize, b);
+  	return;
+  }
+  
   switch (aSize) {
   case 1:
     maybeRex(c, bSize, b, a);
@@ -956,12 +1093,27 @@ moveMR(Context* c, unsigned aSize, Assembler::Memory* a,
   }
 }
 
+void
+sseMoveRM(Context* c, unsigned aSize, Assembler::Register* a,
+       UNUSED unsigned bSize, Assembler::Memory* b)
+{
+  opcode(c, 0x66);
+  maybeRex(c, aSize, a, b);
+  opcode(c, 0x0f, 0x7e);
+  modrmSibImm(c, a, b);
+}
+
 void
 moveRM(Context* c, unsigned aSize, Assembler::Register* a,
        unsigned bSize UNUSED, Assembler::Memory* b)
 {
   assert(c, aSize == bSize);
   
+  if(floatReg(a)) {
+  	sseMoveRM(c, aSize, a, bSize, b);
+  	return;
+  }
+  
   switch (aSize) {
   case 1:
     maybeRex(c, bSize, a, b);
@@ -1066,7 +1218,7 @@ moveCM(Context* c, unsigned aSize UNUSED, Assembler::Constant* a,
         modrmSibImm(c, 0, b->scale, b->index, b->base, b->offset);
         c->code.append4(a->value->value());
       } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
         moveCR(c, 8, a, 8, &tmp);
         moveRM(c, 8, &tmp, 8, b);
         c->client->releaseTemporary(tmp.low);
@@ -1188,7 +1340,7 @@ addCR(Context* c, unsigned aSize, Assembler::Constant* a,
           c->code.append4(v);
         }
       } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
         moveCR(c, aSize, a, aSize, &tmp);
         addRR(c, aSize, &tmp, bSize, b);
         c->client->releaseTemporary(tmp.low);
@@ -1246,7 +1398,7 @@ subtractCR(Context* c, unsigned aSize, Assembler::Constant* a,
           c->code.append4(v);
         }
       } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
         moveCR(c, aSize, a, aSize, &tmp);
         subtractRR(c, aSize, &tmp, bSize, b);
         c->client->releaseTemporary(tmp.low);
@@ -1335,7 +1487,7 @@ andCR(Context* c, unsigned aSize, Assembler::Constant* a,
         c->code.append4(v);
       }
     } else {
-      Assembler::Register tmp(c->client->acquireTemporary());
+      Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
       moveCR(c, aSize, a, aSize, &tmp);
       andRR(c, aSize, &tmp, bSize, b);
       c->client->releaseTemporary(tmp.low);
@@ -1392,7 +1544,7 @@ orCR(Context* c, unsigned aSize, Assembler::Constant* a,
           c->code.append4(v);        
         }
       } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
         moveCR(c, aSize, a, aSize, &tmp);
         orRR(c, aSize, &tmp, bSize, b);
         c->client->releaseTemporary(tmp.low);
@@ -1448,7 +1600,7 @@ xorCR(Context* c, unsigned aSize, Assembler::Constant* a,
           c->code.append4(v);        
         }
       } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
         moveCR(c, aSize, a, aSize, &tmp);
         xorRR(c, aSize, &tmp, bSize, b);
         c->client->releaseTemporary(tmp.low);
@@ -1523,7 +1675,7 @@ compareCR(Context* c, unsigned aSize, Assembler::Constant* a,
       c->code.append4(v);
     }
   } else {
-    Assembler::Register tmp(c->client->acquireTemporary());
+    Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
     moveCR(c, aSize, a, aSize, &tmp);
     compareRR(c, aSize, &tmp, bSize, b);
     c->client->releaseTemporary(tmp.low);
@@ -1537,7 +1689,7 @@ multiplyCR(Context* c, unsigned aSize, Assembler::Constant* a,
   assert(c, aSize == bSize);
 
   if (BytesPerWord == 4 and aSize == 8) {
-    const uint32_t mask = ~((1 << rax) | (1 << rdx));
+    const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
     Assembler::Register tmp(c->client->acquireTemporary(mask),
                             c->client->acquireTemporary(mask));
 
@@ -1560,7 +1712,7 @@ multiplyCR(Context* c, unsigned aSize, Assembler::Constant* a,
           c->code.append4(v);        
         }
       } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
         moveCR(c, aSize, a, aSize, &tmp);
         multiplyRR(c, aSize, &tmp, bSize, b);
         c->client->releaseTemporary(tmp.low);      
@@ -1605,7 +1757,7 @@ compareCM(Context* c, unsigned aSize, Assembler::Constant* a,
       abort(c);
     }
   } else {
-    Assembler::Register tmp(c->client->acquireTemporary());
+    Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
     moveCR(c, aSize, a, bSize, &tmp);
     compareRM(c, bSize, &tmp, bSize, b);
     c->client->releaseTemporary(tmp.low);
@@ -1928,6 +2080,219 @@ unsignedShiftRightCR(Context* c, unsigned aSize UNUSED, Assembler::Constant* a,
   doShift(c, unsignedShiftRightRR, 0xe8, aSize, a, bSize, b);
 }
 
+inline void floatRegOp(Context* c, unsigned aSize, Assembler::Register* a,
+                     unsigned bSize UNUSED, Assembler::Register* b, uint8_t op, uint8_t mod = 0xc0)
+{
+  if(aSize == 4) {
+    opcode(c, 0xf3);
+  } else {
+    opcode(c, 0xf2);
+  }
+  maybeRex(c, bSize, a, b);
+  opcode(c, 0x0f, op);
+  modrm(c, mod, a, b);
+}
+
+inline void floatMemOp(Context* c, unsigned aSize, Assembler::Memory* a,
+                     unsigned bSize UNUSED, Assembler::Register* b, uint8_t op)
+{
+  if(aSize == 4) {
+    opcode(c, 0xf3);
+  } else {
+    opcode(c, 0xf2);
+  }
+  maybeRex(c, bSize, b, a);
+  opcode(c, 0x0f, op);
+  modrmSibImm(c, b, a);
+}
+
+void
+floatSqrtRR(Context* c, unsigned aSize, Assembler::Register* a,
+                     unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x51);
+}
+
+void
+floatSqrtMR(Context* c, unsigned aSize, Assembler::Memory* a,
+                     unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x51);
+}
+
+void
+floatAddRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x58);
+}
+
+void
+floatAddMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x58);
+}
+
+void
+floatSubtractRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x5c);
+}
+
+void
+floatSubtractMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x5c);
+}
+
+void
+floatMultiplyRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x59);
+}
+
+void
+floatMultiplyMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x59);
+}
+
+void
+floatDivideRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x5e);
+}
+
+void
+floatDivideMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x5e);
+}
+
+void
+float2FloatRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, supportsSSE2());
+  floatRegOp(c, aSize, a, 4, b, 0x5a);
+}
+
+void
+float2FloatMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, supportsSSE2());
+  floatMemOp(c, aSize, a, 4, b, 0x5a);
+}
+
+void
+float2IntRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  assert(c, !floatReg(b));
+  floatRegOp(c, aSize, a, bSize, b, 0x2d);
+}
+
+void
+float2IntMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, bSize, b, 0x2d);
+}
+
+void
+int2FloatRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  floatRegOp(c, bSize, a, aSize, b, 0x2a);
+}
+
+void
+int2FloatMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  floatMemOp(c, bSize, a, aSize, b, 0x2a);
+}
+
+void
+floatCompareRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  if (aSize == 8) {
+    opcode(c, 0x66);
+  }
+  maybeRex(c, 4, a, b);
+  opcode(c, 0x0f, 0x2e);
+  modrm(c, 0xc0, a, b);
+}
+
+void
+floatNegateRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, floatReg(a) and floatReg(b));
+  assert(c, aSize == 4); //unlike most of the other floating point code, this does NOT support doubles. 
+  ResolvedPromise pcon(0x80000000);
+  Assembler::Constant con(&pcon);
+  if(a->low == b->low) {
+    Assembler::Register tmp(c->client->acquireTemporary(FloatRegisterMask));
+    moveCR(c, 4, &con, 4, &tmp);
+    maybeRex(c, 4, a, &tmp);
+    opcode(c, 0x0f, 0x57);
+    modrm(c, 0xc0, &tmp, a);
+    c->client->releaseTemporary(tmp.low);
+  } else {
+    moveCR(c, 4, &con, 4, b);
+    if(aSize == 8) opcode(c, 0x66);
+    maybeRex(c, 4, a, b);
+    opcode(c, 0x0f, 0x57);
+    modrm(c, 0xc0, a, b);
+  }
+}
+
+void
+floatAbsRR(Context* c, unsigned aSize UNUSED, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, floatReg(a) and floatReg(b));
+  assert(c, aSize == 4); //unlike most of the other floating point code, this does NOT support doubles. 
+  ResolvedPromise pcon(0x7fffffff);
+  Assembler::Constant con(&pcon);
+  if(a->low == b->low) {
+    Assembler::Register tmp(c->client->acquireTemporary(FloatRegisterMask));
+    moveCR(c, 4, &con, 4, &tmp);
+    maybeRex(c, 4, a, &tmp);
+    opcode(c, 0x0f, 0x54);
+    modrm(c, 0xc0, &tmp, a);
+    c->client->releaseTemporary(tmp.low);
+  } else {
+    moveCR(c, 4, &con, 4, b);
+    maybeRex(c, 4, a, b);
+    opcode(c, 0x0f, 0x54);
+    modrm(c, 0xc0, a, b);
+  }
+}
+
+void
+absRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b UNUSED)
+{
+  assert(c, aSize == bSize and a->low == rax and b->low == rax);
+  Assembler::Register d(c->client->acquireTemporary(static_cast<uint64_t>(1) << rdx));
+  maybeRex(c, aSize, a, b);
+  opcode(c, 0x99);
+  xorRR(c, aSize, &d, aSize, a);
+  subtractRR(c, aSize, &d, aSize, a);
+  c->client->releaseTemporary(rdx);
+}
+
 void
 populateTables(ArchitectureContext* c)
 {
@@ -1963,11 +2328,14 @@ populateTables(ArchitectureContext* c)
   uo[index(JumpIfGreaterOrEqual, C)] = CAST1(jumpIfGreaterOrEqualC);
   uo[index(JumpIfLess, C)] = CAST1(jumpIfLessC);
   uo[index(JumpIfLessOrEqual, C)] = CAST1(jumpIfLessOrEqualC);
+  uo[index(JumpIfUnordered, C)] = CAST1(jumpIfUnorderedC);
 
   uo[index(LongJump, C)] = CAST1(longJumpC);
 
   bo[index(Negate, R, R)] = CAST2(negateRR);
 
+  bo[index(FloatNegate, R, R)] = CAST2(floatNegateRR);
+
   bo[index(Move, R, R)] = CAST2(moveRR);
   bo[index(Move, C, R)] = CAST2(moveCR);
   bo[index(Move, M, R)] = CAST2(moveMR);
@@ -1975,6 +2343,9 @@ populateTables(ArchitectureContext* c)
   bo[index(Move, C, M)] = CAST2(moveCM);
   bo[index(Move, A, R)] = CAST2(moveAR);
 
+  bo[index(FloatSqrt, R, R)] = CAST2(floatSqrtRR);
+  bo[index(FloatSqrt, M, R)] = CAST2(floatSqrtMR);
+
   bo[index(MoveZ, R, R)] = CAST2(moveZRR);
   bo[index(MoveZ, M, R)] = CAST2(moveZMR);
 
@@ -1983,12 +2354,20 @@ populateTables(ArchitectureContext* c)
   bo[index(Compare, C, M)] = CAST2(compareCM);
   bo[index(Compare, R, M)] = CAST2(compareRM);
 
+  bo[index(FloatCompare, R, R)] = CAST2(floatCompareRR);
+
   bo[index(Add, R, R)] = CAST2(addRR);
   bo[index(Add, C, R)] = CAST2(addCR);
 
   bo[index(Subtract, C, R)] = CAST2(subtractCR);
   bo[index(Subtract, R, R)] = CAST2(subtractRR);
 
+  bo[index(FloatAdd, R, R)] = CAST2(floatAddRR);
+  bo[index(FloatAdd, M, R)] = CAST2(floatAddMR);
+
+  bo[index(FloatSubtract, R, R)] = CAST2(floatSubtractRR);
+  bo[index(FloatSubtract, M, R)] = CAST2(floatSubtractMR);
+
   bo[index(And, R, R)] = CAST2(andRR);
   bo[index(And, C, R)] = CAST2(andCR);
 
@@ -2003,6 +2382,12 @@ populateTables(ArchitectureContext* c)
 
   bo[index(Divide, R, R)] = CAST2(divideRR);
 
+  bo[index(FloatMultiply, R, R)] = CAST2(floatMultiplyRR);
+  bo[index(FloatMultiply, M, R)] = CAST2(floatMultiplyMR);
+
+  bo[index(FloatDivide, R, R)] = CAST2(floatDivideRR);
+  bo[index(FloatDivide, M, R)] = CAST2(floatDivideMR);
+
   bo[index(Remainder, R, R)] = CAST2(remainderRR);
 
   bo[index(LongCompare, C, R)] = CAST2(longCompareCR);
@@ -2016,8 +2401,19 @@ populateTables(ArchitectureContext* c)
 
   bo[index(UnsignedShiftRight, R, R)] = CAST2(unsignedShiftRightRR);
   bo[index(UnsignedShiftRight, C, R)] = CAST2(unsignedShiftRightCR);
-}
 
+  bo[index(Float2Float, R, R)] = CAST2(float2FloatRR);
+  bo[index(Float2Float, M, R)] = CAST2(float2FloatMR);
+
+  bo[index(Float2Int, R, R)] = CAST2(float2IntRR);
+  bo[index(Float2Int, M, R)] = CAST2(float2IntMR);
+
+  bo[index(Int2Float, R, R)] = CAST2(int2FloatRR);
+  bo[index(Int2Float, M, R)] = CAST2(int2FloatMR);
+
+  bo[index(Abs, R, R)] = CAST2(absRR);
+  bo[index(FloatAbs, R, R)] = CAST2(floatAbsRR);
+}
 class MyArchitecture: public Assembler::Architecture {
  public:
   MyArchitecture(System* system): c(system), referenceCount(0) {
@@ -2025,7 +2421,31 @@ class MyArchitecture: public Assembler::Architecture {
   }
 
   virtual unsigned registerCount() {
-    return (BytesPerWord == 4 ? 8 : 16);
+    if (supportsSSE()) {
+      return BytesPerWord == 4 ? 24 : 32;
+    } else {
+      return BytesPerWord == 4 ? 8 : 16;
+    }
+  }
+  
+  virtual unsigned generalRegisterCount() {
+  	return BytesPerWord == 4 ? 8 : 16;
+  }
+  
+  virtual unsigned floatRegisterCount() {
+    if (supportsSSE()) {
+      return BytesPerWord == 4 ? 8 : 16;
+    } else {
+      return 0;
+    }
+  }
+  
+  virtual uint64_t generalRegisters() {
+  	return GeneralRegisterMask;
+  }
+  
+  virtual uint64_t floatRegisters() {
+  	return supportsSSE() ? FloatRegisterMask : 0;
   }
 
   virtual int stack() {
@@ -2044,10 +2464,6 @@ class MyArchitecture: public Assembler::Architecture {
     return (BytesPerWord == 4 ? rdx : NoRegister);
   }
 
-  virtual bool condensedAddressing() {
-    return true;
-  }
-
   virtual bool bigEndian() {
     return false;
   }
@@ -2058,7 +2474,7 @@ class MyArchitecture: public Assembler::Architecture {
     case rsp:
     case rbx:
       return true;
-
+   	  
     default:
       return false;
     }
@@ -2171,6 +2587,10 @@ class MyArchitecture: public Assembler::Architecture {
     return 0;
   }
 
+  virtual bool supportsFloatCompare(unsigned size) {
+    return supportsSSE() and size <= BytesPerWord;
+  }
+
   virtual void nextFrame(void** stack, void** base) {
     assert(&c, *static_cast<void**>(*base) != *base);
 
@@ -2189,61 +2609,206 @@ class MyArchitecture: public Assembler::Architecture {
     *thunk = false;
   }
 
-  virtual void plan
+  bool checkMethodClass(Thread* t, object method, const char* value)
+  {
+    return strcmp
+      (reinterpret_cast<const char*>
+       (&byteArrayBody(t, className(t, methodClass(t, method)), 0)),
+       value) == 0;
+  }
+
+  bool checkMethodName(Thread* t, object method, const char* value)
+  {
+    return strcmp
+      (reinterpret_cast<const char*>
+       (&byteArrayBody(t, methodName(t, method), 0)),
+       value) == 0;
+  }
+
+  bool checkMethodSpec(Thread* t, object method, const char* value)
+  {
+    return strcmp
+      (reinterpret_cast<const char*>
+       (&byteArrayBody(t, methodSpec(t, method), 0)),
+       value) == 0;
+  }
+
+  virtual BinaryOperation hasBinaryIntrinsic(Thread* t, object method)
+  {
+    if(checkMethodClass(t, method, "java/lang/Math")) {
+      if(supportsSSE() and checkMethodName(t, method, "sqrt") and checkMethodSpec(t, method, "(D)D") and BytesPerWord == 8) {
+        return FloatSqrt;
+      } else if(checkMethodName(t, method, "abs")) {
+      	if(checkMethodSpec(t, method, "(I)I") or (checkMethodSpec(t, method, "(J)J") and BytesPerWord == 8)) {
+          return Abs;
+      	} else if(supportsSSE() and supportsSSE2() and checkMethodSpec(t, method, "(F)F")) {
+      	  return FloatAbs;
+      	}
+      }
+    }
+    return NoBinaryOperation;
+  }
+
+  virtual TernaryOperation hasTernaryIntrinsic(Thread* t UNUSED, object method UNUSED) {
+  	return NoTernaryOperation;
+  }
+
+  virtual void planSource
   (BinaryOperation op,
    unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
-   unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-   bool* thunk)
+   unsigned bSize, bool* thunk)
   {
     *aTypeMask = ~0;
-    *aRegisterMask = ~static_cast<uint64_t>(0);
-
-    *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
-    *bRegisterMask = ~static_cast<uint64_t>(0);
+    *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
 
     *thunk = false;
 
     switch (op) {
     case Compare:
       *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
-      *bTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = GeneralRegisterMask;
+      break;
+    case FloatCompare:
+      assert(&c, supportsSSE() && aSize <= BytesPerWord);
+      *aTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = FloatRegisterMask;
       break;
-
     case Negate:
       *aTypeMask = (1 << RegisterOperand);
-      *bTypeMask = (1 << RegisterOperand);
       *aRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
         | (static_cast<uint64_t>(1) << rax);
-      *bRegisterMask = *aRegisterMask;
       break;
-
+    case Abs:
+      *aTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = (static_cast<uint64_t>(1) << rax);
+      break;
+    case FloatAbs:
+   	  *aTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = FloatRegisterMask;
+      break;    
+    case FloatNegate:
+      if(!supportsSSE() or aSize == 8 or bSize == 8) { //floatNegateRR does not support doubles
+        *thunk = true;
+      } else {
+   	    *aTypeMask = (1 << RegisterOperand);
+        *aRegisterMask = FloatRegisterMask;
+      }
+      break;
+    case FloatSqrt:
+      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+   	  *aRegisterMask = FloatRegisterMask;
+   	  break;
+   	case Float2Float:
+   	  if(!supportsSSE() or !supportsSSE2() or BytesPerWord == 4) {
+   	    *thunk = true;
+   	  } else {
+        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+   	    *aRegisterMask = FloatRegisterMask;
+   	  }
+   	  break;
+   	case Float2Int:
+   	  if(!supportsSSE() or aSize > BytesPerWord or bSize > BytesPerWord) {
+   	    *thunk = true;
+   	  } else {
+   	    *aTypeMask = (1 << RegisterOperand);// | (1 << MemoryOperand);
+   	    *aRegisterMask = FloatRegisterMask;
+   	  }
+   	  break;
+   	case Int2Float:
+   	  if(!supportsSSE() or aSize > BytesPerWord or bSize > BytesPerWord) {
+   	    *thunk = true;
+   	  } else {
+   	    *aTypeMask = (1 << RegisterOperand);// | (1 << MemoryOperand);
+   	    *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+   	  }
+   	  break;
     case Move:
+      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+      *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
       if (BytesPerWord == 4) {
         if (aSize == 4 and bSize == 8) {
-          const uint32_t mask = ~((1 << rax) | (1 << rdx));
-          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
-          *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
-            | (static_cast<uint64_t>(1) << rax);        
+          *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+          const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
+          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;    
         } else if (aSize == 1 or bSize == 1) {
+          *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
           const uint32_t mask
             = (1 << rax) | (1 << rcx) | (1 << rdx) | (1 << rbx);
-          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
-          *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;        
+          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;     
         }
       }
       break;
-
     default:
       break;
     }
   }
 
-  virtual void plan
+  virtual void planDestination
+  (BinaryOperation op,
+   unsigned aSize, const uint8_t* aTypeMask UNUSED, const uint64_t* aRegisterMask,
+   unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask)
+  {
+  	*bTypeMask = ~0;
+  	*bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+    switch (op) {
+    case Compare:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = GeneralRegisterMask;
+      break;
+    case FloatCompare:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = FloatRegisterMask;
+      break;
+
+    case Abs:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = (static_cast<uint64_t>(1) << rax);
+       break;
+
+    case FloatAbs:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = *aRegisterMask;
+      break;
+
+    case Negate:
+    case FloatNegate:
+    case FloatSqrt:
+   	case Float2Float:
+   	  *bTypeMask = (1 << RegisterOperand);
+   	  *bRegisterMask = *aRegisterMask;
+   	  break;
+   	case Int2Float:
+   	  *bTypeMask = (1 << RegisterOperand);
+   	  *bRegisterMask = FloatRegisterMask;
+   	  break;
+   	case Float2Int:
+   	  *bTypeMask = (1 << RegisterOperand);
+   	  *bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+   	  break;
+    case Move:
+      *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+      *bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+      if (BytesPerWord == 4) {
+        if (aSize == 4 and bSize == 8) {
+          *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
+            | (static_cast<uint64_t>(1) << rax);
+        } else if (aSize == 1 or bSize == 1) {
+          const uint32_t mask
+            = (1 << rax) | (1 << rcx) | (1 << rdx) | (1 << rbx);
+          *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
+        }
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  virtual void planSource
   (TernaryOperation op,
-   unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
+   unsigned aSize, uint8_t *aTypeMask, uint64_t *aRegisterMask,
    unsigned, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-   unsigned, uint8_t* cTypeMask, uint64_t* cRegisterMask,
-   bool* thunk)
+   unsigned, bool* thunk)
   {
     *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
     *aRegisterMask = ~static_cast<uint64_t>(0);
@@ -2254,21 +2819,37 @@ class MyArchitecture: public Assembler::Architecture {
     *thunk = false;
 
     switch (op) {
+   	case FloatAdd:
+   	case FloatSubtract:
+   	case FloatMultiply:
+   	case FloatDivide:
+   	  if(!supportsSSE() or aSize > BytesPerWord) {
+   	  	*thunk = true;
+   	  } else {
+   	  	*aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+   	  	*bTypeMask = (1 << RegisterOperand);
+   	    *aRegisterMask = FloatRegisterMask;
+   	    *bRegisterMask = FloatRegisterMask;
+   	  }
+   	  break;
+   	  
     case Multiply:
       if (BytesPerWord == 4 and aSize == 8) { 
-        const uint32_t mask = ~((1 << rax) | (1 << rdx));
+        const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
         *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
         *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32)) | mask;
+      } else {
+        *aRegisterMask = GeneralRegisterMask;
+        *bRegisterMask = GeneralRegisterMask;
       }
       break;
 
     case Divide:
       if (BytesPerWord == 4 and aSize == 8) {
-        *bTypeMask = ~0;
-        *thunk = true;        
+        *thunk = true;        			
       } else {
         *aTypeMask = (1 << RegisterOperand);
-        *aRegisterMask = ~((1 << rax) | (1 << rdx));
+        *aRegisterMask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
         *bRegisterMask = 1 << rax;      
       }
       break;
@@ -2279,25 +2860,32 @@ class MyArchitecture: public Assembler::Architecture {
         *thunk = true;
       } else {
         *aTypeMask = (1 << RegisterOperand);
-        *aRegisterMask = ~((1 << rax) | (1 << rdx));
-        *bRegisterMask = 1 << rax;      
+        *aRegisterMask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
+        *bRegisterMask = 1 << rax;
       }
       break;
 
     case ShiftLeft:
     case ShiftRight:
     case UnsignedShiftRight: {
-      *aRegisterMask = (~static_cast<uint64_t>(0) << 32)
+      *aRegisterMask = (static_cast<uint64_t>(GeneralRegisterMask) << 32)
         | (static_cast<uint64_t>(1) << rcx);
-      const uint32_t mask = ~(1 << rcx);
+      const uint32_t mask = GeneralRegisterMask & ~(1 << rcx);
       *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
     } break;
 
     default:
       break;
     }
+  }
 
-    *cTypeMask = *bTypeMask;
+  virtual void planDestination
+  (TernaryOperation op UNUSED,
+   unsigned aSize UNUSED, const uint8_t* aTypeMask UNUSED, const uint64_t* aRegisterMask UNUSED,
+   unsigned bSize UNUSED, const uint8_t* bTypeMask UNUSED, const uint64_t* bRegisterMask,
+   unsigned cSize UNUSED, uint8_t* cTypeMask, uint64_t* cRegisterMask)
+  {
+    *cTypeMask = (1 << RegisterOperand);
     *cRegisterMask = *bRegisterMask;
   }
 

From f8bbc609e8d4465f5cf9b159a6a18814b9960d5b Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 10:32:00 -0600
Subject: [PATCH 09/16] corrected debug messages

---
 src/compile.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/compile.cpp b/src/compile.cpp
index 3b8637c872..07d0813751 100644
--- a/src/compile.cpp
+++ b/src/compile.cpp
@@ -27,11 +27,12 @@ vmCall();
 
 namespace {
 
-const bool DebugCompile = true;
+const bool DebugCompile = false;
 const bool DebugNatives = false;
 const bool DebugCallTable = false;
 const bool DebugMethodTree = false;
 const bool DebugFrameMaps = false;
+const bool DebugIntrinsics = false;
 
 const bool CheckArrayBounds = true;
 
@@ -3091,7 +3092,9 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       if(params == 1) {//TODO: Get number of method params
       	BinaryOperation op = t->arch->hasBinaryIntrinsic(t, target);
       	if(op != NoBinaryOperation) {
-      	  printf("Could use binary intrinsic %i.\n", op);
+      	  if(DebugIntrinsics) {
+      	    fprintf(stderr, "Using binary intrinsic %i.\n", op);
+      	  }
 		  int opSize = methodParameterFootprint(t, target) * BytesPerWord;
 		  int resSize = resultSize(t, methodReturnCode(t, target));
 		  Compiler::Operand* param;
@@ -3111,7 +3114,9 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       } else if(params == 2) { //TODO: Get number of method params
       	TernaryOperation op = t->arch->hasTernaryIntrinsic(t, target);
       	if(op != NoTernaryOperation) {
-      	  printf("Could use ternary intrinsic %i.\n", op);
+      	  if(DebugIntrinsics) {
+      	    fprintf(stderr, "Could use ternary intrinsic %i.\n", op);
+      	  }
       	  //int aSize, bSize;
 		  //int resSize = resultSize(t, methodReturnCode(t, target));
           compileDirectInvoke(t, frame, target); //TODO: use intrinsic

From 04583ea534a30048978891121cb2df8a442c507d Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Thu, 6 Aug 2009 10:34:28 -0600
Subject: [PATCH 10/16] floating point test code

---
 test/AllFloats.java | 77 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 test/AllFloats.java

diff --git a/test/AllFloats.java b/test/AllFloats.java
new file mode 100644
index 0000000000..8f41c4d20e
--- /dev/null
+++ b/test/AllFloats.java
@@ -0,0 +1,77 @@
+public class AllFloats {
+  private static float multiplyByFive(float a) {return 5f * a;}
+  private static double multiplyByFive(double a) {return 5d * a;}
+  private static float multiply(float a, float b) {return a * b;}
+  private static double multiply(double a, double b) {return a * b;}
+  private static double multiply(float a, double b) {return a * b;}
+  private static float divide(float a, float b) {return a / b;}
+  private static double divide(double a, double b) {return a / b;}
+  private static double divide(float a, double b) {return a / b;}
+  private static float add(float a, float b) {return a + b;}
+  private static double add(double a, double b) {return a + b;}
+  private static double add(float a, double b) {return a + b;}
+  private static float subtract(float a, float b) {return a - b;}
+  private static double subtract(double a, double b) {return a - b;}
+  private static double subtract(float a, double b) {return a - b;}
+  private static float complex(float a, float b) {return (a - b) / (a * b) + (float)Math.sqrt(a);}
+  private static double complex(double a, double b) {return (a - b) / (a * b) + Math.sqrt(a);}
+  private static double complex(float a, double b) {return (a - b) / (a * b) + Math.sqrt(a);}
+  private static int f2i(float a) {return (int)a;}
+  private static long f2l(float a) {return (long)a;}
+  private static float i2f(int a) {return (float)a;}
+  private static double i2d(int a) {return (double)a;}
+  private static int d2i(double a) {return (int)a;}
+  private static long d2l(double a) {return (long)a;}
+  private static float l2f(long a) {return (float)a;}
+  private static double l2d(long a) {return (double)a;}
+  private static float negate(float a) {return -a;}
+  private static double negate(double a) {return -a;}
+  private static int abs(int a) {return Math.abs(a);}
+  private static float abs(float a) {return Math.abs(a);}
+  
+  private static void expect(boolean v) {
+    if(!v)throw new RuntimeException();
+  }
+  
+  private static int last(){return 0;}
+  
+  public static void main(String[] args) {
+    expect(multiplyByFive(36f) == 5f * 36f);
+    expect(multiplyByFive(36d) == 5d * 36d);
+    expect(multiply(5f, 4f) == 5f*4f);
+    expect(multiply(5d, 4d) == 5d*4d);
+    expect(multiply(5f, 4d) == 5f*4d);
+    expect(divide(5f, 2f) == 5f/2f);
+    expect(divide(5d, 2d) == 5d/2d);
+    expect(divide(5f, 2d) == 5f/2d);
+    expect(add(5f, 4f) == 5f+4f);
+    expect(add(5d, 4d) == 5f+4d);
+    expect(add(5f, 4f) == 5f+4d);
+    expect(subtract(5f, 4f) == 5f-4f);
+    expect(subtract(5d, 4d) == 5f-4d);
+    expect(subtract(5f, 4f) == 5f-4d);
+    expect(complex(4f, 3f) == (4f-3f)/(4f*3f) + 2f);
+    expect(complex(4d, 3d) == (4d-3d)/(4d*3d) + 2d);
+    expect(complex(4f, 3d) == (4f-3d)/(4f*3d) + 2f);
+    
+    expect(f2i(4f) == 4);
+    expect(f2l(4f) == 4);
+    expect(i2f(4) == 4f);
+    expect(i2d(4) == 4d);
+    
+    expect(d2i(4d) == 4);
+    expect(d2l(4d) == 4);
+    expect(l2f(4) == 4f);
+    expect(l2d(4) == 4d);
+    
+    expect(negate(4f) == -4f);
+    expect(negate(4d) == -4d);
+    
+    expect(abs(-4) == 4);
+    expect(abs(12) == 12);
+    expect(abs(-4f) == 4f);
+    expect(abs(12f) == 12f);
+    
+    int unused = last();
+  }
+}

From 9910e310cb4ce1cf95f9223d42e45266c0833dd7 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Mon, 10 Aug 2009 13:42:37 -0600
Subject: [PATCH 11/16] fixed register reserve logic in pickTarget

---
 src/compiler.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/compiler.cpp b/src/compiler.cpp
index 52d3dbfa99..b1a4251bbb 100644
--- a/src/compiler.cpp
+++ b/src/compiler.cpp
@@ -1317,11 +1317,10 @@ pickTarget(Context* c, Read* read, bool intersectRead,
       registerPenalty = (c->floatRegisterCount > registerReserveCount
                               ? 0 : Target::LowRegisterPenalty);
     } else {
-      registerPenalty = (c->availableRegisterCount > registerReserveCount
-                              ? 0 : Target::LowRegisterPenalty);
+      abort(c);
     }
   } else {
-    registerPenalty = (c->availableRegisterCount > registerReserveCount
+    registerPenalty = (c->generalRegisterCount > registerReserveCount || c->floatRegisterCount > registerReserveCount
                             ? 0 : Target::LowRegisterPenalty);
   }
 

From 32167168f8c3d6a68d43c76b2e6961c068562e6b Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Tue, 11 Aug 2009 13:25:22 -0600
Subject: [PATCH 12/16] fixed incorrect opSize bug for 64-bit platforms

---
 src/compile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compile.cpp b/src/compile.cpp
index 2ca1a7a8fd..9d55bdf83f 100644
--- a/src/compile.cpp
+++ b/src/compile.cpp
@@ -3630,7 +3630,7 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
       	  if (DebugIntrinsics) {
       	    fprintf(stderr, "Using binary intrinsic %i.\n", op);
       	  }
-		  int opSize = methodParameterFootprint(t, target) * BytesPerWord;
+		  int opSize = methodParameterFootprint(t, target) * 4;
 		  int resSize = resultSize(t, methodReturnCode(t, target));
 		  Compiler::Operand* param;
 		  if (opSize == 4) {

From cd59222f53397d6f40fb5977cbb40847d74735e9 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Tue, 11 Aug 2009 13:27:25 -0600
Subject: [PATCH 13/16] fixed propegation of result sizes

---
 src/compiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler.cpp b/src/compiler.cpp
index b1a4251bbb..0f1f37cdaa 100644
--- a/src/compiler.cpp
+++ b/src/compiler.cpp
@@ -3652,7 +3652,7 @@ class TranslateEvent: public Event {
 
     apply(c, type,
           size, value->source, source(value->high),
-          size, low, high);
+          resSize, low, high);
 
     for (Read* r = reads; r; r = r->eventNext) {
       popRead(c, this, r->value);

From 711680a183d92dd68d6b588756418fc4dbfd9078 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Tue, 11 Aug 2009 13:29:00 -0600
Subject: [PATCH 14/16] fixed powerpc compile errors

---
 src/powerpc.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/powerpc.cpp b/src/powerpc.cpp
index 99e0c0be34..07a31636b9 100644
--- a/src/powerpc.cpp
+++ b/src/powerpc.cpp
@@ -1676,6 +1676,14 @@ class MyArchitecture: public Assembler::Architecture {
     return 32;
   }
 
+  virtual unsigned generalRegisterCount() {
+    return 32;
+  }
+
+  virtual unsigned floatRegisterCount() {
+    return 0;
+  }
+
   virtual int stack() {
     return StackRegister;
   }
@@ -1827,23 +1835,23 @@ class MyArchitecture: public Assembler::Architecture {
     *stack = *static_cast<void**>(*stack);
   }
 
-  virtual BinaryOperation hasBinaryIntrinsic(Thread* t, object method) {
+  virtual BinaryOperation hasBinaryIntrinsic(Thread*, object) {
   	return NoBinaryOperation;
   }
   
-  virtual TernaryOperation hasTernaryIntrinsic(Thread* t UNUSED, object method UNUSED) {
+  virtual TernaryOperation hasTernaryIntrinsic(Thread*, object) {
   	return NoTernaryOperation;
   }
   
-  virtual bool supportsFloatCompare(unsigned size) {
+  virtual bool supportsFloatCompare(unsigned) {
     return false;
   }
   
-  virtual bool alwaysCondensed(BinaryOperation op) {
+  virtual bool alwaysCondensed(BinaryOperation) {
     return false;
   }
   
-  virtual bool alwaysCondensed(TernaryOperation op) {
+  virtual bool alwaysCondensed(TernaryOperation) {
     return false;
   }
   
@@ -1889,7 +1897,7 @@ class MyArchitecture: public Assembler::Architecture {
   
   virtual void planDestination
   (BinaryOperation op,
-   unsigned, const uint8_t* aTypeMask, const uint64_t* aRegisterMask,
+   unsigned, const uint8_t*, const uint64_t*,
    unsigned, uint8_t* bTypeMask, uint64_t* bRegisterMask)
   {
     *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
@@ -1962,9 +1970,9 @@ class MyArchitecture: public Assembler::Architecture {
   }
 
   virtual void planDestination
-  (TernaryOperation op,
+  (TernaryOperation,
+   unsigned, const uint8_t*, const uint64_t*,
    unsigned, const uint8_t*, const uint64_t*,
-   unsigned, const uint8_t* bTypeMask, const uint64_t* bRegisterMask,
    unsigned, uint8_t* cTypeMask, uint64_t* cRegisterMask)
   {
     *cTypeMask = (1 << RegisterOperand);

From f29199a2851c59da8ed9af5360463250646c1c0e Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Tue, 11 Aug 2009 13:30:31 -0600
Subject: [PATCH 15/16] fixed several operand type errors that appeared on
 64-bit platforms

---
 src/x86.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/x86.cpp b/src/x86.cpp
index 4e3a681c76..d0db69ab3b 100644
--- a/src/x86.cpp
+++ b/src/x86.cpp
@@ -961,7 +961,7 @@ sseMoveRR(Context* c, unsigned aSize, Assembler::Register* a,
   	modrm(c, 0xc0, b, a);  	
   } else {
   	opcode(c, 0x66);
-  	maybeRex(c, aSize, a, b);
+  	maybeRex(c, aSize, b, a);
   	opcode(c, 0x0f, 0x6e);
   	modrm(c, 0xc0, a, b);  	
   }
@@ -2699,12 +2699,12 @@ class MyArchitecture: public Assembler::Architecture {
     case Float2Float:
     case Float2Int:
     case Int2Float:
-      return false;
-    case Negate:
-    case Abs:
     case FloatAbs:
     case FloatNegate:
     case FloatSqrt:
+      return false;
+    case Negate:
+    case Abs:
     default:
       return true;
     }
@@ -2902,11 +2902,14 @@ class MyArchitecture: public Assembler::Architecture {
       break;
 
     case Negate:
+   	  *bTypeMask = (1 << RegisterOperand);
+   	  *bRegisterMask = *aRegisterMask;
+   	  break;
     case FloatNegate:
     case FloatSqrt:
    	case Float2Float:
    	  *bTypeMask = (1 << RegisterOperand);
-   	  *bRegisterMask = *aRegisterMask;
+   	  *bRegisterMask = FloatRegisterMask;
    	  break;
    	case Int2Float:
    	  *bTypeMask = (1 << RegisterOperand);

From 78ea4d20e3cc44e812ab524d73d44d40fef64033 Mon Sep 17 00:00:00 2001
From: Josh warner <jwarner@radio.ecovate.com>
Date: Tue, 11 Aug 2009 13:46:51 -0600
Subject: [PATCH 16/16] added loneMatch to improve register allocation

---
 src/compiler.cpp | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/compiler.cpp b/src/compiler.cpp
index 0f1f37cdaa..e715f54cce 100644
--- a/src/compiler.cpp
+++ b/src/compiler.cpp
@@ -108,6 +108,8 @@ class Site {
   virtual unsigned copyCost(Context*, Site*) = 0;
 
   virtual bool match(Context*, const SiteMask&) = 0;
+
+  virtual bool loneMatch(Context*, const SiteMask&) = 0;
   
   virtual void acquire(Context*, Value*) { }
 
@@ -1429,6 +1431,10 @@ class ConstantSite: public Site {
     return mask.typeMask & (1 << ConstantOperand);
   }
 
+  virtual bool loneMatch(Context*, const SiteMask&) {
+    return true;
+  }
+
   virtual OperandType type(Context*) {
     return ConstantOperand;
   }
@@ -1501,6 +1507,10 @@ class AddressSite: public Site {
     return mask.typeMask & (1 << AddressOperand);
   }
 
+  virtual bool loneMatch(Context*, const SiteMask&) {
+    return false;
+  }
+
   virtual OperandType type(Context*) {
     return AddressOperand;
   }
@@ -1575,6 +1585,16 @@ class RegisterSite: public Site {
     }
   }
 
+  virtual bool loneMatch(Context* c UNUSED, const SiteMask& mask) {
+    assert(c, number != NoRegister);
+
+    if ((mask.typeMask & (1 << RegisterOperand))) {
+      return ((static_cast<uint64_t>(1) << number) == mask.registerMask);
+    } else {
+      return false;
+    }
+  }
+
   virtual void acquire(Context* c, Value* v) {
     Target target;
     if (number != NoRegister) {
@@ -1729,6 +1749,23 @@ class MemorySite: public Site {
     }
   }
 
+  virtual bool loneMatch(Context* c, const SiteMask& mask) {
+    assert(c, acquired);
+
+    if (mask.typeMask & (1 << MemoryOperand)) {
+      if (base == c->arch->stack()) {
+        assert(c, index == NoRegister);
+
+        if (mask.frameIndex == AnyFrameIndex) {
+          return false;
+        } else {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
   virtual void acquire(Context* c, Value* v) {
     increment(c, c->registerResources + base);
     if (index != NoRegister) {
@@ -3145,7 +3182,8 @@ getTarget(Context* c, Value* value, Value* result, const SiteMask& resultMask)
   Site* s;
   Value* v;
   Read* r = liveNext(c, value);
-  if (value->source->match(c, static_cast<const SiteMask&>(resultMask))) {
+  if (value->source->match(c, static_cast<const SiteMask&>(resultMask)) and (r == 0 or 
+          value->source->loneMatch(c, static_cast<const SiteMask&>(resultMask)))) {
     s = value->source;
     v = value;
     if (r and not hasMoreThanOneSite(v)) {