added floating point support, split plan function

2025-01-22 12:28:11 +00:00 · 2009-08-06 10:26:22 -06:00 · 2009-08-06 10:26:22 -06:00 · a2e639a2d2
commit a2e639a2d2
parent 61bc729974
1 changed files with 639 additions and 51 deletions
--- a/src/x86.cpp
+++ b/src/x86.cpp
@ -13,10 +13,15 @@
 #include "assembler.h"
 #include "vector.h"
 #include "machine.h"
 #define CAST1(x) reinterpret_cast<UnaryOperationType>(x)
 #define CAST2(x) reinterpret_cast<BinaryOperationType>(x)
 const bool DebugSSE = false;
 const bool EnableSSE = true;
 const bool EnableSSE2 = true;
 using namespace vm;
 namespace {
@ -40,6 +45,28 @@ enum {
  r15 = 15,
 };
 enum {
  xmm0 = r15 + 1,
  xmm1,
  xmm2,
  xmm3,
  xmm4,
  xmm5,
  xmm6,
  xmm7,
  xmm8,
  xmm9,
  xmm10,
  xmm11,
  xmm12,
  xmm13,
  xmm14,
  xmm15,
 };
 const unsigned GeneralRegisterMask = BytesPerWord == 4 ? 0x000000ff : 0x0000ffff;
 const unsigned FloatRegisterMask = BytesPerWord == 4 ? 0x00ff0000 : 0xffff0000;
 const unsigned FrameHeaderSize = 2;
 inline bool
@ -399,6 +426,35 @@ padding(AlignmentPadding* p, unsigned start, unsigned offset,
  return padding;
 }
 extern "C"
 bool detectFeature(unsigned ecx, unsigned edx);
 inline bool
 supportsSSE()
 {
 	static int supported = -1;
 	if(supported == -1) {
 	  supported = EnableSSE && detectFeature(0, 0x2000000);
 	  if(DebugSSE) {
 	    fprintf(stderr, "sse %sdetected.\n", supported ? "" : "not ");
 	  }
 	}
 	return supported;	
 }
 inline bool
 supportsSSE2()
 {
 	static int supported = -1;
 	if(supported == -1) {
 	  supported = EnableSSE2 && detectFeature(0, 0x4000000);
 	  if(DebugSSE) {
 	    fprintf(stderr, "sse2 %sdetected.\n", supported ? "" : "not ");
 	  }
 	}
 	return supported;
 }
 #define REX_W 0x48
 #define REX_R 0x44
 #define REX_X 0x42
@ -503,6 +559,12 @@ inline void opcode(Context* c, uint8_t op1, uint8_t op2) {
  c->code.append(op2);
 }
 inline void opcode(Context* c, uint8_t op1, uint8_t op2, uint8_t op3) {
  c->code.append(op1);
  c->code.append(op2);
  c->code.append(op3);
 }
 void
 return_(Context* c)
 {
@ -667,6 +729,14 @@ jumpIfLessOrEqualC(Context* c, unsigned size UNUSED, Assembler::Constant* a)
  conditional(c, 0x8e, a);
 }
 void
 jumpIfUnorderedC(Context* c, unsigned size UNUSED, Assembler::Constant* a)
 {
  assert(c, size == BytesPerWord);
  conditional(c, 0x8a, a);
 }
 void
 longJumpC(Context* c, unsigned size, Assembler::Constant* a)
 {
@ -806,12 +876,60 @@ moveCR2(Context* c, UNUSED unsigned aSize, Assembler::Constant* a,
  }
 }
 inline bool floatReg(Assembler::Register* a) {
 	return a->low >= xmm0;
 }
 void
 sseMoveRR(Context* c, unsigned aSize, Assembler::Register* a,
       unsigned bSize UNUSED, Assembler::Register* b)
 {
  if(floatReg(a) && floatReg(b)) {
  	if(aSize == 4) {
  	  opcode(c, 0xf3);
  	  maybeRex(c, 4, a, b);
  	  opcode(c, 0x0f, 0x10);
  	  modrm(c, 0xc0, b, a);
  	} else {
  	  opcode(c, 0xf2);
  	  maybeRex(c, 4, a, b);
  	  opcode(c, 0x0f, 0x10);
  	  modrm(c, 0xc0, b, a);
  	} 
  } else if(floatReg(a)) {
  	opcode(c, 0x66);
  	maybeRex(c, aSize, a, b);
  	opcode(c, 0x0f, 0x7e);
  	modrm(c, 0xc0, b, a);  	
  } else {
  	opcode(c, 0x66);
  	maybeRex(c, aSize, a, b);
  	opcode(c, 0x0f, 0x6e);
  	modrm(c, 0xc0, a, b);  	
  }
 }
 void
 sseMoveCR(Context* c, unsigned aSize, Assembler::Constant* a,
       unsigned bSize, Assembler::Register* b)
 {
  assert(c, aSize <= BytesPerWord);
  Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
  moveCR2(c, aSize, a, aSize, &tmp, 0);
  sseMoveRR(c, aSize, &tmp, bSize, b);
  c->client->releaseTemporary(tmp.low);
 }
 void
 moveCR(Context* c, unsigned aSize, Assembler::Constant* a,
       unsigned bSize, Assembler::Register* b)
 {
  if(floatReg(b)) {
  	sseMoveCR(c, aSize, a, bSize, b);
  } else {
    moveCR2(c, aSize, a, bSize, b, 0);
  }
 }
 void
 swapRR(Context* c, unsigned aSize UNUSED, Assembler::Register* a,
@ -829,6 +947,10 @@ void
 moveRR(Context* c, unsigned aSize, Assembler::Register* a,
       UNUSED unsigned bSize, Assembler::Register* b)
 {
  if(floatReg(a) or floatReg(b)) {
  	sseMoveRR(c, aSize, a, bSize, b);
  	return;
  }
  if (BytesPerWord == 4 and aSize == 8 and bSize == 8) {
    Assembler::Register ah(a->high);
@ -902,10 +1024,25 @@ moveRR(Context* c, unsigned aSize, Assembler::Register* a,
  }
 }
 void
 sseMoveMR(Context* c, unsigned aSize, Assembler::Memory* a,
       unsigned bSize UNUSED, Assembler::Register* b)
 {
  opcode(c, 0x66);
  maybeRex(c, aSize, b, a);
  opcode(c, 0x0f, 0x6e);
  modrmSibImm(c, b, a);
 }
 void
 moveMR(Context* c, unsigned aSize, Assembler::Memory* a,
       unsigned bSize, Assembler::Register* b)
 {
  if(floatReg(b)) {
  	sseMoveMR(c, aSize, a, bSize, b);
  	return;
  }
  switch (aSize) {
  case 1:
    maybeRex(c, bSize, b, a);
@ -956,12 +1093,27 @@ moveMR(Context* c, unsigned aSize, Assembler::Memory* a,
  }
 }
 void
 sseMoveRM(Context* c, unsigned aSize, Assembler::Register* a,
       UNUSED unsigned bSize, Assembler::Memory* b)
 {
  opcode(c, 0x66);
  maybeRex(c, aSize, a, b);
  opcode(c, 0x0f, 0x7e);
  modrmSibImm(c, a, b);
 }
 void
 moveRM(Context* c, unsigned aSize, Assembler::Register* a,
       unsigned bSize UNUSED, Assembler::Memory* b)
 {
  assert(c, aSize == bSize);
  if(floatReg(a)) {
  	sseMoveRM(c, aSize, a, bSize, b);
  	return;
  }
  switch (aSize) {
  case 1:
    maybeRex(c, bSize, a, b);
@ -1066,7 +1218,7 @@ moveCM(Context* c, unsigned aSize UNUSED, Assembler::Constant* a,
        modrmSibImm(c, 0, b->scale, b->index, b->base, b->offset);
        c->code.append4(a->value->value());
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, 8, a, 8, &tmp);
        moveRM(c, 8, &tmp, 8, b);
        c->client->releaseTemporary(tmp.low);
@ -1188,7 +1340,7 @@ addCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        addRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1246,7 +1398,7 @@ subtractCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        subtractRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1335,7 +1487,7 @@ andCR(Context* c, unsigned aSize, Assembler::Constant* a,
        c->code.append4(v);
      }
    } else {
-      Assembler::Register tmp(c->client->acquireTemporary());
+      Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
      moveCR(c, aSize, a, aSize, &tmp);
      andRR(c, aSize, &tmp, bSize, b);
      c->client->releaseTemporary(tmp.low);
@ -1392,7 +1544,7 @@ orCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);        
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        orRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1448,7 +1600,7 @@ xorCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);        
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        xorRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1523,7 +1675,7 @@ compareCR(Context* c, unsigned aSize, Assembler::Constant* a,
      c->code.append4(v);
    }
  } else {
-    Assembler::Register tmp(c->client->acquireTemporary());
+    Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
    moveCR(c, aSize, a, aSize, &tmp);
    compareRR(c, aSize, &tmp, bSize, b);
    c->client->releaseTemporary(tmp.low);
@ -1537,7 +1689,7 @@ multiplyCR(Context* c, unsigned aSize, Assembler::Constant* a,
  assert(c, aSize == bSize);
  if (BytesPerWord == 4 and aSize == 8) {
-    const uint32_t mask = ~((1 << rax) | (1 << rdx));
+    const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
    Assembler::Register tmp(c->client->acquireTemporary(mask),
                            c->client->acquireTemporary(mask));
@ -1560,7 +1712,7 @@ multiplyCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);        
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        multiplyRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);      
@ -1605,7 +1757,7 @@ compareCM(Context* c, unsigned aSize, Assembler::Constant* a,
      abort(c);
    }
  } else {
-    Assembler::Register tmp(c->client->acquireTemporary());
+    Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
    moveCR(c, aSize, a, bSize, &tmp);
    compareRM(c, bSize, &tmp, bSize, b);
    c->client->releaseTemporary(tmp.low);
@ -1928,6 +2080,219 @@ unsignedShiftRightCR(Context* c, unsigned aSize UNUSED, Assembler::Constant* a,
  doShift(c, unsignedShiftRightRR, 0xe8, aSize, a, bSize, b);
 }
 inline void floatRegOp(Context* c, unsigned aSize, Assembler::Register* a,
                     unsigned bSize UNUSED, Assembler::Register* b, uint8_t op, uint8_t mod = 0xc0)
 {
  if(aSize == 4) {
    opcode(c, 0xf3);
  } else {
    opcode(c, 0xf2);
  }
  maybeRex(c, bSize, a, b);
  opcode(c, 0x0f, op);
  modrm(c, mod, a, b);
 }
 inline void floatMemOp(Context* c, unsigned aSize, Assembler::Memory* a,
                     unsigned bSize UNUSED, Assembler::Register* b, uint8_t op)
 {
  if(aSize == 4) {
    opcode(c, 0xf3);
  } else {
    opcode(c, 0xf2);
  }
  maybeRex(c, bSize, b, a);
  opcode(c, 0x0f, op);
  modrmSibImm(c, b, a);
 }
 void
 floatSqrtRR(Context* c, unsigned aSize, Assembler::Register* a,
                     unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatRegOp(c, aSize, a, 4, b, 0x51);
 }
 void
 floatSqrtMR(Context* c, unsigned aSize, Assembler::Memory* a,
                     unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatMemOp(c, aSize, a, 4, b, 0x51);
 }
 void
 floatAddRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatRegOp(c, aSize, a, 4, b, 0x58);
 }
 void
 floatAddMR(Context* c, unsigned aSize, Assembler::Memory* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatMemOp(c, aSize, a, 4, b, 0x58);
 }
 void
 floatSubtractRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatRegOp(c, aSize, a, 4, b, 0x5c);
 }
 void
 floatSubtractMR(Context* c, unsigned aSize, Assembler::Memory* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatMemOp(c, aSize, a, 4, b, 0x5c);
 }
 void
 floatMultiplyRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatRegOp(c, aSize, a, 4, b, 0x59);
 }
 void
 floatMultiplyMR(Context* c, unsigned aSize, Assembler::Memory* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatMemOp(c, aSize, a, 4, b, 0x59);
 }
 void
 floatDivideRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatRegOp(c, aSize, a, 4, b, 0x5e);
 }
 void
 floatDivideMR(Context* c, unsigned aSize, Assembler::Memory* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  floatMemOp(c, aSize, a, 4, b, 0x5e);
 }
 void
 float2FloatRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  assert(c, supportsSSE2());
  floatRegOp(c, aSize, a, 4, b, 0x5a);
 }
 void
 float2FloatMR(Context* c, unsigned aSize, Assembler::Memory* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  assert(c, supportsSSE2());
  floatMemOp(c, aSize, a, 4, b, 0x5a);
 }
 void
 float2IntRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize, Assembler::Register* b)
 {
  assert(c, !floatReg(b));
  floatRegOp(c, aSize, a, bSize, b, 0x2d);
 }
 void
 float2IntMR(Context* c, unsigned aSize, Assembler::Memory* a,
      unsigned bSize, Assembler::Register* b)
 {
  floatMemOp(c, aSize, a, bSize, b, 0x2d);
 }
 void
 int2FloatRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize, Assembler::Register* b)
 {
  floatRegOp(c, bSize, a, aSize, b, 0x2a);
 }
 void
 int2FloatMR(Context* c, unsigned aSize, Assembler::Memory* a,
      unsigned bSize, Assembler::Register* b)
 {
  floatMemOp(c, bSize, a, aSize, b, 0x2a);
 }
 void
 floatCompareRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  if (aSize == 8) {
    opcode(c, 0x66);
  }
  maybeRex(c, 4, a, b);
  opcode(c, 0x0f, 0x2e);
  modrm(c, 0xc0, a, b);
 }
 void
 floatNegateRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  assert(c, floatReg(a) and floatReg(b));
  assert(c, aSize == 4); //unlike most of the other floating point code, this does NOT support doubles. 
  ResolvedPromise pcon(0x80000000);
  Assembler::Constant con(&pcon);
  if(a->low == b->low) {
    Assembler::Register tmp(c->client->acquireTemporary(FloatRegisterMask));
    moveCR(c, 4, &con, 4, &tmp);
    maybeRex(c, 4, a, &tmp);
    opcode(c, 0x0f, 0x57);
    modrm(c, 0xc0, &tmp, a);
    c->client->releaseTemporary(tmp.low);
  } else {
    moveCR(c, 4, &con, 4, b);
    if(aSize == 8) opcode(c, 0x66);
    maybeRex(c, 4, a, b);
    opcode(c, 0x0f, 0x57);
    modrm(c, 0xc0, a, b);
  }
 }
 void
 floatAbsRR(Context* c, unsigned aSize UNUSED, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b)
 {
  assert(c, floatReg(a) and floatReg(b));
  assert(c, aSize == 4); //unlike most of the other floating point code, this does NOT support doubles. 
  ResolvedPromise pcon(0x7fffffff);
  Assembler::Constant con(&pcon);
  if(a->low == b->low) {
    Assembler::Register tmp(c->client->acquireTemporary(FloatRegisterMask));
    moveCR(c, 4, &con, 4, &tmp);
    maybeRex(c, 4, a, &tmp);
    opcode(c, 0x0f, 0x54);
    modrm(c, 0xc0, &tmp, a);
    c->client->releaseTemporary(tmp.low);
  } else {
    moveCR(c, 4, &con, 4, b);
    maybeRex(c, 4, a, b);
    opcode(c, 0x0f, 0x54);
    modrm(c, 0xc0, a, b);
  }
 }
 void
 absRR(Context* c, unsigned aSize, Assembler::Register* a,
      unsigned bSize UNUSED, Assembler::Register* b UNUSED)
 {
  assert(c, aSize == bSize and a->low == rax and b->low == rax);
  Assembler::Register d(c->client->acquireTemporary(static_cast<uint64_t>(1) << rdx));
  maybeRex(c, aSize, a, b);
  opcode(c, 0x99);
  xorRR(c, aSize, &d, aSize, a);
  subtractRR(c, aSize, &d, aSize, a);
  c->client->releaseTemporary(rdx);
 }
 void
 populateTables(ArchitectureContext* c)
 {
@ -1963,11 +2328,14 @@ populateTables(ArchitectureContext* c)
  uo[index(JumpIfGreaterOrEqual, C)] = CAST1(jumpIfGreaterOrEqualC);
  uo[index(JumpIfLess, C)] = CAST1(jumpIfLessC);
  uo[index(JumpIfLessOrEqual, C)] = CAST1(jumpIfLessOrEqualC);
  uo[index(JumpIfUnordered, C)] = CAST1(jumpIfUnorderedC);
  uo[index(LongJump, C)] = CAST1(longJumpC);
  bo[index(Negate, R, R)] = CAST2(negateRR);
  bo[index(FloatNegate, R, R)] = CAST2(floatNegateRR);
  bo[index(Move, R, R)] = CAST2(moveRR);
  bo[index(Move, C, R)] = CAST2(moveCR);
  bo[index(Move, M, R)] = CAST2(moveMR);
@ -1975,6 +2343,9 @@ populateTables(ArchitectureContext* c)
  bo[index(Move, C, M)] = CAST2(moveCM);
  bo[index(Move, A, R)] = CAST2(moveAR);
  bo[index(FloatSqrt, R, R)] = CAST2(floatSqrtRR);
  bo[index(FloatSqrt, M, R)] = CAST2(floatSqrtMR);
  bo[index(MoveZ, R, R)] = CAST2(moveZRR);
  bo[index(MoveZ, M, R)] = CAST2(moveZMR);
@ -1983,12 +2354,20 @@ populateTables(ArchitectureContext* c)
  bo[index(Compare, C, M)] = CAST2(compareCM);
  bo[index(Compare, R, M)] = CAST2(compareRM);
  bo[index(FloatCompare, R, R)] = CAST2(floatCompareRR);
  bo[index(Add, R, R)] = CAST2(addRR);
  bo[index(Add, C, R)] = CAST2(addCR);
  bo[index(Subtract, C, R)] = CAST2(subtractCR);
  bo[index(Subtract, R, R)] = CAST2(subtractRR);
  bo[index(FloatAdd, R, R)] = CAST2(floatAddRR);
  bo[index(FloatAdd, M, R)] = CAST2(floatAddMR);
  bo[index(FloatSubtract, R, R)] = CAST2(floatSubtractRR);
  bo[index(FloatSubtract, M, R)] = CAST2(floatSubtractMR);
  bo[index(And, R, R)] = CAST2(andRR);
  bo[index(And, C, R)] = CAST2(andCR);
@ -2003,6 +2382,12 @@ populateTables(ArchitectureContext* c)
  bo[index(Divide, R, R)] = CAST2(divideRR);
  bo[index(FloatMultiply, R, R)] = CAST2(floatMultiplyRR);
  bo[index(FloatMultiply, M, R)] = CAST2(floatMultiplyMR);
  bo[index(FloatDivide, R, R)] = CAST2(floatDivideRR);
  bo[index(FloatDivide, M, R)] = CAST2(floatDivideMR);
  bo[index(Remainder, R, R)] = CAST2(remainderRR);
  bo[index(LongCompare, C, R)] = CAST2(longCompareCR);
@ -2016,8 +2401,19 @@ populateTables(ArchitectureContext* c)
  bo[index(UnsignedShiftRight, R, R)] = CAST2(unsignedShiftRightRR);
  bo[index(UnsignedShiftRight, C, R)] = CAST2(unsignedShiftRightCR);
 }
  bo[index(Float2Float, R, R)] = CAST2(float2FloatRR);
  bo[index(Float2Float, M, R)] = CAST2(float2FloatMR);
  bo[index(Float2Int, R, R)] = CAST2(float2IntRR);
  bo[index(Float2Int, M, R)] = CAST2(float2IntMR);
  bo[index(Int2Float, R, R)] = CAST2(int2FloatRR);
  bo[index(Int2Float, M, R)] = CAST2(int2FloatMR);
  bo[index(Abs, R, R)] = CAST2(absRR);
  bo[index(FloatAbs, R, R)] = CAST2(floatAbsRR);
 }
 class MyArchitecture: public Assembler::Architecture {
 public:
  MyArchitecture(System* system): c(system), referenceCount(0) {
@ -2025,7 +2421,31 @@ class MyArchitecture: public Assembler::Architecture {
  }
  virtual unsigned registerCount() {
-    return (BytesPerWord == 4 ? 8 : 16);
+    if (supportsSSE()) {
      return BytesPerWord == 4 ? 24 : 32;
    } else {
      return BytesPerWord == 4 ? 8 : 16;
    }
  }
  virtual unsigned generalRegisterCount() {
  	return BytesPerWord == 4 ? 8 : 16;
  }
  virtual unsigned floatRegisterCount() {
    if (supportsSSE()) {
      return BytesPerWord == 4 ? 8 : 16;
    } else {
      return 0;
    }
  }
  virtual uint64_t generalRegisters() {
  	return GeneralRegisterMask;
  }
  virtual uint64_t floatRegisters() {
  	return supportsSSE() ? FloatRegisterMask : 0;
  }
  virtual int stack() {
@ -2044,10 +2464,6 @@ class MyArchitecture: public Assembler::Architecture {
    return (BytesPerWord == 4 ? rdx : NoRegister);
  }
  virtual bool condensedAddressing() {
    return true;
  }
  virtual bool bigEndian() {
    return false;
  }
@ -2171,6 +2587,10 @@ class MyArchitecture: public Assembler::Architecture {
    return 0;
  }
  virtual bool supportsFloatCompare(unsigned size) {
    return supportsSSE() and size <= BytesPerWord;
  }
  virtual void nextFrame(void** stack, void** base) {
    assert(&c, *static_cast<void**>(*base) != *base);
@ -2189,61 +2609,206 @@ class MyArchitecture: public Assembler::Architecture {
    *thunk = false;
  }
-  virtual void plan
+  bool checkMethodClass(Thread* t, object method, const char* value)
  {
    return strcmp
      (reinterpret_cast<const char*>
       (&byteArrayBody(t, className(t, methodClass(t, method)), 0)),
       value) == 0;
  }
  bool checkMethodName(Thread* t, object method, const char* value)
  {
    return strcmp
      (reinterpret_cast<const char*>
       (&byteArrayBody(t, methodName(t, method), 0)),
       value) == 0;
  }
  bool checkMethodSpec(Thread* t, object method, const char* value)
  {
    return strcmp
      (reinterpret_cast<const char*>
       (&byteArrayBody(t, methodSpec(t, method), 0)),
       value) == 0;
  }
  virtual BinaryOperation hasBinaryIntrinsic(Thread* t, object method)
  {
    if(checkMethodClass(t, method, "java/lang/Math")) {
      if(supportsSSE() and checkMethodName(t, method, "sqrt") and checkMethodSpec(t, method, "(D)D") and BytesPerWord == 8) {
        return FloatSqrt;
      } else if(checkMethodName(t, method, "abs")) {
      	if(checkMethodSpec(t, method, "(I)I") or (checkMethodSpec(t, method, "(J)J") and BytesPerWord == 8)) {
          return Abs;
      	} else if(supportsSSE() and supportsSSE2() and checkMethodSpec(t, method, "(F)F")) {
      	  return FloatAbs;
      	}
      }
    }
    return NoBinaryOperation;
  }
  virtual TernaryOperation hasTernaryIntrinsic(Thread* t UNUSED, object method UNUSED) {
  	return NoTernaryOperation;
  }
  virtual void planSource
  (BinaryOperation op,
   unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
-   unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask,
+   unsigned bSize, bool* thunk)
   bool* thunk)
  {
    *aTypeMask = ~0;
-    *aRegisterMask = ~static_cast<uint64_t>(0);
+    *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
    *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
    *bRegisterMask = ~static_cast<uint64_t>(0);
    *thunk = false;
    switch (op) {
    case Compare:
      *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
-      *bTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = GeneralRegisterMask;
      break;
    case FloatCompare:
      assert(&c, supportsSSE() && aSize <= BytesPerWord);
      *aTypeMask = (1 << RegisterOperand);
      *aRegisterMask = FloatRegisterMask;
      break;
    case Negate:
      *aTypeMask = (1 << RegisterOperand);
      *bTypeMask = (1 << RegisterOperand);
      *aRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
        | (static_cast<uint64_t>(1) << rax);
      *bRegisterMask = *aRegisterMask;
      break;
-
+    case Abs:
      *aTypeMask = (1 << RegisterOperand);
      *aRegisterMask = (static_cast<uint64_t>(1) << rax);
      break;
    case FloatAbs:
   	  *aTypeMask = (1 << RegisterOperand);
      *aRegisterMask = FloatRegisterMask;
      break;    
    case FloatNegate:
      if(!supportsSSE() or aSize == 8 or bSize == 8) { //floatNegateRR does not support doubles
        *thunk = true;
      } else {
   	    *aTypeMask = (1 << RegisterOperand);
        *aRegisterMask = FloatRegisterMask;
      }
      break;
    case FloatSqrt:
      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
   	  *aRegisterMask = FloatRegisterMask;
   	  break;
   	case Float2Float:
   	  if(!supportsSSE() or !supportsSSE2() or BytesPerWord == 4) {
   	    *thunk = true;
   	  } else {
        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
   	    *aRegisterMask = FloatRegisterMask;
   	  }
   	  break;
   	case Float2Int:
   	  if(!supportsSSE() or aSize > BytesPerWord or bSize > BytesPerWord) {
   	    *thunk = true;
   	  } else {
   	    *aTypeMask = (1 << RegisterOperand);// | (1 << MemoryOperand);
   	    *aRegisterMask = FloatRegisterMask;
   	  }
   	  break;
   	case Int2Float:
   	  if(!supportsSSE() or aSize > BytesPerWord or bSize > BytesPerWord) {
   	    *thunk = true;
   	  } else {
   	    *aTypeMask = (1 << RegisterOperand);// | (1 << MemoryOperand);
   	    *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
   	  }
   	  break;
    case Move:
      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
      *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
      if (BytesPerWord == 4) {
        if (aSize == 4 and bSize == 8) {
-          const uint32_t mask = ~((1 << rax) | (1 << rdx));
+          *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
          const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;    
          *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
            | (static_cast<uint64_t>(1) << rax);        
        } else if (aSize == 1 or bSize == 1) {
          *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
          const uint32_t mask
            = (1 << rax) | (1 << rcx) | (1 << rdx) | (1 << rbx);
          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;     
          *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;        
        }
      }
      break;
    default:
      break;
    }
  }
-  virtual void plan
+  virtual void planDestination
  (BinaryOperation op,
   unsigned aSize, const uint8_t* aTypeMask UNUSED, const uint64_t* aRegisterMask,
   unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask)
  {
  	*bTypeMask = ~0;
  	*bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
    switch (op) {
    case Compare:
      *bTypeMask = (1 << RegisterOperand);
      *bRegisterMask = GeneralRegisterMask;
      break;
    case FloatCompare:
      *bTypeMask = (1 << RegisterOperand);
      *bRegisterMask = FloatRegisterMask;
      break;
    case Abs:
      *bTypeMask = (1 << RegisterOperand);
      *bRegisterMask = (static_cast<uint64_t>(1) << rax);
       break;
    case FloatAbs:
      *bTypeMask = (1 << RegisterOperand);
      *bRegisterMask = *aRegisterMask;
      break;
    case Negate:
    case FloatNegate:
    case FloatSqrt:
   	case Float2Float:
   	  *bTypeMask = (1 << RegisterOperand);
   	  *bRegisterMask = *aRegisterMask;
   	  break;
   	case Int2Float:
   	  *bTypeMask = (1 << RegisterOperand);
   	  *bRegisterMask = FloatRegisterMask;
   	  break;
   	case Float2Int:
   	  *bTypeMask = (1 << RegisterOperand);
   	  *bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
   	  break;
    case Move:
      *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
      *bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
      if (BytesPerWord == 4) {
        if (aSize == 4 and bSize == 8) {
          *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
            | (static_cast<uint64_t>(1) << rax);
        } else if (aSize == 1 or bSize == 1) {
          const uint32_t mask
            = (1 << rax) | (1 << rcx) | (1 << rdx) | (1 << rbx);
          *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
        }
      }
      break;
    default:
      break;
    }
  }
  virtual void planSource
  (TernaryOperation op,
   unsigned aSize, uint8_t *aTypeMask, uint64_t *aRegisterMask,
   unsigned, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-   unsigned, uint8_t* cTypeMask, uint64_t* cRegisterMask,
+   unsigned, bool* thunk)
   bool* thunk)
  {
    *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
    *aRegisterMask = ~static_cast<uint64_t>(0);
@ -2254,21 +2819,37 @@ class MyArchitecture: public Assembler::Architecture {
    *thunk = false;
    switch (op) {
   	case FloatAdd:
   	case FloatSubtract:
   	case FloatMultiply:
   	case FloatDivide:
   	  if(!supportsSSE() or aSize > BytesPerWord) {
   	  	*thunk = true;
   	  } else {
   	  	*aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
   	  	*bTypeMask = (1 << RegisterOperand);
   	    *aRegisterMask = FloatRegisterMask;
   	    *bRegisterMask = FloatRegisterMask;
   	  }
   	  break;
    case Multiply:
      if (BytesPerWord == 4 and aSize == 8) { 
-        const uint32_t mask = ~((1 << rax) | (1 << rdx));
+        const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
        *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
        *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32)) | mask;
      } else {
        *aRegisterMask = GeneralRegisterMask;
        *bRegisterMask = GeneralRegisterMask;
      }
      break;
    case Divide:
      if (BytesPerWord == 4 and aSize == 8) {
        *bTypeMask = ~0;
        *thunk = true;        			
      } else {
        *aTypeMask = (1 << RegisterOperand);
-        *aRegisterMask = ~((1 << rax) | (1 << rdx));
+        *aRegisterMask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
        *bRegisterMask = 1 << rax;      
      }
      break;
@ -2279,7 +2860,7 @@ class MyArchitecture: public Assembler::Architecture {
        *thunk = true;
      } else {
        *aTypeMask = (1 << RegisterOperand);
-        *aRegisterMask = ~((1 << rax) | (1 << rdx));
+        *aRegisterMask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
        *bRegisterMask = 1 << rax;
      }
      break;
@ -2287,17 +2868,24 @@ class MyArchitecture: public Assembler::Architecture {
    case ShiftLeft:
    case ShiftRight:
    case UnsignedShiftRight: {
-      *aRegisterMask = (~static_cast<uint64_t>(0) << 32)
+      *aRegisterMask = (static_cast<uint64_t>(GeneralRegisterMask) << 32)
        | (static_cast<uint64_t>(1) << rcx);
-      const uint32_t mask = ~(1 << rcx);
+      const uint32_t mask = GeneralRegisterMask & ~(1 << rcx);
      *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
    } break;
    default:
      break;
    }
  }
-    *cTypeMask = *bTypeMask;
+  virtual void planDestination
  (TernaryOperation op UNUSED,
   unsigned aSize UNUSED, const uint8_t* aTypeMask UNUSED, const uint64_t* aRegisterMask UNUSED,
   unsigned bSize UNUSED, const uint8_t* bTypeMask UNUSED, const uint64_t* bRegisterMask,
   unsigned cSize UNUSED, uint8_t* cTypeMask, uint64_t* cRegisterMask)
  {
    *cTypeMask = (1 << RegisterOperand);
    *cRegisterMask = *bRegisterMask;
  }