added floating point support, split plan function

2025-04-07 19:34:41 +00:00 · 2009-08-06 10:26:22 -06:00 · 2009-08-06 10:26:22 -06:00 · a2e639a2d2
commit a2e639a2d2
parent 61bc729974
1 changed files with 639 additions and 51 deletions
--- a/src/x86.cpp
+++ b/src/x86.cpp
@ -13,10 +13,15 @@

 #include "assembler.h"
 #include "vector.h"
+#include "machine.h"

 #define CAST1(x) reinterpret_cast<UnaryOperationType>(x)
 #define CAST2(x) reinterpret_cast<BinaryOperationType>(x)

+const bool DebugSSE = false;
+const bool EnableSSE = true;
+const bool EnableSSE2 = true;
+
 using namespace vm;

 namespace {
@ -40,6 +45,28 @@ enum {
  r15 = 15,
 };

+enum {
+  xmm0 = r15 + 1,
+  xmm1,
+  xmm2,
+  xmm3,
+  xmm4,
+  xmm5,
+  xmm6,
+  xmm7,
+  xmm8,
+  xmm9,
+  xmm10,
+  xmm11,
+  xmm12,
+  xmm13,
+  xmm14,
+  xmm15,
+};
+
+const unsigned GeneralRegisterMask = BytesPerWord == 4 ? 0x000000ff : 0x0000ffff;
+const unsigned FloatRegisterMask = BytesPerWord == 4 ? 0x00ff0000 : 0xffff0000;
+
 const unsigned FrameHeaderSize = 2;

 inline bool
@ -399,6 +426,35 @@ padding(AlignmentPadding* p, unsigned start, unsigned offset,
  return padding;
 }

+extern "C"
+bool detectFeature(unsigned ecx, unsigned edx);
+
+inline bool
+supportsSSE()
+{
+	static int supported = -1;
+	if(supported == -1) {
+	  supported = EnableSSE && detectFeature(0, 0x2000000);
+	  if(DebugSSE) {
+	    fprintf(stderr, "sse %sdetected.\n", supported ? "" : "not ");
+	  }
+	}
+	return supported;	
+}
+
+inline bool
+supportsSSE2()
+{
+	static int supported = -1;
+	if(supported == -1) {
+	  supported = EnableSSE2 && detectFeature(0, 0x4000000);
+	  if(DebugSSE) {
+	    fprintf(stderr, "sse2 %sdetected.\n", supported ? "" : "not ");
+	  }
+	}
+	return supported;
+}
+
 #define REX_W 0x48
 #define REX_R 0x44
 #define REX_X 0x42
@ -503,6 +559,12 @@ inline void opcode(Context* c, uint8_t op1, uint8_t op2) {
  c->code.append(op2);
 }

+inline void opcode(Context* c, uint8_t op1, uint8_t op2, uint8_t op3) {
+  c->code.append(op1);
+  c->code.append(op2);
+  c->code.append(op3);
+}
+
 void
 return_(Context* c)
 {
@ -667,6 +729,14 @@ jumpIfLessOrEqualC(Context* c, unsigned size UNUSED, Assembler::Constant* a)
  conditional(c, 0x8e, a);
 }

+void
+jumpIfUnorderedC(Context* c, unsigned size UNUSED, Assembler::Constant* a)
+{
+  assert(c, size == BytesPerWord);
+
+  conditional(c, 0x8a, a);
+}
+
 void
 longJumpC(Context* c, unsigned size, Assembler::Constant* a)
 {
@ -806,11 +876,59 @@ moveCR2(Context* c, UNUSED unsigned aSize, Assembler::Constant* a,
  }
 }

+inline bool floatReg(Assembler::Register* a) {
+	return a->low >= xmm0;
+}
+
+void
+sseMoveRR(Context* c, unsigned aSize, Assembler::Register* a,
+       unsigned bSize UNUSED, Assembler::Register* b)
+{
+  if(floatReg(a) && floatReg(b)) {
+  	if(aSize == 4) {
+  	  opcode(c, 0xf3);
+  	  maybeRex(c, 4, a, b);
+  	  opcode(c, 0x0f, 0x10);
+  	  modrm(c, 0xc0, b, a);
+  	} else {
+  	  opcode(c, 0xf2);
+  	  maybeRex(c, 4, a, b);
+  	  opcode(c, 0x0f, 0x10);
+  	  modrm(c, 0xc0, b, a);
+  	} 
+  } else if(floatReg(a)) {
+  	opcode(c, 0x66);
+  	maybeRex(c, aSize, a, b);
+  	opcode(c, 0x0f, 0x7e);
+  	modrm(c, 0xc0, b, a);  	
+  } else {
+  	opcode(c, 0x66);
+  	maybeRex(c, aSize, a, b);
+  	opcode(c, 0x0f, 0x6e);
+  	modrm(c, 0xc0, a, b);  	
+  }
+}
+
+void
+sseMoveCR(Context* c, unsigned aSize, Assembler::Constant* a,
+       unsigned bSize, Assembler::Register* b)
+{
+  assert(c, aSize <= BytesPerWord);
+  Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
+  moveCR2(c, aSize, a, aSize, &tmp, 0);
+  sseMoveRR(c, aSize, &tmp, bSize, b);
+  c->client->releaseTemporary(tmp.low);
+}
+
 void
 moveCR(Context* c, unsigned aSize, Assembler::Constant* a,
       unsigned bSize, Assembler::Register* b)
 {
-  moveCR2(c, aSize, a, bSize, b, 0);
+  if(floatReg(b)) {
+  	sseMoveCR(c, aSize, a, bSize, b);
+  } else {
+    moveCR2(c, aSize, a, bSize, b, 0);
+  }
 }

 void
@ -829,7 +947,11 @@ void
 moveRR(Context* c, unsigned aSize, Assembler::Register* a,
       UNUSED unsigned bSize, Assembler::Register* b)
 {
-	
+  if(floatReg(a) or floatReg(b)) {
+  	sseMoveRR(c, aSize, a, bSize, b);
+  	return;
+  }
+  
  if (BytesPerWord == 4 and aSize == 8 and bSize == 8) {
    Assembler::Register ah(a->high);
    Assembler::Register bh(b->high);
@ -902,10 +1024,25 @@ moveRR(Context* c, unsigned aSize, Assembler::Register* a,
  }
 }

+void
+sseMoveMR(Context* c, unsigned aSize, Assembler::Memory* a,
+       unsigned bSize UNUSED, Assembler::Register* b)
+{
+  opcode(c, 0x66);
+  maybeRex(c, aSize, b, a);
+  opcode(c, 0x0f, 0x6e);
+  modrmSibImm(c, b, a);
+}
+
 void
 moveMR(Context* c, unsigned aSize, Assembler::Memory* a,
       unsigned bSize, Assembler::Register* b)
 {
+  if(floatReg(b)) {
+  	sseMoveMR(c, aSize, a, bSize, b);
+  	return;
+  }
+  
  switch (aSize) {
  case 1:
    maybeRex(c, bSize, b, a);
@ -956,12 +1093,27 @@ moveMR(Context* c, unsigned aSize, Assembler::Memory* a,
  }
 }

+void
+sseMoveRM(Context* c, unsigned aSize, Assembler::Register* a,
+       UNUSED unsigned bSize, Assembler::Memory* b)
+{
+  opcode(c, 0x66);
+  maybeRex(c, aSize, a, b);
+  opcode(c, 0x0f, 0x7e);
+  modrmSibImm(c, a, b);
+}
+
 void
 moveRM(Context* c, unsigned aSize, Assembler::Register* a,
       unsigned bSize UNUSED, Assembler::Memory* b)
 {
  assert(c, aSize == bSize);
  
+  if(floatReg(a)) {
+  	sseMoveRM(c, aSize, a, bSize, b);
+  	return;
+  }
+  
  switch (aSize) {
  case 1:
    maybeRex(c, bSize, a, b);
@ -1066,7 +1218,7 @@ moveCM(Context* c, unsigned aSize UNUSED, Assembler::Constant* a,
        modrmSibImm(c, 0, b->scale, b->index, b->base, b->offset);
        c->code.append4(a->value->value());
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, 8, a, 8, &tmp);
        moveRM(c, 8, &tmp, 8, b);
        c->client->releaseTemporary(tmp.low);
@ -1188,7 +1340,7 @@ addCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        addRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1246,7 +1398,7 @@ subtractCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        subtractRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1335,7 +1487,7 @@ andCR(Context* c, unsigned aSize, Assembler::Constant* a,
        c->code.append4(v);
      }
    } else {
-      Assembler::Register tmp(c->client->acquireTemporary());
+      Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
      moveCR(c, aSize, a, aSize, &tmp);
      andRR(c, aSize, &tmp, bSize, b);
      c->client->releaseTemporary(tmp.low);
@ -1392,7 +1544,7 @@ orCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);        
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        orRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1448,7 +1600,7 @@ xorCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);        
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        xorRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);
@ -1523,7 +1675,7 @@ compareCR(Context* c, unsigned aSize, Assembler::Constant* a,
      c->code.append4(v);
    }
  } else {
-    Assembler::Register tmp(c->client->acquireTemporary());
+    Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
    moveCR(c, aSize, a, aSize, &tmp);
    compareRR(c, aSize, &tmp, bSize, b);
    c->client->releaseTemporary(tmp.low);
@ -1537,7 +1689,7 @@ multiplyCR(Context* c, unsigned aSize, Assembler::Constant* a,
  assert(c, aSize == bSize);

  if (BytesPerWord == 4 and aSize == 8) {
-    const uint32_t mask = ~((1 << rax) | (1 << rdx));
+    const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
    Assembler::Register tmp(c->client->acquireTemporary(mask),
                            c->client->acquireTemporary(mask));

@ -1560,7 +1712,7 @@ multiplyCR(Context* c, unsigned aSize, Assembler::Constant* a,
          c->code.append4(v);        
        }
      } else {
-        Assembler::Register tmp(c->client->acquireTemporary());
+        Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
        moveCR(c, aSize, a, aSize, &tmp);
        multiplyRR(c, aSize, &tmp, bSize, b);
        c->client->releaseTemporary(tmp.low);      
@ -1605,7 +1757,7 @@ compareCM(Context* c, unsigned aSize, Assembler::Constant* a,
      abort(c);
    }
  } else {
-    Assembler::Register tmp(c->client->acquireTemporary());
+    Assembler::Register tmp(c->client->acquireTemporary(GeneralRegisterMask));
    moveCR(c, aSize, a, bSize, &tmp);
    compareRM(c, bSize, &tmp, bSize, b);
    c->client->releaseTemporary(tmp.low);
@ -1928,6 +2080,219 @@ unsignedShiftRightCR(Context* c, unsigned aSize UNUSED, Assembler::Constant* a,
  doShift(c, unsignedShiftRightRR, 0xe8, aSize, a, bSize, b);
 }

+inline void floatRegOp(Context* c, unsigned aSize, Assembler::Register* a,
+                     unsigned bSize UNUSED, Assembler::Register* b, uint8_t op, uint8_t mod = 0xc0)
+{
+  if(aSize == 4) {
+    opcode(c, 0xf3);
+  } else {
+    opcode(c, 0xf2);
+  }
+  maybeRex(c, bSize, a, b);
+  opcode(c, 0x0f, op);
+  modrm(c, mod, a, b);
+}
+
+inline void floatMemOp(Context* c, unsigned aSize, Assembler::Memory* a,
+                     unsigned bSize UNUSED, Assembler::Register* b, uint8_t op)
+{
+  if(aSize == 4) {
+    opcode(c, 0xf3);
+  } else {
+    opcode(c, 0xf2);
+  }
+  maybeRex(c, bSize, b, a);
+  opcode(c, 0x0f, op);
+  modrmSibImm(c, b, a);
+}
+
+void
+floatSqrtRR(Context* c, unsigned aSize, Assembler::Register* a,
+                     unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x51);
+}
+
+void
+floatSqrtMR(Context* c, unsigned aSize, Assembler::Memory* a,
+                     unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x51);
+}
+
+void
+floatAddRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x58);
+}
+
+void
+floatAddMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x58);
+}
+
+void
+floatSubtractRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x5c);
+}
+
+void
+floatSubtractMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x5c);
+}
+
+void
+floatMultiplyRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x59);
+}
+
+void
+floatMultiplyMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x59);
+}
+
+void
+floatDivideRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatRegOp(c, aSize, a, 4, b, 0x5e);
+}
+
+void
+floatDivideMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, 4, b, 0x5e);
+}
+
+void
+float2FloatRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, supportsSSE2());
+  floatRegOp(c, aSize, a, 4, b, 0x5a);
+}
+
+void
+float2FloatMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, supportsSSE2());
+  floatMemOp(c, aSize, a, 4, b, 0x5a);
+}
+
+void
+float2IntRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  assert(c, !floatReg(b));
+  floatRegOp(c, aSize, a, bSize, b, 0x2d);
+}
+
+void
+float2IntMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  floatMemOp(c, aSize, a, bSize, b, 0x2d);
+}
+
+void
+int2FloatRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  floatRegOp(c, bSize, a, aSize, b, 0x2a);
+}
+
+void
+int2FloatMR(Context* c, unsigned aSize, Assembler::Memory* a,
+      unsigned bSize, Assembler::Register* b)
+{
+  floatMemOp(c, bSize, a, aSize, b, 0x2a);
+}
+
+void
+floatCompareRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  if (aSize == 8) {
+    opcode(c, 0x66);
+  }
+  maybeRex(c, 4, a, b);
+  opcode(c, 0x0f, 0x2e);
+  modrm(c, 0xc0, a, b);
+}
+
+void
+floatNegateRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, floatReg(a) and floatReg(b));
+  assert(c, aSize == 4); //unlike most of the other floating point code, this does NOT support doubles. 
+  ResolvedPromise pcon(0x80000000);
+  Assembler::Constant con(&pcon);
+  if(a->low == b->low) {
+    Assembler::Register tmp(c->client->acquireTemporary(FloatRegisterMask));
+    moveCR(c, 4, &con, 4, &tmp);
+    maybeRex(c, 4, a, &tmp);
+    opcode(c, 0x0f, 0x57);
+    modrm(c, 0xc0, &tmp, a);
+    c->client->releaseTemporary(tmp.low);
+  } else {
+    moveCR(c, 4, &con, 4, b);
+    if(aSize == 8) opcode(c, 0x66);
+    maybeRex(c, 4, a, b);
+    opcode(c, 0x0f, 0x57);
+    modrm(c, 0xc0, a, b);
+  }
+}
+
+void
+floatAbsRR(Context* c, unsigned aSize UNUSED, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b)
+{
+  assert(c, floatReg(a) and floatReg(b));
+  assert(c, aSize == 4); //unlike most of the other floating point code, this does NOT support doubles. 
+  ResolvedPromise pcon(0x7fffffff);
+  Assembler::Constant con(&pcon);
+  if(a->low == b->low) {
+    Assembler::Register tmp(c->client->acquireTemporary(FloatRegisterMask));
+    moveCR(c, 4, &con, 4, &tmp);
+    maybeRex(c, 4, a, &tmp);
+    opcode(c, 0x0f, 0x54);
+    modrm(c, 0xc0, &tmp, a);
+    c->client->releaseTemporary(tmp.low);
+  } else {
+    moveCR(c, 4, &con, 4, b);
+    maybeRex(c, 4, a, b);
+    opcode(c, 0x0f, 0x54);
+    modrm(c, 0xc0, a, b);
+  }
+}
+
+void
+absRR(Context* c, unsigned aSize, Assembler::Register* a,
+      unsigned bSize UNUSED, Assembler::Register* b UNUSED)
+{
+  assert(c, aSize == bSize and a->low == rax and b->low == rax);
+  Assembler::Register d(c->client->acquireTemporary(static_cast<uint64_t>(1) << rdx));
+  maybeRex(c, aSize, a, b);
+  opcode(c, 0x99);
+  xorRR(c, aSize, &d, aSize, a);
+  subtractRR(c, aSize, &d, aSize, a);
+  c->client->releaseTemporary(rdx);
+}
+
 void
 populateTables(ArchitectureContext* c)
 {
@ -1963,11 +2328,14 @@ populateTables(ArchitectureContext* c)
  uo[index(JumpIfGreaterOrEqual, C)] = CAST1(jumpIfGreaterOrEqualC);
  uo[index(JumpIfLess, C)] = CAST1(jumpIfLessC);
  uo[index(JumpIfLessOrEqual, C)] = CAST1(jumpIfLessOrEqualC);
+  uo[index(JumpIfUnordered, C)] = CAST1(jumpIfUnorderedC);

  uo[index(LongJump, C)] = CAST1(longJumpC);

  bo[index(Negate, R, R)] = CAST2(negateRR);

+  bo[index(FloatNegate, R, R)] = CAST2(floatNegateRR);
+
  bo[index(Move, R, R)] = CAST2(moveRR);
  bo[index(Move, C, R)] = CAST2(moveCR);
  bo[index(Move, M, R)] = CAST2(moveMR);
@ -1975,6 +2343,9 @@ populateTables(ArchitectureContext* c)
  bo[index(Move, C, M)] = CAST2(moveCM);
  bo[index(Move, A, R)] = CAST2(moveAR);

+  bo[index(FloatSqrt, R, R)] = CAST2(floatSqrtRR);
+  bo[index(FloatSqrt, M, R)] = CAST2(floatSqrtMR);
+
  bo[index(MoveZ, R, R)] = CAST2(moveZRR);
  bo[index(MoveZ, M, R)] = CAST2(moveZMR);

@ -1983,12 +2354,20 @@ populateTables(ArchitectureContext* c)
  bo[index(Compare, C, M)] = CAST2(compareCM);
  bo[index(Compare, R, M)] = CAST2(compareRM);

+  bo[index(FloatCompare, R, R)] = CAST2(floatCompareRR);
+
  bo[index(Add, R, R)] = CAST2(addRR);
  bo[index(Add, C, R)] = CAST2(addCR);

  bo[index(Subtract, C, R)] = CAST2(subtractCR);
  bo[index(Subtract, R, R)] = CAST2(subtractRR);

+  bo[index(FloatAdd, R, R)] = CAST2(floatAddRR);
+  bo[index(FloatAdd, M, R)] = CAST2(floatAddMR);
+
+  bo[index(FloatSubtract, R, R)] = CAST2(floatSubtractRR);
+  bo[index(FloatSubtract, M, R)] = CAST2(floatSubtractMR);
+
  bo[index(And, R, R)] = CAST2(andRR);
  bo[index(And, C, R)] = CAST2(andCR);

@ -2003,6 +2382,12 @@ populateTables(ArchitectureContext* c)

  bo[index(Divide, R, R)] = CAST2(divideRR);

+  bo[index(FloatMultiply, R, R)] = CAST2(floatMultiplyRR);
+  bo[index(FloatMultiply, M, R)] = CAST2(floatMultiplyMR);
+
+  bo[index(FloatDivide, R, R)] = CAST2(floatDivideRR);
+  bo[index(FloatDivide, M, R)] = CAST2(floatDivideMR);
+
  bo[index(Remainder, R, R)] = CAST2(remainderRR);

  bo[index(LongCompare, C, R)] = CAST2(longCompareCR);
@ -2016,8 +2401,19 @@ populateTables(ArchitectureContext* c)

  bo[index(UnsignedShiftRight, R, R)] = CAST2(unsignedShiftRightRR);
  bo[index(UnsignedShiftRight, C, R)] = CAST2(unsignedShiftRightCR);
-}

+  bo[index(Float2Float, R, R)] = CAST2(float2FloatRR);
+  bo[index(Float2Float, M, R)] = CAST2(float2FloatMR);
+
+  bo[index(Float2Int, R, R)] = CAST2(float2IntRR);
+  bo[index(Float2Int, M, R)] = CAST2(float2IntMR);
+
+  bo[index(Int2Float, R, R)] = CAST2(int2FloatRR);
+  bo[index(Int2Float, M, R)] = CAST2(int2FloatMR);
+
+  bo[index(Abs, R, R)] = CAST2(absRR);
+  bo[index(FloatAbs, R, R)] = CAST2(floatAbsRR);
+}
 class MyArchitecture: public Assembler::Architecture {
 public:
  MyArchitecture(System* system): c(system), referenceCount(0) {
@ -2025,7 +2421,31 @@ class MyArchitecture: public Assembler::Architecture {
  }

  virtual unsigned registerCount() {
-    return (BytesPerWord == 4 ? 8 : 16);
+    if (supportsSSE()) {
+      return BytesPerWord == 4 ? 24 : 32;
+    } else {
+      return BytesPerWord == 4 ? 8 : 16;
+    }
+  }
+  
+  virtual unsigned generalRegisterCount() {
+  	return BytesPerWord == 4 ? 8 : 16;
+  }
+  
+  virtual unsigned floatRegisterCount() {
+    if (supportsSSE()) {
+      return BytesPerWord == 4 ? 8 : 16;
+    } else {
+      return 0;
+    }
+  }
+  
+  virtual uint64_t generalRegisters() {
+  	return GeneralRegisterMask;
+  }
+  
+  virtual uint64_t floatRegisters() {
+  	return supportsSSE() ? FloatRegisterMask : 0;
  }

  virtual int stack() {
@ -2044,10 +2464,6 @@ class MyArchitecture: public Assembler::Architecture {
    return (BytesPerWord == 4 ? rdx : NoRegister);
  }

-  virtual bool condensedAddressing() {
-    return true;
-  }
-
  virtual bool bigEndian() {
    return false;
  }
@ -2058,7 +2474,7 @@ class MyArchitecture: public Assembler::Architecture {
    case rsp:
    case rbx:
      return true;
-
+   	  
    default:
      return false;
    }
@ -2171,6 +2587,10 @@ class MyArchitecture: public Assembler::Architecture {
    return 0;
  }

+  virtual bool supportsFloatCompare(unsigned size) {
+    return supportsSSE() and size <= BytesPerWord;
+  }
+
  virtual void nextFrame(void** stack, void** base) {
    assert(&c, *static_cast<void**>(*base) != *base);

@ -2189,61 +2609,206 @@ class MyArchitecture: public Assembler::Architecture {
    *thunk = false;
  }

-  virtual void plan
+  bool checkMethodClass(Thread* t, object method, const char* value)
+  {
+    return strcmp
+      (reinterpret_cast<const char*>
+       (&byteArrayBody(t, className(t, methodClass(t, method)), 0)),
+       value) == 0;
+  }
+
+  bool checkMethodName(Thread* t, object method, const char* value)
+  {
+    return strcmp
+      (reinterpret_cast<const char*>
+       (&byteArrayBody(t, methodName(t, method), 0)),
+       value) == 0;
+  }
+
+  bool checkMethodSpec(Thread* t, object method, const char* value)
+  {
+    return strcmp
+      (reinterpret_cast<const char*>
+       (&byteArrayBody(t, methodSpec(t, method), 0)),
+       value) == 0;
+  }
+
+  virtual BinaryOperation hasBinaryIntrinsic(Thread* t, object method)
+  {
+    if(checkMethodClass(t, method, "java/lang/Math")) {
+      if(supportsSSE() and checkMethodName(t, method, "sqrt") and checkMethodSpec(t, method, "(D)D") and BytesPerWord == 8) {
+        return FloatSqrt;
+      } else if(checkMethodName(t, method, "abs")) {
+      	if(checkMethodSpec(t, method, "(I)I") or (checkMethodSpec(t, method, "(J)J") and BytesPerWord == 8)) {
+          return Abs;
+      	} else if(supportsSSE() and supportsSSE2() and checkMethodSpec(t, method, "(F)F")) {
+      	  return FloatAbs;
+      	}
+      }
+    }
+    return NoBinaryOperation;
+  }
+
+  virtual TernaryOperation hasTernaryIntrinsic(Thread* t UNUSED, object method UNUSED) {
+  	return NoTernaryOperation;
+  }
+
+  virtual void planSource
  (BinaryOperation op,
   unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
-   unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-   bool* thunk)
+   unsigned bSize, bool* thunk)
  {
    *aTypeMask = ~0;
-    *aRegisterMask = ~static_cast<uint64_t>(0);
-
-    *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
-    *bRegisterMask = ~static_cast<uint64_t>(0);
+    *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);

    *thunk = false;

    switch (op) {
    case Compare:
      *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
-      *bTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = GeneralRegisterMask;
+      break;
+    case FloatCompare:
+      assert(&c, supportsSSE() && aSize <= BytesPerWord);
+      *aTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = FloatRegisterMask;
      break;
-
    case Negate:
      *aTypeMask = (1 << RegisterOperand);
-      *bTypeMask = (1 << RegisterOperand);
      *aRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
        | (static_cast<uint64_t>(1) << rax);
-      *bRegisterMask = *aRegisterMask;
      break;
-
+    case Abs:
+      *aTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = (static_cast<uint64_t>(1) << rax);
+      break;
+    case FloatAbs:
+   	  *aTypeMask = (1 << RegisterOperand);
+      *aRegisterMask = FloatRegisterMask;
+      break;    
+    case FloatNegate:
+      if(!supportsSSE() or aSize == 8 or bSize == 8) { //floatNegateRR does not support doubles
+        *thunk = true;
+      } else {
+   	    *aTypeMask = (1 << RegisterOperand);
+        *aRegisterMask = FloatRegisterMask;
+      }
+      break;
+    case FloatSqrt:
+      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+   	  *aRegisterMask = FloatRegisterMask;
+   	  break;
+   	case Float2Float:
+   	  if(!supportsSSE() or !supportsSSE2() or BytesPerWord == 4) {
+   	    *thunk = true;
+   	  } else {
+        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+   	    *aRegisterMask = FloatRegisterMask;
+   	  }
+   	  break;
+   	case Float2Int:
+   	  if(!supportsSSE() or aSize > BytesPerWord or bSize > BytesPerWord) {
+   	    *thunk = true;
+   	  } else {
+   	    *aTypeMask = (1 << RegisterOperand);// | (1 << MemoryOperand);
+   	    *aRegisterMask = FloatRegisterMask;
+   	  }
+   	  break;
+   	case Int2Float:
+   	  if(!supportsSSE() or aSize > BytesPerWord or bSize > BytesPerWord) {
+   	    *thunk = true;
+   	  } else {
+   	    *aTypeMask = (1 << RegisterOperand);// | (1 << MemoryOperand);
+   	    *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+   	  }
+   	  break;
    case Move:
+      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+      *aRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
      if (BytesPerWord == 4) {
        if (aSize == 4 and bSize == 8) {
-          const uint32_t mask = ~((1 << rax) | (1 << rdx));
-          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
-          *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
-            | (static_cast<uint64_t>(1) << rax);        
+          *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+          const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
+          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;    
        } else if (aSize == 1 or bSize == 1) {
+          *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
          const uint32_t mask
            = (1 << rax) | (1 << rcx) | (1 << rdx) | (1 << rbx);
-          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
-          *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;        
+          *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;     
        }
      }
      break;
-
    default:
      break;
    }
  }

-  virtual void plan
+  virtual void planDestination
+  (BinaryOperation op,
+   unsigned aSize, const uint8_t* aTypeMask UNUSED, const uint64_t* aRegisterMask,
+   unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask)
+  {
+  	*bTypeMask = ~0;
+  	*bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+    switch (op) {
+    case Compare:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = GeneralRegisterMask;
+      break;
+    case FloatCompare:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = FloatRegisterMask;
+      break;
+
+    case Abs:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = (static_cast<uint64_t>(1) << rax);
+       break;
+
+    case FloatAbs:
+      *bTypeMask = (1 << RegisterOperand);
+      *bRegisterMask = *aRegisterMask;
+      break;
+
+    case Negate:
+    case FloatNegate:
+    case FloatSqrt:
+   	case Float2Float:
+   	  *bTypeMask = (1 << RegisterOperand);
+   	  *bRegisterMask = *aRegisterMask;
+   	  break;
+   	case Int2Float:
+   	  *bTypeMask = (1 << RegisterOperand);
+   	  *bRegisterMask = FloatRegisterMask;
+   	  break;
+   	case Float2Int:
+   	  *bTypeMask = (1 << RegisterOperand);
+   	  *bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+   	  break;
+    case Move:
+      *bTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+      *bRegisterMask = GeneralRegisterMask | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+      if (BytesPerWord == 4) {
+        if (aSize == 4 and bSize == 8) {
+          *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32))
+            | (static_cast<uint64_t>(1) << rax);
+        } else if (aSize == 1 or bSize == 1) {
+          const uint32_t mask
+            = (1 << rax) | (1 << rcx) | (1 << rdx) | (1 << rbx);
+          *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
+        }
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  virtual void planSource
  (TernaryOperation op,
-   unsigned aSize, uint8_t* aTypeMask, uint64_t* aRegisterMask,
+   unsigned aSize, uint8_t *aTypeMask, uint64_t *aRegisterMask,
   unsigned, uint8_t* bTypeMask, uint64_t* bRegisterMask,
-   unsigned, uint8_t* cTypeMask, uint64_t* cRegisterMask,
-   bool* thunk)
+   unsigned, bool* thunk)
  {
    *aTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
    *aRegisterMask = ~static_cast<uint64_t>(0);
@ -2254,21 +2819,37 @@ class MyArchitecture: public Assembler::Architecture {
    *thunk = false;

    switch (op) {
+   	case FloatAdd:
+   	case FloatSubtract:
+   	case FloatMultiply:
+   	case FloatDivide:
+   	  if(!supportsSSE() or aSize > BytesPerWord) {
+   	  	*thunk = true;
+   	  } else {
+   	  	*aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+   	  	*bTypeMask = (1 << RegisterOperand);
+   	    *aRegisterMask = FloatRegisterMask;
+   	    *bRegisterMask = FloatRegisterMask;
+   	  }
+   	  break;
+   	  
    case Multiply:
      if (BytesPerWord == 4 and aSize == 8) { 
-        const uint32_t mask = ~((1 << rax) | (1 << rdx));
+        const uint32_t mask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
        *aRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
        *bRegisterMask = (static_cast<uint64_t>(1) << (rdx + 32)) | mask;
+      } else {
+        *aRegisterMask = GeneralRegisterMask;
+        *bRegisterMask = GeneralRegisterMask;
      }
      break;

    case Divide:
      if (BytesPerWord == 4 and aSize == 8) {
-        *bTypeMask = ~0;
-        *thunk = true;        
+        *thunk = true;        			
      } else {
        *aTypeMask = (1 << RegisterOperand);
-        *aRegisterMask = ~((1 << rax) | (1 << rdx));
+        *aRegisterMask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
        *bRegisterMask = 1 << rax;      
      }
      break;
@ -2279,25 +2860,32 @@ class MyArchitecture: public Assembler::Architecture {
        *thunk = true;
      } else {
        *aTypeMask = (1 << RegisterOperand);
-        *aRegisterMask = ~((1 << rax) | (1 << rdx));
-        *bRegisterMask = 1 << rax;      
+        *aRegisterMask = GeneralRegisterMask & ~((1 << rax) | (1 << rdx));
+        *bRegisterMask = 1 << rax;
      }
      break;

    case ShiftLeft:
    case ShiftRight:
    case UnsignedShiftRight: {
-      *aRegisterMask = (~static_cast<uint64_t>(0) << 32)
+      *aRegisterMask = (static_cast<uint64_t>(GeneralRegisterMask) << 32)
        | (static_cast<uint64_t>(1) << rcx);
-      const uint32_t mask = ~(1 << rcx);
+      const uint32_t mask = GeneralRegisterMask & ~(1 << rcx);
      *bRegisterMask = (static_cast<uint64_t>(mask) << 32) | mask;
    } break;

    default:
      break;
    }
+  }

-    *cTypeMask = *bTypeMask;
+  virtual void planDestination
+  (TernaryOperation op UNUSED,
+   unsigned aSize UNUSED, const uint8_t* aTypeMask UNUSED, const uint64_t* aRegisterMask UNUSED,
+   unsigned bSize UNUSED, const uint8_t* bTypeMask UNUSED, const uint64_t* bRegisterMask,
+   unsigned cSize UNUSED, uint8_t* cTypeMask, uint64_t* cRegisterMask)
+  {
+    *cTypeMask = (1 << RegisterOperand);
    *cRegisterMask = *bRegisterMask;
  }