Merge branch 'master' of oss.readytalk.com:/var/local/git/avian

2025-04-19 08:36:39 +00:00 · 2009-12-01 09:55:19 -07:00 · 2009-12-01 09:55:19 -07:00 · 1f98cc633d
commit 1f98cc633d
parent f39469e71c 98275e175e
13 changed files with 897 additions and 455 deletions
--- a/src/assembler.h
+++ b/src/assembler.h
@ -361,10 +361,9 @@ class Assembler {
     unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask) = 0;

    virtual void planMove
-    (unsigned size,
-     uint8_t srcTypeMask, uint64_t srcRegisterMask,
-     uint8_t dstTypeMask, uint64_t dstRegisterMask,
-     uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask) = 0; 
+    (unsigned size, uint8_t* srcTypeMask, uint64_t* srcRegisterMask,
+     uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask,
+     uint8_t dstTypeMask, uint64_t dstRegisterMask) = 0; 

    virtual void planSource
    (TernaryOperation op,
--- a/src/compile.cpp
+++ b/src/compile.cpp
@ -235,7 +235,7 @@ methodForIp(MyThread* t, void* ip)
  // we must use a version of the method tree at least as recent as the
  // compiled form of the method containing the specified address (see
  // compile(MyThread*, Allocator*, BootContext*, object)):
-  memoryBarrier();
+  loadMemoryBarrier();

  return treeQuery(t, methodTree(t), reinterpret_cast<intptr_t>(ip),
                   methodTreeSentinal(t), compareIpToMethodBounds);
@ -785,6 +785,10 @@ class Context {
    {
      if (size == 8) {
        switch(op) {
+        case Absolute:
+          assert(t, resultSize == 8);
+          return local::getThunk(t, absoluteLongThunk);
+
        case FloatNegate:
          assert(t, resultSize == 8);
          return local::getThunk(t, negateDoubleThunk);
@ -819,12 +823,16 @@ class Context {
        assert(t, size == 4);

        switch(op) {
+        case Absolute:
+          assert(t, resultSize == 4);
+          return local::getThunk(t, absoluteIntThunk);
+
        case FloatNegate:
-          assert(t, size == 4);
+          assert(t, resultSize == 4);
          return local::getThunk(t, negateFloatThunk);

        case FloatAbsolute:
-          assert(t, size == 4);
+          assert(t, resultSize == 4);
          return local::getThunk(t, absoluteFloatThunk);

        case Float2Float:
@ -2160,6 +2168,18 @@ absoluteFloat(uint32_t a)
  return floatToBits(fabsf(bitsToFloat(a)));
 }

+int64_t
+absoluteLong(int64_t a)
+{
+  return a > 0 ? a : -a;
+}
+
+int64_t
+absoluteInt(int32_t a)
+{
+  return a > 0 ? a : -a;
+}
+
 int64_t
 divideLong(int64_t b, int64_t a)
 {
@ -2934,7 +2954,7 @@ bool
 intrinsic(MyThread* t, Frame* frame, object target)
 {
 #define MATCH(name, constant)                                           \
-  (byteArrayLength(t, name) - 1 == sizeof(constant)                     \
+  (byteArrayLength(t, name) == sizeof(constant)                         \
   and strcmp(reinterpret_cast<char*>(&byteArrayBody(t, name, 0)),      \
              constant) == 0)

@ -3040,6 +3060,7 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
           (4, 4, c->memory
            (array, Compiler::FloatType, ArrayBody, index, 4), BytesPerWord));
        break;
+
      case iaload:
        frame->pushInt
          (c->load
@ -5621,6 +5642,7 @@ compile(MyThread* t, Allocator* allocator, Context* context)
      frame.set(--index, Frame::Long);
      c->initLocal(2, index, Compiler::IntegerType);
      break;
+
    case 'D':
      frame.set(--index, Frame::Long);
      frame.set(--index, Frame::Long);
@ -5876,7 +5898,7 @@ resolveNative(MyThread* t, object method)
    // methodCompiled, since we don't want them using the slow calling
    // convention on a function that expects the fast calling
    // convention:
-    memoryBarrier();
+    storeStoreMemoryBarrier();

    methodCompiled(t, method) = reinterpret_cast<uintptr_t>(function);
  }
@ -7467,7 +7489,7 @@ findCallNode(MyThread* t, void* address)
  // we must use a version of the call table at least as recent as the
  // compiled form of the method containing the specified address (see
  // compile(MyThread*, Allocator*, BootContext*, object)):
-  memoryBarrier();
+  loadMemoryBarrier();

  MyProcessor* p = processor(t);
  object table = p->callTable;
@ -8227,7 +8249,7 @@ compile(MyThread* t, Allocator* allocator, BootContext* bootContext,
         reinterpret_cast<intptr_t>(compiled), clone, methodTreeSentinal(t),
         compareIpToMethodBounds);

-      memoryBarrier();
+      storeStoreMemoryBarrier();

      methodCompiled(t, method) = reinterpret_cast<intptr_t>(compiled);

--- a/src/compiler.cpp
+++ b/src/compiler.cpp
--- a/src/heap.cpp
+++ b/src/heap.cpp
@ -76,7 +76,10 @@ markBitAtomic(uintptr_t* map, unsigned i)
 {
  uintptr_t* p = map + wordOf(i);
  uintptr_t v = static_cast<uintptr_t>(1) << bitOf(i);
-  while (not atomicCompareAndSwap(p, *p, *p | v)) { }
+  for (uintptr_t old = *p;
+       not atomicCompareAndSwap(p, old, old | v);
+       old = *p)
+  { }
 }
 #endif // USE_ATOMIC_OPERATIONS

--- a/src/jnienv.cpp
+++ b/src/jnienv.cpp
@ -2173,10 +2173,7 @@ JNI_CreateJavaVM(Machine** m, Thread** t, void* args)
  System* s = makeSystem(crashDumpDirectory);
  Heap* h = makeHeap(s, heapLimit);
  Finder* f = makeFinder(s, RUNTIME_ARRAY_BODY(classpathBuffer), bootLibrary);
-  Processor* p = makeProcessor(s, h, false); // change back to true
-                                             // once use of SSE is
-                                             // fixed on 32-bit
-                                             // systems
+  Processor* p = makeProcessor(s, h, true);

  const char** properties = static_cast<const char**>
    (h->allocate(sizeof(const char*) * propertyCount));
--- a/src/machine.cpp
+++ b/src/machine.cpp
@ -14,6 +14,7 @@
 #include "stream.h"
 #include "constants.h"
 #include "processor.h"
+#include "arch.h"

 using namespace vm;

@ -21,6 +22,17 @@ namespace {

 const unsigned NoByte = 0xFFFF;

+#ifdef USE_ATOMIC_OPERATIONS
+void
+atomicIncrement(uint32_t* p, int v)
+{
+  for (uint32_t old = *p;
+       not atomicCompareAndSwap32(p, old, old + v);
+       old = *p)
+  { }
+}
+#endif
+
 bool
 find(Thread* t, Thread* o)
 {
@ -2319,20 +2331,33 @@ enter(Thread* t, Thread::State s)
    return;
  }

+#ifdef USE_ATOMIC_OPERATIONS
+#  define INCREMENT atomicIncrement
+#  define ACQUIRE_LOCK ACQUIRE_RAW(t, t->m->stateLock)
+#  define STORE_LOAD_MEMORY_BARRIER storeLoadMemoryBarrier()
+#else
+#  define INCREMENT(pointer, value) *(pointer) += value;
+#  define ACQUIRE_LOCK
+#  define STORE_LOAD_MEMORY_BARRIER
+
  ACQUIRE_RAW(t, t->m->stateLock);
+#endif // not USE_ATOMIC_OPERATIONS

  switch (s) {
  case Thread::ExclusiveState: {
+    ACQUIRE_LOCK;
+
    while (t->m->exclusive) {
      // another thread got here first.
      ENTER(t, Thread::IdleState);
+      t->m->stateLock->wait(t->systemThread, 0);
    }

    switch (t->state) {
    case Thread::ActiveState: break;

    case Thread::IdleState: {
-      ++ t->m->activeCount;
+      INCREMENT(&(t->m->activeCount), 1);
    } break;

    default: abort(t);
@ -2340,14 +2365,35 @@ enter(Thread* t, Thread::State s)

    t->state = Thread::ExclusiveState;
    t->m->exclusive = t;
-      
+    
+    STORE_LOAD_MEMORY_BARRIER;
+
    while (t->m->activeCount > 1) {
      t->m->stateLock->wait(t->systemThread, 0);
    }
  } break;

  case Thread::IdleState:
+    if (t->state == Thread::ActiveState) {
+      // fast path
+      assert(t, t->m->activeCount > 0);
+      INCREMENT(&(t->m->activeCount), -1);
+
+      t->state = s;
+
+      if (t->m->exclusive) {
+        ACQUIRE_LOCK;
+
+        t->m->stateLock->notifyAll(t->systemThread);
+      }
+      break;
+    } else {
+      // fall through to slow path
+    }
+
  case Thread::ZombieState: {
+    ACQUIRE_LOCK;
+
    switch (t->state) {
    case Thread::ExclusiveState: {
      assert(t, t->m->exclusive == t);
@ -2360,7 +2406,7 @@ enter(Thread* t, Thread::State s)
    }

    assert(t, t->m->activeCount > 0);
-    -- t->m->activeCount;
+    INCREMENT(&(t->m->activeCount), -1);

    if (s == Thread::ZombieState) {
      assert(t, t->m->liveCount > 0);
@ -2375,35 +2421,54 @@ enter(Thread* t, Thread::State s)
    t->m->stateLock->notifyAll(t->systemThread);
  } break;

-  case Thread::ActiveState: {
-    switch (t->state) {
-    case Thread::ExclusiveState: {
-      assert(t, t->m->exclusive == t);
+  case Thread::ActiveState:
+    if (t->state == Thread::IdleState and t->m->exclusive == 0) {
+      // fast path
+      INCREMENT(&(t->m->activeCount), 1);

      t->state = s;
-      t->m->exclusive = 0;

-      t->m->stateLock->notifyAll(t->systemThread);
-    } break;
-
-    case Thread::NoState:
-    case Thread::IdleState: {
-      while (t->m->exclusive) {
-        t->m->stateLock->wait(t->systemThread, 0);
+      if (t->m->exclusive) {
+        // another thread has entered the exclusive state, so we
+        // return to idle and use the slow path to become active
+        enter(t, Thread::IdleState);
+      } else {
+        break;
      }
-
-      ++ t->m->activeCount;
-      if (t->state == Thread::NoState) {
-        ++ t->m->liveCount;
-      }
-      t->state = s;
-    } break;
-
-    default: abort(t);
    }
-  } break;
+
+    { ACQUIRE_LOCK;
+
+      switch (t->state) {
+      case Thread::ExclusiveState: {
+        assert(t, t->m->exclusive == t);
+
+        t->state = s;
+        t->m->exclusive = 0;
+
+        t->m->stateLock->notifyAll(t->systemThread);
+      } break;
+
+      case Thread::NoState:
+      case Thread::IdleState: {
+        while (t->m->exclusive) {
+          t->m->stateLock->wait(t->systemThread, 0);
+        }
+
+        INCREMENT(&(t->m->activeCount), 1);
+        if (t->state == Thread::NoState) {
+          ++ t->m->liveCount;
+        }
+        t->state = s;
+      } break;
+
+      default: abort(t);
+      }
+    } break;

  case Thread::ExitState: {
+    ACQUIRE_LOCK;
+
    switch (t->state) {
    case Thread::ExclusiveState: {
      assert(t, t->m->exclusive == t);
@ -2418,7 +2483,7 @@ enter(Thread* t, Thread::State s)
    }

    assert(t, t->m->activeCount > 0);
-    -- t->m->activeCount;
+    INCREMENT(&(t->m->activeCount), -1);

    t->state = s;

--- a/src/powerpc.cpp
+++ b/src/powerpc.cpp
@ -2065,6 +2065,9 @@ class MyArchitecture: public Assembler::Architecture {
      *aTypeMask = (1 << RegisterOperand);
      break;

+    case Absolute:
+    case FloatAbsolute:
+    case FloatSquareRoot:
    case FloatNegate:
    case Float2Float:
    case Float2Int:
@ -2096,19 +2099,20 @@ class MyArchitecture: public Assembler::Architecture {
  }

  virtual void planMove
-  (unsigned,
-   uint8_t srcTypeMask, uint64_t srcRegisterMask,
-   uint8_t dstTypeMask, uint64_t,
-   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask)
+  (unsigned, uint8_t* srcTypeMask, uint64_t* srcRegisterMask,
+   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask,
+   uint8_t dstTypeMask, uint64_t)
  {
-    *tmpTypeMask = srcTypeMask;
-    *tmpRegisterMask = srcRegisterMask;
+    *srcTypeMask = ~0;
+    *srcRegisterMask = ~static_cast<uint64_t>(0);

-    if ((dstTypeMask & (1 << MemoryOperand))
-        and (srcTypeMask & ((1 << MemoryOperand) | 1 << AddressOperand)))
-    {
-      // can't move directly from memory to memory                              
-      *tmpTypeMask = (1 << RegisterOperand);
+    *tmpTypeMask = 0;
+    *tmpRegisterMask = 0;
+
+    if (dstTypeMask & (1 << MemoryOperand)) {
+      // can't move directly from memory or constant to memory
+      *srcTypeMask = 1 << RegisterOperand;
+      *tmpTypeMask = 1 << RegisterOperand;
      *tmpRegisterMask = ~static_cast<uint64_t>(0);
    }
  }
--- a/src/powerpc.h
+++ b/src/powerpc.h
@ -92,7 +92,7 @@ syncInstructionCache(const void* start, unsigned size)

 #ifdef USE_ATOMIC_OPERATIONS
 inline bool
-atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
+atomicCompareAndSwap32(uint32_t* p, uint32_t old, uint32_t new_)
 {
 #if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 1)
  return __sync_bool_compare_and_swap(p, old, new_);
@ -118,6 +118,12 @@ atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
  return result;
 #endif // not GCC >= 4.1
 }
+
+inline bool
+atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
+{
+  return atomicCompareAndSwap32(reinterpret_cast<uint32_t*>(p), old, new_);
+}
 #endif // USE_ATOMIC_OPERATIONS

 inline uint64_t
--- a/src/thunks.cpp
+++ b/src/thunks.cpp
@ -22,6 +22,8 @@ THUNK(divideFloat)
 THUNK(moduloFloat)
 THUNK(negateFloat)
 THUNK(absoluteFloat)
+THUNK(absoluteLong)
+THUNK(absoluteInt)
 THUNK(divideLong)
 THUNK(divideInt)
 THUNK(moduloLong)
--- a/src/x86.cpp
+++ b/src/x86.cpp
@ -113,23 +113,7 @@ class MyBlock: public Assembler::Block {
  unsigned size;
 };

-class Context {
- public:
-  Context(System* s, Allocator* a, Zone* zone):
-    s(s), zone(zone), client(0), code(s, a, 1024), tasks(0), result(0),
-    firstBlock(new (zone->allocate(sizeof(MyBlock))) MyBlock(0)),
-    lastBlock(firstBlock)
-  { }
-
-  System* s;
-  Zone* zone;
-  Assembler::Client* client;
-  Vector code;
-  Task* tasks;
-  uint8_t* result;
-  MyBlock* firstBlock;
-  MyBlock* lastBlock;
-};
+class Context;

 typedef void (*OperationType)(Context*);

@ -163,6 +147,25 @@ class ArchitectureContext {
   * OperandTypeCount];
 };

+class Context {
+ public:
+  Context(System* s, Allocator* a, Zone* zone, ArchitectureContext* ac):
+    s(s), zone(zone), client(0), code(s, a, 1024), tasks(0), result(0),
+    firstBlock(new (zone->allocate(sizeof(MyBlock))) MyBlock(0)),
+    lastBlock(firstBlock), ac(ac)
+  { }
+
+  System* s;
+  Zone* zone;
+  Assembler::Client* client;
+  Vector code;
+  Task* tasks;
+  uint8_t* result;
+  MyBlock* firstBlock;
+  MyBlock* lastBlock;
+  ArchitectureContext* ac;
+};
+
 void NO_RETURN
 abort(Context* c)
 {
@ -620,6 +623,27 @@ void
 ignore(Context*)
 { }

+void
+storeLoadBarrier(Context* c)
+{
+  if (useSSE(c->ac)) {
+    // mfence:
+    c->code.append(0x0f);
+    c->code.append(0xae);
+    c->code.append(0xf0);
+  } else {
+    // lock addq $0x0,(%rsp):
+    c->code.append(0xf0);
+    if (BytesPerWord == 8) {
+      c->code.append(0x48);
+    }
+    c->code.append(0x83);
+    c->code.append(0x04);
+    c->code.append(0x24);
+    c->code.append(0x00);    
+  }
+}
+
 void
 unconditional(Context* c, unsigned jump, Assembler::Constant* a)
 {
@ -946,17 +970,20 @@ void
 sseMoveRR(Context* c, unsigned aSize, Assembler::Register* a,
          unsigned bSize UNUSED, Assembler::Register* b)
 {
+  assert(c, aSize >= 4);
+  assert(c, aSize == bSize);
+
  if (floatReg(a) and floatReg(b)) {
    if (aSize == 4) {
      opcode(c, 0xf3);
      maybeRex(c, 4, a, b);
      opcode(c, 0x0f, 0x10);
-      modrm(c, 0xc0, b, a);
+      modrm(c, 0xc0, a, b);
    } else {
      opcode(c, 0xf2);
-      maybeRex(c, 4, a, b);
+      maybeRex(c, 8, a, b);
      opcode(c, 0x0f, 0x10);
-      modrm(c, 0xc0, b, a);
+      modrm(c, 0xc0, a, b);
    } 
  } else if (floatReg(a)) {
    opcode(c, 0x66);
@ -1090,6 +1117,8 @@ void
 sseMoveMR(Context* c, unsigned aSize, Assembler::Memory* a,
          unsigned bSize UNUSED, Assembler::Register* b)
 {
+  assert(c, aSize >= 4);
+
  if (BytesPerWord == 4 and aSize == 8) {
    opcode(c, 0xf3);
    opcode(c, 0x0f, 0x7e);
@ -1165,6 +1194,7 @@ void
 sseMoveRM(Context* c, unsigned aSize, Assembler::Register* a,
       UNUSED unsigned bSize, Assembler::Memory* b)
 {
+  assert(c, aSize >= 4);
  assert(c, aSize == bSize);

  if (BytesPerWord == 4 and aSize == 8) {
@ -2496,7 +2526,7 @@ populateTables(ArchitectureContext* c)
  zo[Return] = return_;
  zo[LoadBarrier] = ignore;
  zo[StoreStoreBarrier] = ignore;
-  zo[StoreLoadBarrier] = ignore;
+  zo[StoreLoadBarrier] = storeLoadBarrier;

  uo[index(c, Call, C)] = CAST1(callC);
  uo[index(c, Call, R)] = CAST1(callR);
@ -2889,9 +2919,13 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case FloatAbsolute:
-      *aTypeMask = (1 << RegisterOperand);
-      *aRegisterMask = (static_cast<uint64_t>(FloatRegisterMask) << 32)
-        | FloatRegisterMask;
+      if (useSSE(&c)) {
+        *aTypeMask = (1 << RegisterOperand);
+        *aRegisterMask = (static_cast<uint64_t>(FloatRegisterMask) << 32)
+          | FloatRegisterMask;
+      } else {
+        *thunk = true;
+      }
      break;  
  
    case FloatNegate:
@ -2905,9 +2939,13 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case FloatSquareRoot:
-      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
-      *aRegisterMask = (static_cast<uint64_t>(FloatRegisterMask) << 32)
-        | FloatRegisterMask;
+      if (useSSE(&c)) {
+        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+        *aRegisterMask = (static_cast<uint64_t>(FloatRegisterMask) << 32)
+          | FloatRegisterMask;
+      } else {
+        *thunk = true;
+      }
      break;

    case Float2Float:
@ -2921,7 +2959,7 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case Float2Int:
-      if (useSSE(&c) and (bSize <= BytesPerWord)) {
+      if (useSSE(&c) and bSize <= BytesPerWord) {
        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
        *aRegisterMask = (static_cast<uint64_t>(FloatRegisterMask) << 32)
          | FloatRegisterMask;
@ -2931,7 +2969,7 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case Int2Float:
-      if (useSSE(&c) and (aSize <= BytesPerWord)) {
+      if (useSSE(&c) and aSize <= BytesPerWord) {
        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
        *aRegisterMask = GeneralRegisterMask
          | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
@ -2941,9 +2979,8 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case Move:
-      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
-      *aRegisterMask = GeneralRegisterMask
-        | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+      *aTypeMask = ~0;
+      *aRegisterMask = ~static_cast<uint64_t>(0);

      if (BytesPerWord == 4) {
        if (aSize == 4 and bSize == 8) {
@ -3039,38 +3076,46 @@ class MyArchitecture: public Assembler::Architecture {
  }

  virtual void planMove
-  (unsigned size, 
-   uint8_t srcTypeMask, uint64_t srcRegisterMask,
-   uint8_t dstTypeMask, uint64_t dstRegisterMask,
-   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask)
+  (unsigned size, uint8_t* srcTypeMask, uint64_t* srcRegisterMask,
+   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask,
+   uint8_t dstTypeMask, uint64_t dstRegisterMask)
  {
-    *tmpTypeMask = srcTypeMask;
-    *tmpRegisterMask = srcRegisterMask;
+    *srcTypeMask = ~0;
+    *srcRegisterMask = ~static_cast<uint64_t>(0);

-    if ((dstTypeMask & (1 << MemoryOperand))
-        and (srcTypeMask & ((1 << MemoryOperand) | 1 << AddressOperand)))
-    {
+    *tmpTypeMask = 0;
+    *tmpRegisterMask = 0;
+
+    if (dstTypeMask & (1 << MemoryOperand)) {
      // can't move directly from memory to memory
-      *tmpTypeMask = (1 << RegisterOperand);
+      *srcTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
+      *tmpTypeMask = 1 << RegisterOperand;
      *tmpRegisterMask = GeneralRegisterMask
        | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
    } else if (dstTypeMask & (1 << RegisterOperand)) {
-      if (srcTypeMask & (1 << RegisterOperand)) {
-        if (size != BytesPerWord
-            and (((dstRegisterMask & FloatRegisterMask) == 0)
-                 xor ((srcRegisterMask & FloatRegisterMask) == 0)))
-        {
-          // can't move directly from FPR to GPR or vice-versa for
-          // values larger than the GPR size
-          *tmpTypeMask = (1 << MemoryOperand);
-          *tmpRegisterMask = 0;
+      if (size > BytesPerWord) {
+        // can't move directly from FPR to GPR or vice-versa for
+        // values larger than the GPR size
+        if (dstRegisterMask & FloatRegisterMask) {
+          *srcRegisterMask = FloatRegisterMask
+            | (static_cast<uint64_t>(FloatRegisterMask) << 32);
+          *tmpTypeMask = 1 << MemoryOperand;          
+        } else if (dstRegisterMask & GeneralRegisterMask) {
+          *srcRegisterMask = GeneralRegisterMask
+            | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+          *tmpTypeMask = 1 << MemoryOperand;
        }
-      } else if ((dstRegisterMask & FloatRegisterMask)
-                 and (srcTypeMask & (1 << ConstantOperand)))
-      {
+      }
+      if (dstRegisterMask & FloatRegisterMask) {
        // can't move directly from constant to FPR
-        *tmpTypeMask = (1 << MemoryOperand);
-        *tmpRegisterMask = 0;
+        *srcTypeMask &= ~(1 << ConstantOperand);
+        if (size > BytesPerWord) {
+          *tmpTypeMask = 1 << MemoryOperand;
+        } else {
+          *tmpTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+          *tmpRegisterMask = GeneralRegisterMask
+            | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+        }
      }
    }
  }
@ -3211,7 +3256,7 @@ class MyArchitecture: public Assembler::Architecture {
 class MyAssembler: public Assembler {
 public:
  MyAssembler(System* s, Allocator* a, Zone* zone, MyArchitecture* arch):
-    c(s, a, zone), arch_(arch)
+    c(s, a, zone, &(arch->c)), arch_(arch)
  { }

  virtual void setClient(Client* client) {
--- a/src/x86.h
+++ b/src/x86.h
@ -159,15 +159,17 @@ memoryBarrier()
 {
 #ifdef _MSC_VER
  MemoryBarrier();
-#else
-  __asm__ __volatile__("": : :"memory");
-#endif
+#elif defined ARCH_x86_32
+  __asm__ __volatile__("lock; addl $0,0(%%esp)": : :"memory");
+#elif defined ARCH_x86_64
+  __asm__ __volatile__("mfence": : :"memory");
+#endif // ARCH_x86_64
 }

 inline void
 storeStoreMemoryBarrier()
 {
-  memoryBarrier();
+  __asm__ __volatile__("": : :"memory");
 }

 inline void
@ -179,28 +181,24 @@ storeLoadMemoryBarrier()
 inline void
 loadMemoryBarrier()
 {
-  memoryBarrier();
+  __asm__ __volatile__("": : :"memory");
 }

 inline void
 syncInstructionCache(const void*, unsigned)
 {
-  // ignore
+  __asm__ __volatile__("": : :"memory");
 }

 #ifdef USE_ATOMIC_OPERATIONS
 inline bool
-atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
+atomicCompareAndSwap32(uint32_t* p, uint32_t old, uint32_t new_)
 {
 #ifdef _MSC_VER
-#  ifdef ARCH_x86_32
  InterlockedCompareExchange(p, new_, old);
-#  elif defined ARCH_x86_64
-  InterlockedCompareExchange64(p, new_, old);
-#  endif // ARCH_x86_64
 #elif (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 1)
  return __sync_bool_compare_and_swap(p, old, new_);
-#elif defined ARCH_x86_32
+#else
  uint8_t result;

  __asm__ __volatile__("lock; cmpxchgl %2, %0; setz %1"
@ -209,7 +207,17 @@ atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
                       : "memory");

  return result != 0;
-#elif defined ARCH_x86_64
+#endif
+}
+
+inline bool
+atomicCompareAndSwap64(uint64_t* p, uint64_t old, uint64_t new_)
+{
+#ifdef _MSC_VER
+  InterlockedCompareExchange64(p, new_, old);
+#elif (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 1)
+  return __sync_bool_compare_and_swap(p, old, new_);
+#else
  uint8_t result;

  __asm__ __volatile__("lock; cmpxchgq %2, %0; setz %1"
@ -218,6 +226,16 @@ atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
                       : "memory");

  return result != 0;
+#endif
+}
+
+inline bool
+atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
+{
+#ifdef ARCH_x86_32
+  return atomicCompareAndSwap32(reinterpret_cast<uint32_t*>(p), old, new_);
+#elif defined ARCH_x86_64
+  return atomicCompareAndSwap64(reinterpret_cast<uint64_t*>(p), old, new_);
 #endif // ARCH_x86_64
 }
 #endif // USE_ATOMIC_OPERATIONS
--- a/test/AllFloats.java
+++ b/test/AllFloats.java
@ -19,6 +19,8 @@ public class AllFloats {
  private static float complex(float a, float b) {return (a - b) / (a * b) + (float)Math.sqrt(a);}
  private static double complex(double a, double b) {return (a - b) / (a * b) + Math.sqrt(a);}
  private static double complex(float a, double b) {return (a - b) / (a * b) + Math.sqrt(a);}
+  private static double sqrt(double a) {return Math.sqrt(a);}
+  private static float complexNoIntrinsic(float a, float b) {return (a - b) / (a * b) + (float)sqrt(a);}
  private static int f2i(float a) {return (int)a;}
  private static long f2l(float a) {return (long)a;}
  private static float i2f(int a) {return (float)a;}
@ -59,6 +61,7 @@ public class AllFloats {
    expect(complex(4f, 3f) == (4f-3f)/(4f*3f) + 2f);
    expect(complex(4d, 3d) == (4d-3d)/(4d*3d) + 2d);
    expect(complex(4f, 3d) == (4f-3d)/(4f*3d) + 2f);
+    expect(complexNoIntrinsic(4f, 3f) == (4f-3f)/(4f*3f) + 2f);
    
    expect(f2i(4f) == 4);
    expect(f2l(4f) == 4);
--- a/test/Floats.java
+++ b/test/Floats.java
@ -19,6 +19,20 @@ public class Floats {
    return a - b;
  }

+  private double field = 100d;
+
+  private static int doubleToInt(Floats f) {
+    return (int) f.field;
+  }
+
+  private static void multiplyAndStore(double a, double b, Floats f) {
+    f.field = a * b;
+  }
+
+  private static double loadAndMultiply(double a, Floats f) {
+    return f.field * a;
+  }
+
  public static void main(String[] args) {
    expect(multiply(0.5d, 0.5d) == 0.25d);
    expect(multiply(0.5f, 0.5f) == 0.25f);
@ -50,10 +64,35 @@ public class Floats {
      expect(((int) d) == 1);
    }

+    { double d = 12345d;
+      expect(((int) d) == 12345);
+    }
+
+    expect(doubleToInt(new Floats()) == 100);
+
+    { Floats f = new Floats();
+      f.field = 32.0d;
+      expect(loadAndMultiply(2.0d, f) == 64.0d);
+    }
+
+    { Floats f = new Floats();
+      f.field = 32.0d;
+      expect(multiply(2.0d, f.field) == 64.0d);
+    }
+
+    { Floats f = new Floats();
+      multiplyAndStore(32.0d, 0.5d, f);
+      expect(f.field == 16.0d);
+    }
+
    { float f = 1f;
      expect(((int) f) == 1);
    }

+    { float f = 1f;
+      expect(((long) f) == 1);
+    }
+
    expect(Math.round(0.4f) == 0);
    expect(Math.round(0.5f) == 1);
    expect(Math.round(1.0f) == 1);
@ -73,5 +112,20 @@ public class Floats {
      double d = (double) z;
      expect(d == 6553311036568663.0);
    }
+
+    { long z = 12345L;
+      float f = (float) z;
+      expect(f == 12345.0);
+    }
+
+    { int z = 12345;
+      float f = (float) z;
+      expect(f == 12345.0);
+    }
+
+    { int z = 12345;
+      double d = (double) z;
+      expect(d == 12345.0);
+    }
  }
 }