Merge branch 'wip' of oss.readytalk.com:/var/local/git/avian into wip

2025-04-19 08:36:39 +00:00 · 2009-11-30 14:00:51 -07:00 · 2009-11-30 14:00:51 -07:00 · 69c9bf5ff9
commit 69c9bf5ff9
parent 993d210232 04454960ec
16 changed files with 858 additions and 418 deletions
--- a/4
+++ b/4
@ -101,7 +101,7 @@ warnings = -Wall -Wextra -Werror -Wunused-parameter -Winit-self \
 common-cflags = $(warnings) -fno-rtti -fno-exceptions -fno-omit-frame-pointer \
 	"-I$(JAVA_HOME)/include" -idirafter $(src) -I$(native-build) \
 	-D__STDC_LIMIT_MACROS -D_JNI_IMPLEMENTATION_ -DAVIAN_VERSION=\"$(version)\" \
-	$(gnu-cflags)
+	-DUSE_ATOMIC_OPERATIONS $(gnu-cflags)

 build-cflags = $(common-cflags) -fPIC -fvisibility=hidden \
 	"-I$(JAVA_HOME)/include/linux" -I$(src) -pthread
@ -240,10 +240,12 @@ ifeq ($(mode),small)
 	cflags += -Os -g3 -DNDEBUG
 endif

+ifneq ($(platform),darwin)
 ifeq ($(arch),i386)
 # this is necessary to support __sync_bool_compare_and_swap:
 	cflags += -march=i486
 endif
+endif

 output = -o $(1)
 as := $(cc)
--- a/src/assembler.h
+++ b/src/assembler.h
@ -361,10 +361,9 @@ class Assembler {
     unsigned bSize, uint8_t* bTypeMask, uint64_t* bRegisterMask) = 0;

    virtual void planMove
-    (unsigned size,
-     uint8_t srcTypeMask, uint64_t srcRegisterMask,
-     uint8_t dstTypeMask, uint64_t dstRegisterMask,
-     uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask) = 0; 
+    (unsigned size, uint8_t* srcTypeMask, uint64_t* srcRegisterMask,
+     uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask,
+     uint8_t dstTypeMask, uint64_t dstRegisterMask) = 0; 

    virtual void planSource
    (TernaryOperation op,
--- a/src/binaryToObject/main.cpp
+++ b/src/binaryToObject/main.cpp
@ -14,7 +14,11 @@
 #include "string.h"

 #include "sys/stat.h"
+#ifdef WIN32
+#include <windows.h>
+#else
 #include "sys/mman.h"
+#endif
 #include "fcntl.h"
 #include "unistd.h"

@ -153,8 +157,29 @@ main(int argc, const char** argv)
    struct stat s;
    int r = fstat(fd, &s);
    if (r != -1) {
+#ifdef WIN32
+      HANDLE fm;
+      HANDLE h = (HANDLE) _get_osfhandle (fd);
+
+      fm = CreateFileMapping(
+               h,
+               NULL,
+               PAGE_READONLY,
+               0,
+               0,
+               NULL);
+      data = static_cast<uint8_t*>(MapViewOfFile(
+                fm,
+                FILE_MAP_READ,
+                0,
+                0,
+                s.st_size));
+
+      CloseHandle(fm);
+#else
      data = static_cast<uint8_t*>
        (mmap(0, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0));
+#endif
      size = s.st_size;
    }
    close(fd);
@ -174,7 +199,11 @@ main(int argc, const char** argv)
      fprintf(stderr, "unable to open %s\n", argv[2]);
    }

+#ifdef WIN32
+    UnmapViewOfFile(data);
+#else
    munmap(data, size);
+#endif
  } else {
    perror(argv[0]);
  }
--- a/src/common.h
+++ b/src/common.h
@ -325,14 +325,6 @@ markBit(uintptr_t* map, unsigned i)
  map[wordOf(i)] |= static_cast<uintptr_t>(1) << bitOf(i);
 }

-inline void
-markBitAtomic(uintptr_t* map, unsigned i)
-{
-  uintptr_t* p = map + wordOf(i);
-  uintptr_t v = static_cast<uintptr_t>(1) << bitOf(i);
-  while (not __sync_bool_compare_and_swap(p, *p, *p | v)) { }
-}
-
 inline void
 clearBit(uintptr_t* map, unsigned i)
 {
--- a/src/compile.cpp
+++ b/src/compile.cpp
@ -2934,7 +2934,7 @@ bool
 intrinsic(MyThread* t, Frame* frame, object target)
 {
 #define MATCH(name, constant)                                           \
-  (byteArrayLength(t, name) - 1 == sizeof(constant)                     \
+  (byteArrayLength(t, name) == sizeof(constant)                         \
   and strcmp(reinterpret_cast<char*>(&byteArrayBody(t, name, 0)),      \
              constant) == 0)

@ -3040,6 +3040,7 @@ compile(MyThread* t, Frame* initialFrame, unsigned ip,
           (4, 4, c->memory
            (array, Compiler::FloatType, ArrayBody, index, 4), BytesPerWord));
        break;
+
      case iaload:
        frame->pushInt
          (c->load
@ -5621,6 +5622,7 @@ compile(MyThread* t, Allocator* allocator, Context* context)
      frame.set(--index, Frame::Long);
      c->initLocal(2, index, Compiler::IntegerType);
      break;
+
    case 'D':
      frame.set(--index, Frame::Long);
      frame.set(--index, Frame::Long);
@ -7260,7 +7262,7 @@ class MyProcessor: public Processor {
    class Visitor: public System::ThreadVisitor {
     public:
      Visitor(MyThread* t, MyProcessor* p, MyThread* target):
-        t(t), p(p), target(target)
+        t(t), p(p), target(target), trace(0)
      { }

      virtual void visit(void* ip, void* base, void* stack) {
--- a/src/compiler.cpp
+++ b/src/compiler.cpp
--- a/src/heap.cpp
+++ b/src/heap.cpp
@ -11,6 +11,7 @@
 #include "heap.h"
 #include "system.h"
 #include "common.h"
+#include "arch.h"

 using namespace vm;

@ -69,6 +70,19 @@ System* system(Context*);
 void* tryAllocate(Context* c, unsigned size);
 void free(Context* c, const void* p, unsigned size);

+#ifdef USE_ATOMIC_OPERATIONS
+inline void
+markBitAtomic(uintptr_t* map, unsigned i)
+{
+  uintptr_t* p = map + wordOf(i);
+  uintptr_t v = static_cast<uintptr_t>(1) << bitOf(i);
+  for (uintptr_t old = *p;
+       not atomicCompareAndSwap(p, old, old | v);
+       old = *p)
+  { }
+}
+#endif // USE_ATOMIC_OPERATIONS
+
 inline void*
 get(void* o, unsigned offsetInWords)
 {
@ -303,12 +317,14 @@ class Segment {
      if (child) child->set(p, v);
    }

+#ifdef USE_ATOMIC_OPERATIONS
    void markAtomic(void* p) {
      assert(segment->context, bitsPerRecord == 1);
      markBitAtomic(data, indexOf(p));
      assert(segment->context, getBit(data, indexOf(p)));
      if (child) child->markAtomic(p);
    }
+#endif

    unsigned get(void* p) {
      return getBits(data, bitsPerRecord, indexOf(p));
@ -1020,7 +1036,9 @@ void
 markDirty(Context* c, Fixie* f)
 {
  if (not f->dirty) {
+#ifdef USE_ATOMIC_OPERATIONS
    ACQUIRE(c->lock);
+#endif

    if (not f->dirty) {
      f->dirty = true;
@ -1816,6 +1834,10 @@ class MyHeap: public Heap {

  virtual void mark(void* p, unsigned offset, unsigned count) {
    if (needsMark(p)) {
+#ifndef USE_ATOMIC_OPERATIONS
+      ACQUIRE(c.lock);
+#endif
+
      if (c.client->isFixed(p)) {
        Fixie* f = fixie(p);
        assert(&c, offset == 0 or f->hasMask);
@ -1830,7 +1852,11 @@ class MyHeap: public Heap {
            }

            dirty = true;
+#ifdef USE_ATOMIC_OPERATIONS
            markBitAtomic(f->mask(), offset + i);
+#else
+            markBit(f->mask(), offset + i);
+#endif
            assert(&c, getBit(f->mask(), offset + i));
          }
        }
@ -1848,7 +1874,11 @@ class MyHeap: public Heap {
        for (unsigned i = 0; i < count; ++i) {
          void** target = static_cast<void**>(p) + offset + i;
          if (targetNeedsMark(mask(*target))) {
+#ifdef USE_ATOMIC_OPERATIONS
            map->markAtomic(target);
+#else
+            map->set(target);
+#endif
          }
        }
      }
--- a/src/jnienv.cpp
+++ b/src/jnienv.cpp
@ -2173,10 +2173,7 @@ JNI_CreateJavaVM(Machine** m, Thread** t, void* args)
  System* s = makeSystem(crashDumpDirectory);
  Heap* h = makeHeap(s, heapLimit);
  Finder* f = makeFinder(s, RUNTIME_ARRAY_BODY(classpathBuffer), bootLibrary);
-  Processor* p = makeProcessor(s, h, false); // change back to true
-                                             // once use of SSE is
-                                             // fixed on 32-bit
-                                             // systems
+  Processor* p = makeProcessor(s, h, true);

  const char** properties = static_cast<const char**>
    (h->allocate(sizeof(const char*) * propertyCount));
--- a/src/machine.cpp
+++ b/src/machine.cpp
@ -14,6 +14,7 @@
 #include "stream.h"
 #include "constants.h"
 #include "processor.h"
+#include "arch.h"

 using namespace vm;

@ -21,6 +22,17 @@ namespace {

 const unsigned NoByte = 0xFFFF;

+#ifdef USE_ATOMIC_OPERATIONS
+void
+atomicIncrement(uint32_t* p, int v)
+{
+  for (uint32_t old = *p;
+       not atomicCompareAndSwap32(p, old, old + v);
+       old = *p)
+  { }
+}
+#endif
+
 bool
 find(Thread* t, Thread* o)
 {
@ -2319,20 +2331,33 @@ enter(Thread* t, Thread::State s)
    return;
  }

+#ifdef USE_ATOMIC_OPERATIONS
+#  define INCREMENT atomicIncrement
+#  define ACQUIRE_LOCK ACQUIRE_RAW(t, t->m->stateLock)
+#  define BARRIER memoryBarrier()
+#else
+#  define INCREMENT(pointer, value) *(pointer) += value;
+#  define ACQUIRE_LOCK
+#  define BARRIER
+
  ACQUIRE_RAW(t, t->m->stateLock);
+#endif // not USE_ATOMIC_OPERATIONS

  switch (s) {
  case Thread::ExclusiveState: {
+    ACQUIRE_LOCK;
+
    while (t->m->exclusive) {
      // another thread got here first.
      ENTER(t, Thread::IdleState);
+      t->m->stateLock->wait(t->systemThread, 0);
    }

    switch (t->state) {
    case Thread::ActiveState: break;

    case Thread::IdleState: {
-      ++ t->m->activeCount;
+      INCREMENT(&(t->m->activeCount), 1);
    } break;

    default: abort(t);
@ -2340,14 +2365,35 @@ enter(Thread* t, Thread::State s)

    t->state = Thread::ExclusiveState;
    t->m->exclusive = t;
-      
+    
+    BARRIER;
+
    while (t->m->activeCount > 1) {
      t->m->stateLock->wait(t->systemThread, 0);
    }
  } break;

  case Thread::IdleState:
+    if (t->state == Thread::ActiveState) {
+      // fast path
+      assert(t, t->m->activeCount > 0);
+      INCREMENT(&(t->m->activeCount), -1);
+
+      t->state = s;
+
+      if (t->m->exclusive) {
+        ACQUIRE_LOCK;
+
+        t->m->stateLock->notifyAll(t->systemThread);
+      }
+      break;
+    } else {
+      // fall through to slow path
+    }
+
  case Thread::ZombieState: {
+    ACQUIRE_LOCK;
+
    switch (t->state) {
    case Thread::ExclusiveState: {
      assert(t, t->m->exclusive == t);
@ -2360,7 +2406,7 @@ enter(Thread* t, Thread::State s)
    }

    assert(t, t->m->activeCount > 0);
-    -- t->m->activeCount;
+    INCREMENT(&(t->m->activeCount), -1);

    if (s == Thread::ZombieState) {
      assert(t, t->m->liveCount > 0);
@ -2375,35 +2421,54 @@ enter(Thread* t, Thread::State s)
    t->m->stateLock->notifyAll(t->systemThread);
  } break;

-  case Thread::ActiveState: {
-    switch (t->state) {
-    case Thread::ExclusiveState: {
-      assert(t, t->m->exclusive == t);
+  case Thread::ActiveState:
+    if (t->state == Thread::IdleState and t->m->exclusive == 0) {
+      // fast path
+      INCREMENT(&(t->m->activeCount), 1);

      t->state = s;
-      t->m->exclusive = 0;

-      t->m->stateLock->notifyAll(t->systemThread);
-    } break;
-
-    case Thread::NoState:
-    case Thread::IdleState: {
-      while (t->m->exclusive) {
-        t->m->stateLock->wait(t->systemThread, 0);
+      if (t->m->exclusive) {
+        // another thread has entered the exclusive state, so we
+        // return to idle and use the slow path to become active
+        enter(t, Thread::IdleState);
+      } else {
+        break;
      }
-
-      ++ t->m->activeCount;
-      if (t->state == Thread::NoState) {
-        ++ t->m->liveCount;
-      }
-      t->state = s;
-    } break;
-
-    default: abort(t);
    }
-  } break;
+
+    { ACQUIRE_LOCK;
+
+      switch (t->state) {
+      case Thread::ExclusiveState: {
+        assert(t, t->m->exclusive == t);
+
+        t->state = s;
+        t->m->exclusive = 0;
+
+        t->m->stateLock->notifyAll(t->systemThread);
+      } break;
+
+      case Thread::NoState:
+      case Thread::IdleState: {
+        while (t->m->exclusive) {
+          t->m->stateLock->wait(t->systemThread, 0);
+        }
+
+        INCREMENT(&(t->m->activeCount), 1);
+        if (t->state == Thread::NoState) {
+          ++ t->m->liveCount;
+        }
+        t->state = s;
+      } break;
+
+      default: abort(t);
+      }
+    } break;

  case Thread::ExitState: {
+    ACQUIRE_LOCK;
+
    switch (t->state) {
    case Thread::ExclusiveState: {
      assert(t, t->m->exclusive == t);
@ -2418,7 +2483,7 @@ enter(Thread* t, Thread::State s)
    }

    assert(t, t->m->activeCount > 0);
-    -- t->m->activeCount;
+    INCREMENT(&(t->m->activeCount), -1);

    t->state = s;

--- a/src/powerpc.cpp
+++ b/src/powerpc.cpp
@ -2096,19 +2096,20 @@ class MyArchitecture: public Assembler::Architecture {
  }

  virtual void planMove
-  (unsigned,
-   uint8_t srcTypeMask, uint64_t srcRegisterMask,
-   uint8_t dstTypeMask, uint64_t,
-   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask)
+  (unsigned size, uint8_t* srcTypeMask, uint64_t* srcRegisterMask,
+   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask,
+   uint8_t dstTypeMask, uint64_t dstRegisterMask)
  {
-    *tmpTypeMask = srcTypeMask;
-    *tmpRegisterMask = srcRegisterMask;
+    *srcTypeMask = ~0;
+    *srcRegisterMask = ~static_cast<uint64_t>(0);

-    if ((dstTypeMask & (1 << MemoryOperand))
-        and (srcTypeMask & ((1 << MemoryOperand) | 1 << AddressOperand)))
-    {
-      // can't move directly from memory to memory                              
-      *tmpTypeMask = (1 << RegisterOperand);
+    *tmpTypeMask = 0;
+    *tmpRegisterMask = 0;
+
+    if (dstTypeMask & (1 << MemoryOperand)) {
+      // can't move directly from memory or constant to memory
+      *srcTypeMask = 1 << RegisterOperand;
+      *tmpTypeMask = 1 << RegisterOperand;
      *tmpRegisterMask = ~static_cast<uint64_t>(0);
    }
  }
--- a/src/powerpc.h
+++ b/src/powerpc.h
@ -90,6 +90,42 @@ syncInstructionCache(const void* start, unsigned size)
  __asm__ __volatile__("isync");
 }

+#ifdef USE_ATOMIC_OPERATIONS
+inline bool
+atomicCompareAndSwap32(uint32_t* p, uint32_t old, uint32_t new_)
+{
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 1)
+  return __sync_bool_compare_and_swap(p, old, new_);
+#else // not GCC >= 4.1
+  bool result;
+
+  __asm__ __volatile__("  sync\n"
+                       "1:\n"
+                       "  lwarx  %0,0,%2\n"
+                       "  cmpw   %0,%3\n"
+                       "  bne-   2f\n"
+                       "  stwcx. %4,0,%2\n"
+                       "  bne-   1b\n"
+                       "  isync  \n"
+                       "2:\n"
+                       "  xor    %0,%0,%3\n"
+                       "  cntlzw %0,%0\n"
+                       "  srwi   %0,%0,5\n"
+                       : "=&r"(result), "+m"(*p)
+                       : "r"(p), "r"(old), "r"(new_)
+                       : "cc", "memory");
+ 
+  return result;
+#endif // not GCC >= 4.1
+}
+
+inline bool
+atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
+{
+  return atomicCompareAndSwap32(p, old, new_);
+}
+#endif // USE_ATOMIC_OPERATIONS
+
 inline uint64_t
 dynamicCall(void* function, uintptr_t* arguments, uint8_t* argumentTypes,
            unsigned argumentCount, unsigned argumentsSize,
--- a/src/windows.cpp
+++ b/src/windows.cpp
@ -616,26 +616,32 @@ class MySystem: public System {

    ACQUIRE(this, mutex);

+    bool success = false;
    int rv = SuspendThread(target->thread);
-    expect(this, rv != -1);
+    if (rv != -1) {
+      CONTEXT context;
+      memset(&context, 0, sizeof(CONTEXT));
+      context.ContextFlags = CONTEXT_CONTROL;
+      rv = GetThreadContext(target->thread, &context);

-    CONTEXT context;
-    rv = GetThreadContext(target->thread, &context);
-    expect(this, rv);
+      if (rv) {
 #ifdef ARCH_x86_32
-    visitor->visit(reinterpret_cast<void*>(context.Eip),
-                   reinterpret_cast<void*>(context.Ebp),
-                   reinterpret_cast<void*>(context.Esp));
+        visitor->visit(reinterpret_cast<void*>(context.Eip),
+                       reinterpret_cast<void*>(context.Ebp),
+                       reinterpret_cast<void*>(context.Esp));
 #elif defined ARCH_x86_64
-    visitor->visit(reinterpret_cast<void*>(context.Rip),
-                   reinterpret_cast<void*>(context.Rbp),
-                   reinterpret_cast<void*>(context.Rsp));
+        visitor->visit(reinterpret_cast<void*>(context.Rip),
+                       reinterpret_cast<void*>(context.Rbp),
+                       reinterpret_cast<void*>(context.Rsp));
 #endif
+        success = true;
+      }

-    rv = ResumeThread(target->thread);
-    expect(this, rv != -1);
+      rv = ResumeThread(target->thread);
+      expect(this, rv != -1);
+    }

-    return 0;
+    return (success ? 0 : 1);
  }

  virtual uint64_t call(void* function, uintptr_t* arguments, uint8_t* types,
--- a/src/x86.cpp
+++ b/src/x86.cpp
@ -946,6 +946,9 @@ void
 sseMoveRR(Context* c, unsigned aSize, Assembler::Register* a,
          unsigned bSize UNUSED, Assembler::Register* b)
 {
+  assert(c, aSize >= 4);
+  assert(c, aSize == bSize);
+
  if (floatReg(a) and floatReg(b)) {
    if (aSize == 4) {
      opcode(c, 0xf3);
@ -1090,6 +1093,9 @@ void
 sseMoveMR(Context* c, unsigned aSize, Assembler::Memory* a,
          unsigned bSize UNUSED, Assembler::Register* b)
 {
+  assert(c, aSize >= 4);
+  assert(c, aSize == bSize);
+
  if (BytesPerWord == 4 and aSize == 8) {
    opcode(c, 0xf3);
    opcode(c, 0x0f, 0x7e);
@ -1165,6 +1171,7 @@ void
 sseMoveRM(Context* c, unsigned aSize, Assembler::Register* a,
       UNUSED unsigned bSize, Assembler::Memory* b)
 {
+  assert(c, aSize >= 4);
  assert(c, aSize == bSize);

  if (BytesPerWord == 4 and aSize == 8) {
@ -2921,7 +2928,7 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case Float2Int:
-      if (useSSE(&c) and (bSize <= BytesPerWord)) {
+      if (useSSE(&c) and bSize <= BytesPerWord) {
        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
        *aRegisterMask = (static_cast<uint64_t>(FloatRegisterMask) << 32)
          | FloatRegisterMask;
@ -2931,7 +2938,7 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case Int2Float:
-      if (useSSE(&c) and (aSize <= BytesPerWord)) {
+      if (useSSE(&c) and aSize <= BytesPerWord) {
        *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
        *aRegisterMask = GeneralRegisterMask
          | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
@ -2941,9 +2948,8 @@ class MyArchitecture: public Assembler::Architecture {
      break;

    case Move:
-      *aTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
-      *aRegisterMask = GeneralRegisterMask
-        | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+      *aTypeMask = ~0;
+      *aRegisterMask = ~static_cast<uint64_t>(0);

      if (BytesPerWord == 4) {
        if (aSize == 4 and bSize == 8) {
@ -3039,38 +3045,46 @@ class MyArchitecture: public Assembler::Architecture {
  }

  virtual void planMove
-  (unsigned size, 
-   uint8_t srcTypeMask, uint64_t srcRegisterMask,
-   uint8_t dstTypeMask, uint64_t dstRegisterMask,
-   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask)
+  (unsigned size, uint8_t* srcTypeMask, uint64_t* srcRegisterMask,
+   uint8_t* tmpTypeMask, uint64_t* tmpRegisterMask,
+   uint8_t dstTypeMask, uint64_t dstRegisterMask)
  {
-    *tmpTypeMask = srcTypeMask;
-    *tmpRegisterMask = srcRegisterMask;
+    *srcTypeMask = ~0;
+    *srcRegisterMask = ~static_cast<uint64_t>(0);

-    if ((dstTypeMask & (1 << MemoryOperand))
-        and (srcTypeMask & ((1 << MemoryOperand) | 1 << AddressOperand)))
-    {
+    *tmpTypeMask = 0;
+    *tmpRegisterMask = 0;
+
+    if (dstTypeMask & (1 << MemoryOperand)) {
      // can't move directly from memory to memory
-      *tmpTypeMask = (1 << RegisterOperand);
+      *srcTypeMask = (1 << RegisterOperand) | (1 << ConstantOperand);
+      *tmpTypeMask = 1 << RegisterOperand;
      *tmpRegisterMask = GeneralRegisterMask
        | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
    } else if (dstTypeMask & (1 << RegisterOperand)) {
-      if (srcTypeMask & (1 << RegisterOperand)) {
-        if (size != BytesPerWord
-            and (((dstRegisterMask & FloatRegisterMask) == 0)
-                 xor ((srcRegisterMask & FloatRegisterMask) == 0)))
-        {
-          // can't move directly from FPR to GPR or vice-versa for
-          // values larger than the GPR size
-          *tmpTypeMask = (1 << MemoryOperand);
-          *tmpRegisterMask = 0;
+      if (size > BytesPerWord) {
+        // can't move directly from FPR to GPR or vice-versa for
+        // values larger than the GPR size
+        if (dstRegisterMask & FloatRegisterMask) {
+          *srcRegisterMask = FloatRegisterMask
+            | (static_cast<uint64_t>(FloatRegisterMask) << 32);
+          *tmpTypeMask = 1 << MemoryOperand;          
+        } else if (dstRegisterMask & GeneralRegisterMask) {
+          *srcRegisterMask = GeneralRegisterMask
+            | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+          *tmpTypeMask = 1 << MemoryOperand;
        }
-      } else if ((dstRegisterMask & FloatRegisterMask)
-                 and (srcTypeMask & (1 << ConstantOperand)))
-      {
+      }
+      if (dstRegisterMask & FloatRegisterMask) {
        // can't move directly from constant to FPR
-        *tmpTypeMask = (1 << MemoryOperand);
-        *tmpRegisterMask = 0;
+        *srcTypeMask &= ~(1 << ConstantOperand);
+        if (size > BytesPerWord) {
+          *tmpTypeMask = 1 << MemoryOperand;
+        } else {
+          *tmpTypeMask = (1 << RegisterOperand) | (1 << MemoryOperand);
+          *tmpRegisterMask = GeneralRegisterMask
+            | (static_cast<uint64_t>(GeneralRegisterMask) << 32);
+        }
      }
    }
  }
--- a/src/x86.h
+++ b/src/x86.h
@ -159,9 +159,11 @@ memoryBarrier()
 {
 #ifdef _MSC_VER
  MemoryBarrier();
-#else
-  __asm__ __volatile__("": : :"memory");
-#endif
+#elif defined ARCH_x86_32
+  __asm__ __volatile__("lock; addl $0,0(%%esp)": : :"memory");
+#elif defined ARCH_x86_64
+  __asm__ __volatile__("mfence": : :"memory");
+#endif // ARCH_x86_64
 }

 inline void
@ -188,6 +190,56 @@ syncInstructionCache(const void*, unsigned)
  // ignore
 }

+#ifdef USE_ATOMIC_OPERATIONS
+inline bool
+atomicCompareAndSwap32(uint32_t* p, uint32_t old, uint32_t new_)
+{
+#ifdef _MSC_VER
+  InterlockedCompareExchange(p, new_, old);
+#elif (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 1)
+  return __sync_bool_compare_and_swap(p, old, new_);
+#else
+  uint8_t result;
+
+  __asm__ __volatile__("lock; cmpxchgl %2, %0; setz %1"
+                       : "=m"(*p), "=q"(result)
+                       : "r"(new_), "a"(old), "m"(*p)
+                       : "memory");
+
+  return result != 0;
+#endif
+}
+
+inline bool
+atomicCompareAndSwap64(uint64_t* p, uint64_t old, uint64_t new_)
+{
+#ifdef _MSC_VER
+  InterlockedCompareExchange64(p, new_, old);
+#elif (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 1)
+  return __sync_bool_compare_and_swap(p, old, new_);
+#else
+  uint8_t result;
+
+  __asm__ __volatile__("lock; cmpxchgq %2, %0; setz %1"
+                       : "=m"(*p), "=q"(result)
+                       : "r"(new_), "a"(old), "m"(*p)
+                       : "memory");
+
+  return result != 0;
+#endif
+}
+
+inline bool
+atomicCompareAndSwap(uintptr_t* p, uintptr_t old, uintptr_t new_)
+{
+#ifdef ARCH_x86_32
+  return atomicCompareAndSwap32(p, old, new_);
+#elif defined ARCH_x86_64
+  return atomicCompareAndSwap64(p, old, new_);
+#endif // ARCH_x86_64
+}
+#endif // USE_ATOMIC_OPERATIONS
+
 } // namespace vm

 #endif//X86_H
--- a/test/AllFloats.java
+++ b/test/AllFloats.java
@ -19,6 +19,8 @@ public class AllFloats {
  private static float complex(float a, float b) {return (a - b) / (a * b) + (float)Math.sqrt(a);}
  private static double complex(double a, double b) {return (a - b) / (a * b) + Math.sqrt(a);}
  private static double complex(float a, double b) {return (a - b) / (a * b) + Math.sqrt(a);}
+  private static double sqrt(double a) {return Math.sqrt(a);}
+  private static float complexNoIntrinsic(float a, float b) {return (a - b) / (a * b) + (float)sqrt(a);}
  private static int f2i(float a) {return (int)a;}
  private static long f2l(float a) {return (long)a;}
  private static float i2f(int a) {return (float)a;}
@ -59,6 +61,7 @@ public class AllFloats {
    expect(complex(4f, 3f) == (4f-3f)/(4f*3f) + 2f);
    expect(complex(4d, 3d) == (4d-3d)/(4d*3d) + 2d);
    expect(complex(4f, 3d) == (4f-3d)/(4f*3d) + 2f);
+    expect(complexNoIntrinsic(4f, 3f) == (4f-3f)/(4f*3f) + 2f);
    
    expect(f2i(4f) == 4);
    expect(f2l(4f) == 4);
--- a/test/Floats.java
+++ b/test/Floats.java
@ -19,6 +19,20 @@ public class Floats {
    return a - b;
  }

+  private double field = 100d;
+
+  private static int doubleToInt(Floats f) {
+    return (int) f.field;
+  }
+
+  private static void multiplyAndStore(double a, double b, Floats f) {
+    f.field = a * b;
+  }
+
+  private static double loadAndMultiply(double a, Floats f) {
+    return f.field * a;
+  }
+
  public static void main(String[] args) {
    expect(multiply(0.5d, 0.5d) == 0.25d);
    expect(multiply(0.5f, 0.5f) == 0.25f);
@ -50,10 +64,35 @@ public class Floats {
      expect(((int) d) == 1);
    }

+    { double d = 12345d;
+      expect(((int) d) == 12345);
+    }
+
+    expect(doubleToInt(new Floats()) == 100);
+
+    { Floats f = new Floats();
+      f.field = 32.0d;
+      expect(loadAndMultiply(2.0d, f) == 64.0d);
+    }
+
+    { Floats f = new Floats();
+      f.field = 32.0d;
+      expect(multiply(2.0d, f.field) == 64.0d);
+    }
+
+    { Floats f = new Floats();
+      multiplyAndStore(32.0d, 0.5d, f);
+      expect(f.field == 16.0d);
+    }
+
    { float f = 1f;
      expect(((int) f) == 1);
    }

+    { float f = 1f;
+      expect(((long) f) == 1);
+    }
+
    expect(Math.round(0.4f) == 0);
    expect(Math.round(0.5f) == 1);
    expect(Math.round(1.0f) == 1);
@ -73,5 +112,20 @@ public class Floats {
      double d = (double) z;
      expect(d == 6553311036568663.0);
    }
+
+    { long z = 12345L;
+      float f = (float) z;
+      expect(f == 12345.0);
+    }
+
+    { int z = 12345;
+      float f = (float) z;
+      expect(f == 12345.0);
+    }
+
+    { int z = 12345;
+      double d = (double) z;
+      expect(d == 12345.0);
+    }
  }
 }