[Pcre-svn] [1201] code/trunk/sljit: JIT compiler update.

Kezdőlap
Üzenet törlése
Szerző: Subversion repository
Dátum:  
Címzett: pcre-svn
Tárgy: [Pcre-svn] [1201] code/trunk/sljit: JIT compiler update.
Revision: 1201
          http://vcs.pcre.org/viewvc?view=rev&revision=1201
Author:   zherczeg
Date:     2012-11-04 06:11:18 +0000 (Sun, 04 Nov 2012)


Log Message:
-----------
JIT compiler update.

Modified Paths:
--------------
    code/trunk/sljit/sljitLir.c
    code/trunk/sljit/sljitNativeX86_32.c
    code/trunk/sljit/sljitNativeX86_64.c
    code/trunk/sljit/sljitNativeX86_common.c


Modified: code/trunk/sljit/sljitLir.c
===================================================================
--- code/trunk/sljit/sljitLir.c    2012-11-03 19:21:41 UTC (rev 1200)
+++ code/trunk/sljit/sljitLir.c    2012-11-04 06:11:18 UTC (rev 1201)
@@ -198,7 +198,7 @@
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 #define SLJIT_HAS_FIXED_LOCALS_OFFSET 1
 #ifdef _WIN64
-#define FIXED_LOCALS_OFFSET (4 * sizeof(sljit_sw))
+#define FIXED_LOCALS_OFFSET ((4 + 2) * sizeof(sljit_sw))
 #else
 #define FIXED_LOCALS_OFFSET (sizeof(sljit_sw))
 #endif
@@ -1115,7 +1115,7 @@
     SLJIT_UNUSED_ARG(src2);
     SLJIT_UNUSED_ARG(src2w);


-    SLJIT_ASSERT(!(type & ~(0xff | SLJIT_INT_OP | SLJIT_REWRITABLE_JUMP)));
+    SLJIT_ASSERT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP | SLJIT_INT_OP)));
     SLJIT_ASSERT((type & 0xff) >= SLJIT_C_EQUAL && (type & 0xff) <= SLJIT_C_SIG_LESS_EQUAL);
 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
     FUNCTION_CHECK_SRC(src1, src1w);
@@ -1170,6 +1170,13 @@
     SLJIT_UNUSED_ARG(src);
     SLJIT_UNUSED_ARG(srcw);


+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
+    if (SLJIT_UNLIKELY(compiler->skip_checks)) {
+        compiler->skip_checks = 0;
+        return;
+    }
+#endif
+
     SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL3);
 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
     FUNCTION_CHECK_SRC(src, srcw);


Modified: code/trunk/sljit/sljitNativeX86_32.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_32.c    2012-11-03 19:21:41 UTC (rev 1200)
+++ code/trunk/sljit/sljitNativeX86_32.c    2012-11-04 06:11:18 UTC (rev 1201)
@@ -28,28 +28,28 @@


 static sljit_si emit_do_imm(struct sljit_compiler *compiler, sljit_ub opcode, sljit_sw imm)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 1 + sizeof(sljit_sw));
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + sizeof(sljit_sw));
+    FAIL_IF(!inst);
     INC_SIZE(1 + sizeof(sljit_sw));
-    *buf++ = opcode;
-    *(sljit_sw*)buf = imm;
+    *inst++ = opcode;
+    *(sljit_sw*)inst = imm;
     return SLJIT_SUCCESS;
 }


 static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type)
 {
     if (type == SLJIT_JUMP) {
-        *code_ptr++ = 0xe9;
+        *code_ptr++ = JMP_i32;
         jump->addr++;
     }
     else if (type >= SLJIT_FAST_CALL) {
-        *code_ptr++ = 0xe8;
+        *code_ptr++ = CALL_i32;
         jump->addr++;
     }
     else {
-        *code_ptr++ = 0x0f;
+        *code_ptr++ = GROUP_0F;
         *code_ptr++ = get_jump_code(type);
         jump->addr += 2;
     }
@@ -67,7 +67,7 @@
 {
     sljit_si size;
     sljit_si locals_offset;
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_enter(compiler, args, temporaries, saveds, local_size);
@@ -85,15 +85,15 @@
 #else
     size = 1 + (saveds <= 3 ? saveds : 3) + (args > 0 ? (2 + args * 3) : 0);
 #endif
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + size);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
+    FAIL_IF(!inst);


     INC_SIZE(size);
     PUSH_REG(reg_map[TMP_REGISTER]);
 #if !(defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
     if (args > 0) {
-        *buf++ = 0x8b;
-        *buf++ = 0xc4 | (reg_map[TMP_REGISTER] << 3);
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_REG | (reg_map[TMP_REGISTER] << 3) | 0x4 /* esp */;
     }
 #endif
     if (saveds > 2)
@@ -105,34 +105,34 @@


 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
     if (args > 0) {
-        *buf++ = 0x8b;
-        *buf++ = 0xc0 | (reg_map[SLJIT_SAVED_REG1] << 3) | reg_map[SLJIT_TEMPORARY_REG3];
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_REG | (reg_map[SLJIT_SAVED_REG1] << 3) | reg_map[SLJIT_TEMPORARY_REG3];
     }
     if (args > 1) {
-        *buf++ = 0x8b;
-        *buf++ = 0xc0 | (reg_map[SLJIT_SAVED_REG2] << 3) | reg_map[SLJIT_TEMPORARY_REG2];
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_REG | (reg_map[SLJIT_SAVED_REG2] << 3) | reg_map[SLJIT_TEMPORARY_REG2];
     }
     if (args > 2) {
-        *buf++ = 0x8b;
-        *buf++ = 0x44 | (reg_map[SLJIT_SAVED_REG3] << 3);
-        *buf++ = 0x24;
-        *buf++ = sizeof(sljit_sw) * (3 + 2); /* saveds >= 3 as well. */
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG3] << 3) | 0x4 /* esp */;
+        *inst++ = 0x24;
+        *inst++ = sizeof(sljit_sw) * (3 + 2); /* saveds >= 3 as well. */
     }
 #else
     if (args > 0) {
-        *buf++ = 0x8b;
-        *buf++ = 0x40 | (reg_map[SLJIT_SAVED_REG1] << 3) | reg_map[TMP_REGISTER];
-        *buf++ = sizeof(sljit_sw) * 2;
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG1] << 3) | reg_map[TMP_REGISTER];
+        *inst++ = sizeof(sljit_sw) * 2;
     }
     if (args > 1) {
-        *buf++ = 0x8b;
-        *buf++ = 0x40 | (reg_map[SLJIT_SAVED_REG2] << 3) | reg_map[TMP_REGISTER];
-        *buf++ = sizeof(sljit_sw) * 3;
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG2] << 3) | reg_map[TMP_REGISTER];
+        *inst++ = sizeof(sljit_sw) * 3;
     }
     if (args > 2) {
-        *buf++ = 0x8b;
-        *buf++ = 0x40 | (reg_map[SLJIT_SAVED_REG3] << 3) | reg_map[TMP_REGISTER];
-        *buf++ = sizeof(sljit_sw) * 4;
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG3] << 3) | reg_map[TMP_REGISTER];
+        *inst++ = sizeof(sljit_sw) * 4;
     }
 #endif


@@ -148,14 +148,14 @@

 #ifdef _WIN32
     if (local_size > 1024) {
-        FAIL_IF(emit_do_imm(compiler, 0xb8 + reg_map[SLJIT_TEMPORARY_REG1], local_size));
+        FAIL_IF(emit_do_imm(compiler, MOV_r_i32 + reg_map[SLJIT_TEMPORARY_REG1], local_size));
         FAIL_IF(sljit_emit_ijump(compiler, SLJIT_CALL1, SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
     }
 #endif


     compiler->local_size = local_size;
     SLJIT_ASSERT(local_size > 0);
-    return emit_non_cum_binary(compiler, 0x2b, 0x29, 0x5 << 3, 0x2d,
+    return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
         SLJIT_LOCALS_REG, 0, SLJIT_LOCALS_REG, 0, SLJIT_IMM, local_size);


     return SLJIT_SUCCESS;
@@ -189,7 +189,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compiler, sljit_si op, sljit_si src, sljit_sw srcw)
 {
     sljit_si size;
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_return(compiler, op, src, srcw);
@@ -199,7 +199,7 @@
     FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));


     SLJIT_ASSERT(compiler->local_size > 0);
-    FAIL_IF(emit_cum_binary(compiler, 0x03, 0x01, 0x0 << 3, 0x05,
+    FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
         SLJIT_LOCALS_REG, 0, SLJIT_LOCALS_REG, 0, SLJIT_IMM, compiler->local_size));


     size = 2 + (compiler->saveds <= 3 ? compiler->saveds : 3);
@@ -210,8 +210,8 @@
     if (compiler->args > 0)
         size += 2;
 #endif
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + size);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
+    FAIL_IF(!inst);


     INC_SIZE(size);


@@ -224,12 +224,12 @@
     POP_REG(reg_map[TMP_REGISTER]);
 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
     if (compiler->args > 2)
-        RETN(sizeof(sljit_sw));
+        RET_I16(sizeof(sljit_sw));
     else
         RET();
 #else
     if (compiler->args > 0)
-        RETN(compiler->args * sizeof(sljit_sw));
+        RET_I16(compiler->args * sizeof(sljit_sw));
     else
         RET();
 #endif
@@ -248,7 +248,7 @@
     /* The general operand (not immediate). */
     sljit_si b, sljit_sw immb)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
     sljit_ub *buf_ptr;
     sljit_si flags = size & ~0xf;
     sljit_si inst_size;
@@ -322,26 +322,26 @@
     else
         SLJIT_ASSERT(!(flags & EX86_SHIFT_INS) || a == SLJIT_PREF_SHIFT_REG);


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + inst_size);
-    PTR_FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + inst_size);
+    PTR_FAIL_IF(!inst);


     /* Encoding the byte. */
     INC_SIZE(inst_size);
 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
     if (flags & EX86_PREF_F2)
-        *buf++ = 0xf2;
+        *inst++ = 0xf2;
     if (flags & EX86_PREF_F3)
-        *buf++ = 0xf3;
+        *inst++ = 0xf3;
 #endif
     if (flags & EX86_PREF_66)
-        *buf++ = 0x66;
+        *inst++ = 0x66;


-    buf_ptr = buf + size;
+    buf_ptr = inst + size;


     /* Encode mod/rm byte. */
     if (!(flags & EX86_SHIFT_INS)) {
         if ((flags & EX86_BIN_INS) && (a & SLJIT_IMM))
-            *buf = (flags & EX86_BYTE_ARG) ? 0x83 : 0x81;
+            *inst = (flags & EX86_BYTE_ARG) ? GROUP_BINARY_83 : GROUP_BINARY_81;


         if ((a & SLJIT_IMM) || (a == 0))
             *buf_ptr = 0;
@@ -358,19 +358,19 @@
     else {
         if (a & SLJIT_IMM) {
             if (imma == 1)
-                *buf = 0xd1;
+                *inst = GROUP_SHIFT_1;
             else
-                *buf = 0xc1;
+                *inst = GROUP_SHIFT_N;
         } else
-            *buf = 0xd3;
+            *inst = GROUP_SHIFT_CL;
         *buf_ptr = 0;
     }


     if (!(b & SLJIT_MEM))
 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
-        *buf_ptr++ |= 0xc0 + ((!(flags & EX86_SSE2)) ? reg_map[b] : b);
+        *buf_ptr++ |= MOD_REG + ((!(flags & EX86_SSE2)) ? reg_map[b] : b);
 #else
-        *buf_ptr++ |= 0xc0 + reg_map[b];
+        *buf_ptr++ |= MOD_REG + reg_map[b];
 #endif
     else if ((b & 0x0f) != SLJIT_UNUSED) {
         if ((b & 0xf0) == SLJIT_UNUSED || (b & 0xf0) == (SLJIT_LOCALS_REG << 4)) {
@@ -417,7 +417,7 @@
             *(sljit_sw*)buf_ptr = imma;
     }


-    return !(flags & EX86_SHIFT_INS) ? buf : (buf + 1);
+    return !(flags & EX86_SHIFT_INS) ? inst : (inst + 1);
 }


/* --------------------------------------------------------------------- */
@@ -426,20 +426,20 @@

 static SLJIT_INLINE sljit_si call_with_args(struct sljit_compiler *compiler, sljit_si type)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-    buf = (sljit_ub*)ensure_buf(compiler, type >= SLJIT_CALL3 ? 1 + 2 + 1 : 1 + 2);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, type >= SLJIT_CALL3 ? 1 + 2 + 1 : 1 + 2);
+    FAIL_IF(!inst);
     INC_SIZE(type >= SLJIT_CALL3 ? 2 + 1 : 2);


     if (type >= SLJIT_CALL3)
         PUSH_REG(reg_map[SLJIT_TEMPORARY_REG3]);
-    *buf++ = 0x8b;
-    *buf++ = 0xc0 | (reg_map[SLJIT_TEMPORARY_REG3] << 3) | reg_map[SLJIT_TEMPORARY_REG1];
+    *inst++ = MOV_r_rm;
+    *inst++ = MOD_REG | (reg_map[SLJIT_TEMPORARY_REG3] << 3) | reg_map[SLJIT_TEMPORARY_REG1];
 #else
-    buf = (sljit_ub*)ensure_buf(compiler, type - SLJIT_CALL0 + 1);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, type - SLJIT_CALL0 + 1);
+    FAIL_IF(!inst);
     INC_SIZE(type - SLJIT_CALL0);
     if (type >= SLJIT_CALL3)
         PUSH_REG(reg_map[SLJIT_TEMPORARY_REG3]);
@@ -452,7 +452,7 @@


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_fast_enter(compiler, dst, dstw);
@@ -461,23 +461,23 @@
     CHECK_EXTRA_REGS(dst, dstw, (void)0);


     if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+        FAIL_IF(!inst);


         INC_SIZE(1);
         POP_REG(reg_map[dst]);
         return SLJIT_SUCCESS;
     }
     else if (dst & SLJIT_MEM) {
-        buf = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
-        FAIL_IF(!buf);
-        *buf++ = 0x8f;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst++ = POP_rm;
         return SLJIT_SUCCESS;
     }


     /* For UNUSED dst. Uncommon, but possible. */
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+    FAIL_IF(!inst);


     INC_SIZE(1);
     POP_REG(reg_map[TMP_REGISTER]);
@@ -486,7 +486,7 @@


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_si src, sljit_sw srcw)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_fast_return(compiler, src, srcw);
@@ -495,31 +495,31 @@
     CHECK_EXTRA_REGS(src, srcw, (void)0);


     if (src >= SLJIT_TEMPORARY_REG1 && src <= SLJIT_NO_REGISTERS) {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 1);
+        FAIL_IF(!inst);


         INC_SIZE(1 + 1);
         PUSH_REG(reg_map[src]);
     }
     else if (src & SLJIT_MEM) {
-        buf = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
-        FAIL_IF(!buf);
-        *buf++ = 0xff;
-        *buf |= 6 << 3;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_FF;
+        *inst |= PUSH_rm;


-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+        FAIL_IF(!inst);
         INC_SIZE(1);
     }
     else {
         /* SLJIT_IMM. */
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 5 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 5 + 1);
+        FAIL_IF(!inst);


         INC_SIZE(5 + 1);
-        *buf++ = 0x68;
-        *(sljit_sw*)buf = srcw;
-        buf += sizeof(sljit_sw);
+        *inst++ = PUSH_i32;
+        *(sljit_sw*)inst = srcw;
+        inst += sizeof(sljit_sw);
     }


     RET();


Modified: code/trunk/sljit/sljitNativeX86_64.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_64.c    2012-11-03 19:21:41 UTC (rev 1200)
+++ code/trunk/sljit/sljitNativeX86_64.c    2012-11-04 06:11:18 UTC (rev 1201)
@@ -28,27 +28,28 @@


 static sljit_si emit_load_imm64(struct sljit_compiler *compiler, sljit_si reg, sljit_sw imm)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 2 + sizeof(sljit_sw));
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + sizeof(sljit_sw));
+    FAIL_IF(!inst);
     INC_SIZE(2 + sizeof(sljit_sw));
-    *buf++ = REX_W | ((reg_map[reg] <= 7) ? 0 : REX_B);
-    *buf++ = 0xb8 + (reg_map[reg] & 0x7);
-    *(sljit_sw*)buf = imm;
+    *inst++ = REX_W | ((reg_map[reg] <= 7) ? 0 : REX_B);
+    *inst++ = MOV_r_i32 + (reg_map[reg] & 0x7);
+    *(sljit_sw*)inst = imm;
     return SLJIT_SUCCESS;
 }


 static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type)
 {
     if (type < SLJIT_JUMP) {
+        /* Invert type. */
         *code_ptr++ = get_jump_code(type ^ 0x1) - 0x10;
         *code_ptr++ = 10 + 3;
     }


     SLJIT_COMPILE_ASSERT(reg_map[TMP_REG3] == 9, tmp3_is_9_first);
     *code_ptr++ = REX_W | REX_B;
-    *code_ptr++ = 0xb8 + 1;
+    *code_ptr++ = MOV_r_i32 + 1;
     jump->addr = (sljit_uw)code_ptr;


     if (jump->flags & JUMP_LABEL)
@@ -58,8 +59,8 @@


     code_ptr += sizeof(sljit_sw);
     *code_ptr++ = REX_B;
-    *code_ptr++ = 0xff;
-    *code_ptr++ = (type >= SLJIT_FAST_CALL) ? 0xd1 /* call */ : 0xe1 /* jmp */;
+    *code_ptr++ = GROUP_FF;
+    *code_ptr++ = (type >= SLJIT_FAST_CALL) ? (MOD_REG | CALL_rm | 1) : (MOD_REG | JMP_rm | 1);


     return code_ptr;
 }
@@ -69,18 +70,18 @@
     sljit_sw delta = addr - ((sljit_sw)code_ptr + 1 + sizeof(sljit_si));


     if (delta <= SLJIT_W(0x7fffffff) && delta >= SLJIT_W(-0x80000000)) {
-        *code_ptr++ = (type == 2) ? 0xe8 /* call */ : 0xe9 /* jmp */;
+        *code_ptr++ = (type == 2) ? CALL_i32 : JMP_i32;
         *(sljit_sw*)code_ptr = delta;
     }
     else {
         SLJIT_COMPILE_ASSERT(reg_map[TMP_REG3] == 9, tmp3_is_9_second);
         *code_ptr++ = REX_W | REX_B;
-        *code_ptr++ = 0xb8 + 1;
+        *code_ptr++ = MOV_r_i32 + 1;
         *(sljit_sw*)code_ptr = addr;
         code_ptr += sizeof(sljit_sw);
         *code_ptr++ = REX_B;
-        *code_ptr++ = 0xff;
-        *code_ptr++ = (type == 2) ? 0xd1 /* call */ : 0xe1 /* jmp */;
+        *code_ptr++ = GROUP_FF;
+        *code_ptr++ = (type == 2) ? (MOD_REG | CALL_rm | 1) : (MOD_REG | JMP_rm | 1);
     }


     return code_ptr;
@@ -89,7 +90,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compiler, sljit_si args, sljit_si temporaries, sljit_si saveds, sljit_si local_size)
 {
     sljit_si size, pushed_size;
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_enter(compiler, args, temporaries, saveds, local_size);
@@ -117,24 +118,24 @@
 #endif
     size += args * 3;
     if (size > 0) {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + size);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
+        FAIL_IF(!inst);


         INC_SIZE(size);
         if (saveds >= 5) {
             SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SAVED_EREG2] >= 8, saved_ereg2_is_hireg);
-            *buf++ = REX_B;
+            *inst++ = REX_B;
             PUSH_REG(reg_lmap[SLJIT_SAVED_EREG2]);
         }
         if (saveds >= 4) {
             SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SAVED_EREG1] >= 8, saved_ereg1_is_hireg);
-            *buf++ = REX_B;
+            *inst++ = REX_B;
             PUSH_REG(reg_lmap[SLJIT_SAVED_EREG1]);
         }
         if (saveds >= 3) {
 #ifndef _WIN64
             SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SAVED_REG3] >= 8, saved_reg3_is_hireg);
-            *buf++ = REX_B;
+            *inst++ = REX_B;
 #else
             SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SAVED_REG3] < 8, saved_reg3_is_loreg);
 #endif
@@ -143,7 +144,7 @@
         if (saveds >= 2) {
 #ifndef _WIN64
             SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SAVED_REG2] >= 8, saved_reg2_is_hireg);
-            *buf++ = REX_B;
+            *inst++ = REX_B;
 #else
             SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SAVED_REG2] < 8, saved_reg2_is_loreg);
 #endif
@@ -156,42 +157,42 @@
 #ifdef _WIN64
         if (temporaries >= 5) {
             SLJIT_COMPILE_ASSERT(reg_map[SLJIT_TEMPORARY_EREG2] >= 8, temporary_ereg2_is_hireg);
-            *buf++ = REX_B;
+            *inst++ = REX_B;
             PUSH_REG(reg_lmap[SLJIT_TEMPORARY_EREG2]);
         }
 #endif


 #ifndef _WIN64
         if (args > 0) {
-            *buf++ = REX_W;
-            *buf++ = 0x8b;
-            *buf++ = 0xc0 | (reg_map[SLJIT_SAVED_REG1] << 3) | 0x7;
+            *inst++ = REX_W;
+            *inst++ = MOV_r_rm;
+            *inst++ = MOD_REG | (reg_map[SLJIT_SAVED_REG1] << 3) | 0x7 /* rdi */;
         }
         if (args > 1) {
-            *buf++ = REX_W | REX_R;
-            *buf++ = 0x8b;
-            *buf++ = 0xc0 | (reg_lmap[SLJIT_SAVED_REG2] << 3) | 0x6;
+            *inst++ = REX_W | REX_R;
+            *inst++ = MOV_r_rm;
+            *inst++ = MOD_REG | (reg_lmap[SLJIT_SAVED_REG2] << 3) | 0x6 /* rsi */;
         }
         if (args > 2) {
-            *buf++ = REX_W | REX_R;
-            *buf++ = 0x8b;
-            *buf++ = 0xc0 | (reg_lmap[SLJIT_SAVED_REG3] << 3) | 0x2;
+            *inst++ = REX_W | REX_R;
+            *inst++ = MOV_r_rm;
+            *inst++ = MOD_REG | (reg_lmap[SLJIT_SAVED_REG3] << 3) | 0x2 /* rdx */;
         }
 #else
         if (args > 0) {
-            *buf++ = REX_W;
-            *buf++ = 0x8b;
-            *buf++ = 0xc0 | (reg_map[SLJIT_SAVED_REG1] << 3) | 0x1;
+            *inst++ = REX_W;
+            *inst++ = MOV_r_rm;
+            *inst++ = MOD_REG | (reg_map[SLJIT_SAVED_REG1] << 3) | 0x1 /* rcx */;
         }
         if (args > 1) {
-            *buf++ = REX_W;
-            *buf++ = 0x8b;
-            *buf++ = 0xc0 | (reg_map[SLJIT_SAVED_REG2] << 3) | 0x2;
+            *inst++ = REX_W;
+            *inst++ = MOV_r_rm;
+            *inst++ = MOD_REG | (reg_map[SLJIT_SAVED_REG2] << 3) | 0x2 /* rdx */;
         }
         if (args > 2) {
-            *buf++ = REX_W | REX_B;
-            *buf++ = 0x8b;
-            *buf++ = 0xc0 | (reg_map[SLJIT_SAVED_REG3] << 3) | 0x0;
+            *inst++ = REX_W | REX_B;
+            *inst++ = MOV_r_rm;
+            *inst++ = MOD_REG | (reg_map[SLJIT_SAVED_REG3] << 3) | 0x0 /* r8 */;
         }
 #endif
     }
@@ -201,45 +202,61 @@
 #ifdef _WIN64
     if (local_size > 1024) {
         /* Allocate stack for the callback, which grows the stack. */
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 4);
-        FAIL_IF(!buf);
-        INC_SIZE(4);
-        *buf++ = REX_W;
-        *buf++ = 0x83;
-        *buf++ = 0xc0 | (5 << 3) | 4;
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + (3 + sizeof(sljit_si)));
+        FAIL_IF(!inst);
+        INC_SIZE(4 + (3 + sizeof(sljit_si)));
+        *inst++ = REX_W;
+        *inst++ = GROUP_BINARY_83;
+        *inst++ = MOD_REG | SUB | 4;
         /* Pushed size must be divisible by 8. */
         SLJIT_ASSERT(!(pushed_size & 0x7));
         if (pushed_size & 0x8) {
-            *buf++ = 5 * sizeof(sljit_sw);
+            *inst++ = 5 * sizeof(sljit_sw);
             local_size -= 5 * sizeof(sljit_sw);
         } else {
-            *buf++ = 4 * sizeof(sljit_sw);
+            *inst++ = 4 * sizeof(sljit_sw);
             local_size -= 4 * sizeof(sljit_sw);
         }
-        FAIL_IF(emit_load_imm64(compiler, SLJIT_TEMPORARY_REG1, local_size));
+        /* Second instruction */
+        SLJIT_COMPILE_ASSERT(reg_map[SLJIT_TEMPORARY_REG1] < 8, temporary_reg1_is_loreg);
+        *inst++ = REX_W;
+        *inst++ = MOV_rm_i32;
+        *inst++ = MOD_REG | reg_lmap[SLJIT_TEMPORARY_REG1];
+        *(sljit_si*)inst = local_size;
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
+        compiler->skip_checks = 1;
+#endif
         FAIL_IF(sljit_emit_ijump(compiler, SLJIT_CALL1, SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
     }
 #endif
     SLJIT_ASSERT(local_size > 0);
     if (local_size <= 127) {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 4);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
+        FAIL_IF(!inst);
         INC_SIZE(4);
-        *buf++ = REX_W;
-        *buf++ = 0x83;
-        *buf++ = 0xc0 | (5 << 3) | 4;
-        *buf++ = local_size;
+        *inst++ = REX_W;
+        *inst++ = GROUP_BINARY_83;
+        *inst++ = MOD_REG | SUB | 4;
+        *inst++ = local_size;
     }
     else {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 7);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 7);
+        FAIL_IF(!inst);
         INC_SIZE(7);
-        *buf++ = REX_W;
-        *buf++ = 0x81;
-        *buf++ = 0xc0 | (5 << 3) | 4;
-        *(sljit_si*)buf = local_size;
-        buf += sizeof(sljit_si);
+        *inst++ = REX_W;
+        *inst++ = GROUP_BINARY_81;
+        *inst++ = MOD_REG | SUB | 4;
+        *(sljit_si*)inst = local_size;
+        inst += sizeof(sljit_si);
     }
+#ifdef _WIN64
+    /* Save xmm6 with MOVAPS instruction. */
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
+    FAIL_IF(!inst);
+    INC_SIZE(5);
+    *inst++ = GROUP_0F;
+    *(sljit_si*)inst = 0x20247429;
+#endif


     return SLJIT_SUCCESS;
 }
@@ -269,7 +286,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compiler, sljit_si op, sljit_si src, sljit_sw srcw)
 {
     sljit_si size;
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_return(compiler, op, src, srcw);
@@ -277,24 +294,32 @@
     compiler->flags_saved = 0;
     FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));


+#ifdef _WIN64
+    /* Restore xmm6 with MOVAPS instruction. */
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
+    FAIL_IF(!inst);
+    INC_SIZE(5);
+    *inst++ = GROUP_0F;
+    *(sljit_si*)inst = 0x20247428;
+#endif
     SLJIT_ASSERT(compiler->local_size > 0);
     if (compiler->local_size <= 127) {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 4);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
+        FAIL_IF(!inst);
         INC_SIZE(4);
-        *buf++ = REX_W;
-        *buf++ = 0x83;
-        *buf++ = 0xc0 | (0 << 3) | 4;
-        *buf = compiler->local_size;
+        *inst++ = REX_W;
+        *inst++ = GROUP_BINARY_83;
+        *inst++ = MOD_REG | ADD | 4;
+        *inst = compiler->local_size;
     }
     else {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 7);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 7);
+        FAIL_IF(!inst);
         INC_SIZE(7);
-        *buf++ = REX_W;
-        *buf++ = 0x81;
-        *buf++ = 0xc0 | (0 << 3) | 4;
-        *(sljit_si*)buf = compiler->local_size;
+        *inst++ = REX_W;
+        *inst++ = GROUP_BINARY_81;
+        *inst++ = MOD_REG | ADD | 4;
+        *(sljit_si*)inst = compiler->local_size;
     }


     size = 1 + compiler->saveds;
@@ -307,14 +332,14 @@
     if (compiler->temporaries >= 5)
         size += (5 - 4) * 2;
 #endif
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + size);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
+    FAIL_IF(!inst);


     INC_SIZE(size);


 #ifdef _WIN64
     if (compiler->temporaries >= 5) {
-        *buf++ = REX_B;
+        *inst++ = REX_B;
         POP_REG(reg_lmap[SLJIT_TEMPORARY_EREG2]);
     }
 #endif
@@ -322,22 +347,22 @@
         POP_REG(reg_map[SLJIT_SAVED_REG1]);
     if (compiler->saveds >= 2) {
 #ifndef _WIN64
-        *buf++ = REX_B;
+        *inst++ = REX_B;
 #endif
         POP_REG(reg_lmap[SLJIT_SAVED_REG2]);
     }
     if (compiler->saveds >= 3) {
 #ifndef _WIN64
-        *buf++ = REX_B;
+        *inst++ = REX_B;
 #endif
         POP_REG(reg_lmap[SLJIT_SAVED_REG3]);
     }
     if (compiler->saveds >= 4) {
-        *buf++ = REX_B;
+        *inst++ = REX_B;
         POP_REG(reg_lmap[SLJIT_SAVED_EREG1]);
     }
     if (compiler->saveds >= 5) {
-        *buf++ = REX_B;
+        *inst++ = REX_B;
         POP_REG(reg_lmap[SLJIT_SAVED_EREG2]);
     }


@@ -351,23 +376,16 @@

 static sljit_si emit_do_imm32(struct sljit_compiler *compiler, sljit_ub rex, sljit_ub opcode, sljit_sw imm)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
+    sljit_si length = 1 + (rex ? 1 : 0) + sizeof(sljit_si);


-    if (rex != 0) {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 2 + sizeof(sljit_si));
-        FAIL_IF(!buf);
-        INC_SIZE(2 + sizeof(sljit_si));
-        *buf++ = rex;
-        *buf++ = opcode;
-        *(sljit_si*)buf = imm;
-    }
-    else {
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 1 + sizeof(sljit_si));
-        FAIL_IF(!buf);
-        INC_SIZE(1 + sizeof(sljit_si));
-        *buf++ = opcode;
-        *(sljit_si*)buf = imm;
-    }
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + length);
+    FAIL_IF(!inst);
+    INC_SIZE(length);
+    if (rex)
+        *inst++ = rex;
+    *inst++ = opcode;
+    *(sljit_si*)inst = imm;
     return SLJIT_SUCCESS;
 }


@@ -377,7 +395,7 @@
     /* The general operand (not immediate). */
     sljit_si b, sljit_sw immb)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
     sljit_ub *buf_ptr;
     sljit_ub rex = 0;
     sljit_si flags = size & ~0xf;
@@ -494,27 +512,27 @@
     if (rex)
         inst_size++;


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + inst_size);
-    PTR_FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + inst_size);
+    PTR_FAIL_IF(!inst);


     /* Encoding the byte. */
     INC_SIZE(inst_size);
 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
     if (flags & EX86_PREF_F2)
-        *buf++ = 0xf2;
+        *inst++ = 0xf2;
     if (flags & EX86_PREF_F3)
-        *buf++ = 0xf3;
+        *inst++ = 0xf3;
 #endif
     if (flags & EX86_PREF_66)
-        *buf++ = 0x66;
+        *inst++ = 0x66;
     if (rex)
-        *buf++ = rex;
-    buf_ptr = buf + size;
+        *inst++ = rex;
+    buf_ptr = inst + size;


     /* Encode mod/rm byte. */
     if (!(flags & EX86_SHIFT_INS)) {
         if ((flags & EX86_BIN_INS) && (a & SLJIT_IMM))
-            *buf = (flags & EX86_BYTE_ARG) ? 0x83 : 0x81;
+            *inst = (flags & EX86_BYTE_ARG) ? GROUP_BINARY_83 : GROUP_BINARY_81;


         if ((a & SLJIT_IMM) || (a == 0))
             *buf_ptr = 0;
@@ -531,19 +549,19 @@
     else {
         if (a & SLJIT_IMM) {
             if (imma == 1)
-                *buf = 0xd1;
+                *inst = GROUP_SHIFT_1;
             else
-                *buf = 0xc1;
+                *inst = GROUP_SHIFT_N;
         } else
-            *buf = 0xd3;
+            *inst = GROUP_SHIFT_CL;
         *buf_ptr = 0;
     }


     if (!(b & SLJIT_MEM))
 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
-        *buf_ptr++ |= 0xc0 + ((!(flags & EX86_SSE2)) ? reg_lmap[b] : b);
+        *buf_ptr++ |= MOD_REG + ((!(flags & EX86_SSE2)) ? reg_lmap[b] : b);
 #else
-        *buf_ptr++ |= 0xc0 + reg_lmap[b];
+        *buf_ptr++ |= MOD_REG + reg_lmap[b];
 #endif
     else if ((b & 0x0f) != SLJIT_UNUSED) {
         if ((b & 0xf0) == SLJIT_UNUSED || (b & 0xf0) == (SLJIT_LOCALS_REG << 4)) {
@@ -591,7 +609,7 @@
             *(sljit_si*)buf_ptr = imma;
     }


-    return !(flags & EX86_SHIFT_INS) ? buf : (buf + 1);
+    return !(flags & EX86_SHIFT_INS) ? inst : (inst + 1);
 }


/* --------------------------------------------------------------------- */
@@ -600,43 +618,43 @@

 static SLJIT_INLINE sljit_si call_with_args(struct sljit_compiler *compiler, sljit_si type)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


 #ifndef _WIN64
     SLJIT_COMPILE_ASSERT(reg_map[SLJIT_TEMPORARY_REG2] == 6 && reg_map[SLJIT_TEMPORARY_REG1] < 8 && reg_map[SLJIT_TEMPORARY_REG3] < 8, args_registers);


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
+    FAIL_IF(!inst);
     INC_SIZE((type < SLJIT_CALL3) ? 3 : 6);
     if (type >= SLJIT_CALL3) {
-        *buf++ = REX_W;
-        *buf++ = 0x8b;
-        *buf++ = 0xc0 | (0x2 << 3) | reg_lmap[SLJIT_TEMPORARY_REG3];
+        *inst++ = REX_W;
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_REG | (0x2 /* rdx */ << 3) | reg_lmap[SLJIT_TEMPORARY_REG3];
     }
-    *buf++ = REX_W;
-    *buf++ = 0x8b;
-    *buf++ = 0xc0 | (0x7 << 3) | reg_lmap[SLJIT_TEMPORARY_REG1];
+    *inst++ = REX_W;
+    *inst++ = MOV_r_rm;
+    *inst++ = MOD_REG | (0x7 /* rdi */ << 3) | reg_lmap[SLJIT_TEMPORARY_REG1];
 #else
     SLJIT_COMPILE_ASSERT(reg_map[SLJIT_TEMPORARY_REG2] == 2 && reg_map[SLJIT_TEMPORARY_REG1] < 8 && reg_map[SLJIT_TEMPORARY_REG3] < 8, args_registers);


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
+    FAIL_IF(!inst);
     INC_SIZE((type < SLJIT_CALL3) ? 3 : 6);
     if (type >= SLJIT_CALL3) {
-        *buf++ = REX_W | REX_R;
-        *buf++ = 0x8b;
-        *buf++ = 0xc0 | (0x0 << 3) | reg_lmap[SLJIT_TEMPORARY_REG3];
+        *inst++ = REX_W | REX_R;
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_REG | (0x0 /* r8 */ << 3) | reg_lmap[SLJIT_TEMPORARY_REG3];
     }
-    *buf++ = REX_W;
-    *buf++ = 0x8b;
-    *buf++ = 0xc0 | (0x1 << 3) | reg_lmap[SLJIT_TEMPORARY_REG1];
+    *inst++ = REX_W;
+    *inst++ = MOV_r_rm;
+    *inst++ = MOD_REG | (0x1 /* rcx */ << 3) | reg_lmap[SLJIT_TEMPORARY_REG1];
 #endif
     return SLJIT_SUCCESS;
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_fast_enter(compiler, dst, dstw);
@@ -648,34 +666,34 @@


     if (dst >= SLJIT_TEMPORARY_REG1 && dst <= TMP_REGISTER) {
         if (reg_map[dst] < 8) {
-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+            FAIL_IF(!inst);


             INC_SIZE(1);
             POP_REG(reg_lmap[dst]);
         }
         else {
-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 2);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
+            FAIL_IF(!inst);


             INC_SIZE(2);
-            *buf++ = REX_B;
+            *inst++ = REX_B;
             POP_REG(reg_lmap[dst]);
         }
     }
     else if (dst & SLJIT_MEM) {
         /* REX_W is not necessary (src is not immediate). */
         compiler->mode32 = 1;
-        buf = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
-        FAIL_IF(!buf);
-        *buf++ = 0x8f;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst++ = POP_rm;
     }
     return SLJIT_SUCCESS;
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_si src, sljit_sw srcw)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_fast_return(compiler, src, srcw);
@@ -688,43 +706,43 @@


     if (src >= SLJIT_TEMPORARY_REG1 && src <= TMP_REGISTER) {
         if (reg_map[src] < 8) {
-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 1);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 1);
+            FAIL_IF(!inst);


             INC_SIZE(1 + 1);
             PUSH_REG(reg_lmap[src]);
         }
         else {
-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 1);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 1);
+            FAIL_IF(!inst);


             INC_SIZE(2 + 1);
-            *buf++ = REX_B;
+            *inst++ = REX_B;
             PUSH_REG(reg_lmap[src]);
         }
     }
     else if (src & SLJIT_MEM) {
         /* REX_W is not necessary (src is not immediate). */
         compiler->mode32 = 1;
-        buf = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
-        FAIL_IF(!buf);
-        *buf++ = 0xff;
-        *buf |= 6 << 3;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_FF;
+        *inst |= PUSH_rm;


-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+        FAIL_IF(!inst);
         INC_SIZE(1);
     }
     else {
         SLJIT_ASSERT(IS_HALFWORD(srcw));
         /* SLJIT_IMM. */
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 5 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 5 + 1);
+        FAIL_IF(!inst);


         INC_SIZE(5 + 1);
-        *buf++ = 0x68;
-        *(sljit_si*)buf = srcw;
-        buf += sizeof(sljit_si);
+        *inst++ = PUSH_i32;
+        *(sljit_si*)inst = srcw;
+        inst += sizeof(sljit_si);
     }


     RET();
@@ -740,7 +758,7 @@
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;
     sljit_si dst_r;


     compiler->mode32 = 0;
@@ -751,17 +769,17 @@
     if (src & SLJIT_IMM) {
         if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
             if (sign || ((sljit_uw)srcw <= 0x7fffffff)) {
-                code = emit_x86_instruction(compiler, 1, SLJIT_IMM, (sljit_sw)(sljit_si)srcw, dst, dstw);
-                FAIL_IF(!code);
-                *code = 0xc7;
+                inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, (sljit_sw)(sljit_si)srcw, dst, dstw);
+                FAIL_IF(!inst);
+                *inst = MOV_rm_i32;
                 return SLJIT_SUCCESS;
             }
             return emit_load_imm64(compiler, dst, srcw);
         }
         compiler->mode32 = 1;
-        code = emit_x86_instruction(compiler, 1, SLJIT_IMM, (sljit_sw)(sljit_si)srcw, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0xc7;
+        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, (sljit_sw)(sljit_si)srcw, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm_i32;
         compiler->mode32 = 0;
         return SLJIT_SUCCESS;
     }
@@ -772,9 +790,9 @@
         dst_r = src;
     else {
         if (sign) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, src, srcw);
-            FAIL_IF(!code);
-            *code++ = 0x63;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, src, srcw);
+            FAIL_IF(!inst);
+            *inst++ = MOVSXD_r_rm;
         } else {
             compiler->mode32 = 1;
             FAIL_IF(emit_mov(compiler, dst_r, 0, src, srcw));
@@ -784,9 +802,9 @@


     if (dst & SLJIT_MEM) {
         compiler->mode32 = 1;
-        code = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0x89;
+        inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm_r;
         compiler->mode32 = 0;
     }



Modified: code/trunk/sljit/sljitNativeX86_common.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_common.c    2012-11-03 19:21:41 UTC (rev 1200)
+++ code/trunk/sljit/sljitNativeX86_common.c    2012-11-04 06:11:18 UTC (rev 1201)
@@ -144,68 +144,247 @@
 #define EX86_PREF_F3        0x2000
 #endif


-#define INC_SIZE(s)            (*buf++ = (s), compiler->size += (s))
-#define INC_CSIZE(s)            (*code++ = (s), compiler->size += (s))
+/* --------------------------------------------------------------------- */
+/*  Instrucion forms                                                     */
+/* --------------------------------------------------------------------- */


-#define PUSH_REG(r)            (*buf++ = (0x50 + (r)))
-#define POP_REG(r)            (*buf++ = (0x58 + (r)))
-#define RET()                (*buf++ = (0xc3))
-#define RETN(n)                (*buf++ = (0xc2), *buf++ = n, *buf++ = 0)
+#define ADD        (/* BINARY */ 0 << 3)
+#define ADD_EAX_i32    0x05
+#define ADD_r_rm    0x03
+#define ADD_rm_r    0x01
+#define ADDSD_x_xm    0x58
+#define ADC        (/* BINARY */ 2 << 3)
+#define ADC_EAX_i32    0x15
+#define ADC_r_rm    0x13
+#define ADC_rm_r    0x11
+#define AND        (/* BINARY */ 4 << 3)
+#define AND_EAX_i32    0x25
+#define AND_r_rm    0x23
+#define AND_rm_r    0x21
+#define ANDPD_x_xm    0x54
+#define BSR_r_rm    (/* GROUP_0F */ 0xbd)
+#define CALL_i32    0xe8
+#define CALL_rm        (/* GROUP_FF */ 2 << 3)
+#define CDQ        0x99
+#define CMOVNE_r_rm    (/* GROUP_0F */ 0x45)
+#define CMP        (/* BINARY */ 7 << 3)
+#define CMP_EAX_i32    0x3d
+#define CMP_r_rm    0x3b
+#define CMP_rm_r    0x39
+#define DIV        (/* GROUP_F7 */ 6 << 3)
+#define DIVSD_x_xm    0x5e
+#define INT3        0xcc
+#define IDIV        (/* GROUP_F7 */ 7 << 3)
+#define IMUL        (/* GROUP_F7 */ 5 << 3)
+#define IMUL_r_rm    (/* GROUP_0F */ 0xaf)
+#define IMUL_r_rm_i8    0x6b
+#define IMUL_r_rm_i32    0x69
+#define JE_i8        0x74
+#define JMP_i8        0xeb
+#define JMP_i32        0xe9
+#define JMP_rm        (/* GROUP_FF */ 4 << 3)
+#define LEA_r_m        0x8d
+#define MOV_r_rm    0x8b
+#define MOV_r_i32    0xb8
+#define MOV_rm_r    0x89
+#define MOV_rm_i32    0xc7
+#define MOV_rm8_i8    0xc6
+#define MOV_rm8_r8    0x88
+#define MOVSD_x_xm    0x10
+#define MOVSD_xm_x    0x11
+#define MOVSXD_r_rm    0x63
+#define MOVSX_r_rm8    (/* GROUP_0F */ 0xbe)
+#define MOVSX_r_rm16    (/* GROUP_0F */ 0xbf)
+#define MOVZX_r_rm8    (/* GROUP_0F */ 0xb6)
+#define MOVZX_r_rm16    (/* GROUP_0F */ 0xb7)
+#define MUL        (/* GROUP_F7 */ 4 << 3)
+#define MULSD_x_xm    0x59
+#define NEG_rm        (/* GROUP_F7 */ 3 << 3)
+#define NOP        0x90
+#define NOT_rm        (/* GROUP_F7 */ 2 << 3)
+#define OR        (/* BINARY */ 1 << 3)
+#define OR_r_rm        0x0b
+#define OR_EAX_i32    0x0d
+#define OR_rm_r        0x09
+#define POP_r        0x58
+#define POP_rm        0x8f
+#define POPF        0x9d
+#define PUSH_i32    0x68
+#define PUSH_r        0x50
+#define PUSH_rm        (/* GROUP_FF */ 6 << 3)
+#define PUSHF        0x9c
+#define RET_near    0xc3
+#define RET_i16        0xc2
+#define SBB        (/* BINARY */ 3 << 3)
+#define SBB_EAX_i32    0x1d
+#define SBB_r_rm    0x1b
+#define SBB_rm_r    0x19
+#define SAR        (/* SHIFT */ 7 << 3)
+#define SHL        (/* SHIFT */ 4 << 3)
+#define SHR        (/* SHIFT */ 5 << 3)
+#define SUB        (/* BINARY */ 5 << 3)
+#define SUB_EAX_i32    0x2d
+#define SUB_r_rm    0x2b
+#define SUB_rm_r    0x29
+#define SUBSD_x_xm    0x5c
+#define TEST_EAX_i32    0xa9
+#define TEST_rm_r    0x85
+#define UCOMISD_x_xm    0x2e
+#define XCHG_EAX_r    0x90
+#define XCHG_r_rm    0x87
+#define XOR        (/* BINARY */ 6 << 3)
+#define XOR_EAX_i32    0x35
+#define XOR_r_rm    0x33
+#define XOR_rm_r    0x31
+#define XORPD_x_xm    0x57
+
+#define GROUP_0F    0x0f
+#define GROUP_F7    0xf7
+#define GROUP_FF    0xff
+#define GROUP_BINARY_81    0x81
+#define GROUP_BINARY_83    0x83
+#define GROUP_SHIFT_1    0xd1
+#define GROUP_SHIFT_N    0xc1
+#define GROUP_SHIFT_CL    0xd3
+
+#define MOD_REG        0xc0
+#define MOD_DISP8    0x40
+
+#define INC_SIZE(s)            (*inst++ = (s), compiler->size += (s))
+
+#define PUSH_REG(r)            (*inst++ = (PUSH_r + (r)))
+#define POP_REG(r)            (*inst++ = (POP_r + (r)))
+#define RET()                (*inst++ = (RET_near))
+#define RET_I16(n)            (*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
 /* r32, r/m32 */
-#define MOV_RM(mod, reg, rm)        (*buf++ = (0x8b), *buf++ = (mod) << 6 | (reg) << 3 | (rm))
+#define MOV_RM(mod, reg, rm)        (*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))


+/* Multithreading does not affect these static variables, since they store
+   built-in CPU features. Therefore they can be overwritten by different threads
+   if they detect the CPU features in the same time. */
+#if (defined SLJIT_SSE2 && SLJIT_SSE2) && (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
+static sljit_ui cpu_has_sse2 = -1;
+#endif
+static sljit_ui cpu_has_cmov = -1;
+
+#if defined(_MSC_VER) && (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+#include <intrin.h>
+#endif
+
+static void get_cpu_features()
+{
+    sljit_ui features;
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+
+#ifdef __GNUC__
+    /* AT&T syntax. */
+    asm (
+        "pushl %%ebx\n"
+        "movl $0x1, %%eax\n"
+        "cpuid\n"
+        "popl %%ebx\n"
+        "movl %%edx, %0\n"
+        : "=g" (features)
+        :
+        : "%eax", "%ecx", "%edx"
+    );
+#elif defined(_MSC_VER) || defined(__BORLANDC__)
+    /* Intel syntax. */
+    __asm {
+        mov eax, 1
+        push ebx
+        cpuid
+        pop ebx
+        mov features, edx
+    }
+#else
+    #error "SLJIT_DETECT_SSE2 is not implemented for this C compiler"
+#endif
+
+#else /* SLJIT_CONFIG_X86_32 */
+
+#ifdef __GNUC__
+    /* AT&T syntax. */
+    asm (
+        "pushq %%rbx\n"
+        "movl $0x1, %%eax\n"
+        "cpuid\n"
+        "popq %%rbx\n"
+        "movl %%edx, %0\n"
+        : "=g" (features)
+        :
+        : "%rax", "%rcx", "%rdx"
+    );
+#elif defined(_MSC_VER)
+    int CPUInfo[4];
+
+    __cpuid(CPUInfo, 1);
+    features = (sljit_ui)CPUInfo[3];
+#else
+    #error "SLJIT_DETECT_SSE2 is not implemented for this C compiler"
+#endif
+
+#endif /* SLJIT_CONFIG_X86_32 */
+
+#if (defined SLJIT_SSE2 && SLJIT_SSE2) && (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
+    cpu_has_sse2 = (features >> 26) & 0x1;
+#endif
+    cpu_has_cmov = (features >> 15) & 0x1;
+}
+
 static sljit_ub get_jump_code(sljit_si type)
 {
     switch (type) {
     case SLJIT_C_EQUAL:
     case SLJIT_C_FLOAT_EQUAL:
-        return 0x84;
+        return 0x84 /* je */;


     case SLJIT_C_NOT_EQUAL:
     case SLJIT_C_FLOAT_NOT_EQUAL:
-        return 0x85;
+        return 0x85 /* jne */;


     case SLJIT_C_LESS:
     case SLJIT_C_FLOAT_LESS:
-        return 0x82;
+        return 0x82 /* jc */;


     case SLJIT_C_GREATER_EQUAL:
     case SLJIT_C_FLOAT_GREATER_EQUAL:
-        return 0x83;
+        return 0x83 /* jae */;


     case SLJIT_C_GREATER:
     case SLJIT_C_FLOAT_GREATER:
-        return 0x87;
+        return 0x87 /* jnbe */;


     case SLJIT_C_LESS_EQUAL:
     case SLJIT_C_FLOAT_LESS_EQUAL:
-        return 0x86;
+        return 0x86 /* jbe */;


     case SLJIT_C_SIG_LESS:
-        return 0x8c;
+        return 0x8c /* jl */;


     case SLJIT_C_SIG_GREATER_EQUAL:
-        return 0x8d;
+        return 0x8d /* jnl */;


     case SLJIT_C_SIG_GREATER:
-        return 0x8f;
+        return 0x8f /* jnle */;


     case SLJIT_C_SIG_LESS_EQUAL:
-        return 0x8e;
+        return 0x8e /* jle */;


     case SLJIT_C_OVERFLOW:
     case SLJIT_C_MUL_OVERFLOW:
-        return 0x80;
+        return 0x80 /* jo */;


     case SLJIT_C_NOT_OVERFLOW:
     case SLJIT_C_MUL_NOT_OVERFLOW:
-        return 0x81;
+        return 0x81 /* jno */;


     case SLJIT_C_FLOAT_UNORDERED:
-        return 0x8a;
+        return 0x8a /* jp */;


     case SLJIT_C_FLOAT_ORDERED:
-        return 0x8b;
+        return 0x8b /* jpo */;
     }
     return 0;
 }
@@ -234,14 +413,14 @@


     if (type == SLJIT_JUMP) {
         if (short_jump)
-            *code_ptr++ = 0xeb;
+            *code_ptr++ = JMP_i8;
         else
-            *code_ptr++ = 0xe9;
+            *code_ptr++ = JMP_i32;
         jump->addr++;
     }
     else if (type >= SLJIT_FAST_CALL) {
         short_jump = 0;
-        *code_ptr++ = 0xe8;
+        *code_ptr++ = CALL_i32;
         jump->addr++;
     }
     else if (short_jump) {
@@ -249,7 +428,7 @@
         jump->addr++;
     }
     else {
-        *code_ptr++ = 0x0f;
+        *code_ptr++ = GROUP_0F;
         *code_ptr++ = get_jump_code(type);
         jump->addr += 2;
     }
@@ -326,7 +505,7 @@
                 }
                 else {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-                    *code_ptr++ = (*buf_ptr == 2) ? 0xe8 /* call */ : 0xe9 /* jmp */;
+                    *code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
                     buf_ptr++;
                     *(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
                     code_ptr += sizeof(sljit_sw);
@@ -407,47 +586,47 @@


 static SLJIT_INLINE sljit_si emit_save_flags(struct sljit_compiler *compiler)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 5);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
+    FAIL_IF(!inst);
     INC_SIZE(5);
 #else
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 6);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
+    FAIL_IF(!inst);
     INC_SIZE(6);
-    *buf++ = REX_W;
+    *inst++ = REX_W;
 #endif
-    *buf++ = 0x8d; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
-    *buf++ = 0x64;
-    *buf++ = 0x24;
-    *buf++ = (sljit_ub)sizeof(sljit_sw);
-    *buf++ = 0x9c; /* pushfd / pushfq */
+    *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
+    *inst++ = 0x64;
+    *inst++ = 0x24;
+    *inst++ = (sljit_ub)sizeof(sljit_sw);
+    *inst++ = PUSHF;
     compiler->flags_saved = 1;
     return SLJIT_SUCCESS;
 }


 static SLJIT_INLINE sljit_si emit_restore_flags(struct sljit_compiler *compiler, sljit_si keep_flags)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 5);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
+    FAIL_IF(!inst);
     INC_SIZE(5);
-    *buf++ = 0x9d; /* popfd */
+    *inst++ = POPF;
 #else
-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 6);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
+    FAIL_IF(!inst);
     INC_SIZE(6);
-    *buf++ = 0x9d; /* popfq */
-    *buf++ = REX_W;
+    *inst++ = POPF;
+    *inst++ = REX_W;
 #endif
-    *buf++ = 0x8d; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
-    *buf++ = 0x64;
-    *buf++ = 0x24;
-    *buf++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
+    *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
+    *inst++ = 0x64;
+    *inst++ = 0x24;
+    *inst++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
     compiler->flags_saved = keep_flags;
     return SLJIT_SUCCESS;
 }
@@ -478,64 +657,64 @@
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
         /* No destination, doesn't need to setup flags. */
         if (src & SLJIT_MEM) {
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src, srcw);
-            FAIL_IF(!code);
-            *code = 0x8b;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src, srcw);
+            FAIL_IF(!inst);
+            *inst = MOV_r_rm;
         }
         return SLJIT_SUCCESS;
     }
     if (src >= SLJIT_TEMPORARY_REG1 && src <= TMP_REGISTER) {
-        code = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0x89;
+        inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm_r;
         return SLJIT_SUCCESS;
     }
     if (src & SLJIT_IMM) {
         if (dst >= SLJIT_TEMPORARY_REG1 && dst <= TMP_REGISTER) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-            return emit_do_imm(compiler, 0xb8 + reg_map[dst], srcw);
+            return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
 #else
             if (!compiler->mode32) {
                 if (NOT_HALFWORD(srcw))
                     return emit_load_imm64(compiler, dst, srcw);
             }
             else
-                return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, 0xb8 + reg_lmap[dst], srcw);
+                return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
 #endif
         }
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
             FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
-            code = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
-            FAIL_IF(!code);
-            *code = 0x89;
+            inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = MOV_rm_r;
             return SLJIT_SUCCESS;
         }
 #endif
-        code = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0xc7;
+        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm_i32;
         return SLJIT_SUCCESS;
     }
     if (dst >= SLJIT_TEMPORARY_REG1 && dst <= TMP_REGISTER) {
-        code = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
-        FAIL_IF(!code);
-        *code = 0x8b;
+        inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
+        FAIL_IF(!inst);
+        *inst = MOV_r_rm;
         return SLJIT_SUCCESS;
     }


     /* Memory to memory move. Requires two instruction. */
-    code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src, srcw);
-    FAIL_IF(!code);
-    *code = 0x8b;
-    code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
-    FAIL_IF(!code);
-    *code = 0x89;
+    inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src, srcw);
+    FAIL_IF(!inst);
+    *inst = MOV_r_rm;
+    inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+    FAIL_IF(!inst);
+    *inst = MOV_rm_r;
     return SLJIT_SUCCESS;
 }


@@ -544,7 +723,7 @@

 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     sljit_si size;
 #endif
@@ -554,16 +733,16 @@


     switch (GET_OPCODE(op)) {
     case SLJIT_BREAKPOINT:
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+        FAIL_IF(!inst);
         INC_SIZE(1);
-        *buf = 0xcc;
+        *inst = INT3;
         break;
     case SLJIT_NOP:
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+        FAIL_IF(!inst);
         INC_SIZE(1);
-        *buf = 0x90;
+        *inst = NOP;
         break;
     case SLJIT_UMUL:
     case SLJIT_SMUL:
@@ -591,12 +770,12 @@
         if (op == SLJIT_UDIV) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
             EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_TEMPORARY_REG2, 0);
-            buf = emit_x86_instruction(compiler, 1, SLJIT_TEMPORARY_REG2, 0, SLJIT_TEMPORARY_REG2, 0);
+            inst = emit_x86_instruction(compiler, 1, SLJIT_TEMPORARY_REG2, 0, SLJIT_TEMPORARY_REG2, 0);
 #else
-            buf = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
 #endif
-            FAIL_IF(!buf);
-            *buf = 0x33;
+            FAIL_IF(!inst);
+            *inst = XOR_r_rm;
         }


         if (op == SLJIT_SDIV) {
@@ -604,69 +783,68 @@
             EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_TEMPORARY_REG2, 0);
 #endif


-            /* CDQ instruction */
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+            FAIL_IF(!inst);
             INC_SIZE(1);
-            *buf = 0x99;
+            *inst = CDQ;
 #else
             if (compiler->mode32) {
-                buf = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-                FAIL_IF(!buf);
+                inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+                FAIL_IF(!inst);
                 INC_SIZE(1);
-                *buf = 0x99;
+                *inst = CDQ;
             } else {
-                buf = (sljit_ub*)ensure_buf(compiler, 1 + 2);
-                FAIL_IF(!buf);
+                inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
+                FAIL_IF(!inst);
                 INC_SIZE(2);
-                *buf++ = REX_W;
-                *buf = 0x99;
+                *inst++ = REX_W;
+                *inst = CDQ;
             }
 #endif
         }


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + 2);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
+        FAIL_IF(!inst);
         INC_SIZE(2);
-        *buf++ = 0xf7;
-        *buf = 0xc0 | ((op >= SLJIT_UDIV) ? reg_map[TMP_REGISTER] : reg_map[SLJIT_TEMPORARY_REG2]);
+        *inst++ = GROUP_F7;
+        *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_map[TMP_REGISTER] : reg_map[SLJIT_TEMPORARY_REG2]);
 #else
 #ifdef _WIN64
         size = (!compiler->mode32 || op >= SLJIT_UDIV) ? 3 : 2;
 #else
         size = (!compiler->mode32) ? 3 : 2;
 #endif
-        buf = (sljit_ub*)ensure_buf(compiler, 1 + size);
-        FAIL_IF(!buf);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
+        FAIL_IF(!inst);
         INC_SIZE(size);
 #ifdef _WIN64
         if (!compiler->mode32)
-            *buf++ = REX_W | ((op >= SLJIT_UDIV) ? REX_B : 0);
+            *inst++ = REX_W | ((op >= SLJIT_UDIV) ? REX_B : 0);
         else if (op >= SLJIT_UDIV)
-            *buf++ = REX_B;
-        *buf++ = 0xf7;
-        *buf = 0xc0 | ((op >= SLJIT_UDIV) ? reg_lmap[TMP_REGISTER] : reg_lmap[SLJIT_TEMPORARY_REG2]);
+            *inst++ = REX_B;
+        *inst++ = GROUP_F7;
+        *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_lmap[TMP_REGISTER] : reg_lmap[SLJIT_TEMPORARY_REG2]);
 #else
         if (!compiler->mode32)
-            *buf++ = REX_W;
-        *buf++ = 0xf7;
-        *buf = 0xc0 | reg_map[SLJIT_TEMPORARY_REG2];
+            *inst++ = REX_W;
+        *inst++ = GROUP_F7;
+        *inst = MOD_REG | reg_map[SLJIT_TEMPORARY_REG2];
 #endif
 #endif
         switch (op) {
         case SLJIT_UMUL:
-            *buf |= 4 << 3;
+            *inst |= MUL;
             break;
         case SLJIT_SMUL:
-            *buf |= 5 << 3;
+            *inst |= IMUL;
             break;
         case SLJIT_UDIV:
-            *buf |= 6 << 3;
+            *inst |= DIV;
             break;
         case SLJIT_SDIV:
-            *buf |= 7 << 3;
+            *inst |= IDIV;
             break;
         }
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
@@ -680,17 +858,17 @@


 #define ENCODE_PREFIX(prefix) \
     do { \
-        code = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
-        FAIL_IF(!code); \
-        INC_CSIZE(1); \
-        *code = (prefix); \
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
+        FAIL_IF(!inst); \
+        INC_SIZE(1); \
+        *inst = (prefix); \
     } while (0)


 static sljit_si emit_mov_byte(struct sljit_compiler *compiler, sljit_si sign,
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;
     sljit_si dst_r;
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     sljit_si work_r;
@@ -706,14 +884,17 @@
     if (src & SLJIT_IMM) {
         if (dst >= SLJIT_TEMPORARY_REG1 && dst <= TMP_REGISTER) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-            return emit_do_imm(compiler, 0xb8 + reg_map[dst], srcw);
+            return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
 #else
-            return emit_load_imm64(compiler, dst, srcw);
+            inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
+            FAIL_IF(!inst);
+            *inst = MOV_rm_i32;
+            return SLJIT_SUCCESS;
 #endif
         }
-        code = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0xc6;
+        inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm8_i8;
         return SLJIT_SUCCESS;
     }


@@ -737,29 +918,28 @@
         if (reg_map[dst] < 4) {
             if (dst != src)
                 EMIT_MOV(compiler, dst, 0, src, 0);
-            code = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
-            FAIL_IF(!code);
-            *code++ = 0x0f;
-            *code = sign ? 0xbe : 0xb6;
+            inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
+            FAIL_IF(!inst);
+            *inst++ = GROUP_0F;
+            *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
         }
         else {
             if (dst != src)
                 EMIT_MOV(compiler, dst, 0, src, 0);
             if (sign) {
                 /* shl reg, 24 */
-                code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
-                FAIL_IF(!code);
-                *code |= 0x4 << 3;
-                code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
-                FAIL_IF(!code);
-                /* shr/sar reg, 24 */
-                *code |= 0x7 << 3;
+                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
+                FAIL_IF(!inst);
+                *inst |= SHL;
+                /* sar reg, 24 */
+                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
+                FAIL_IF(!inst);
+                *inst |= SAR;
             }
             else {
-                /* and dst, 0xff */
-                code = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 255, dst, 0);
-                FAIL_IF(!code);
-                *(code + 1) |= 0x4 << 3;
+                inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
+                FAIL_IF(!inst);
+                *(inst + 1) |= AND;
             }
         }
         return SLJIT_SUCCESS;
@@ -767,10 +947,10 @@
 #endif
     else {
         /* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
-        code = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
-        FAIL_IF(!code);
-        *code++ = 0x0f;
-        *code = sign ? 0xbe : 0xb6;
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_0F;
+        *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
     }


     if (dst & SLJIT_MEM) {
@@ -793,36 +973,36 @@
             }


             if (work_r == SLJIT_TEMPORARY_REG1) {
-                ENCODE_PREFIX(0x90 + reg_map[TMP_REGISTER]);
+                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REGISTER]);
             }
             else {
-                code = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
-                FAIL_IF(!code);
-                *code = 0x87;
+                inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
+                FAIL_IF(!inst);
+                *inst = XCHG_r_rm;
             }


-            code = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
-            FAIL_IF(!code);
-            *code = 0x88;
+            inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = MOV_rm8_r8;


             if (work_r == SLJIT_TEMPORARY_REG1) {
-                ENCODE_PREFIX(0x90 + reg_map[TMP_REGISTER]);
+                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REGISTER]);
             }
             else {
-                code = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
-                FAIL_IF(!code);
-                *code = 0x87;
+                inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
+                FAIL_IF(!inst);
+                *inst = XCHG_r_rm;
             }
         }
         else {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
-            FAIL_IF(!code);
-            *code = 0x88;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = MOV_rm8_r8;
         }
 #else
-        code = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0x88;
+        inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm8_r8;
 #endif
     }


@@ -833,7 +1013,7 @@
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;
     sljit_si dst_r;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -846,14 +1026,17 @@
     if (src & SLJIT_IMM) {
         if (dst >= SLJIT_TEMPORARY_REG1 && dst <= TMP_REGISTER) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-            return emit_do_imm(compiler, 0xb8 + reg_map[dst], srcw);
+            return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
 #else
-            return emit_load_imm64(compiler, dst, srcw);
+            inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
+            FAIL_IF(!inst);
+            *inst = MOV_rm_i32;
+            return SLJIT_SUCCESS;
 #endif
         }
-        code = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0xc7;
+        inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm_i32;
         return SLJIT_SUCCESS;
     }


@@ -862,56 +1045,56 @@
     if ((dst & SLJIT_MEM) && (src >= SLJIT_TEMPORARY_REG1 && src <= SLJIT_NO_REGISTERS))
         dst_r = src;
     else {
-        code = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
-        FAIL_IF(!code);
-        *code++ = 0x0f;
-        *code = sign ? 0xbf : 0xb7;
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_0F;
+        *inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
     }


     if (dst & SLJIT_MEM) {
-        code = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0x89;
+        inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = MOV_rm_r;
     }


     return SLJIT_SUCCESS;
 }


-static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_si un_index,
+static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_ub opcode,
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
         EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-        code = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code++ = 0xf7;
-        *code |= (un_index) << 3;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_F7;
+        *inst |= opcode;
         return SLJIT_SUCCESS;
     }
     if (dst == src && dstw == srcw) {
         /* Same input and output */
-        code = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code++ = 0xf7;
-        *code |= (un_index) << 3;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_F7;
+        *inst |= opcode;
         return SLJIT_SUCCESS;
     }
     if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
         EMIT_MOV(compiler, dst, 0, src, srcw);
-        code = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code++ = 0xf7;
-        *code |= (un_index) << 3;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_F7;
+        *inst |= opcode;
         return SLJIT_SUCCESS;
     }
     EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-    code = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
-    FAIL_IF(!code);
-    *code++ = 0xf7;
-    *code |= (un_index) << 3;
+    inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+    FAIL_IF(!inst);
+    *inst++ = GROUP_F7;
+    *inst |= opcode;
     EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
     return SLJIT_SUCCESS;
 }
@@ -920,38 +1103,38 @@
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
         EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-        code = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code++ = 0xf7;
-        *code |= 0x2 << 3;
-        code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code = 0x0b;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_F7;
+        *inst |= NOT_rm;
+        inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst = OR_r_rm;
         return SLJIT_SUCCESS;
     }
     if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
         EMIT_MOV(compiler, dst, 0, src, srcw);
-        code = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code++ = 0xf7;
-        *code |= 0x2 << 3;
-        code = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
-        FAIL_IF(!code);
-        *code = 0x0b;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_F7;
+        *inst |= NOT_rm;
+        inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
+        FAIL_IF(!inst);
+        *inst = OR_r_rm;
         return SLJIT_SUCCESS;
     }
     EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-    code = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
-    FAIL_IF(!code);
-    *code++ = 0xf7;
-    *code |= 0x2 << 3;
-    code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
-    FAIL_IF(!code);
-    *code = 0x0b;
+    inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+    FAIL_IF(!inst);
+    *inst++ = GROUP_F7;
+    *inst |= NOT_rm;
+    inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
+    FAIL_IF(!inst);
+    *inst = OR_r_rm;
     EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
     return SLJIT_SUCCESS;
 }
@@ -960,37 +1143,37 @@
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;
     sljit_si dst_r;


     SLJIT_UNUSED_ARG(op_flags);
     if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
         /* Just set the zero flag. */
         EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-        code = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code++ = 0xf7;
-        *code |= 0x2 << 3;
+        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_F7;
+        *inst |= NOT_rm;
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REGISTER, 0);
 #else
-        code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REGISTER, 0);
 #endif
-        FAIL_IF(!code);
-        *code |= 0x5 << 3;
+        FAIL_IF(!inst);
+        *inst |= SHR;
         return SLJIT_SUCCESS;
     }


     if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
+        EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_IMM, srcw);
         src = TMP_REGISTER;
         srcw = 0;
     }


-    code = emit_x86_instruction(compiler, 2, TMP_REGISTER, 0, src, srcw);
-    FAIL_IF(!code);
-    *code++ = 0x0f;
-    *code = 0xbd;
+    inst = emit_x86_instruction(compiler, 2, TMP_REGISTER, 0, src, srcw);
+    FAIL_IF(!inst);
+    *inst++ = GROUP_0F;
+    *inst = BSR_r_rm;


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     if (dst >= SLJIT_TEMPORARY_REG1 && dst <= TMP_REGISTER)
@@ -1013,24 +1196,50 @@
     compiler->mode32 = op_flags & SLJIT_INT_OP;
 #endif


-    code = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REGISTER, 0);
-    FAIL_IF(!code);
-    *code++ = 0x0f;
-    *code = 0x45;
+    if (cpu_has_cmov == -1)
+        get_cpu_features();


+    if (cpu_has_cmov) {
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_0F;
+        *inst = CMOVNE_r_rm;
+    } else {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    code = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
+        FAIL_IF(!inst);
+        INC_SIZE(4);
+
+        *inst++ = JE_i8;
+        *inst++ = 2;
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REGISTER];
 #else
-    code = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
+        inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
+        FAIL_IF(!inst);
+        INC_SIZE(5);
+
+        *inst++ = JE_i8;
+        *inst++ = 3;
+        *inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REGISTER] >= 8 ? REX_B : 0);
+        *inst++ = MOV_r_rm;
+        *inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REGISTER];
 #endif
-    FAIL_IF(!code);
-    *(code + 1) |= 0x6 << 3;
+    }


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+    inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
+#else
+    inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
+#endif
+    FAIL_IF(!inst);
+    *(inst + 1) |= XOR;
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     if (dst & SLJIT_MEM) {
-        code = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
-        FAIL_IF(!code);
-        *code = 0x87;
+        inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
+        FAIL_IF(!inst);
+        *inst = XCHG_r_rm;
     }
 #else
     if (dst & SLJIT_MEM)
@@ -1043,7 +1252,7 @@
     sljit_si dst, sljit_sw dstw,
     sljit_si src, sljit_sw srcw)
 {
-    sljit_ub* code;
+    sljit_ub* inst;
     sljit_si update = 0;
     sljit_si op_flags = GET_ALL_FLAGS(op);
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
@@ -1123,9 +1332,9 @@
         }


         if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & 0xf) && (srcw != 0 || (src & 0xf0) != 0)) {
-            code = emit_x86_instruction(compiler, 1, src & 0xf, 0, src, srcw);
-            FAIL_IF(!code);
-            *code = 0x8d;
+            inst = emit_x86_instruction(compiler, 1, src & 0xf, 0, src, srcw);
+            FAIL_IF(!inst);
+            *inst = LEA_r_m;
             src &= SLJIT_MEM | 0xf;
             srcw = 0;
         }
@@ -1174,9 +1383,9 @@
 #endif


         if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & 0xf) && (dstw != 0 || (dst & 0xf0) != 0)) {
-            code = emit_x86_instruction(compiler, 1, dst & 0xf, 0, dst, dstw);
-            FAIL_IF(!code);
-            *code = 0x8d;
+            inst = emit_x86_instruction(compiler, 1, dst & 0xf, 0, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = LEA_r_m;
         }
         return SLJIT_SUCCESS;
     }
@@ -1188,12 +1397,12 @@
     case SLJIT_NOT:
         if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
             return emit_not_with_flags(compiler, dst, dstw, src, srcw);
-        return emit_unary(compiler, 0x2, dst, dstw, src, srcw);
+        return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);


     case SLJIT_NEG:
         if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
             FAIL_IF(emit_save_flags(compiler));
-        return emit_unary(compiler, 0x3, dst, dstw, src, srcw);
+        return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);


     case SLJIT_CLZ:
         if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
@@ -1210,31 +1419,31 @@


#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)

-#define BINARY_IMM(_op_imm_, _op_mr_, immw, arg, argw) \
+#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
     if (IS_HALFWORD(immw) || compiler->mode32) { \
-        code = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
-        FAIL_IF(!code); \
-        *(code + 1) |= (_op_imm_); \
+        inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
+        FAIL_IF(!inst); \
+        *(inst + 1) |= (op_imm); \
     } \
     else { \
         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
-        code = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
-        FAIL_IF(!code); \
-        *code = (_op_mr_); \
+        inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
+        FAIL_IF(!inst); \
+        *inst = (op_mr); \
     }


-#define BINARY_EAX_IMM(_op_eax_imm_, immw) \
-    FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (_op_eax_imm_), immw))
+#define BINARY_EAX_IMM(op_eax_imm, immw) \
+    FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))


#else

-#define BINARY_IMM(_op_imm_, _op_mr_, immw, arg, argw) \
-    code = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
-    FAIL_IF(!code); \
-    *(code + 1) |= (_op_imm_);
+#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
+    inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
+    FAIL_IF(!inst); \
+    *(inst + 1) |= (op_imm);


-#define BINARY_EAX_IMM(_op_eax_imm_, immw) \
-    FAIL_IF(emit_do_imm(compiler, (_op_eax_imm_), immw))
+#define BINARY_EAX_IMM(op_eax_imm, immw) \
+    FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))


#endif

@@ -1244,7 +1453,7 @@
     sljit_si src1, sljit_sw src1w,
     sljit_si src2, sljit_sw src2w)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
         EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
@@ -1252,9 +1461,9 @@
             BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
         }
         else {
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
         return SLJIT_SUCCESS;
     }
@@ -1273,21 +1482,21 @@
             }
         }
         else if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
-            code = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
         else if (src2 >= SLJIT_TEMPORARY_REG1 && src2 <= TMP_REGISTER) {
             /* Special exception for sljit_emit_cond_value. */
-            code = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
-            FAIL_IF(!code);
-            *code = op_mr;
+            inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = op_mr;
         }
         else {
             EMIT_MOV(compiler, TMP_REGISTER, 0, src2, src2w);
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
-            FAIL_IF(!code);
-            *code = op_mr;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = op_mr;
         }
         return SLJIT_SUCCESS;
     }
@@ -1307,20 +1516,20 @@
             }
         }
         else if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
-            code = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
         else if (src1 >= SLJIT_TEMPORARY_REG1 && src1 <= SLJIT_NO_REGISTERS) {
-            code = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
-            FAIL_IF(!code);
-            *code = op_mr;
+            inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = op_mr;
         }
         else {
             EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
-            FAIL_IF(!code);
-            *code = op_mr;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = op_mr;
         }
         return SLJIT_SUCCESS;
     }
@@ -1332,9 +1541,9 @@
             BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
         }
         else {
-            code = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
     }
     else {
@@ -1344,9 +1553,9 @@
             BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
         }
         else {
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
         EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
     }
@@ -1360,7 +1569,7 @@
     sljit_si src1, sljit_sw src1w,
     sljit_si src2, sljit_sw src2w)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
         EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
@@ -1368,9 +1577,9 @@
             BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
         }
         else {
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
         return SLJIT_SUCCESS;
     }
@@ -1389,20 +1598,20 @@
             }
         }
         else if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
-            code = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
         else if (src2 >= SLJIT_TEMPORARY_REG1 && src2 <= SLJIT_NO_REGISTERS) {
-            code = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
-            FAIL_IF(!code);
-            *code = op_mr;
+            inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = op_mr;
         }
         else {
             EMIT_MOV(compiler, TMP_REGISTER, 0, src2, src2w);
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
-            FAIL_IF(!code);
-            *code = op_mr;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+            FAIL_IF(!inst);
+            *inst = op_mr;
         }
         return SLJIT_SUCCESS;
     }
@@ -1414,9 +1623,9 @@
             BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
         }
         else {
-            code = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
     }
     else {
@@ -1426,9 +1635,9 @@
             BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
         }
         else {
-            code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = op_rm;
+            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = op_rm;
         }
         EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
     }
@@ -1441,23 +1650,23 @@
     sljit_si src1, sljit_sw src1w,
     sljit_si src2, sljit_sw src2w)
 {
-    sljit_ub* code;
+    sljit_ub* inst;
     sljit_si dst_r;


     dst_r = (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) ? dst : TMP_REGISTER;


     /* Register destination. */
     if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
-        code = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
-        FAIL_IF(!code);
-        *code++ = 0x0f;
-        *code = 0xaf;
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_0F;
+        *inst = IMUL_r_rm;
     }
     else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
-        code = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
-        FAIL_IF(!code);
-        *code++ = 0x0f;
-        *code = 0xaf;
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_0F;
+        *inst = IMUL_r_rm;
     }
     else if (src1 & SLJIT_IMM) {
         if (src2 & SLJIT_IMM) {
@@ -1467,42 +1676,42 @@
         }


         if (src1w <= 127 && src1w >= -128) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = 0x6b;
-            code = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-            FAIL_IF(!code);
-            INC_CSIZE(1);
-            *code = (sljit_sb)src1w;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = IMUL_r_rm_i8;
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+            FAIL_IF(!inst);
+            INC_SIZE(1);
+            *inst = (sljit_sb)src1w;
         }
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
         else {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = 0x69;
-            code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
-            FAIL_IF(!code);
-            INC_CSIZE(4);
-            *(sljit_sw*)code = src1w;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = IMUL_r_rm_i32;
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
+            FAIL_IF(!inst);
+            INC_SIZE(4);
+            *(sljit_sw*)inst = src1w;
         }
 #else
         else if (IS_HALFWORD(src1w)) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = 0x69;
-            code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
-            FAIL_IF(!code);
-            INC_CSIZE(4);
-            *(sljit_si*)code = (sljit_si)src1w;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = IMUL_r_rm_i32;
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
+            FAIL_IF(!inst);
+            INC_SIZE(4);
+            *(sljit_si*)inst = (sljit_si)src1w;
         }
         else {
             EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
             if (dst_r != src2)
                 EMIT_MOV(compiler, dst_r, 0, src2, src2w);
-            code = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
-            FAIL_IF(!code);
-            *code++ = 0x0f;
-            *code = 0xaf;
+            inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
+            FAIL_IF(!inst);
+            *inst++ = GROUP_0F;
+            *inst = IMUL_r_rm;
         }
 #endif
     }
@@ -1510,42 +1719,42 @@
         /* Note: src1 is NOT immediate. */


         if (src2w <= 127 && src2w >= -128) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
-            FAIL_IF(!code);
-            *code = 0x6b;
-            code = (sljit_ub*)ensure_buf(compiler, 1 + 1);
-            FAIL_IF(!code);
-            INC_CSIZE(1);
-            *code = (sljit_sb)src2w;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
+            FAIL_IF(!inst);
+            *inst = IMUL_r_rm_i8;
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
+            FAIL_IF(!inst);
+            INC_SIZE(1);
+            *inst = (sljit_sb)src2w;
         }
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
         else {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
-            FAIL_IF(!code);
-            *code = 0x69;
-            code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
-            FAIL_IF(!code);
-            INC_CSIZE(4);
-            *(sljit_sw*)code = src2w;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
+            FAIL_IF(!inst);
+            *inst = IMUL_r_rm_i32;
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
+            FAIL_IF(!inst);
+            INC_SIZE(4);
+            *(sljit_sw*)inst = src2w;
         }
 #else
         else if (IS_HALFWORD(src2w)) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
-            FAIL_IF(!code);
-            *code = 0x69;
-            code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
-            FAIL_IF(!code);
-            INC_CSIZE(4);
-            *(sljit_si*)code = (sljit_si)src2w;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
+            FAIL_IF(!inst);
+            *inst = IMUL_r_rm_i32;
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
+            FAIL_IF(!inst);
+            INC_SIZE(4);
+            *(sljit_si*)inst = (sljit_si)src2w;
         }
         else {
             EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
             if (dst_r != src1)
                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
-            code = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
-            FAIL_IF(!code);
-            *code++ = 0x0f;
-            *code = 0xaf;
+            inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
+            FAIL_IF(!inst);
+            *inst++ = GROUP_0F;
+            *inst = IMUL_r_rm;
         }
 #endif
     }
@@ -1554,10 +1763,10 @@
         if (ADDRESSING_DEPENDS_ON(src2, dst_r))
             dst_r = TMP_REGISTER;
         EMIT_MOV(compiler, dst_r, 0, src1, src1w);
-        code = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
-        FAIL_IF(!code);
-        *code++ = 0x0f;
-        *code = 0xaf;
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_0F;
+        *inst = IMUL_r_rm;
     }


     if (dst_r == TMP_REGISTER)
@@ -1571,7 +1780,7 @@
     sljit_si src1, sljit_sw src1w,
     sljit_si src2, sljit_sw src2w)
 {
-    sljit_ub* code;
+    sljit_ub* inst;
     sljit_si dst_r, done = 0;


     /* These cases better be left to handled by normal way. */
@@ -1584,33 +1793,33 @@


     if (src1 >= SLJIT_TEMPORARY_REG1 && src1 <= SLJIT_NO_REGISTERS) {
         if ((src2 >= SLJIT_TEMPORARY_REG1 && src2 <= SLJIT_NO_REGISTERS) || src2 == TMP_REGISTER) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
-            FAIL_IF(!code);
-            *code = 0x8d;
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
+            FAIL_IF(!inst);
+            *inst = LEA_r_m;
             done = 1;
         }
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
 #else
         if (src2 & SLJIT_IMM) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
 #endif
-            FAIL_IF(!code);
-            *code = 0x8d;
+            FAIL_IF(!inst);
+            *inst = LEA_r_m;
             done = 1;
         }
     }
     else if (src2 >= SLJIT_TEMPORARY_REG1 && src2 <= SLJIT_NO_REGISTERS) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
 #else
         if (src1 & SLJIT_IMM) {
-            code = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
+            inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
 #endif
-            FAIL_IF(!code);
-            *code = 0x8d;
+            FAIL_IF(!inst);
+            *inst = LEA_r_m;
             done = 1;
         }
     }
@@ -1627,33 +1836,33 @@
     sljit_si src1, sljit_sw src1w,
     sljit_si src2, sljit_sw src2w)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     if (src1 == SLJIT_TEMPORARY_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
 #else
     if (src1 == SLJIT_TEMPORARY_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
 #endif
-        BINARY_EAX_IMM(0x3d, src2w);
+        BINARY_EAX_IMM(CMP_EAX_i32, src2w);
         return SLJIT_SUCCESS;
     }


     if (src1 >= SLJIT_TEMPORARY_REG1 && src1 <= SLJIT_NO_REGISTERS) {
         if (src2 & SLJIT_IMM) {
-            BINARY_IMM(0x7 << 3, 0x39, src2w, src1, 0);
+            BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
         }
         else {
-            code = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = 0x3b;
+            inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = CMP_r_rm;
         }
         return SLJIT_SUCCESS;
     }


     if (src2 >= SLJIT_TEMPORARY_REG1 && src2 <= SLJIT_NO_REGISTERS && !(src1 & SLJIT_IMM)) {
-        code = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
-        FAIL_IF(!code);
-        *code = 0x39;
+        inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
+        FAIL_IF(!inst);
+        *inst = CMP_rm_r;
         return SLJIT_SUCCESS;
     }


@@ -1663,13 +1872,13 @@
             src1 = TMP_REGISTER;
             src1w = 0;
         }
-        BINARY_IMM(0x7 << 3, 0x39, src2w, src1, src1w);
+        BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
     }
     else {
         EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-        code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
-        FAIL_IF(!code);
-        *code = 0x3b;
+        inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+        FAIL_IF(!inst);
+        *inst = CMP_r_rm;
     }
     return SLJIT_SUCCESS;
 }
@@ -1678,14 +1887,14 @@
     sljit_si src1, sljit_sw src1w,
     sljit_si src2, sljit_sw src2w)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     if (src1 == SLJIT_TEMPORARY_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
 #else
     if (src1 == SLJIT_TEMPORARY_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
 #endif
-        BINARY_EAX_IMM(0xa9, src2w);
+        BINARY_EAX_IMM(TEST_EAX_i32, src2w);
         return SLJIT_SUCCESS;
     }


@@ -1694,7 +1903,7 @@
 #else
     if (src2 == SLJIT_TEMPORARY_REG1 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
 #endif
-        BINARY_EAX_IMM(0xa9, src1w);
+        BINARY_EAX_IMM(TEST_EAX_i32, src1w);
         return SLJIT_SUCCESS;
     }


@@ -1702,26 +1911,26 @@
         if (src2 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
             if (IS_HALFWORD(src2w) || compiler->mode32) {
-                code = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
-                FAIL_IF(!code);
-                *code = 0xf7;
+                inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
+                FAIL_IF(!inst);
+                *inst = GROUP_F7;
             }
             else {
                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
-                code = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
-                FAIL_IF(!code);
-                *code = 0x85;
+                inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
+                FAIL_IF(!inst);
+                *inst = TEST_rm_r;
             }
 #else
-            code = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
-            FAIL_IF(!code);
-            *code = 0xf7;
+            inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
+            FAIL_IF(!inst);
+            *inst = GROUP_F7;
 #endif
         }
         else {
-            code = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
-            FAIL_IF(!code);
-            *code = 0x85;
+            inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
+            FAIL_IF(!inst);
+            *inst = TEST_rm_r;
         }
         return SLJIT_SUCCESS;
     }
@@ -1730,26 +1939,26 @@
         if (src1 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
             if (IS_HALFWORD(src1w) || compiler->mode32) {
-                code = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
-                FAIL_IF(!code);
-                *code = 0xf7;
+                inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
+                FAIL_IF(!inst);
+                *inst = GROUP_F7;
             }
             else {
                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
-                code = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
-                FAIL_IF(!code);
-                *code = 0x85;
+                inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
+                FAIL_IF(!inst);
+                *inst = TEST_rm_r;
             }
 #else
-            code = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
-            FAIL_IF(!code);
-            *code = 0xf7;
+            inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
+            FAIL_IF(!inst);
+            *inst = GROUP_F7;
 #endif
         }
         else {
-            code = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
-            FAIL_IF(!code);
-            *code = 0x85;
+            inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
+            FAIL_IF(!inst);
+            *inst = TEST_rm_r;
         }
         return SLJIT_SUCCESS;
     }
@@ -1758,26 +1967,26 @@
     if (src2 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         if (IS_HALFWORD(src2w) || compiler->mode32) {
-            code = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REGISTER, 0);
-            FAIL_IF(!code);
-            *code = 0xf7;
+            inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REGISTER, 0);
+            FAIL_IF(!inst);
+            *inst = GROUP_F7;
         }
         else {
             FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
-            code = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REGISTER, 0);
-            FAIL_IF(!code);
-            *code = 0x85;
+            inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REGISTER, 0);
+            FAIL_IF(!inst);
+            *inst = TEST_rm_r;
         }
 #else
-        code = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code = 0xf7;
+        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst = GROUP_F7;
 #endif
     }
     else {
-        code = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
-        FAIL_IF(!code);
-        *code = 0x85;
+        inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+        FAIL_IF(!inst);
+        *inst = TEST_rm_r;
     }
     return SLJIT_SUCCESS;
 }
@@ -1788,42 +1997,42 @@
     sljit_si src1, sljit_sw src1w,
     sljit_si src2, sljit_sw src2w)
 {
-    sljit_ub* code;
+    sljit_ub* inst;


     if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
         if (dst == src1 && dstw == src1w) {
-            code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
-            FAIL_IF(!code);
-            *code |= mode;
+            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
+            FAIL_IF(!inst);
+            *inst |= mode;
             return SLJIT_SUCCESS;
         }
         if (dst == SLJIT_UNUSED) {
             EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-            code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REGISTER, 0);
-            FAIL_IF(!code);
-            *code |= mode;
+            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REGISTER, 0);
+            FAIL_IF(!inst);
+            *inst |= mode;
             return SLJIT_SUCCESS;
         }
         if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
             EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-            code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
-            FAIL_IF(!code);
-            *code |= mode;
+            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+            FAIL_IF(!inst);
+            *inst |= mode;
             EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
             return SLJIT_SUCCESS;
         }
         if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) {
             EMIT_MOV(compiler, dst, 0, src1, src1w);
-            code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
-            FAIL_IF(!code);
-            *code |= mode;
+            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
+            FAIL_IF(!inst);
+            *inst |= mode;
             return SLJIT_SUCCESS;
         }


         EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-        code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code |= mode;
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst |= mode;
         EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
         return SLJIT_SUCCESS;
     }
@@ -1831,9 +2040,9 @@
     if (dst == SLJIT_PREF_SHIFT_REG) {
         EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-        code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code |= mode;
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst |= mode;
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
     }
     else if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
@@ -1841,9 +2050,9 @@
             EMIT_MOV(compiler, dst, 0, src1, src1w);
         EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_PREF_SHIFT_REG, 0);
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-        code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
-        FAIL_IF(!code);
-        *code |= mode;
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
+        FAIL_IF(!inst);
+        *inst |= mode;
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
     }
     else {
@@ -1857,9 +2066,9 @@
         EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
 #endif
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-        code = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
-        FAIL_IF(!code);
-        *code |= mode;
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+        FAIL_IF(!inst);
+        *inst |= mode;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
 #else
@@ -1889,7 +2098,7 @@
         if (!set_flags)
             return emit_mov(compiler, dst, dstw, src1, src1w);
         /* OR dst, src, 0 */
-        return emit_cum_binary(compiler, 0x0b, 0x09, 0x1 << 3, 0x0d,
+        return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
             dst, dstw, src1, src1w, SLJIT_IMM, 0);
     }


@@ -1941,7 +2150,7 @@
             compiler->flags_saved = 0;
         if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
             FAIL_IF(emit_save_flags(compiler));
-        return emit_cum_binary(compiler, 0x03, 0x01, 0x0 << 3, 0x05,
+        return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_ADDC:
         if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
@@ -1950,7 +2159,7 @@
             FAIL_IF(emit_save_flags(compiler));
         if (SLJIT_UNLIKELY(GET_FLAGS(op)))
             compiler->flags_saved = 0;
-        return emit_cum_binary(compiler, 0x13, 0x11, 0x2 << 3, 0x15,
+        return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_SUB:
         if (!GET_FLAGS(op)) {
@@ -1963,7 +2172,7 @@
             FAIL_IF(emit_save_flags(compiler));
         if (dst == SLJIT_UNUSED)
             return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
-        return emit_non_cum_binary(compiler, 0x2b, 0x29, 0x5 << 3, 0x2d,
+        return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_SUBC:
         if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
@@ -1972,29 +2181,29 @@
             FAIL_IF(emit_save_flags(compiler));
         if (SLJIT_UNLIKELY(GET_FLAGS(op)))
             compiler->flags_saved = 0;
-        return emit_non_cum_binary(compiler, 0x1b, 0x19, 0x3 << 3, 0x1d,
+        return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_MUL:
         return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_AND:
         if (dst == SLJIT_UNUSED)
             return emit_test_binary(compiler, src1, src1w, src2, src2w);
-        return emit_cum_binary(compiler, 0x23, 0x21, 0x4 << 3, 0x25,
+        return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_OR:
-        return emit_cum_binary(compiler, 0x0b, 0x09, 0x1 << 3, 0x0d,
+        return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_XOR:
-        return emit_cum_binary(compiler, 0x33, 0x31, 0x6 << 3, 0x35,
+        return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_SHL:
-        return emit_shift_with_flags(compiler, 0x4 << 3, GET_FLAGS(op),
+        return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_LSHR:
-        return emit_shift_with_flags(compiler, 0x5 << 3, GET_FLAGS(op),
+        return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_ASHR:
-        return emit_shift_with_flags(compiler, 0x7 << 3, GET_FLAGS(op),
+        return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
             dst, dstw, src1, src1w, src2, src2w);
     }


@@ -2015,16 +2224,16 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
     void *instruction, sljit_si size)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


     CHECK_ERROR();
     check_sljit_emit_op_custom(compiler, instruction, size);
     SLJIT_ASSERT(size > 0 && size < 16);


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + size);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
+    FAIL_IF(!inst);
     INC_SIZE(size);
-    SLJIT_MEMMOVE(buf, instruction, size);
+    SLJIT_MEMMOVE(inst, instruction, size);
     return SLJIT_SUCCESS;
 }


@@ -2057,42 +2266,13 @@
 {
 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-    static sljit_si sse2_available = -1;
-    sljit_si features;
-
-    if (sse2_available != -1)
-        return sse2_available;
-
-#ifdef __GNUC__
-    /* AT&T syntax. */
-    asm (
-        "pushl %%ebx\n"
-        "movl $0x1, %%eax\n"
-        "cpuid\n"
-        "popl %%ebx\n"
-        "movl %%edx, %0\n"
-        : "=g" (features)
-        :
-        : "%eax", "%ecx", "%edx"
-    );
-#elif defined(_MSC_VER) || defined(__BORLANDC__)
-    /* Intel syntax. */
-    __asm {
-        mov eax, 1
-        push ebx
-        cpuid
-        pop ebx
-        mov features, edx
-    }
-#else
-    #error "SLJIT_DETECT_SSE2 is not implemented for this C compiler"
-#endif
-    sse2_available = (features >> 26) & 0x1;
-    return sse2_available;
-#else
+    if (cpu_has_sse2 == -1)
+        get_cpu_features();
+    return cpu_has_sse2;
+#else /* SLJIT_DETECT_SSE2 */
     return 1;
-#endif
-#else
+#endif /* SLJIT_DETECT_SSE2 */
+#else /* SLJIT_SSE2 */
     return 0;
 #endif
 }
@@ -2102,37 +2282,37 @@
 static sljit_si emit_sse2(struct sljit_compiler *compiler, sljit_ub opcode,
     sljit_si single, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


-    buf = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
-    FAIL_IF(!buf);
-    *buf++ = 0x0f;
-    *buf = opcode;
+    inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
+    FAIL_IF(!inst);
+    *inst++ = GROUP_0F;
+    *inst = opcode;
     return SLJIT_SUCCESS;
 }


 static sljit_si emit_sse2_logic(struct sljit_compiler *compiler, sljit_ub opcode,
     sljit_si pref66, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;


-    buf = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
-    FAIL_IF(!buf);
-    *buf++ = 0x0f;
-    *buf = opcode;
+    inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
+    FAIL_IF(!inst);
+    *inst++ = GROUP_0F;
+    *inst = opcode;
     return SLJIT_SUCCESS;
 }


 static SLJIT_INLINE sljit_si emit_sse2_load(struct sljit_compiler *compiler,
     sljit_si single, sljit_si dst, sljit_si src, sljit_sw srcw)
 {
-    return emit_sse2(compiler, 0x10, single, dst, src, srcw);
+    return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
 }


 static SLJIT_INLINE sljit_si emit_sse2_store(struct sljit_compiler *compiler,
     sljit_si single, sljit_si dst, sljit_sw dstw, sljit_si src)
 {
-    return emit_sse2(compiler, 0x11, single, src, dst, dstw);
+    return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
@@ -2156,7 +2336,7 @@
             dst_r = TMP_FREG;
             FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, dst, dstw));
         }
-        return emit_sse2_logic(compiler, 0x2e, !(op & SLJIT_SINGLE_OP), dst_r, src, srcw);
+        return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_SINGLE_OP), dst_r, src, srcw);
     }


     if (op == SLJIT_MOVD) {
@@ -2180,11 +2360,11 @@


     switch (GET_OPCODE(op)) {
     case SLJIT_NEGD:
-        FAIL_IF(emit_sse2_logic(compiler, 0x57, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
+        FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
         break;


     case SLJIT_ABSD:
-        FAIL_IF(emit_sse2_logic(compiler, 0x54, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
+        FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
         break;
     }


@@ -2230,19 +2410,19 @@

     switch (GET_OPCODE(op)) {
     case SLJIT_ADDD:
-        FAIL_IF(emit_sse2(compiler, 0x58, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
+        FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
         break;


     case SLJIT_SUBD:
-        FAIL_IF(emit_sse2(compiler, 0x5c, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
+        FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
         break;


     case SLJIT_MULD:
-        FAIL_IF(emit_sse2(compiler, 0x59, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
+        FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
         break;


     case SLJIT_DIVD:
-        FAIL_IF(emit_sse2(compiler, 0x5e, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
+        FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
         break;
     }


@@ -2284,7 +2464,7 @@

 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
     struct sljit_label *label;


     CHECK_ERROR_PTR();
@@ -2302,18 +2482,18 @@
     PTR_FAIL_IF(!label);
     set_label(label, compiler);


-    buf = (sljit_ub*)ensure_buf(compiler, 2);
-    PTR_FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 2);
+    PTR_FAIL_IF(!inst);


-    *buf++ = 0;
-    *buf++ = 0;
+    *inst++ = 0;
+    *inst++ = 0;


     return label;
 }


 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
     struct sljit_jump *jump;


     CHECK_ERROR_PTR();
@@ -2340,17 +2520,17 @@
     compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
 #endif


-    buf = (sljit_ub*)ensure_buf(compiler, 2);
-    PTR_FAIL_IF_NULL(buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 2);
+    PTR_FAIL_IF_NULL(inst);


-    *buf++ = 0;
-    *buf++ = type + 4;
+    *inst++ = 0;
+    *inst++ = type + 4;
     return jump;
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
 {
-    sljit_ub *code;
+    sljit_ub *inst;
     struct sljit_jump *jump;


     CHECK_ERROR();
@@ -2401,28 +2581,28 @@
         compiler->size += 10 + 3;
 #endif


-        code = (sljit_ub*)ensure_buf(compiler, 2);
-        FAIL_IF_NULL(code);
+        inst = (sljit_ub*)ensure_buf(compiler, 2);
+        FAIL_IF_NULL(inst);


-        *code++ = 0;
-        *code++ = type + 4;
+        *inst++ = 0;
+        *inst++ = type + 4;
     }
     else {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         /* REX_W is not necessary (src is not immediate). */
         compiler->mode32 = 1;
 #endif
-        code = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
-        FAIL_IF(!code);
-        *code++ = 0xff;
-        *code |= (type >= SLJIT_FAST_CALL) ? (2 << 3) : (4 << 3);
+        inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
+        FAIL_IF(!inst);
+        *inst++ = GROUP_FF;
+        *inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
     }
     return SLJIT_SUCCESS;
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_cond_value(struct sljit_compiler *compiler, sljit_si op, sljit_si dst, sljit_sw dstw, sljit_si type)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
     sljit_ub cond_set = 0;
     sljit_si dst_save = dst;
     sljit_sw dstw_save = dstw;
@@ -2444,84 +2624,84 @@
     switch (type) {
     case SLJIT_C_EQUAL:
     case SLJIT_C_FLOAT_EQUAL:
-        cond_set = 0x94;
+        cond_set = 0x94 /* sete */;
         break;


     case SLJIT_C_NOT_EQUAL:
     case SLJIT_C_FLOAT_NOT_EQUAL:
-        cond_set = 0x95;
+        cond_set = 0x95 /* setne */;
         break;


     case SLJIT_C_LESS:
     case SLJIT_C_FLOAT_LESS:
-        cond_set = 0x92;
+        cond_set = 0x92 /* setnae */;
         break;


     case SLJIT_C_GREATER_EQUAL:
     case SLJIT_C_FLOAT_GREATER_EQUAL:
-        cond_set = 0x93;
+        cond_set = 0x93 /* setnb */;
         break;


     case SLJIT_C_GREATER:
     case SLJIT_C_FLOAT_GREATER:
-        cond_set = 0x97;
+        cond_set = 0x97 /* seta */;
         break;


     case SLJIT_C_LESS_EQUAL:
     case SLJIT_C_FLOAT_LESS_EQUAL:
-        cond_set = 0x96;
+        cond_set = 0x96 /* setbe */;
         break;


     case SLJIT_C_SIG_LESS:
-        cond_set = 0x9c;
+        cond_set = 0x9c /* setnge */;
         break;


     case SLJIT_C_SIG_GREATER_EQUAL:
-        cond_set = 0x9d;
+        cond_set = 0x9d /* setnl */;
         break;


     case SLJIT_C_SIG_GREATER:
-        cond_set = 0x9f;
+        cond_set = 0x9f /* setg */;
         break;


     case SLJIT_C_SIG_LESS_EQUAL:
-        cond_set = 0x9e;
+        cond_set = 0x9e /* setle */;
         break;


     case SLJIT_C_OVERFLOW:
     case SLJIT_C_MUL_OVERFLOW:
-        cond_set = 0x90;
+        cond_set = 0x90 /* seto */;
         break;


     case SLJIT_C_NOT_OVERFLOW:
     case SLJIT_C_MUL_NOT_OVERFLOW:
-        cond_set = 0x91;
+        cond_set = 0x91 /* setno */;
         break;


     case SLJIT_C_FLOAT_UNORDERED:
-        cond_set = 0x9a;
+        cond_set = 0x9a /* setp */;
         break;


     case SLJIT_C_FLOAT_ORDERED:
-        cond_set = 0x9b;
+        cond_set = 0x9b /* setpo */;
         break;
     }


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     reg = (op == SLJIT_MOV && dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_NO_REGISTERS) ? dst : TMP_REGISTER;


-    buf = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
-    FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
+    FAIL_IF(!inst);
     INC_SIZE(4 + 4);
     /* Set low register to conditional flag. */
-    *buf++ = (reg_map[reg] <= 7) ? 0x40 : REX_B;
-    *buf++ = 0x0f;
-    *buf++ = cond_set;
-    *buf++ = 0xC0 | reg_lmap[reg];
-    *buf++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
-    *buf++ = 0x0f;
-    *buf++ = 0xb6;
-    *buf = 0xC0 | (reg_lmap[reg] << 3) | reg_lmap[reg];
+    *inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
+    *inst++ = GROUP_0F;
+    *inst++ = cond_set;
+    *inst++ = MOD_REG | reg_lmap[reg];
+    *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
+    *inst++ = GROUP_0F;
+    *inst++ = MOVZX_r_rm8;
+    *inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];


     if (reg == TMP_REGISTER) {
         if (op == SLJIT_MOV) {
@@ -2538,35 +2718,35 @@
 #else
     if (op == SLJIT_MOV) {
         if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_TEMPORARY_REG3) {
-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
+            FAIL_IF(!inst);
             INC_SIZE(3 + 3);
             /* Set low byte to conditional flag. */
-            *buf++ = 0x0f;
-            *buf++ = cond_set;
-            *buf++ = 0xC0 | reg_map[dst];
+            *inst++ = GROUP_0F;
+            *inst++ = cond_set;
+            *inst++ = MOD_REG | reg_map[dst];


-            *buf++ = 0x0f;
-            *buf++ = 0xb6;
-            *buf = 0xC0 | (reg_map[dst] << 3) | reg_map[dst];
+            *inst++ = GROUP_0F;
+            *inst++ = MOVZX_r_rm8;
+            *inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
         }
         else {
             EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_TEMPORARY_REG1, 0);


-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
+            FAIL_IF(!inst);
             INC_SIZE(3 + 3);
             /* Set al to conditional flag. */
-            *buf++ = 0x0f;
-            *buf++ = cond_set;
-            *buf++ = 0xC0;
+            *inst++ = GROUP_0F;
+            *inst++ = cond_set;
+            *inst++ = MOD_REG;


-            *buf++ = 0x0f;
-            *buf++ = 0xb6;
+            *inst++ = GROUP_0F;
+            *inst++ = MOVZX_r_rm8;
             if (dst >= SLJIT_SAVED_REG1 && dst <= SLJIT_NO_REGISTERS)
-                *buf = 0xC0 | (reg_map[dst] << 3);
+                *inst = MOD_REG | (reg_map[dst] << 3);
             else {
-                *buf = 0xC0;
+                *inst = MOD_REG;
                 EMIT_MOV(compiler, dst, dstw, SLJIT_TEMPORARY_REG1, 0);
             }


@@ -2576,30 +2756,30 @@
     else {
         if (dst >= SLJIT_TEMPORARY_REG1 && dst <= SLJIT_TEMPORARY_REG3) {
             EMIT_MOV(compiler, TMP_REGISTER, 0, dst, 0);
-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 3);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 3);
+            FAIL_IF(!inst);
             INC_SIZE(3);


-            *buf++ = 0x0f;
-            *buf++ = cond_set;
-            *buf++ = 0xC0 | reg_map[dst];
+            *inst++ = GROUP_0F;
+            *inst++ = cond_set;
+            *inst++ = MOD_REG | reg_map[dst];
         }
         else {
             EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_TEMPORARY_REG1, 0);


-            buf = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3 + 1);
-            FAIL_IF(!buf);
+            inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3 + 1);
+            FAIL_IF(!inst);
             INC_SIZE(3 + 3 + 1);
             /* Set al to conditional flag. */
-            *buf++ = 0x0f;
-            *buf++ = cond_set;
-            *buf++ = 0xC0;
+            *inst++ = GROUP_0F;
+            *inst++ = cond_set;
+            *inst++ = MOD_REG;


-            *buf++ = 0x0f;
-            *buf++ = 0xb6;
-            *buf++ = 0xC0;
+            *inst++ = GROUP_0F;
+            *inst++ = MOVZX_r_rm8;
+            *inst++ = MOD_REG;


-            *buf++ = 0x90 + reg_map[TMP_REGISTER];
+            *inst++ = XCHG_EAX_r + reg_map[TMP_REGISTER];
         }
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
         compiler->skip_checks = 1;
@@ -2644,7 +2824,7 @@


 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
 {
-    sljit_ub *buf;
+    sljit_ub *inst;
     struct sljit_const *const_;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     sljit_si reg;
@@ -2674,11 +2854,11 @@
         return NULL;
 #endif


-    buf = (sljit_ub*)ensure_buf(compiler, 2);
-    PTR_FAIL_IF(!buf);
+    inst = (sljit_ub*)ensure_buf(compiler, 2);
+    PTR_FAIL_IF(!inst);


-    *buf++ = 0;
-    *buf++ = 1;
+    *inst++ = 0;
+    *inst++ = 1;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     if (reg == TMP_REGISTER && dst != SLJIT_UNUSED)