[Pcre-svn] [768] code/trunk/src: JIT compiler update.

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [768] code/trunk/src: JIT compiler update.
Revision: 768
          http://www.exim.org/viewvc/pcre2?view=rev&revision=768
Author:   zherczeg
Date:     2017-05-07 08:10:16 +0100 (Sun, 07 May 2017)
Log Message:
-----------
JIT compiler update.


Modified Paths:
--------------
    code/trunk/src/pcre2_jit_compile.c
    code/trunk/src/sljit/sljitConfigInternal.h
    code/trunk/src/sljit/sljitLir.c
    code/trunk/src/sljit/sljitLir.h
    code/trunk/src/sljit/sljitNativeARM_32.c
    code/trunk/src/sljit/sljitNativeARM_64.c
    code/trunk/src/sljit/sljitNativeARM_T2_32.c
    code/trunk/src/sljit/sljitNativeMIPS_32.c
    code/trunk/src/sljit/sljitNativeMIPS_64.c
    code/trunk/src/sljit/sljitNativeMIPS_common.c
    code/trunk/src/sljit/sljitNativePPC_common.c
    code/trunk/src/sljit/sljitNativeSPARC_common.c
    code/trunk/src/sljit/sljitNativeTILEGX_64.c
    code/trunk/src/sljit/sljitNativeX86_64.c
    code/trunk/src/sljit/sljitNativeX86_common.c


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/pcre2_jit_compile.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -593,8 +593,8 @@
   sljit_emit_cmp(compiler, (type), (src1), (src1w), (src2), (src2w))
 #define CMPTO(type, src1, src1w, src2, src2w, label) \
   sljit_set_label(sljit_emit_cmp(compiler, (type), (src1), (src1w), (src2), (src2w)), (label))
-#define OP_FLAGS(op, dst, dstw, src, srcw, type) \
-  sljit_emit_op_flags(compiler, (op), (dst), (dstw), (src), (srcw), (type))
+#define OP_FLAGS(op, dst, dstw, type) \
+  sljit_emit_op_flags(compiler, (op), (dst), (dstw), (type))
 #define GET_LOCAL_BASE(dst, dstw, offset) \
   sljit_get_local_base(compiler, (dst), (dstw), (offset))


@@ -3395,7 +3395,7 @@
/* Skip low surrogate if necessary. */
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
- OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
return;
@@ -3512,7 +3512,7 @@

 JUMPHERE(jump);
 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400);
-OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_NOT_ZERO);
+OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
 /* This code runs only in 8 bit mode. No need to shift the value. */
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
@@ -3702,7 +3702,7 @@
   end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
   OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, common->newline & 0xff);
-  OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+  OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
   OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
 #endif
@@ -3740,7 +3740,7 @@
   singlechar = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
   OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
   OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
-  OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+  OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
   OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
   JUMPHERE(singlechar);
@@ -4914,9 +4914,9 @@
   else
     {
     OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char1);
-    OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+    OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
     OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char2);
-    OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+    OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL);
     found = JUMP(SLJIT_NOT_ZERO);
     }
   }
@@ -5247,7 +5247,7 @@


OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_GREATER_EQUAL);
+ OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER_EQUAL);
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCHAR_SHIFT);
#endif
@@ -5292,7 +5292,7 @@
notfoundnl = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
- OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
#endif
@@ -5369,7 +5369,7 @@
CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start);
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
- OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
@@ -5533,10 +5533,10 @@
add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Ll);
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_Lu - ucp_Ll);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+ OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Nd - ucp_Ll);
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd);
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
JUMPHERE(jump);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP2, 0);
}
@@ -5577,10 +5577,10 @@
add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Ll);
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_Lu - ucp_Ll);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+ OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Nd - ucp_Ll);
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd);
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
JUMPHERE(jump);
}
else
@@ -5773,7 +5773,7 @@

OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0a);
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
-OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
#if PCRE2_CODE_UNIT_WIDTH == 8
@@ -5780,7 +5780,7 @@
if (common->utf)
{
#endif
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2029 - 0x0a);
#if PCRE2_CODE_UNIT_WIDTH == 8
@@ -5787,7 +5787,7 @@
}
#endif
#endif /* SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == [16|32] */
-OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}

@@ -5799,9 +5799,9 @@
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);

OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x09);
-OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
-OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xa0);
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
#if PCRE2_CODE_UNIT_WIDTH == 8
@@ -5808,24 +5808,24 @@
if (common->utf)
{
#endif
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x1680);
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x180e);
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2000);
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x200A - 0x2000);
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x202f - 0x2000);
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x205f - 0x2000);
- OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x3000 - 0x2000);
#if PCRE2_CODE_UNIT_WIDTH == 8
}
#endif
#endif /* SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == [16|32] */
-OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL);

sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
@@ -5839,7 +5839,7 @@

OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0a);
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
-OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
#if PCRE2_CODE_UNIT_WIDTH == 8
@@ -5846,7 +5846,7 @@
if (common->utf)
{
#endif
- OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+ OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2029 - 0x0a);
#if PCRE2_CODE_UNIT_WIDTH == 8
@@ -5853,7 +5853,7 @@
}
#endif
#endif /* SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == [16|32] */
-OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL);

 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 }
@@ -6441,13 +6441,13 @@
     if (numberofcmps < 3 && (*cc == XCL_SINGLE || *cc == XCL_RANGE))
       {
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset));
-      OP_FLAGS(numberofcmps == 0 ? SLJIT_MOV : SLJIT_OR, TMP2, 0, numberofcmps == 0 ? SLJIT_UNUSED : TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(numberofcmps == 0 ? SLJIT_MOV : SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
       numberofcmps++;
       }
     else if (numberofcmps > 0)
       {
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset));
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL);
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       numberofcmps = 0;
       }
@@ -6467,13 +6467,13 @@
     if (numberofcmps < 3 && (*cc == XCL_SINGLE || *cc == XCL_RANGE))
       {
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset));
-      OP_FLAGS(numberofcmps == 0 ? SLJIT_MOV : SLJIT_OR, TMP2, 0, numberofcmps == 0 ? SLJIT_UNUSED : TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(numberofcmps == 0 ? SLJIT_MOV : SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
       numberofcmps++;
       }
     else if (numberofcmps > 0)
       {
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(c - charoffset));
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL);
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       numberofcmps = 0;
       }
@@ -6499,11 +6499,11 @@


       case PT_LAMP:
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Lu - typeoffset);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Ll - typeoffset);
-      OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Lt - typeoffset);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL);
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       break;


@@ -6526,32 +6526,32 @@
       case PT_PXSPACE:
       SET_CHAR_OFFSET(9);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd - 0x9);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);


       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x9);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);


       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x180e - 0x9);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);


       SET_TYPE_OFFSET(ucp_Zl);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Zs - ucp_Zl);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL);
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       break;


       case PT_WORD:
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_UNDERSCORE - charoffset));
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
       /* Fall through. */


       case PT_ALNUM:
       SET_TYPE_OFFSET(ucp_Ll);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Lu - ucp_Ll);
-      OP_FLAGS((*cc == PT_ALNUM) ? SLJIT_MOV : SLJIT_OR, TMP2, 0, (*cc == PT_ALNUM) ? SLJIT_UNUSED : TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS((*cc == PT_ALNUM) ? SLJIT_MOV : SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
       SET_TYPE_OFFSET(ucp_Nd);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_No - ucp_Nd);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL);
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       break;


@@ -6574,7 +6574,7 @@
           OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, other_cases[1] ^ other_cases[0]);
           }
         OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, other_cases[1]);
-        OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+        OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
         other_cases += 2;
         }
       else if (is_powerof2(other_cases[2] ^ other_cases[1]))
@@ -6587,10 +6587,10 @@
           OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, other_cases[1] ^ other_cases[0]);
           }
         OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, other_cases[2]);
-        OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+        OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);


         OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[0] - charoffset));
-        OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+        OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);


         other_cases += 3;
         }
@@ -6597,13 +6597,13 @@
       else
         {
         OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++ - charoffset));
-        OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+        OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
         }


       while (*other_cases != NOTACHAR)
         {
         OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++ - charoffset));
-        OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+        OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
         }
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       break;
@@ -6610,18 +6610,18 @@


       case PT_UCNC:
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_DOLLAR_SIGN - charoffset));
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_COMMERCIAL_AT - charoffset));
-      OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_GRAVE_ACCENT - charoffset));
-      OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);


       SET_CHAR_OFFSET(0xa0);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(0xd7ff - charoffset));
-      OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
       SET_CHAR_OFFSET(0);
       OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xe000 - 0);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_GREATER_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_GREATER_EQUAL);
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       break;


@@ -6629,7 +6629,7 @@
       /* C and Z groups are the farthest two groups. */
       SET_TYPE_OFFSET(ucp_Ll);
       OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_So - ucp_Ll);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_GREATER);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER);


       jump = CMP(SLJIT_NOT_EQUAL, typereg, 0, SLJIT_IMM, ucp_Cf - ucp_Ll);


@@ -6636,13 +6636,13 @@
       /* In case of ucp_Cf, we overwrite the result. */
       SET_CHAR_OFFSET(0x2066);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);


       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
-      OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);


       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x180e - 0x2066);
-      OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);


       JUMPHERE(jump);
       jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
@@ -6652,10 +6652,10 @@
       /* C and Z groups are the farthest two groups. */
       SET_TYPE_OFFSET(ucp_Ll);
       OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_So - ucp_Ll);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_GREATER);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER);


       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Zs - ucp_Ll);
-      OP_FLAGS(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_NOT_EQUAL);
+      OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_NOT_EQUAL);


       jump = CMP(SLJIT_NOT_EQUAL, typereg, 0, SLJIT_IMM, ucp_Cf - ucp_Ll);


@@ -6662,10 +6662,10 @@
       /* In case of ucp_Cf, we overwrite the result. */
       SET_CHAR_OFFSET(0x2066);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);


       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
-      OP_FLAGS(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+      OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);


       JUMPHERE(jump);
       jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
@@ -6674,15 +6674,15 @@
       case PT_PXPUNCT:
       SET_TYPE_OFFSET(ucp_Sc);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_So - ucp_Sc);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);


       SET_CHAR_OFFSET(0);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x7f);
-      OP_FLAGS(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL);


       SET_TYPE_OFFSET(ucp_Pc);
       OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, typereg, 0, SLJIT_IMM, ucp_Ps - ucp_Pc);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL);
       jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
       break;


@@ -6750,9 +6750,9 @@
       {
       jump[1] = CMP(SLJIT_EQUAL, TMP2, 0, STR_END, 0);
       OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0);
-      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS);
+      OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS);
       OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
-      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_NOT_EQUAL);
+      OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_EQUAL);
       add_jump(compiler, backtracks, JUMP(SLJIT_NOT_EQUAL));
       check_partial(common, TRUE);
       add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
@@ -7028,7 +7028,7 @@
     jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
     OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
     OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
-    OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+    OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
     OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
 #endif


Modified: code/trunk/src/sljit/sljitConfigInternal.h
===================================================================
--- code/trunk/src/sljit/sljitConfigInternal.h    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitConfigInternal.h    2017-05-07 07:10:16 UTC (rev 768)
@@ -567,11 +567,11 @@
 #elif (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)


#ifndef _WIN64
-#define SLJIT_NUMBER_OF_REGISTERS 12
+#define SLJIT_NUMBER_OF_REGISTERS 13
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 6
#define SLJIT_LOCALS_OFFSET_BASE 0
#else
-#define SLJIT_NUMBER_OF_REGISTERS 12
+#define SLJIT_NUMBER_OF_REGISTERS 13
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 8
#define SLJIT_LOCALS_OFFSET_BASE (compiler->locals_offset)
#endif /* _WIN64 */

Modified: code/trunk/src/sljit/sljitLir.c
===================================================================
--- code/trunk/src/sljit/sljitLir.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitLir.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -697,12 +697,12 @@
             CHECK_NOT_VIRTUAL_REGISTER(OFFS_REG(p)); \
             CHECK_ARGUMENT(!((i) & ~0x3)); \
         } \
-        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | SLJIT_IMM | REG_MASK | OFFS_REG_MASK))); \
+        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | REG_MASK | OFFS_REG_MASK))); \
     }


-#define FUNCTION_CHECK_DST(p, i) \
+#define FUNCTION_CHECK_DST(p, i, unused) \
     CHECK_ARGUMENT(compiler->scratches != -1 && compiler->saveds != -1); \
-    if (FUNCTION_CHECK_IS_REG_OR_UNUSED(p)) \
+    if (FUNCTION_CHECK_IS_REG(p) || ((unused) && (p) == SLJIT_UNUSED)) \
         CHECK_ARGUMENT((i) == 0); \
     else if ((p) == (SLJIT_MEM1(SLJIT_SP))) \
         CHECK_ARGUMENT((i) >= 0 && (i) < compiler->logical_local_size); \
@@ -716,7 +716,7 @@
             CHECK_NOT_VIRTUAL_REGISTER(OFFS_REG(p)); \
             CHECK_ARGUMENT(!((i) & ~0x3)); \
         } \
-        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | SLJIT_IMM | REG_MASK | OFFS_REG_MASK))); \
+        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | REG_MASK | OFFS_REG_MASK))); \
     }


 #define FUNCTION_FCHECK(p, i) \
@@ -736,7 +736,7 @@
             CHECK_NOT_VIRTUAL_REGISTER(OFFS_REG(p)); \
             CHECK_ARGUMENT(((p) & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SP) && !(i & ~0x3)); \
         } \
-        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | SLJIT_IMM | REG_MASK | OFFS_REG_MASK))); \
+        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | REG_MASK | OFFS_REG_MASK))); \
     }


 #endif /* SLJIT_ARGUMENT_CHECKS */
@@ -977,7 +977,7 @@
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-    FUNCTION_CHECK_DST(dst, dstw);
+    FUNCTION_CHECK_DST(dst, dstw, 0);
     compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1047,8 +1047,7 @@
         break;
     case SLJIT_NEG:
         CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK)
-            || GET_FLAG_TYPE(op) == SLJIT_OVERFLOW
-            || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW);
+            || GET_FLAG_TYPE(op) == SLJIT_OVERFLOW);
         break;
     case SLJIT_MOV:
     case SLJIT_MOV_U32:
@@ -1065,8 +1064,8 @@
         break;
     }


+    FUNCTION_CHECK_DST(dst, dstw, 1);
     FUNCTION_CHECK_SRC(src, srcw);
-    FUNCTION_CHECK_DST(dst, dstw);


     if (GET_OPCODE(op) >= SLJIT_NOT)
         compiler->last_flags = GET_FLAG_TYPE(op) | (op & (SLJIT_I32_OP | SLJIT_SET_Z));
@@ -1132,18 +1131,16 @@
     case SLJIT_MUL:
         CHECK_ARGUMENT(!(op & SLJIT_SET_Z));
         CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK)
-            || GET_FLAG_TYPE(op) == SLJIT_MUL_OVERFLOW
-            || GET_FLAG_TYPE(op) == SLJIT_MUL_NOT_OVERFLOW);
+            || GET_FLAG_TYPE(op) == SLJIT_MUL_OVERFLOW);
         break;
     case SLJIT_ADD:
         CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK)
             || GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY)
-            || GET_FLAG_TYPE(op) == SLJIT_OVERFLOW
-            || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW);
+            || GET_FLAG_TYPE(op) == SLJIT_OVERFLOW);
         break;
     case SLJIT_SUB:
         CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK)
-            || (GET_FLAG_TYPE(op) >= SLJIT_LESS && GET_FLAG_TYPE(op) <= SLJIT_NOT_OVERFLOW)
+            || (GET_FLAG_TYPE(op) >= SLJIT_LESS && GET_FLAG_TYPE(op) <= SLJIT_OVERFLOW)
             || GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY));
         break;
     case SLJIT_ADDC:
@@ -1158,9 +1155,9 @@
         break;
     }


+    FUNCTION_CHECK_DST(dst, dstw, 1);
     FUNCTION_CHECK_SRC(src1, src1w);
     FUNCTION_CHECK_SRC(src2, src2w);
-    FUNCTION_CHECK_DST(dst, dstw);
     compiler->last_flags = GET_FLAG_TYPE(op) | (op & (SLJIT_I32_OP | SLJIT_SET_Z));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1317,7 +1314,7 @@
     CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_CONV_SW_FROM_F64 && GET_OPCODE(op) <= SLJIT_CONV_S32_FROM_F64);
     CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
     FUNCTION_FCHECK(src, srcw);
-    FUNCTION_CHECK_DST(dst, dstw);
+    FUNCTION_CHECK_DST(dst, dstw, 0);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1428,7 +1425,9 @@
         if ((type & 0xff) <= SLJIT_NOT_ZERO)
             CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
         else
-            CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff));
+            CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
+                || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+                || ((type & 0xff) == SLJIT_MUL_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_MUL_OVERFLOW));
         CHECK_ARGUMENT((type & SLJIT_I32_OP) == (compiler->last_flags & SLJIT_I32_OP));
     }
 #endif
@@ -1517,7 +1516,6 @@


 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
@@ -1524,7 +1522,7 @@
     CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
     CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
     CHECK_ARGUMENT((type & 0xff) != GET_FLAG_TYPE(SLJIT_SET_CARRY) && (type & 0xff) != (GET_FLAG_TYPE(SLJIT_SET_CARRY) + 1));
-    CHECK_ARGUMENT(op == SLJIT_MOV || GET_OPCODE(op) == SLJIT_MOV_U32 || GET_OPCODE(op) == SLJIT_MOV_S32
+    CHECK_ARGUMENT(op == SLJIT_MOV || op == SLJIT_MOV32
         || (GET_OPCODE(op) >= SLJIT_AND && GET_OPCODE(op) <= SLJIT_XOR));
     CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK));


@@ -1531,15 +1529,14 @@
     if ((type & 0xff) <= SLJIT_NOT_ZERO)
         CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
     else
-        CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff));
+        CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
+            || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+            || ((type & 0xff) == SLJIT_MUL_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_MUL_OVERFLOW));


-    if (GET_OPCODE(op) < SLJIT_ADD) {
-        CHECK_ARGUMENT(src == SLJIT_UNUSED && srcw == 0);
-    } else {
-        CHECK_ARGUMENT(src == dst && srcw == dstw);
+    FUNCTION_CHECK_DST(dst, dstw, 0);
+
+    if (GET_OPCODE(op) >= SLJIT_ADD)
         compiler->last_flags = GET_FLAG_TYPE(op) | (op & (SLJIT_I32_OP | SLJIT_SET_Z));
-    }
-    FUNCTION_CHECK_DST(dst, dstw);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1548,10 +1545,6 @@
             GET_OPCODE(op) < SLJIT_OP2_BASE ? "mov" : op2_names[GET_OPCODE(op) - SLJIT_OP2_BASE],
             GET_OPCODE(op) < SLJIT_OP2_BASE ? op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE] : ((op & SLJIT_I32_OP) ? "32" : ""));
         sljit_verbose_param(compiler, dst, dstw);
-        if (src != SLJIT_UNUSED) {
-            fprintf(compiler->verbose, ", ");
-            sljit_verbose_param(compiler, src, srcw);
-        }
         fprintf(compiler->verbose, ", %s%s\n", jump_names[type & 0xff], JUMP_POSTFIX(type));
     }
 #endif
@@ -1573,7 +1566,9 @@
     if ((type & 0xff) <= SLJIT_NOT_ZERO)
         CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
     else
-        CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff));
+        CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
+            || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+            || ((type & 0xff) == SLJIT_MUL_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_MUL_OVERFLOW));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1594,7 +1589,7 @@
     SLJIT_UNUSED_ARG(offset);


 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-    FUNCTION_CHECK_DST(dst, dstw);
+    FUNCTION_CHECK_DST(dst, dstw, 0);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1611,7 +1606,7 @@
     SLJIT_UNUSED_ARG(init_value);


 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-    FUNCTION_CHECK_DST(dst, dstw);
+    FUNCTION_CHECK_DST(dst, dstw, 0);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -2166,7 +2161,6 @@


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
     SLJIT_UNUSED_ARG(compiler);
@@ -2173,8 +2167,6 @@
     SLJIT_UNUSED_ARG(op);
     SLJIT_UNUSED_ARG(dst);
     SLJIT_UNUSED_ARG(dstw);
-    SLJIT_UNUSED_ARG(src);
-    SLJIT_UNUSED_ARG(srcw);
     SLJIT_UNUSED_ARG(type);
     SLJIT_UNREACHABLE();
     return SLJIT_ERR_UNSUPPORTED;


Modified: code/trunk/src/sljit/sljitLir.h
===================================================================
--- code/trunk/src/sljit/sljitLir.h    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitLir.h    2017-05-07 07:10:16 UTC (rev 768)
@@ -120,8 +120,8 @@
   If an architecture provides two scratch and three saved registers,
   its scratch and saved register sets are the following:


-     R0   |  [S4]  |   R0 and S4 represent the same physical register
-     R1   |  [S3]  |   R1 and S3 represent the same physical register
+     R0   |        |   R0 is always a scratch register
+     R1   |        |   R1 is always a scratch register
     [R2]  |   S2   |   R2 and S2 represent the same physical register
     [R3]  |   S1   |   R3 and S1 represent the same physical register
     [R4]  |   S0   |   R4 and S0 represent the same physical register
@@ -129,38 +129,35 @@
   Note: SLJIT_NUMBER_OF_SCRATCH_REGISTERS would be 2 and
         SLJIT_NUMBER_OF_SAVED_REGISTERS would be 3 for this architecture.


-  Note: On all supported architectures SLJIT_NUMBER_OF_REGISTERS >= 10
-        and SLJIT_NUMBER_OF_SAVED_REGISTERS >= 5. However, 4 registers
+  Note: On all supported architectures SLJIT_NUMBER_OF_REGISTERS >= 12
+        and SLJIT_NUMBER_OF_SAVED_REGISTERS >= 6. However, 6 registers
         are virtual on x86-32. See below.


- The purpose of this definition is convenience. Although a register
- is either scratch register or saved register, SLJIT allows accessing
- them from the other set. For example, four registers can be used as
- scratch registers and the fifth one as saved register on the architecture
- above. Of course the last two scratch registers (R2 and R3) from this
- four will be saved on the stack, because they are defined as saved
- registers in the application binary interface. Still R2 and R3 can be
- used for referencing to these registers instead of S2 and S1, which
- makes easier to write platform independent code. Scratch registers
- can be saved registers in a similar way, but these extra saved
- registers will not be preserved across function calls! Hence the
- application must save them on those platforms, where the number of
- saved registers is too low. This can be done by copy them onto
- the stack and restore them after a function call.
+ The purpose of this definition is convenience: saved registers can
+ be used as extra scratch registers. For example four registers can
+ be specified as scratch registers and the fifth one as saved register
+ on the CPU above and any user code which requires four scratch
+ registers can run unmodified. The SLJIT compiler automatically saves
+ the content of the two extra scrath register on the stack. Scratch
+ registers can also be preserved by saving their value on the stack
+ but this needs to be done manually.

   Note: To emphasize that registers assigned to R2-R4 are saved
-        registers, they are enclosed by square brackets. S3-S4
-        are marked in a similar way.
+        registers, they are enclosed by square brackets.


   Note: sljit_emit_enter and sljit_set_context defines whether a register
         is S or R register. E.g: when 3 scratches and 1 saved is mapped
         by sljit_emit_enter, the allowed register set will be: R0-R2 and
         S0. Although S2 is mapped to the same position as R2, it does not
-        available in the current configuration. Furthermore the R3 (S1)
-        register does not available as well.
+        available in the current configuration. Furthermore the S1 register
+        is not available at all.
 */


-/* When SLJIT_UNUSED is specified as destination, the result is discarded. */
+/* When SLJIT_UNUSED is specified as the destination of sljit_emit_op1 and
+   and sljit_emit_op2 operations the result is discarded. If no status
+   flags are set, no instructions are emitted for these operations. Data
+   prefetch is a special exception, see SLJIT_MOV operation. Other SLJIT
+   operations do not support SLJIT_UNUSED as a destination operand. */
 #define SLJIT_UNUSED        0


/* Scratch registers. */
@@ -489,21 +486,27 @@
*/
static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler *compiler) { return compiler->executable_size; }

-/* Returns with non-zero if the passed SLJIT_HAS_* feature is available.
+/* Returns with non-zero if the feature or limitation type passed as its
+ argument is present on the current CPU.

- Some features (e.g. floating point operations) require CPU support
- while other (e.g. move with update) is emulated if not available.
- However it might be worth to generate a special code path even in
- the latter case in certain cases. */
+ Some features (e.g. floating point operations) require hardware (CPU)
+ support while others (e.g. move with update) are emulated if not available.
+ However even if a feature is emulated, specialized code paths can be faster
+ than the emulation. Some limitations are emulated as well so their general
+ case is supported but it has extra performance costs. */

 /* [Not emulated] Floating-point support is available. */
 #define SLJIT_HAS_FPU            0
+/* [Limitation] Some registers are virtual registers. */
+#define SLJIT_HAS_VIRTUAL_REGISTERS    1
 /* [Emulated] Some forms of move with pre update is supported. */
-#define SLJIT_HAS_PRE_UPDATE        1
+#define SLJIT_HAS_PRE_UPDATE        2
 /* [Emulated] Count leading zero is supported. */
-#define SLJIT_HAS_CLZ            2
+#define SLJIT_HAS_CLZ            3
 /* [Emulated] Conditional move is supported. */
-#define SLJIT_HAS_CMOV            3
+#define SLJIT_HAS_CMOV            4
+/* [Limitation] [Emulated] Shifting with register is limited to SLJIT_PREF_SHIFT_REG. */
+#define SLJIT_HAS_PREF_SHIFT_REG    5


 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
 /* [Not emulated] SSE2 support is available on x86. */
@@ -723,8 +726,8 @@
    Example: SLJIT_ADD can set the Z, OVERFLOW, CARRY flags hence


      sljit_op2(..., SLJIT_ADD, ...)
-       Both the zero and variable flags are undefined so their
-       they hold a random value after the operation is completed.
+       Both the zero and variable flags are undefined so they can
+       have any value after the operation is completed.


      sljit_op2(..., SLJIT_ADD | SLJIT_SET_Z, ...)
        Sets the zero flag if the result is zero, clears it otherwise.
@@ -734,10 +737,6 @@
        Sets the variable flag if an integer overflow occurs, clears
        it otherwise. The zero flag is undefined.


-     sljit_op2(..., SLJIT_ADD | SLJIT_SET_NOT_OVERFLOW, ...)
-       Sets the variable flag if an integer overflow does NOT occur,
-       clears it otherwise. The zero flag is undefined.
-
      sljit_op2(..., SLJIT_ADD | SLJIT_SET_Z | SLJIT_SET_CARRY, ...)
        Sets the zero flag if the result is zero, clears it otherwise.
        Sets the variable flag if unsigned overflow (carry) occurs,
@@ -862,6 +861,15 @@


    sljit_emit_op1(..., SLJIT_MOVU_U8,
        SLJIT_MEM2(SLJIT_R0, SLJIT_R2), 0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
+
+   If the destination of a MOV without update instruction is SLJIT_UNUSED
+   and the source operand is a memory address the compiler emits a prefetch
+   instruction if this instruction is supported by the current CPU.
+   Higher data sizes bring the data closer to the core: a MOV with word
+   size loads the data into a higher level cache than a byte size. Otherwise
+   the type does not affect the prefetch instruction. Furthermore a prefetch
+   instruction never fails, so it can be used to prefetch a data from an
+   address and check whether that address is NULL afterwards.
 */


 /* Flags: - (does not modify flags) */
@@ -1090,7 +1098,6 @@
 #define SLJIT_SET_OVERFLOW        SLJIT_SET(SLJIT_OVERFLOW)
 #define SLJIT_NOT_OVERFLOW        11
 #define SLJIT_NOT_OVERFLOW32        (SLJIT_NOT_OVERFLOW | SLJIT_I32_OP)
-#define SLJIT_SET_NOT_OVERFLOW        SLJIT_SET(SLJIT_NOT_OVERFLOW)


 #define SLJIT_MUL_OVERFLOW        12
 #define SLJIT_MUL_OVERFLOW32        (SLJIT_MUL_OVERFLOW | SLJIT_I32_OP)
@@ -1097,7 +1104,6 @@
 #define SLJIT_SET_MUL_OVERFLOW        SLJIT_SET(SLJIT_MUL_OVERFLOW)
 #define SLJIT_MUL_NOT_OVERFLOW        13
 #define SLJIT_MUL_NOT_OVERFLOW32    (SLJIT_MUL_NOT_OVERFLOW | SLJIT_I32_OP)
-#define SLJIT_SET_MUL_NOT_OVERFLOW    SLJIT_SET(SLJIT_MUL_NOT_OVERFLOW)


 /* There is no SLJIT_CARRY or SLJIT_NOT_CARRY. */
 #define SLJIT_SET_CARRY            SLJIT_SET(14)
@@ -1194,19 +1200,15 @@
    represented by the type is 1, if the condition represented by the type
    is fulfilled, and 0 otherwise.


-   If op == SLJIT_MOV, SLJIT_MOV_S32, SLJIT_MOV_U32:
+   If op == SLJIT_MOV, SLJIT_MOV32:
      Set dst to the value represented by the type (0 or 1).
-     Src must be SLJIT_UNUSED, and srcw must be 0
      Flags: - (does not modify flags)
    If op == SLJIT_OR, op == SLJIT_AND, op == SLJIT_XOR
-     Performs the binary operation using src as the first, and the value
-     represented by type as the second argument.
-     Important note: only dst=src and dstw=srcw is supported at the moment!
-     Flags: Z (may destroy flags)
-   Note: sljit_emit_op_flags does nothing, if dst is SLJIT_UNUSED (regardless of op). */
+     Performs the binary operation using dst as the first, and the value
+     represented by type as the second argument. Result is written into dst.
+     Flags: Z (may destroy flags) */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type);


/* Emit a conditional mov instruction which moves source to destination,

Modified: code/trunk/src/sljit/sljitNativeARM_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_32.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeARM_32.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -843,6 +843,7 @@
 #define WORD_DATA    0x00
 #define BYTE_DATA    0x01
 #define HALF_DATA    0x02
+#define PRELOAD_DATA    0x03
 #define SIGNED_DATA    0x04
 #define LOAD_DATA    0x08


@@ -871,7 +872,7 @@
/* l u w */ 0xe5100000 /* ldr */,
/* l u b */ 0xe5500000 /* ldrb */,
/* l u h */ 0xe11000b0 /* ldrh */,
-/* l u N */ 0x00000000 /* not allowed */,
+/* l u p */ 0xf5500000 /* preload data */,
/* l s w */ 0xe5100000 /* ldr */,
/* l s b */ 0xe11000d0 /* ldrsb */,
/* l s h */ 0xe11000f0 /* ldrsh */,
@@ -879,7 +880,7 @@
};

 #define EMIT_DATA_TRANSFER(type, add, wb, target_reg, base_reg, arg) \
-    (data_transfer_insts[(type) & 0xf] | ((add) << 23) | ((wb) << (21 - 4)) | (reg_map[target_reg] << 12) | (reg_map[base_reg] << 16) | (arg))
+    (data_transfer_insts[(type) & 0xf] | ((add) << 23) | ((wb) << (21 - 4)) | RD(target_reg) | RN(base_reg) | (arg))


 /* Normal ldr/str instruction.
    Type2: ldrsb, ldrh, ldrsh */
@@ -1344,8 +1345,16 @@


     if ((arg & REG_MASK) == SLJIT_UNUSED) {
         /* Write back is not used. */
-        FAIL_IF(load_immediate(compiler, tmp_reg, argw));
-        return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 0, reg, tmp_reg, is_type1_transfer ? 0 : TYPE2_TRANSFER_IMM(0)));
+        if (is_type1_transfer) {
+            FAIL_IF(load_immediate(compiler, tmp_reg, argw & ~0xfff));
+            argw &= 0xfff;
+        }
+        else {
+            FAIL_IF(load_immediate(compiler, tmp_reg, argw & ~0xff));
+            argw &= 0xff;
+        }
+
+        return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 0, reg, tmp_reg, is_type1_transfer ? argw : TYPE2_TRANSFER_IMM(argw)));
     }


     if (arg & OFFS_REG_MASK) {
@@ -1660,6 +1669,14 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);
     ADJUST_LOCAL_OFFSET(src, srcw);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+#if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
+        if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
+            return emit_op_mem(compiler, PRELOAD_DATA | LOAD_DATA, TMP_PC, src, srcw, TMP_REG1);
+#endif
+        return SLJIT_SUCCESS;
+    }
+
     switch (GET_OPCODE(op)) {
     case SLJIT_MOV:
     case SLJIT_MOV_U32:
@@ -1725,6 +1742,9 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
     switch (GET_OPCODE(op)) {
     case SLJIT_ADD:
     case SLJIT_ADDC:
@@ -1845,9 +1865,6 @@


     FAIL_IF(push_inst(compiler, EMIT_FPU_OPERATION(VCVT_S32_F32, op & SLJIT_F32_OP, TMP_FREG1, src, 0)));


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst(compiler, VMOV | (1 << 20) | RD(dst) | (TMP_FREG1 << 16));


@@ -2015,10 +2032,6 @@

     SLJIT_ASSERT(reg_map[TMP_REG1] == 14);


-    /* For UNUSED dst. Uncommon, but possible. */
-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, RM(TMP_REG1)));


@@ -2199,51 +2212,43 @@

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
-    sljit_s32 dst_reg, flags = GET_ALL_FLAGS(op);
+    sljit_s32 dst_r, flags = GET_ALL_FLAGS(op);
     sljit_uw cc, ins;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
+    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);
-    ADJUST_LOCAL_OFFSET(src, srcw);


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     op = GET_OPCODE(op);
     cc = get_cc(type & 0xff);
-    dst_reg = FAST_IS_REG(dst) ? dst : TMP_REG2;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


     if (op < SLJIT_ADD) {
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_reg, SLJIT_UNUSED, SRC2_IMM | 0)));
-        FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_reg, SLJIT_UNUSED, SRC2_IMM | 1) & ~COND_MASK) | cc));
-        return (dst_reg == TMP_REG2) ? emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw, TMP_REG1) : SLJIT_SUCCESS;
+        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_r, SLJIT_UNUSED, SRC2_IMM | 0)));
+        FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_r, SLJIT_UNUSED, SRC2_IMM | 1) & ~COND_MASK) | cc));
+        if (dst & SLJIT_MEM)
+            return emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2);
+        return SLJIT_SUCCESS;
     }


     ins = (op == SLJIT_AND ? AND_DP : (op == SLJIT_OR ? ORR_DP : EOR_DP));
-    if ((op == SLJIT_OR || op == SLJIT_XOR) && FAST_IS_REG(dst) && dst == src) {
-        FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst, dst, SRC2_IMM | 1) & ~COND_MASK) | cc));
-        /* The condition must always be set, even if the ORR/EOR is not executed above. */
-        return (flags & SLJIT_SET_Z) ? push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, SET_FLAGS, TMP_REG1, SLJIT_UNUSED, RM(dst))) : SLJIT_SUCCESS;
-    }


-    if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
-        src = TMP_REG1;
-    } else if (src & SLJIT_IMM) {
-        FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
-        src = TMP_REG1;
-    }
+    if (dst & SLJIT_MEM)
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, dst, dstw, TMP_REG2));


-    FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_reg, src, SRC2_IMM | 1) & ~COND_MASK) | cc));
-    FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_reg, src, SRC2_IMM | 0) & ~COND_MASK) | (cc ^ 0x10000000)));
-    if (dst_reg == TMP_REG2)
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw, TMP_REG1));
+    FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_r, dst_r, SRC2_IMM | 1) & ~COND_MASK) | cc));


-    return (flags & SLJIT_SET_Z) ? push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, SET_FLAGS, TMP_REG2, SLJIT_UNUSED, RM(dst_reg))) : SLJIT_SUCCESS;
+    if (op == SLJIT_AND)
+        FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_r, dst_r, SRC2_IMM | 0) & ~COND_MASK) | (cc ^ 0x10000000)));
+
+    if (dst & SLJIT_MEM)
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2));
+
+    if (flags & SLJIT_SET_Z)
+        return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, SET_FLAGS, TMP_REG2, SLJIT_UNUSED, RM(dst_r)));
+    return SLJIT_SUCCESS;
 }


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,

Modified: code/trunk/src/sljit/sljitNativeARM_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_64.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeARM_64.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -890,6 +890,10 @@
     }


     arg &= REG_MASK;
+
+    if (arg == SLJIT_UNUSED)
+        return 0;
+
     if (argw >= 0 && (argw >> shift) <= 0xfff && (argw & ((1 << shift) - 1)) == 0) {
         if (SLJIT_UNLIKELY(flags & ARG_TEST))
             return 1;
@@ -950,7 +954,7 @@
         next_argw = 0;
     }


-    tmp_r = (flags & STORE) ? TMP_REG3 : reg;
+    tmp_r = ((flags & STORE) || (flags == (WORD_SIZE | SIGNED))) ? TMP_REG3 : reg;


     if (SLJIT_UNLIKELY((flags & UPDATE) && (arg & REG_MASK))) {
         /* Update only applies if a base register exists. */
@@ -1021,16 +1025,16 @@
         }
     }


-    if (argw >= 0 && argw <= 0xffffff && (argw & ((1 << shift) - 1)) == 0) {
-        FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_r) | RN(arg & REG_MASK) | ((argw >> 12) << 10)));
+    diff = argw - next_argw;
+    next_arg = (arg & REG_MASK) && (arg == next_arg) && diff <= 0xfff && diff >= -0xfff && diff != 0;
+    arg &= REG_MASK;
+
+    if (arg != SLJIT_UNUSED && argw >= 0 && argw <= 0xffffff && (argw & ((1 << shift) - 1)) == 0) {
+        FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_r) | RN(arg) | ((argw >> 12) << 10)));
         return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30)
             | RT(reg) | RN(tmp_r) | ((argw & 0xfff) << (10 - shift)));
     }


-    diff = argw - next_argw;
-    next_arg = (arg & REG_MASK) && (arg == next_arg) && diff <= 0xfff && diff >= -0xfff && diff != 0;
-    arg &= REG_MASK;
-
     if (arg && compiler->cache_arg == SLJIT_MEM) {
         if (compiler->cache_argw == argw)
             return push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
@@ -1313,6 +1317,23 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+        if (op <= SLJIT_MOV_P && (src & SLJIT_MEM)) {
+            SLJIT_ASSERT(reg_map[1] == 0 && reg_map[3] == 2 && reg_map[5] == 4);
+
+            if (op >= SLJIT_MOV_U8 && op <= SLJIT_MOV_S8)
+                dst = 5;
+            else if (op >= SLJIT_MOV_U16 && op <= SLJIT_MOV_S16)
+                dst = 3;
+            else
+                dst = 1;
+
+            /* Signed word sized load is the prefetch instruction. */
+            return emit_op_mem(compiler, WORD_SIZE | SIGNED, dst, src, srcw);
+        }
+        return SLJIT_SUCCESS;
+    }
+
     dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;


     op = GET_OPCODE(op);
@@ -1466,6 +1487,9 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
     dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
     flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
     mem_flags = WORD_SIZE;
@@ -1617,7 +1641,7 @@
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src, sljit_sw srcw)
 {
-    sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+    sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
     sljit_ins inv_bits = (op & SLJIT_F32_OP) ? (1 << 22) : 0;


     if (GET_OPCODE(op) == SLJIT_CONV_S32_FROM_F64)
@@ -1630,7 +1654,7 @@


     FAIL_IF(push_inst(compiler, (FCVTZS ^ inv_bits) | RD(dst_r) | VN(src)));


-    if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
+    if (dst & SLJIT_MEM)
         return emit_op_mem(compiler, ((GET_OPCODE(op) == SLJIT_CONV_S32_FROM_F64) ? INT_SIZE : WORD_SIZE) | STORE, TMP_REG1, dst, dstw);
     return SLJIT_SUCCESS;
 }
@@ -1788,10 +1812,6 @@
     CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    /* For UNUSED dst. Uncommon, but possible. */
-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | RM(TMP_LR));


@@ -1979,20 +1999,15 @@

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
-    sljit_s32 dst_r, flags, mem_flags;
+    sljit_s32 dst_r, src_r, flags, mem_flags;
     sljit_ins cc;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
+    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);
-    ADJUST_LOCAL_OFFSET(src, srcw);


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     cc = get_cc(type & 0xff);
     dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


@@ -2012,19 +2027,19 @@
         mem_flags = INT_SIZE;
     }


-    if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, mem_flags, TMP_REG1, src, srcw, dst, dstw));
-        src = TMP_REG1;
-        srcw = 0;
-    } else if (src & SLJIT_IMM)
-        flags |= ARG1_IMM;
+    src_r = dst;


+    if (dst & SLJIT_MEM) {
+        FAIL_IF(emit_op_mem2(compiler, mem_flags, TMP_REG1, dst, dstw, dst, dstw));
+        src_r = TMP_REG1;
+    }
+
     FAIL_IF(push_inst(compiler, CSINC | (cc << 12) | RD(TMP_REG2) | RN(TMP_ZERO) | RM(TMP_ZERO)));
-    emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src, TMP_REG2);
+    emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src_r, TMP_REG2);


-    if (dst_r != TMP_REG1)
-        return SLJIT_SUCCESS;
-    return emit_op_mem2(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, 0, 0);
+    if (dst & SLJIT_MEM)
+        return emit_op_mem2(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, 0, 0);
+    return SLJIT_SUCCESS;
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
@@ -2064,7 +2079,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
     PTR_FAIL_IF(emit_imm64_const(compiler, dst_r, init_value));


     if (dst & SLJIT_MEM)


Modified: code/trunk/src/sljit/sljitNativeARM_T2_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_T2_32.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeARM_T2_32.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -838,6 +838,7 @@
 #define WORD_SIZE    0x00
 #define BYTE_SIZE    0x04
 #define HALF_SIZE    0x08
+#define PRELOAD        0x0c


 #define UPDATE        0x10


@@ -895,7 +896,7 @@

 #define MEM_IMM8    0xc00
 #define MEM_IMM12    0x800000
-static const sljit_ins sljit_mem32[12] = {
+static const sljit_ins sljit_mem32[13] = {
 /* w u l */ 0xf8500000 /* ldr.w */,
 /* w u s */ 0xf8400000 /* str.w */,
 /* w s l */ 0xf8500000 /* ldr.w */,
@@ -910,6 +911,8 @@
 /* h u s */ 0xf8200000 /* strsh.w */,
 /* h s l */ 0xf9300000 /* ldrsh.w */,
 /* h s s */ 0xf8200000 /* strsh.w */,
+
+/* p u l */ 0xf8100000 /* pld */,
 };


 /* Helper function. Dst should be reg + value, using at most 1 instruction, flags does not set. */
@@ -946,6 +949,12 @@
     arg &= ~SLJIT_MEM;


     if (SLJIT_UNLIKELY(!(arg & REG_MASK))) {
+        tmp = get_imm(argw & ~0xfff);
+        if (tmp != INVALID_IMM) {
+            FAIL_IF(push_inst32(compiler, MOV_WI | RD4(tmp_reg) | tmp));
+            return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(tmp_reg) | (argw & 0xfff));
+        }
+
         FAIL_IF(load_immediate(compiler, tmp_reg, argw));
         if (IS_2_LO_REGS(reg, tmp_reg) && sljit_mem16_imm5[flags])
             return push_inst16(compiler, sljit_mem16_imm5[flags] | RD3(reg) | RN3(tmp_reg));
@@ -1270,6 +1279,13 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);
     ADJUST_LOCAL_OFFSET(src, srcw);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+        /* Since TMP_PC has index 15, IS_2_LO_REGS and IS_3_LO_REGS checks always fail. */
+        if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
+            return emit_op_mem(compiler, PRELOAD, TMP_PC, src, srcw, TMP_REG1);
+        return SLJIT_SUCCESS;
+    }
+
     dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;


     op = GET_OPCODE(op);
@@ -1388,6 +1404,9 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
     dst_reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
     flags = HAS_FLAGS(op) ? SET_FLAGS : 0;


@@ -1507,9 +1526,6 @@

     FAIL_IF(push_inst32(compiler, VCVT_S32_F32 | (op & SLJIT_F32_OP) | DD4(TMP_FREG1) | DM4(src)));


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst32(compiler, VMOV | (1 << 20) | RT4(dst) | DN4(TMP_FREG1));


@@ -1669,10 +1685,6 @@

     SLJIT_ASSERT(reg_map[TMP_REG2] == 14);


-    /* For UNUSED dst. Uncommon, but possible. */
-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst16(compiler, MOV | SET_REGS44(dst, TMP_REG2));


@@ -1836,20 +1848,15 @@

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
     sljit_s32 dst_r, flags = GET_ALL_FLAGS(op);
-    sljit_ins cc, ins;
+    sljit_ins cc;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
+    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);
-    ADJUST_LOCAL_OFFSET(src, srcw);


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     op = GET_OPCODE(op);
     cc = get_cc(type & 0xff);
     dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
@@ -1864,56 +1871,34 @@
             FAIL_IF(push_inst16(compiler, MOVSI | RDN3(dst_r) | 1));
             FAIL_IF(push_inst16(compiler, MOVSI | RDN3(dst_r) | 0));
         }
-        if (dst_r != TMP_REG1)
+        if (!(dst & SLJIT_MEM))
             return SLJIT_SUCCESS;
         return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG1, dst, dstw, TMP_REG2);
     }


-    ins = (op == SLJIT_AND ? ANDI : (op == SLJIT_OR ? ORRI : EORI));
+    if (dst & SLJIT_MEM)
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, dst, dstw, TMP_REG2));


-    if ((op == SLJIT_OR || op == SLJIT_XOR) && FAST_IS_REG(dst) && dst == src) {
-        /* Does not change the other bits. */
-        FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
-        FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst) | 1));
-        if (flags & SLJIT_SET_Z) {
-            /* The condition must always be set, even if the ORRI/EORI is not executed above. */
-            if (reg_map[dst] <= 7)
-                return push_inst16(compiler, MOVS | RD3(TMP_REG1) | RN3(dst));
-            return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(TMP_REG1) | RM4(dst));
-        }
-        return SLJIT_SUCCESS;
-    }
-
-    if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, src, srcw, TMP_REG2));
-        src = TMP_REG2;
-        srcw = 0;
-    } else if (src & SLJIT_IMM) {
-        FAIL_IF(load_immediate(compiler, TMP_REG2, srcw));
-        src = TMP_REG2;
-        srcw = 0;
-    }
-
-    if (op == SLJIT_AND || src != dst_r) {
+    if (op == SLJIT_AND) {
         FAIL_IF(push_inst16(compiler, IT | (cc << 4) | (((cc & 0x1) ^ 0x1) << 3) | 0x4));
-        FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 1));
-        FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 0));
+        FAIL_IF(push_inst32(compiler, ANDI | RN4(dst_r) | RD4(dst_r) | 1));
+        FAIL_IF(push_inst32(compiler, ANDI | RN4(dst_r) | RD4(dst_r) | 0));
     }
     else {
         FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
-        FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 1));
+        FAIL_IF(push_inst32(compiler, ((op == SLJIT_OR) ? ORRI : EORI) | RN4(dst_r) | RD4(dst_r) | 1));
     }


-    if (dst_r == TMP_REG1)
+    if (dst & SLJIT_MEM)
         FAIL_IF(emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG1, dst, dstw, TMP_REG2));


-    if (flags & SLJIT_SET_Z) {
-        /* The condition must always be set, even if the ORR/EORI is not executed above. */
-        if (reg_map[dst_r] <= 7)
-            return push_inst16(compiler, MOVS | RD3(TMP_REG1) | RN3(dst_r));
-        return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(TMP_REG1) | RM4(dst_r));
-    }
-    return SLJIT_SUCCESS;
+    if (!(flags & SLJIT_SET_Z))
+        return SLJIT_SUCCESS;
+
+    /* The condition must always be set, even if the ORR/EORI is not executed above. */
+    if (reg_map[dst_r] <= 7)
+        return push_inst16(compiler, MOVS | RD3(TMP_REG1) | RN3(dst_r));
+    return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(TMP_REG1) | RM4(dst_r));
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
@@ -1977,7 +1962,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
     PTR_FAIL_IF(emit_imm32_const(compiler, dst_r, init_value));


     if (dst & SLJIT_MEM)


Modified: code/trunk/src/sljit/sljitNativeMIPS_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeMIPS_32.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeMIPS_32.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -153,7 +153,7 @@
         return SLJIT_SUCCESS;


     case SLJIT_ADD:
-        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW;
+        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
         is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);


         if (flags & SRC2_IMM) {
@@ -295,7 +295,7 @@
             return SLJIT_SUCCESS;
         }


-        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW;
+        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
         is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);


         if (flags & SRC2_IMM) {
@@ -367,7 +367,7 @@
     case SLJIT_MUL:
         SLJIT_ASSERT(!(flags & SRC2_IMM));


-        if (GET_FLAG_TYPE(op) != SLJIT_MUL_OVERFLOW && GET_FLAG_TYPE(op) != SLJIT_MUL_NOT_OVERFLOW) {
+        if (GET_FLAG_TYPE(op) != SLJIT_MUL_OVERFLOW) {
 #if (defined SLJIT_MIPS_R1 && SLJIT_MIPS_R1)
             return push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst));
 #else


Modified: code/trunk/src/sljit/sljitNativeMIPS_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeMIPS_64.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeMIPS_64.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -244,7 +244,7 @@
         return SLJIT_SUCCESS;


     case SLJIT_ADD:
-        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW;
+        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
         is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);


         if (flags & SRC2_IMM) {
@@ -386,7 +386,7 @@
             return SLJIT_SUCCESS;
         }


-        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW;
+        is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
         is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);


         if (flags & SRC2_IMM) {
@@ -458,7 +458,7 @@
     case SLJIT_MUL:
         SLJIT_ASSERT(!(flags & SRC2_IMM));


-        if (GET_FLAG_TYPE(op) != SLJIT_MUL_OVERFLOW && GET_FLAG_TYPE(op) != SLJIT_MUL_NOT_OVERFLOW) {
+        if (GET_FLAG_TYPE(op) != SLJIT_MUL_OVERFLOW) {
 #if (defined SLJIT_MIPS_R1 && SLJIT_MIPS_R1)
             if (op & SLJIT_I32_OP)
                 return push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst));


Modified: code/trunk/src/sljit/sljitNativeMIPS_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeMIPS_common.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeMIPS_common.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -178,6 +178,8 @@
 #define MOVT        (HI(0) | (1 << 16) | LO(1))
 #define MOVZ        (HI(0) | LO(10))
 #define MUL        (HI(28) | LO(2))
+#define PREF        (HI(51))
+#define PREFX        (HI(19) | LO(15))
 #define SEB        (HI(31) | (16 << 6) | LO(32))
 #define SEH        (HI(31) | (24 << 6) | LO(32))
 #endif
@@ -920,10 +922,8 @@
     }


     if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
-        if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32 && !(src2 & SLJIT_MEM))
-            return SLJIT_SUCCESS;
-        if (HAS_FLAGS(op))
-            flags |= UNUSED_DEST;
+        SLJIT_ASSERT(HAS_FLAGS(op));
+        flags |= UNUSED_DEST;
     }
     else if (FAST_IS_REG(dst)) {
         dst_r = dst;
@@ -1085,6 +1085,29 @@
     return SLJIT_SUCCESS;
 }


+#if (defined SLJIT_MIPS_R1 && SLJIT_MIPS_R1)
+static sljit_s32 emit_prefetch(struct sljit_compiler *compiler,
+        sljit_s32 src, sljit_sw srcw)
+{
+    if (!(src & OFFS_REG_MASK)) {
+        if (srcw <= SIMM_MAX && srcw >= SIMM_MIN)
+            return push_inst(compiler, PREF | S(src & REG_MASK) | IMM(srcw), MOVABLE_INS);
+
+        FAIL_IF(load_immediate(compiler, DR(TMP_REG1), srcw));
+        return push_inst(compiler, PREFX | S(src & REG_MASK) | T(TMP_REG1), MOVABLE_INS);
+    }
+
+    srcw &= 0x3;
+
+    if (SLJIT_UNLIKELY(srcw != 0)) {
+        FAIL_IF(push_inst(compiler, SLL_W | T(OFFS_REG(src)) | D(TMP_REG1) | SH_IMM(srcw), DR(TMP_REG1)));
+        return push_inst(compiler, PREFX | S(src & REG_MASK) | T(TMP_REG1), MOVABLE_INS);
+    }
+
+    return push_inst(compiler, PREFX | S(src & REG_MASK) | T(OFFS_REG(src)), MOVABLE_INS);
+}
+#endif
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src, sljit_sw srcw)
@@ -1100,6 +1123,14 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);
     ADJUST_LOCAL_OFFSET(src, srcw);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+#if (defined SLJIT_MIPS_R1 && SLJIT_MIPS_R1)
+        if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
+            return emit_prefetch(compiler, src, srcw);
+#endif
+        return SLJIT_SUCCESS;
+    }
+
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
     if ((op & SLJIT_I32_OP) && GET_OPCODE(op) >= SLJIT_NOT) {
         flags |= INT_DATA | SIGNED_DATA;
@@ -1203,6 +1234,9 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
     if (op & SLJIT_I32_OP) {
         flags |= INT_DATA | SIGNED_DATA;
@@ -1301,9 +1335,6 @@


     FAIL_IF(push_inst(compiler, (TRUNC_W_S ^ (flags >> 19)) | FMT(op) | FS(src) | FD(TMP_FREG1), MOVABLE_INS));


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst(compiler, MFC1 | flags | T(dst) | FS(TMP_FREG1), MOVABLE_INS);


@@ -1538,10 +1569,6 @@
     CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    /* For UNUSED dst. Uncommon, but possible. */
-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst(compiler, ADDU_W | SA(RETURN_ADDR_REG) | TA(0) | D(dst), DR(dst));


@@ -1903,50 +1930,43 @@

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
-    sljit_s32 sugg_dst_ar, dst_ar;
-    sljit_s32 flags = GET_ALL_FLAGS(op);
+    sljit_s32 src_ar, dst_ar;
+    sljit_s32 saved_op = op;
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#    define mem_type WORD_DATA
+    sljit_s32 mem_type = WORD_DATA;
 #else
     sljit_s32 mem_type = (op & SLJIT_I32_OP) ? (INT_DATA | SIGNED_DATA) : WORD_DATA;
 #endif


     CHECK_ERROR();
-    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
+    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     op = GET_OPCODE(op);
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-    if (op == SLJIT_MOV_S32 || op == SLJIT_MOV_U32)
+    if (op == SLJIT_MOV_S32)
         mem_type = INT_DATA | SIGNED_DATA;
 #endif
-    sugg_dst_ar = DR((op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2);
+    dst_ar = DR((op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2);


     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
-    if (op >= SLJIT_ADD && (src & SLJIT_MEM)) {
-        ADJUST_LOCAL_OFFSET(src, srcw);
-        FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, DR(TMP_REG1), src, srcw, dst, dstw));
-        src = TMP_REG1;
-        srcw = 0;
-    }


+    if (op >= SLJIT_ADD && (dst & SLJIT_MEM))
+        FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, DR(TMP_REG1), dst, dstw, dst, dstw));
+
     switch (type & 0xff) {
     case SLJIT_EQUAL:
     case SLJIT_NOT_EQUAL:
-        FAIL_IF(push_inst(compiler, SLTIU | SA(EQUAL_FLAG) | TA(sugg_dst_ar) | IMM(1), sugg_dst_ar));
-        dst_ar = sugg_dst_ar;
+        FAIL_IF(push_inst(compiler, SLTIU | SA(EQUAL_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
+        src_ar = dst_ar;
         break;
     case SLJIT_MUL_OVERFLOW:
     case SLJIT_MUL_NOT_OVERFLOW:
-        FAIL_IF(push_inst(compiler, SLTIU | SA(OTHER_FLAG) | TA(sugg_dst_ar) | IMM(1), sugg_dst_ar));
-        dst_ar = sugg_dst_ar;
+        FAIL_IF(push_inst(compiler, SLTIU | SA(OTHER_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
+        src_ar = dst_ar;
         type ^= 0x1; /* Flip type bit for the XORI below. */
         break;
     case SLJIT_GREATER_F64:
@@ -1958,38 +1978,40 @@
     case SLJIT_GREATER_EQUAL_F64:
     case SLJIT_UNORDERED_F64:
     case SLJIT_ORDERED_F64:
-        FAIL_IF(push_inst(compiler, CFC1 | TA(sugg_dst_ar) | DA(FCSR_REG), sugg_dst_ar));
-        FAIL_IF(push_inst(compiler, SRL | TA(sugg_dst_ar) | DA(sugg_dst_ar) | SH_IMM(23), sugg_dst_ar));
-        FAIL_IF(push_inst(compiler, ANDI | SA(sugg_dst_ar) | TA(sugg_dst_ar) | IMM(1), sugg_dst_ar));
-        dst_ar = sugg_dst_ar;
+        FAIL_IF(push_inst(compiler, CFC1 | TA(dst_ar) | DA(FCSR_REG), dst_ar));
+        FAIL_IF(push_inst(compiler, SRL | TA(dst_ar) | DA(dst_ar) | SH_IMM(23), dst_ar));
+        FAIL_IF(push_inst(compiler, ANDI | SA(dst_ar) | TA(dst_ar) | IMM(1), dst_ar));
+        src_ar = dst_ar;
         break;


     default:
-        dst_ar = OTHER_FLAG;
+        src_ar = OTHER_FLAG;
         break;
     }


     if (type & 0x1) {
-        FAIL_IF(push_inst(compiler, XORI | SA(dst_ar) | TA(sugg_dst_ar) | IMM(1), sugg_dst_ar));
-        dst_ar = sugg_dst_ar;
+        FAIL_IF(push_inst(compiler, XORI | SA(src_ar) | TA(dst_ar) | IMM(1), dst_ar));
+        src_ar = dst_ar;
     }


-    if (op >= SLJIT_ADD) {
-        if (DR(TMP_REG2) != dst_ar)
-            FAIL_IF(push_inst(compiler, ADDU_W | SA(dst_ar) | TA(0) | D(TMP_REG2), DR(TMP_REG2)));
-        return emit_op(compiler, op | flags, mem_type | CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0);
+    if (op < SLJIT_ADD) {
+        if (dst & SLJIT_MEM)
+            return emit_op_mem(compiler, mem_type, src_ar, dst, dstw);
+
+        if (src_ar != dst_ar)
+            return push_inst(compiler, ADDU_W | SA(src_ar) | TA(0) | DA(dst_ar), dst_ar);
+        return SLJIT_SUCCESS;
     }


-    if (dst & SLJIT_MEM)
-        return emit_op_mem(compiler, mem_type, dst_ar, dst, dstw);
+    /* OTHER_FLAG cannot be specified as src2 argument at the moment. */
+    if (DR(TMP_REG2) != src_ar)
+        FAIL_IF(push_inst(compiler, ADDU_W | SA(src_ar) | TA(0) | D(TMP_REG2), DR(TMP_REG2)));


-    if (sugg_dst_ar != dst_ar)
-        return push_inst(compiler, ADDU_W | SA(dst_ar) | TA(0) | DA(sugg_dst_ar), sugg_dst_ar);
-    return SLJIT_SUCCESS;
+    mem_type |= CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE;


-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#    undef mem_type
-#endif
+    if (dst & SLJIT_MEM)
+        return emit_op(compiler, saved_op, mem_type, dst, dstw, TMP_REG1, 0, TMP_REG2, 0);
+    return emit_op(compiler, saved_op, mem_type, dst, dstw, dst, dstw, TMP_REG2, 0);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
@@ -2078,7 +2100,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;
+    reg = FAST_IS_REG(dst) ? dst : TMP_REG2;


     PTR_FAIL_IF(emit_const(compiler, reg, init_value));



Modified: code/trunk/src/sljit/sljitNativePPC_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativePPC_common.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativePPC_common.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -154,6 +154,7 @@
 #define CMPL        (HI(31) | LO(32))
 #define CMPLI        (HI(10))
 #define CROR        (HI(19) | LO(449))
+#define DCBT        (HI(31) | LO(278))
 #define DIVD        (HI(31) | LO(489))
 #define DIVDU        (HI(31) | LO(457))
 #define DIVW        (HI(31) | LO(491))
@@ -1169,8 +1170,6 @@


     /* Destination check. */
     if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
-        if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32 && !(src2 & SLJIT_MEM))
-            return SLJIT_SUCCESS;
         dst_r = TMP_REG2;
     }
     else if (FAST_IS_REG(dst)) {
@@ -1323,6 +1322,31 @@
     return SLJIT_SUCCESS;
 }


+static sljit_s32 emit_prefetch(struct sljit_compiler *compiler,
+        sljit_s32 src, sljit_sw srcw)
+{
+    if (!(src & OFFS_REG_MASK)) {
+        if (srcw == 0 && (src & REG_MASK) != SLJIT_UNUSED)
+            return push_inst(compiler, DCBT | A(0) | B(src & REG_MASK));
+
+        FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
+        /* Works with SLJIT_MEM0() case as well. */
+        return push_inst(compiler, DCBT | A(src & REG_MASK) | B(TMP_REG1));
+    }
+
+    srcw &= 0x3;
+
+    if (srcw == 0)
+        return push_inst(compiler, DCBT | A(src & REG_MASK) | B(OFFS_REG(src)));
+
+#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
+    FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(src)) | A(TMP_REG1) | (srcw << 11) | ((31 - srcw) << 1)));
+#else
+    FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, OFFS_REG(src), srcw, 63 - srcw, 1)));
+#endif
+    return push_inst(compiler, DCBT | A(src & REG_MASK) | B(TMP_REG1));
+}
+
 #define EMIT_MOV(type, type_flags, type_cast) \
     emit_op(compiler, (src & SLJIT_IMM) ? SLJIT_MOV : type, flags | (type_flags), dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? type_cast srcw : srcw)


@@ -1338,11 +1362,18 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);
     ADJUST_LOCAL_OFFSET(src, srcw);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+        if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
+            return emit_prefetch(compiler, src, srcw);
+
+        return SLJIT_SUCCESS;
+    }
+
     op = GET_OPCODE(op);
     if ((src & SLJIT_IMM) && srcw == 0)
         src = TMP_ZERO;


-    if (GET_FLAG_TYPE(op_flags) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op_flags) == SLJIT_NOT_OVERFLOW)
+    if (GET_FLAG_TYPE(op_flags) == SLJIT_OVERFLOW)
         FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));


     if (op_flags & SLJIT_I32_OP) {
@@ -1496,6 +1527,9 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
     if ((src1 & SLJIT_IMM) && src1w == 0)
         src1 = TMP_ZERO;
     if ((src2 & SLJIT_IMM) && src2w == 0)
@@ -1513,7 +1547,7 @@
             flags |= ALT_SIGN_EXT;
     }
 #endif
-    if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW)
+    if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
         FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));
     if (src2 == TMP_REG2)
         flags |= ALT_KEEP_CACHE;
@@ -1520,7 +1554,7 @@


     switch (GET_OPCODE(op)) {
     case SLJIT_ADD:
-        if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW)
+        if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
             return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM1, dst, dstw, src1, src1w, src2, src2w);


         if (!HAS_FLAGS(op) && ((src1 | src2) & SLJIT_IMM)) {
@@ -1582,7 +1616,7 @@
             return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM1 | ALT_FORM3, dst, dstw, src1, src1w, src2, src2w);
         }


-        if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW || GET_FLAG_TYPE(op) == SLJIT_NOT_OVERFLOW)
+        if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
             return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM2, dst, dstw, src1, src1w, src2, src2w);


         if (!HAS_FLAGS(op) && ((src1 | src2) & SLJIT_IMM)) {
@@ -1751,9 +1785,6 @@
     op = GET_OPCODE(op);
     FAIL_IF(push_inst(compiler, (op == SLJIT_CONV_S32_FROM_F64 ? FCTIWZ : FCTIDZ) | FD(TMP_FREG1) | FB(src)));


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (op == SLJIT_CONV_SW_FROM_F64) {
         if (FAST_IS_REG(dst)) {
             FAIL_IF(emit_op_mem2(compiler, DOUBLE_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, 0, 0));
@@ -1761,12 +1792,8 @@
         }
         return emit_op_mem2(compiler, DOUBLE_DATA, TMP_FREG1, dst, dstw, 0, 0);
     }
-
 #else
     FAIL_IF(push_inst(compiler, FCTIWZ | FD(TMP_FREG1) | FB(src)));
-
-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
 #endif


     if (FAST_IS_REG(dst)) {
@@ -2043,10 +2070,6 @@
     CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    /* For UNUSED dst. Uncommon, but possible. */
-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst(compiler, MFLR | D(dst));


@@ -2231,28 +2254,23 @@
     return push_inst(compiler, BCCTR | (20 << 21) | (type >= SLJIT_FAST_CALL ? 1 : 0));
 }


-/* Get a bit from CR, all other bits are zeroed. */
-#define GET_CR_BIT(bit, dst) \
-    FAIL_IF(push_inst(compiler, RLWINM | S(dst) | A(dst) | ((1 + (bit)) << 11) | (31 << 6) | (31 << 1)));
-
-#define INVERT_BIT(dst) \
-    FAIL_IF(push_inst(compiler, XORI | S(dst) | A(dst) | 0x1));
-
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
-    sljit_s32 reg, input_flags;
-    sljit_s32 flags = GET_ALL_FLAGS(op);
-    sljit_sw original_dstw = dstw;
+    sljit_s32 reg, input_flags, cr_bit, invert;
+    sljit_s32 saved_op = op;
+    sljit_sw saved_dstw = dstw;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
+    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    input_flags = (op & SLJIT_I32_OP) ? INT_DATA : WORD_DATA;
+#else
+    input_flags = WORD_DATA;
+#endif


     op = GET_OPCODE(op);
     reg = (op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2;
@@ -2259,97 +2277,89 @@


     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
-    if (op >= SLJIT_ADD && (src & SLJIT_MEM)) {
-        ADJUST_LOCAL_OFFSET(src, srcw);
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-        input_flags = (flags & SLJIT_I32_OP) ? INT_DATA : WORD_DATA;
-#else
-        input_flags = WORD_DATA;
-#endif
-        FAIL_IF(emit_op_mem2(compiler, input_flags | LOAD_DATA, TMP_REG1, src, srcw, dst, dstw));
-        src = TMP_REG1;
-        srcw = 0;
-    }


-    FAIL_IF(push_inst(compiler, MFCR | D(reg)));
+    if (op >= SLJIT_ADD && (dst & SLJIT_MEM))
+        FAIL_IF(emit_op_mem2(compiler, input_flags | LOAD_DATA, TMP_REG1, dst, dstw, dst, dstw));


+    invert = 0;
+
     switch (type & 0xff) {
-    case SLJIT_EQUAL:
-        GET_CR_BIT(2, reg);
-        break;
-
-    case SLJIT_NOT_EQUAL:
-        GET_CR_BIT(2, reg);
-        INVERT_BIT(reg);
-        break;
-
     case SLJIT_LESS:
     case SLJIT_SIG_LESS:
-        GET_CR_BIT(0, reg);
+        cr_bit = 0;
         break;


     case SLJIT_GREATER_EQUAL:
     case SLJIT_SIG_GREATER_EQUAL:
-        GET_CR_BIT(0, reg);
-        INVERT_BIT(reg);
+        cr_bit = 0;
+        invert = 1;
         break;


     case SLJIT_GREATER:
     case SLJIT_SIG_GREATER:
-        GET_CR_BIT(1, reg);
+        cr_bit = 1;
         break;


     case SLJIT_LESS_EQUAL:
     case SLJIT_SIG_LESS_EQUAL:
-        GET_CR_BIT(1, reg);
-        INVERT_BIT(reg);
+        cr_bit = 1;
+        invert = 1;
         break;


+    case SLJIT_EQUAL:
+        cr_bit = 2;
+        break;
+
+    case SLJIT_NOT_EQUAL:
+        cr_bit = 2;
+        invert = 1;
+        break;
+
+    case SLJIT_OVERFLOW:
+    case SLJIT_MUL_OVERFLOW:
+        cr_bit = 3;
+        break;
+
+    case SLJIT_NOT_OVERFLOW:
+    case SLJIT_MUL_NOT_OVERFLOW:
+        cr_bit = 3;
+        invert = 1;
+        break;
+
     case SLJIT_LESS_F64:
-        GET_CR_BIT(4 + 0, reg);
+        cr_bit = 4 + 0;
         break;


     case SLJIT_GREATER_EQUAL_F64:
-        GET_CR_BIT(4 + 0, reg);
-        INVERT_BIT(reg);
+        cr_bit = 4 + 0;
+        invert = 1;
         break;


     case SLJIT_GREATER_F64:
-        GET_CR_BIT(4 + 1, reg);
+        cr_bit = 4 + 1;
         break;


     case SLJIT_LESS_EQUAL_F64:
-        GET_CR_BIT(4 + 1, reg);
-        INVERT_BIT(reg);
+        cr_bit = 4 + 1;
+        invert = 1;
         break;


-    case SLJIT_OVERFLOW:
-    case SLJIT_MUL_OVERFLOW:
-        GET_CR_BIT(3, reg);
-        break;
-
-    case SLJIT_NOT_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
-        GET_CR_BIT(3, reg);
-        INVERT_BIT(reg);
-        break;
-
     case SLJIT_EQUAL_F64:
-        GET_CR_BIT(4 + 2, reg);
+        cr_bit = 4 + 2;
         break;


     case SLJIT_NOT_EQUAL_F64:
-        GET_CR_BIT(4 + 2, reg);
-        INVERT_BIT(reg);
+        cr_bit = 4 + 2;
+        invert = 1;
         break;


     case SLJIT_UNORDERED_F64:
-        GET_CR_BIT(4 + 3, reg);
+        cr_bit = 4 + 3;
         break;


     case SLJIT_ORDERED_F64:
-        GET_CR_BIT(4 + 3, reg);
-        INVERT_BIT(reg);
+        cr_bit = 4 + 3;
+        invert = 1;
         break;


     default:
@@ -2357,21 +2367,16 @@
         break;
     }


+    FAIL_IF(push_inst(compiler, MFCR | D(reg)));
+    FAIL_IF(push_inst(compiler, RLWINM | S(reg) | A(reg) | ((1 + (cr_bit)) << 11) | (31 << 6) | (31 << 1)));
+
+    if (invert)
+        FAIL_IF(push_inst(compiler, XORI | S(reg) | A(reg) | 0x1));
+
     if (op < SLJIT_ADD) {
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-        if (op == SLJIT_MOV)
-            input_flags = WORD_DATA;
-        else {
-            op = SLJIT_MOV_U32;
-            input_flags = INT_DATA;
-        }
-#else
-        op = SLJIT_MOV;
-        input_flags = WORD_DATA;
-#endif
-        if (reg != TMP_REG2)
+        if (!(dst & SLJIT_MEM))
             return SLJIT_SUCCESS;
-        return emit_op(compiler, op, input_flags, dst, dstw, TMP_REG1, 0, TMP_REG2, 0);
+        return emit_op_mem2(compiler, input_flags, reg, dst, dstw, reg, 0);
     }


 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
@@ -2378,7 +2383,9 @@
         || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
     compiler->skip_checks = 1;
 #endif
-    return sljit_emit_op2(compiler, op | flags, dst, original_dstw, src, srcw, TMP_REG2, 0);
+    if (dst & SLJIT_MEM)
+        return sljit_emit_op2(compiler, saved_op, dst, saved_dstw, TMP_REG1, 0, TMP_REG2, 0);
+    return sljit_emit_op2(compiler, saved_op, dst, 0, dst, 0, TMP_REG2, 0);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
@@ -2404,7 +2411,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;
+    reg = FAST_IS_REG(dst) ? dst : TMP_REG2;


     PTR_FAIL_IF(emit_const(compiler, reg, init_value));



Modified: code/trunk/src/sljit/sljitNativeSPARC_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeSPARC_common.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeSPARC_common.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -683,18 +683,16 @@
         compiler->cache_argw = 0;
     }


-    if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
-        if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32 && !(src2 & SLJIT_MEM))
-            return SLJIT_SUCCESS;
+    if (dst != SLJIT_UNUSED) {
+        if (FAST_IS_REG(dst)) {
+            dst_r = dst;
+            flags |= REG_DEST;
+            if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
+                sugg_src2_r = dst_r;
+        }
+        else if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, flags | ARG_TEST, TMP_REG1, dst, dstw))
+            flags |= SLOW_DEST;
     }
-    else if (FAST_IS_REG(dst)) {
-        dst_r = dst;
-        flags |= REG_DEST;
-        if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
-            sugg_src2_r = dst_r;
-    }
-    else if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, flags | ARG_TEST, TMP_REG1, dst, dstw))
-        flags |= SLOW_DEST;


     if (flags & IMM_OP) {
         if ((src2 & SLJIT_IMM) && src2w) {
@@ -850,6 +848,9 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);
     ADJUST_LOCAL_OFFSET(src, srcw);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
     op = GET_OPCODE(op);
     switch (op) {
     case SLJIT_MOV:
@@ -920,6 +921,9 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
     op = GET_OPCODE(op);
     switch (op) {
     case SLJIT_ADD:
@@ -991,9 +995,6 @@


     FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSTOI, FDTOI) | DA(TMP_FREG1) | S2A(src), MOVABLE_INS));


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst)) {
         FAIL_IF(emit_op_mem2(compiler, SINGLE_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
         return emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET);
@@ -1207,10 +1208,6 @@
     CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    /* For UNUSED dst. Uncommon, but possible. */
-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     if (FAST_IS_REG(dst))
         return push_inst(compiler, OR | D(dst) | S1(0) | S2(TMP_LINK), DR(dst));


@@ -1394,18 +1391,14 @@

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
     sljit_s32 reg, flags = HAS_FLAGS(op) ? SET_FLAGS : 0;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
+    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
 #if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
     op = GET_OPCODE(op);
     reg = (op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2;
@@ -1412,13 +1405,10 @@


     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
-    if (op >= SLJIT_ADD && (src & SLJIT_MEM)) {
-        ADJUST_LOCAL_OFFSET(src, srcw);
-        FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, dst, dstw));
-        src = TMP_REG1;
-        srcw = 0;
-    }


+    if (op >= SLJIT_ADD && (dst & SLJIT_MEM))
+        FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, dst, dstw, dst, dstw));
+
     type &= 0xff;
     if (type < SLJIT_EQUAL_F64)
         FAIL_IF(push_inst(compiler, BICC | get_cc(type) | 3, UNMOVABLE_INS));
@@ -1428,10 +1418,17 @@
     FAIL_IF(push_inst(compiler, OR | D(reg) | S1(0) | IMM(1), UNMOVABLE_INS));
     FAIL_IF(push_inst(compiler, OR | D(reg) | S1(0) | IMM(0), UNMOVABLE_INS));


-    if (op >= SLJIT_ADD)
-        return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0);
+    if (op >= SLJIT_ADD) {
+        flags |= CUMULATIVE_OP | IMM_OP | ALT_KEEP_CACHE;
+        if (dst & SLJIT_MEM)
+            return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, TMP_REG2, 0);
+        return emit_op(compiler, op, flags, dst, 0, dst, 0, TMP_REG2, 0);
+    }


-    return (reg == TMP_REG2) ? emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw) : SLJIT_SUCCESS;
+    if (!(dst & SLJIT_MEM))
+        return SLJIT_SUCCESS;
+
+    return emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw);
 #else
 #error "Implementation required"
 #endif
@@ -1464,7 +1461,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;
+    reg = FAST_IS_REG(dst) ? dst : TMP_REG2;


     PTR_FAIL_IF(emit_const(compiler, reg, init_value));



Modified: code/trunk/src/sljit/sljitNativeTILEGX_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeTILEGX_64.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeTILEGX_64.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -2092,9 +2092,6 @@
     CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     op = GET_OPCODE(op);
     if (op == SLJIT_MOV_S32 || op == SLJIT_MOV_U32)
         mem_type = INT_DATA | SIGNED_DATA;


Modified: code/trunk/src/sljit/sljitNativeX86_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_64.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeX86_64.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -47,9 +47,8 @@
         *code_ptr++ = 10 + 3;
     }


-    SLJIT_ASSERT(reg_map[TMP_REG3] == 9);
-    *code_ptr++ = REX_W | REX_B;
-    *code_ptr++ = MOV_r_i32 + 1;
+    *code_ptr++ = REX_W | ((reg_map[TMP_REG2] <= 7) ? 0 : REX_B);
+    *code_ptr++ = MOV_r_i32 | reg_lmap[TMP_REG2];
     jump->addr = (sljit_uw)code_ptr;


     if (jump->flags & JUMP_LABEL)
@@ -58,9 +57,10 @@
         sljit_unaligned_store_sw(code_ptr, jump->u.target);


     code_ptr += sizeof(sljit_sw);
-    *code_ptr++ = REX_B;
+    if (reg_map[TMP_REG2] >= 8)
+        *code_ptr++ = REX_B;
     *code_ptr++ = GROUP_FF;
-    *code_ptr++ = (type >= SLJIT_FAST_CALL) ? (MOD_REG | CALL_rm | 1) : (MOD_REG | JMP_rm | 1);
+    *code_ptr++ = MOD_REG | (type >= SLJIT_FAST_CALL ? CALL_rm : JMP_rm) | reg_lmap[TMP_REG2];


     return code_ptr;
 }
@@ -380,12 +380,12 @@
     if (b & SLJIT_MEM) {
         if (!(b & OFFS_REG_MASK)) {
             if (NOT_HALFWORD(immb)) {
-                PTR_FAIL_IF(emit_load_imm64(compiler, TMP_REG3, immb));
+                PTR_FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immb));
                 immb = 0;
                 if (b & REG_MASK)
-                    b |= TO_OFFS_REG(TMP_REG3);
+                    b |= TO_OFFS_REG(TMP_REG2);
                 else
-                    b |= TMP_REG3;
+                    b |= TMP_REG2;
             }
             else if (reg_lmap[b & REG_MASK] == 4)
                 b |= TO_OFFS_REG(SLJIT_SP);
@@ -545,17 +545,19 @@
 /*  Call / return instructions                                           */
 /* --------------------------------------------------------------------- */


-static SLJIT_INLINE sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 type)
+static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 type)
 {
     sljit_u8 *inst;


+    /* After any change update IS_REG_CHANGED_BY_CALL as well. */
 #ifndef _WIN64
-    SLJIT_ASSERT(reg_map[SLJIT_R1] == 6 && reg_map[SLJIT_R0] < 8 && reg_map[SLJIT_R2] < 8);
+    SLJIT_ASSERT(reg_map[SLJIT_R1] == 6 && reg_map[SLJIT_R0] < 8 && reg_map[SLJIT_R2] < 8 && reg_map[TMP_REG1] == 2);


     inst = (sljit_u8*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
     FAIL_IF(!inst);
     INC_SIZE((type < SLJIT_CALL3) ? 3 : 6);
     if (type >= SLJIT_CALL3) {
+        /* Move third argument to TMP_REG1. */
         *inst++ = REX_W;
         *inst++ = MOV_r_rm;
         *inst++ = MOD_REG | (0x2 /* rdx */ << 3) | reg_lmap[SLJIT_R2];
@@ -564,12 +566,13 @@
     *inst++ = MOV_r_rm;
     *inst++ = MOD_REG | (0x7 /* rdi */ << 3) | reg_lmap[SLJIT_R0];
 #else
-    SLJIT_ASSERT(reg_map[SLJIT_R1] == 2 && reg_map[SLJIT_R0] < 8 && reg_map[SLJIT_R2] < 8);
+    SLJIT_ASSERT(reg_map[SLJIT_R1] == 2 && reg_map[SLJIT_R0] < 8 && reg_map[SLJIT_R2] < 8 && reg_map[TMP_REG1] == 8);


     inst = (sljit_u8*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
     FAIL_IF(!inst);
     INC_SIZE((type < SLJIT_CALL3) ? 3 : 6);
     if (type >= SLJIT_CALL3) {
+        /* Move third argument to TMP_REG1. */
         *inst++ = REX_W | REX_R;
         *inst++ = MOV_r_rm;
         *inst++ = MOD_REG | (0x0 /* r8 */ << 3) | reg_lmap[SLJIT_R2];


Modified: code/trunk/src/sljit/sljitNativeX86_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_common.c    2017-05-06 16:56:07 UTC (rev 767)
+++ code/trunk/src/sljit/sljitNativeX86_common.c    2017-05-07 07:10:16 UTC (rev 768)
@@ -85,28 +85,27 @@
 /* Last register + 1. */
 #define TMP_REG1    (SLJIT_NUMBER_OF_REGISTERS + 2)
 #define TMP_REG2    (SLJIT_NUMBER_OF_REGISTERS + 3)
-#define TMP_REG3    (SLJIT_NUMBER_OF_REGISTERS + 4)


 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
    Note: avoid to use r12 and r13 for memory addessing
-   therefore r12 is better for SAVED_EREG than SAVED_REG. */
+   therefore r12 is better to be a higher saved register. */
 #ifndef _WIN64
-/* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
-static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
-    0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
+/* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
+static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
+    0, 0, 6, 1, 7, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
 };
 /* low-map. reg_map & 0x7. */
-static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
-    0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
+static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
+    0, 0, 6, 1, 7, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 1
 };
 #else
-/* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
-static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
-    0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
+/* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
+static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
+    0, 0, 2, 1, 10, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 8, 9
 };
 /* low-map. reg_map & 0x7. */
-static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
-    0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
+static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
+    0, 0, 2, 1, 2,  3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 0, 1
 };
 #endif


@@ -169,7 +168,7 @@
 #define CALL_i32    0xe8
 #define CALL_rm        (/* GROUP_FF */ 2 << 3)
 #define CDQ        0x99
-#define CMOVNE_r_rm    (/* GROUP_0F */ 0x45)
+#define CMOVE_r_rm    (/* GROUP_0F */ 0x44)
 #define CMP        (/* BINARY */ 7 << 3)
 #define CMP_EAX_i32    0x3d
 #define CMP_r_rm    0x3b
@@ -217,6 +216,7 @@
 #define POP_r        0x58
 #define POP_rm        0x8f
 #define POPF        0x9d
+#define PREFETCH    0x18
 #define PUSH_i32    0x68
 #define PUSH_r        0x50
 #define PUSH_rm        (/* GROUP_FF */ 6 << 3)
@@ -602,14 +602,20 @@
         return 1;
 #endif /* SLJIT_DETECT_SSE2 */


-    case SLJIT_HAS_CLZ:
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+    case SLJIT_HAS_VIRTUAL_REGISTERS:
         return 1;
+#endif


+    case SLJIT_HAS_CLZ:
     case SLJIT_HAS_CMOV:
         if (cpu_has_cmov == -1)
             get_cpu_features();
         return cpu_has_cmov;


+    case SLJIT_HAS_PREF_SHIFT_REG:
+        return 1;
+
     case SLJIT_HAS_SSE2:
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
         if (cpu_has_sse2 == -1)
@@ -676,15 +682,8 @@
 {
     sljit_u8* inst;


-    if (dst == SLJIT_UNUSED) {
-        /* No destination, doesn't need to setup flags. */
-        if (src & SLJIT_MEM) {
-            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
-            FAIL_IF(!inst);
-            *inst = MOV_r_rm;
-        }
-        return SLJIT_SUCCESS;
-    }
+    SLJIT_ASSERT(dst != SLJIT_UNUSED);
+
     if (FAST_IS_REG(src)) {
         inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
         FAIL_IF(!inst);
@@ -706,8 +705,10 @@
         }
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
-            FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
-            inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
+            /* Immediate to memory move. Only SLJIT_MOV operation copies
+               an immediate directly into memory so TMP_REG1 can be used. */
+            FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
             FAIL_IF(!inst);
             *inst = MOV_rm_r;
             return SLJIT_SUCCESS;
@@ -725,7 +726,8 @@
         return SLJIT_SUCCESS;
     }


-    /* Memory to memory move. Requires two instruction. */
+    /* Memory to memory move. Only SLJIT_MOV operation copies
+       data from memory to memory so TMP_REG1 can be used. */
     inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
     FAIL_IF(!inst);
     *inst = MOV_r_rm;
@@ -898,9 +900,6 @@
     compiler->mode32 = 0;
 #endif


-    if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
-        return SLJIT_SUCCESS; /* Empty instruction. */
-
     if (src & SLJIT_IMM) {
         if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
@@ -1029,6 +1028,30 @@
     return SLJIT_SUCCESS;
 }


+static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
+    sljit_s32 src, sljit_sw srcw)
+{
+    sljit_u8* inst;
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+    compiler->mode32 = 1;
+#endif
+
+    inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
+    FAIL_IF(!inst);
+    *inst++ = GROUP_0F;
+    *inst++ = PREFETCH;
+
+    if (op >= SLJIT_MOV_U8 && op <= SLJIT_MOV_S8)
+        *inst |= (3 << 3);
+    else if (op >= SLJIT_MOV_U16 && op <= SLJIT_MOV_S16)
+        *inst |= (2 << 3);
+    else
+        *inst |= (1 << 3);
+
+    return SLJIT_SUCCESS;
+}
+
 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src, sljit_sw srcw)
@@ -1040,9 +1063,6 @@
     compiler->mode32 = 0;
 #endif


-    if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
-        return SLJIT_SUCCESS; /* Empty instruction. */
-
     if (src & SLJIT_IMM) {
         if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
@@ -1086,14 +1106,6 @@
 {
     sljit_u8* inst;


-    if (dst == SLJIT_UNUSED) {
-        EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
-        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
-        FAIL_IF(!inst);
-        *inst++ = GROUP_F7;
-        *inst |= opcode;
-        return SLJIT_SUCCESS;
-    }
     if (dst == src && dstw == srcw) {
         /* Same input and output */
         inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
@@ -1102,14 +1114,19 @@
         *inst |= opcode;
         return SLJIT_SUCCESS;
     }
+
+    if (dst == SLJIT_UNUSED)
+        dst = TMP_REG1;
+
     if (FAST_IS_REG(dst)) {
         EMIT_MOV(compiler, dst, 0, src, srcw);
-        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
+        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
         FAIL_IF(!inst);
         *inst++ = GROUP_F7;
         *inst |= opcode;
         return SLJIT_SUCCESS;
     }
+
     EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
     inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
     FAIL_IF(!inst);
@@ -1125,20 +1142,12 @@
 {
     sljit_u8* inst;


-    if (dst == SLJIT_UNUSED) {
-        EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
-        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
-        FAIL_IF(!inst);
-        *inst++ = GROUP_F7;
-        *inst |= NOT_rm;
-        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
-        FAIL_IF(!inst);
-        *inst = OR_r_rm;
-        return SLJIT_SUCCESS;
-    }
+    if (dst == SLJIT_UNUSED)
+        dst = TMP_REG1;
+
     if (FAST_IS_REG(dst)) {
         EMIT_MOV(compiler, dst, 0, src, srcw);
-        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
+        inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
         FAIL_IF(!inst);
         *inst++ = GROUP_F7;
         *inst |= NOT_rm;
@@ -1147,6 +1156,7 @@
         *inst = OR_r_rm;
         return SLJIT_SUCCESS;
     }
+
     EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
     inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
     FAIL_IF(!inst);
@@ -1159,6 +1169,10 @@
     return SLJIT_SUCCESS;
 }


+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+static const sljit_sw emit_clz_arg = 32 + 31;
+#endif
+
 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src, sljit_sw srcw)
@@ -1167,8 +1181,6 @@
     sljit_s32 dst_r;


     SLJIT_UNUSED_ARG(op_flags);
-    if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED))
-        return SLJIT_SUCCESS;


     if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
@@ -1176,81 +1188,53 @@
         srcw = 0;
     }


-    inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
+    if (cpu_has_cmov == -1)
+        get_cpu_features();
+
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+
+    inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
     FAIL_IF(!inst);
     *inst++ = GROUP_0F;
     *inst = BSR_r_rm;


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    if (FAST_IS_REG(dst))
-        dst_r = dst;
-    else {
-        /* Find an unused temporary register. */
-        if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
-            dst_r = SLJIT_R0;
-        else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
-            dst_r = SLJIT_R1;
+    if (cpu_has_cmov) {
+        if (dst_r != TMP_REG1) {
+            EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 32 + 31);
+            inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
+        }
         else
-            dst_r = SLJIT_R2;
-        EMIT_MOV(compiler, dst, dstw, dst_r, 0);
-    }
-    EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
-#else
-    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
-    compiler->mode32 = 0;
-    EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 64 + 63 : 32 + 31);
-    compiler->mode32 = op_flags & SLJIT_I32_OP;
-#endif
+            inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), (sljit_sw)&emit_clz_arg);


-    if (cpu_has_cmov == -1)
-        get_cpu_features();
-
-    if (cpu_has_cmov) {
-        inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst++ = GROUP_0F;
-        *inst = CMOVNE_r_rm;
-    } else {
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
-        FAIL_IF(!inst);
-        INC_SIZE(4);
+        *inst = CMOVE_r_rm;
+    }
+    else
+        FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, 32 + 31));


-        *inst++ = JE_i8;
-        *inst++ = 2;
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
+    inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
 #else
-        inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
+    if (cpu_has_cmov) {
+        EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31));
+
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
         FAIL_IF(!inst);
-        INC_SIZE(5);
-
-        *inst++ = JE_i8;
-        *inst++ = 3;
-        *inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
-#endif
+        *inst++ = GROUP_0F;
+        *inst = CMOVE_r_rm;
     }
+    else
+        FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31)));


-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
-#else
     inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
 #endif
+
     FAIL_IF(!inst);
     *(inst + 1) |= XOR;


-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    if (dst & SLJIT_MEM) {
-        inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
-        FAIL_IF(!inst);
-        *inst = XCHG_r_rm;
-    }
-#else
     if (dst & SLJIT_MEM)
-        EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
-#endif
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
     return SLJIT_SUCCESS;
 }


@@ -1278,7 +1262,14 @@
     compiler->mode32 = op_flags & SLJIT_I32_OP;
 #endif


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+        if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
+            return emit_prefetch(compiler, op, src, srcw);
+        return SLJIT_SUCCESS;
+    }
+
     op = GET_OPCODE(op);
+
     if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         compiler->mode32 = 0;
@@ -1432,8 +1423,8 @@
         *(inst + 1) |= (op_imm); \
     } \
     else { \
-        FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
-        inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
+        FAIL_IF(emit_load_imm64(compiler, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, immw)); \
+        inst = emit_x86_instruction(compiler, 1, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \
         FAIL_IF(!inst); \
         *inst = (op_mr); \
     }
@@ -1659,7 +1650,7 @@
     sljit_u8* inst;
     sljit_s32 dst_r;


-    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;


     /* Register destination. */
     if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
@@ -1711,9 +1702,9 @@
             sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
         }
         else {
-            EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
             if (dst_r != src2)
                 EMIT_MOV(compiler, dst_r, 0, src2, src2w);
+            FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
             inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
             FAIL_IF(!inst);
             *inst++ = GROUP_0F;
@@ -1754,9 +1745,9 @@
             sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
         }
         else {
-            EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
             if (dst_r != src1)
                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
+            FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
             inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
             FAIL_IF(!inst);
             *inst++ = GROUP_0F;
@@ -1775,7 +1766,7 @@
         *inst = IMUL_r_rm;
     }


-    if (dst_r == TMP_REG1)
+    if (dst & SLJIT_MEM)
         EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);


     return SLJIT_SUCCESS;
@@ -1922,8 +1913,8 @@
                 *inst = GROUP_F7;
             }
             else {
-                FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
-                inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
+                FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src2w));
+                inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src1, src1w);
                 FAIL_IF(!inst);
                 *inst = TEST_rm_r;
             }
@@ -1951,8 +1942,8 @@
                 *inst = GROUP_F7;
             }
             else {
-                FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
-                inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
+                FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src1w));
+                inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
                 FAIL_IF(!inst);
                 *inst = TEST_rm_r;
             }
@@ -2066,22 +2057,27 @@
     else {
         /* This case is complex since ecx itself may be used for
            addressing, and this case must be supported as well. */
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-        EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
-#else
         EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
-#endif
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst |= mode;
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
+#else
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+        EMIT_MOV(compiler, TMP_REG2, 0, src2, src2w);
+        inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
+        FAIL_IF(!inst);
+        *inst = XCHG_r_rm;
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+        FAIL_IF(!inst);
+        *inst |= mode;
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
-#else
-        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
 #endif
-        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
     }


     return SLJIT_SUCCESS;
@@ -2140,6 +2136,9 @@
     compiler->mode32 = op & SLJIT_I32_OP;
 #endif


+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
     switch (GET_OPCODE(op)) {
     case SLJIT_ADD:
         if (!HAS_FLAGS(op)) {
@@ -2226,17 +2225,19 @@
 /*  Floating point operators                                             */
 /* --------------------------------------------------------------------- */


-/* Alignment + 2 * 16 bytes. */
-static sljit_s32 sse2_data[3 + (4 + 4) * 2];
+/* Alignment(3) + 4 * 16 bytes. */
+static sljit_s32 sse2_data[3 + (4 * 4)];
static sljit_s32 *sse2_buffer;

 static void init_compiler(void)
 {
+    /* Align to 16 bytes. */
     sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
-    /* Single precision constants. */
+
+    /* Single precision constants (each constant is 16 byte long). */
     sse2_buffer[0] = 0x80000000;
     sse2_buffer[4] = 0x7fffffff;
-    /* Double precision constants. */
+    /* Double precision constants (each constant is 16 byte long). */
     sse2_buffer[8] = 0;
     sse2_buffer[9] = 0x80000000;
     sse2_buffer[12] = 0xffffffff;
@@ -2283,7 +2284,7 @@
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src, sljit_sw srcw)
 {
-    sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+    sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
     sljit_u8 *inst;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -2296,7 +2297,7 @@
     *inst++ = GROUP_0F;
     *inst = CVTTSD2SI_r_xm;


-    if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
+    if (dst & SLJIT_MEM)
         return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
     return SLJIT_SUCCESS;
 }
@@ -2388,7 +2389,7 @@
         return SLJIT_SUCCESS;
     }


-    if (SLOW_IS_REG(dst)) {
+    if (FAST_IS_REG(dst)) {
         dst_r = dst;
         if (dst != src)
             FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
@@ -2533,6 +2534,14 @@
     return jump;
 }


+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+#ifndef _WIN64
+#define IS_REG_CHANGED_BY_CALL(src, type) ((src) == SLJIT_R3)
+#else
+#define IS_REG_CHANGED_BY_CALL(src, type) ((src) == SLJIT_R2)
+#endif
+#endif
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
     sljit_u8 *inst;
@@ -2554,11 +2563,10 @@
         if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
             srcw += sizeof(sljit_sw);
 #endif
-#endif
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
-        if (src == SLJIT_R2) {
-            EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
-            src = TMP_REG1;
+#else
+        if ((src & SLJIT_MEM) || IS_REG_CHANGED_BY_CALL(src, type)) {
+            EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
+            src = TMP_REG2;
         }
 #endif
         FAIL_IF(call_with_args(compiler, type));
@@ -2598,7 +2606,6 @@


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
-    sljit_s32 src, sljit_sw srcw,
     sljit_s32 type)
 {
     sljit_u8 *inst;
@@ -2611,12 +2618,8 @@
     sljit_sw dstw_save = dstw;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
-    SLJIT_UNUSED_ARG(srcw);
+    CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));


-    if (dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
-
     ADJUST_LOCAL_OFFSET(dst, dstw);
     CHECK_EXTRA_REGS(dst, dstw, (void)0);


@@ -2625,7 +2628,7 @@
     cond_set = get_jump_code(type) + 0x10;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
+    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) {
         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
         FAIL_IF(!inst);
         INC_SIZE(4 + 3);
@@ -2640,7 +2643,7 @@
         return SLJIT_SUCCESS;
     }


-    reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
+    reg = (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG1;


     inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
     FAIL_IF(!inst);
@@ -2663,6 +2666,7 @@
         compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
         return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
     }
+
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
         || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
     compiler->skip_checks = 1;
@@ -2724,7 +2728,7 @@
         return SLJIT_SUCCESS;
     }


-    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
+    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
         SLJIT_ASSERT(reg_map[SLJIT_R0] == 0);


         if (dst != SLJIT_R0) {
@@ -2876,14 +2880,11 @@


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     compiler->mode32 = 0;
-    reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+    reg = FAST_IS_REG(dst) ? dst : TMP_REG1;


     if (emit_load_imm64(compiler, reg, init_value))
         return NULL;
 #else
-    if (dst == SLJIT_UNUSED)
-        dst = TMP_REG1;
-
     if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
         return NULL;
 #endif