[Pcre-svn] [1453] code/trunk: JIT compiler update.

トップ ページ
このメッセージを削除
著者: Subversion repository
日付:  
To: pcre-svn
題目: [Pcre-svn] [1453] code/trunk: JIT compiler update.
Revision: 1453
          http://vcs.pcre.org/viewvc?view=rev&revision=1453
Author:   zherczeg
Date:     2014-01-30 06:10:21 +0000 (Thu, 30 Jan 2014)


Log Message:
-----------
JIT compiler update.

Modified Paths:
--------------
    code/trunk/Makefile.am
    code/trunk/sljit/sljitConfig.h
    code/trunk/sljit/sljitConfigInternal.h
    code/trunk/sljit/sljitLir.c
    code/trunk/sljit/sljitLir.h
    code/trunk/sljit/sljitNativeARM_Thumb2.c
    code/trunk/sljit/sljitNativeARM_v5.c
    code/trunk/sljit/sljitNativeMIPS_common.c
    code/trunk/sljit/sljitNativePPC_32.c
    code/trunk/sljit/sljitNativePPC_64.c
    code/trunk/sljit/sljitNativePPC_common.c
    code/trunk/sljit/sljitNativeSPARC_common.c
    code/trunk/sljit/sljitNativeTILEGX.c
    code/trunk/sljit/sljitNativeX86_32.c
    code/trunk/sljit/sljitNativeX86_64.c
    code/trunk/sljit/sljitNativeX86_common.c


Added Paths:
-----------
    code/trunk/sljit/sljitNativeARM_64.c


Modified: code/trunk/Makefile.am
===================================================================
--- code/trunk/Makefile.am    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/Makefile.am    2014-01-30 06:10:21 UTC (rev 1453)
@@ -350,6 +350,7 @@
   sljit/sljitExecAllocator.c \
   sljit/sljitLir.c \
   sljit/sljitLir.h \
+  sljit/sljitNativeARM_64.c \
   sljit/sljitNativeARM_Thumb2.c \
   sljit/sljitNativeARM_v5.c \
   sljit/sljitNativeMIPS_32.c \


Modified: code/trunk/sljit/sljitConfig.h
===================================================================
--- code/trunk/sljit/sljitConfig.h    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitConfig.h    2014-01-30 06:10:21 UTC (rev 1453)
@@ -44,6 +44,7 @@
 /* #define SLJIT_CONFIG_ARM_V5 1 */
 /* #define SLJIT_CONFIG_ARM_V7 1 */
 /* #define SLJIT_CONFIG_ARM_THUMB2 1 */
+/* #define SLJIT_CONFIG_ARM_64 1 */
 /* #define SLJIT_CONFIG_PPC_32 1 */
 /* #define SLJIT_CONFIG_PPC_64 1 */
 /* #define SLJIT_CONFIG_MIPS_32 1 */


Modified: code/trunk/sljit/sljitConfigInternal.h
===================================================================
--- code/trunk/sljit/sljitConfigInternal.h    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitConfigInternal.h    2014-01-30 06:10:21 UTC (rev 1453)
@@ -59,6 +59,7 @@
     || (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5) \
     || (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) \
     || (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2) \
+    || (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
     || (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) \
     || (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) \
     || (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) \
@@ -75,6 +76,7 @@
     + (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5) \
     + (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) \
     + (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2) \
+    + (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
     + (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) \
     + (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) \
     + (defined SLJIT_CONFIG_TILEGX && SLJIT_CONFIG_TILEGX) \
@@ -102,6 +104,8 @@
 #else
 #define SLJIT_CONFIG_ARM_V5 1
 #endif
+#elif defined (__aarch64__)
+#define SLJIT_CONFIG_ARM_64 1
 #elif defined(__ppc64__) || defined(__powerpc64__) || defined(_ARCH_PPC64) || (defined(_POWER) && defined(__64BIT__))
 #define SLJIT_CONFIG_PPC_64 1
 #elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(_ARCH_PWR) || defined(_ARCH_PWR2) || defined(_POWER)
@@ -275,6 +279,7 @@
 typedef unsigned long int sljit_uw;
 typedef long int sljit_sw;
 #elif !(defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) \
+    && !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
     && !(defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) \
     && !(defined SLJIT_CONFIG_TILEGX && SLJIT_CONFIG_TILEGX)
 #define SLJIT_32BIT_ARCHITECTURE 1
@@ -391,7 +396,7 @@
 #endif


 #ifndef SLJIT_INDIRECT_CALL
-#if ((defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) && (defined SLJIT_BIG_ENDIAN && SLJIT_BIG_ENDIAN))  \
+#if ((defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) && (defined SLJIT_BIG_ENDIAN && SLJIT_BIG_ENDIAN)) \
     || ((defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) && defined _AIX)
 /* It seems certain ppc compilers use an indirect addressing for functions
    which makes things complicated. */
@@ -429,6 +434,7 @@
     || (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) \
     || (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) \
     || (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2) \
+    || (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
     || (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) \
     || (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 #define SLJIT_UNALIGNED 1


Modified: code/trunk/sljit/sljitLir.c
===================================================================
--- code/trunk/sljit/sljitLir.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitLir.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -102,6 +102,16 @@
 #define ABUF_SIZE    4096
 #endif


+/* Parameter parsing. */
+#define REG_MASK        0x3f
+#define OFFS_REG(reg)        (((reg) >> 8) & REG_MASK)
+#define OFFS_REG_MASK        (REG_MASK << 8)
+#define TO_OFFS_REG(reg)    ((reg) << 8)
+/* When reg cannot be unused. */
+#define FAST_IS_REG(reg)    ((reg) <= REG_MASK)
+/* When reg can be unused. */
+#define SLOW_IS_REG(reg)    ((reg) > 0 && (reg) <= REG_MASK)
+
 /* Jump flags. */
 #define JUMP_LABEL    0x1
 #define JUMP_ADDR    0x2
@@ -127,27 +137,34 @@
 #if (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2)
 #    define IS_COND        0x04
 #    define IS_BL        0x08
-    /* cannot be encoded as branch */
-#    define B_TYPE0        0x00
     /* conditional + imm8 */
-#    define B_TYPE1        0x10
+#    define PATCH_TYPE1    0x10
     /* conditional + imm20 */
-#    define B_TYPE2        0x20
+#    define PATCH_TYPE2    0x20
     /* IT + imm24 */
-#    define B_TYPE3        0x30
+#    define PATCH_TYPE3    0x30
     /* imm11 */
-#    define B_TYPE4        0x40
+#    define PATCH_TYPE4    0x40
     /* imm24 */
-#    define B_TYPE5        0x50
+#    define PATCH_TYPE5    0x50
     /* BL + imm24 */
-#    define BL_TYPE6    0x60
+#    define PATCH_BL        0x60
     /* 0xf00 cc code for branches */
 #endif


+#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
+#    define IS_COND        0x04
+#    define IS_CBZ        0x08
+#    define IS_BL        0x10
+#    define PATCH_B        0x20
+#    define PATCH_COND    0x40
+#endif
+
 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) || (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-#    define UNCOND_B    0x04
+#    define COND_B        0x04
 #    define PATCH_B        0x08
 #    define ABSOLUTE_B    0x10
+#    define REMOVE_COND    0x20
 #endif


 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -167,15 +184,15 @@
     /* no destination (i.e: store) */
 #    define UNMOVABLE_INS    32
     /* FPU status register */
-#    define FCSR_FCC    33
+#    define FCSR_FCC        33
 #endif


 #if (defined SLJIT_CONFIG_TILEGX && SLJIT_CONFIG_TILEGX)
-#    define IS_JAL           0x04
-#    define IS_COND          0x08
+#    define IS_JAL        0x04
+#    define IS_COND        0x08


-#    define PATCH_B          0x10
-#    define PATCH_J          0x20
+#    define PATCH_B        0x10
+#    define PATCH_J        0x20
 #endif


#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
@@ -215,6 +232,10 @@
#endif
#endif

+#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
+#define SLJIT_HAS_VARIABLE_LOCALS_OFFSET 1
+#endif
+
#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
#define SLJIT_HAS_FIXED_LOCALS_OFFSET 1
#ifdef _AIX
@@ -534,7 +555,7 @@
}

 #define ADDRESSING_DEPENDS_ON(exp, reg) \
-    (((exp) & SLJIT_MEM) && (((exp) & 0xf) == reg || (((exp) >> 4) & 0xf) == reg))
+    (((exp) & SLJIT_MEM) && (((exp) & REG_MASK) == reg || OFFS_REG(exp) == reg))


 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
 #define FUNCTION_CHECK_OP() \
@@ -600,12 +621,12 @@
     else if ((p) == (SLJIT_MEM1(SLJIT_LOCALS_REG))) \
         SLJIT_ASSERT((i) >= 0 && (i) < compiler->logical_local_size); \
     else if ((p) & SLJIT_MEM) { \
-        SLJIT_ASSERT(FUNCTION_CHECK_IS_REG((p) & 0xf)); \
-        if ((p) & 0xf0) { \
-            SLJIT_ASSERT(FUNCTION_CHECK_IS_REG(((p) >> 4) & 0xf)); \
+        SLJIT_ASSERT(FUNCTION_CHECK_IS_REG((p) & REG_MASK)); \
+        if ((p) & OFFS_REG_MASK) { \
+            SLJIT_ASSERT(FUNCTION_CHECK_IS_REG(OFFS_REG(p))); \
             SLJIT_ASSERT(!((i) & ~0x3)); \
         } \
-        SLJIT_ASSERT(((p) >> 9) == 0); \
+        SLJIT_ASSERT(!((p) & ~(SLJIT_MEM | SLJIT_IMM | REG_MASK | OFFS_REG_MASK))); \
     } \
     else \
         SLJIT_ASSERT_STOP();
@@ -617,12 +638,12 @@
     else if ((p) == (SLJIT_MEM1(SLJIT_LOCALS_REG))) \
         SLJIT_ASSERT((i) >= 0 && (i) < compiler->logical_local_size); \
     else if ((p) & SLJIT_MEM) { \
-        SLJIT_ASSERT(FUNCTION_CHECK_IS_REG((p) & 0xf)); \
-        if ((p) & 0xf0) { \
-            SLJIT_ASSERT(FUNCTION_CHECK_IS_REG(((p) >> 4) & 0xf)); \
+        SLJIT_ASSERT(FUNCTION_CHECK_IS_REG((p) & REG_MASK)); \
+        if ((p) & OFFS_REG_MASK) { \
+            SLJIT_ASSERT(FUNCTION_CHECK_IS_REG(OFFS_REG(p))); \
             SLJIT_ASSERT(!((i) & ~0x3)); \
         } \
-        SLJIT_ASSERT(((p) >> 9) == 0); \
+        SLJIT_ASSERT(!((p) & ~(SLJIT_MEM | SLJIT_IMM | REG_MASK | OFFS_REG_MASK))); \
     } \
     else \
         SLJIT_ASSERT_STOP();
@@ -631,23 +652,23 @@
     if ((p) >= SLJIT_FLOAT_REG1 && (p) <= SLJIT_FLOAT_REG6) \
         SLJIT_ASSERT(i == 0); \
     else if ((p) & SLJIT_MEM) { \
-        SLJIT_ASSERT(FUNCTION_CHECK_IS_REG((p) & 0xf)); \
-        if ((p) & 0xf0) { \
-            SLJIT_ASSERT(FUNCTION_CHECK_IS_REG(((p) >> 4) & 0xf)); \
-            SLJIT_ASSERT(((p) & 0xf0) != (SLJIT_LOCALS_REG << 4) && !(i & ~0x3)); \
+        SLJIT_ASSERT(FUNCTION_CHECK_IS_REG((p) & REG_MASK)); \
+        if ((p) & OFFS_REG_MASK) { \
+            SLJIT_ASSERT(FUNCTION_CHECK_IS_REG(OFFS_REG(p))); \
+            SLJIT_ASSERT(((p) & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_LOCALS_REG) && !(i & ~0x3)); \
         } else \
-            SLJIT_ASSERT((((p) >> 4) & 0xf) == 0); \
-        SLJIT_ASSERT(((p) >> 9) == 0); \
+            SLJIT_ASSERT(OFFS_REG(p) == 0); \
+        SLJIT_ASSERT(!((p) & ~(SLJIT_MEM | SLJIT_IMM | REG_MASK | OFFS_REG_MASK))); \
     } \
     else \
         SLJIT_ASSERT_STOP();


 #define FUNCTION_CHECK_OP1() \
     if (GET_OPCODE(op) >= SLJIT_MOVU && GET_OPCODE(op) <= SLJIT_MOVU_P) { \
-        SLJIT_ASSERT(!(src & SLJIT_MEM) || (src & 0xf) != SLJIT_LOCALS_REG); \
-        SLJIT_ASSERT(!(dst & SLJIT_MEM) || (dst & 0xf) != SLJIT_LOCALS_REG); \
-        if ((src & SLJIT_MEM) && (src & 0xf)) \
-            SLJIT_ASSERT((dst & 0xf) != (src & 0xf) && ((dst >> 4) & 0xf) != (src & 0xf)); \
+        SLJIT_ASSERT(!(src & SLJIT_MEM) || (src & REG_MASK) != SLJIT_LOCALS_REG); \
+        SLJIT_ASSERT(!(dst & SLJIT_MEM) || (dst & REG_MASK) != SLJIT_LOCALS_REG); \
+        if ((src & SLJIT_MEM) && (src & REG_MASK)) \
+            SLJIT_ASSERT((dst & REG_MASK) != (src & REG_MASK) && OFFS_REG(dst) != (src & REG_MASK)); \
     }


 #endif
@@ -670,7 +691,9 @@
     (char*)"f4", (char*)"f5", (char*)"f6"
 };


-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) || (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) \
+    || (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+    || (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 #ifdef _WIN64
 #    define SLJIT_PRINT_D    "I64"
 #else
@@ -684,18 +707,18 @@
     if ((p) & SLJIT_IMM) \
         fprintf(compiler->verbose, "#%" SLJIT_PRINT_D "d", (i)); \
     else if ((p) & SLJIT_MEM) { \
-        if ((p) & 0xf) { \
+        if ((p) & REG_MASK) { \
             if (i) { \
-                if (((p) >> 4) & 0xf) \
-                    fprintf(compiler->verbose, "[%s + %s * %d]", reg_names[(p) & 0xF], reg_names[((p) >> 4)& 0xF], 1 << (i)); \
+                if ((p) & OFFS_REG_MASK) \
+                    fprintf(compiler->verbose, "[%s + %s * %d]", reg_names[(p) & REG_MASK], reg_names[OFFS_REG(p)], 1 << (i)); \
                 else \
-                    fprintf(compiler->verbose, "[%s + #%" SLJIT_PRINT_D "d]", reg_names[(p) & 0xF], (i)); \
+                    fprintf(compiler->verbose, "[%s + #%" SLJIT_PRINT_D "d]", reg_names[(p) & REG_MASK], (i)); \
             } \
             else { \
-                if (((p) >> 4) & 0xf) \
-                    fprintf(compiler->verbose, "[%s + %s]", reg_names[(p) & 0xF], reg_names[((p) >> 4)& 0xF]); \
+                if ((p) & OFFS_REG_MASK) \
+                    fprintf(compiler->verbose, "[%s + %s]", reg_names[(p) & REG_MASK], reg_names[OFFS_REG(p)]); \
                 else \
-                    fprintf(compiler->verbose, "[%s]", reg_names[(p) & 0xF]); \
+                    fprintf(compiler->verbose, "[%s]", reg_names[(p) & REG_MASK]); \
             } \
         } \
         else \
@@ -704,18 +727,18 @@
         fprintf(compiler->verbose, "%s", reg_names[p]);
 #define sljit_verbose_fparam(p, i) \
     if ((p) & SLJIT_MEM) { \
-        if ((p) & 0xf) { \
+        if ((p) & REG_MASK) { \
             if (i) { \
-                if (((p) >> 4) & 0xf) \
-                    fprintf(compiler->verbose, "[%s + %s * %d]", reg_names[(p) & 0xF], reg_names[((p) >> 4)& 0xF], 1 << (i)); \
+                if ((p) & OFFS_REG_MASK) \
+                    fprintf(compiler->verbose, "[%s + %s * %d]", reg_names[(p) & REG_MASK], reg_names[OFFS_REG(p)], 1 << (i)); \
                 else \
-                    fprintf(compiler->verbose, "[%s + #%" SLJIT_PRINT_D "d]", reg_names[(p) & 0xF], (i)); \
+                    fprintf(compiler->verbose, "[%s + #%" SLJIT_PRINT_D "d]", reg_names[(p) & REG_MASK], (i)); \
             } \
             else { \
-                if (((p) >> 4) & 0xF) \
-                    fprintf(compiler->verbose, "[%s + %s]", reg_names[(p) & 0xF], reg_names[((p) >> 4)& 0xF]); \
+                if ((p) & OFFS_REG_MASK) \
+                    fprintf(compiler->verbose, "[%s + %s]", reg_names[(p) & REG_MASK], reg_names[OFFS_REG(p)]); \
                 else \
-                    fprintf(compiler->verbose, "[%s]", reg_names[(p) & 0xF]); \
+                    fprintf(compiler->verbose, "[%s]", reg_names[(p) & REG_MASK]); \
             } \
         } \
         else \
@@ -1345,6 +1368,8 @@
 #    include "sljitNativeARM_v5.c"
 #elif (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2)
 #    include "sljitNativeARM_Thumb2.c"
+#elif (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
+#    include "sljitNativeARM_64.c"
 #elif (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
 #    include "sljitNativePPC_common.c"
 #elif (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
@@ -1371,6 +1396,19 @@
     check_sljit_emit_cmp(compiler, type, src1, src1w, src2, src2w);


     condition = type & 0xff;
+#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
+    if ((condition == SLJIT_C_EQUAL || condition == SLJIT_C_NOT_EQUAL)) {
+        if ((src1 & SLJIT_IMM) && !src1w) {
+            src1 = src2;
+            src1w = src2w;
+            src2 = SLJIT_IMM;
+            src2w = 0;
+        }
+        if ((src2 & SLJIT_IMM) && !src2w)
+            return emit_cmp_to0(compiler, type, src1, src1w);
+    }
+#endif
+
     if (SLJIT_UNLIKELY((src1 & SLJIT_IMM) && !(src2 & SLJIT_IMM))) {
         /* Immediate is prefered as second argument by most architectures. */
         switch (condition) {


Modified: code/trunk/sljit/sljitLir.h
===================================================================
--- code/trunk/sljit/sljitLir.h    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitLir.h    2014-01-30 06:10:21 UTC (rev 1453)
@@ -265,6 +265,12 @@
     sljit_sw cache_argw;
 #endif


+#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
+    sljit_si locals_offset;
+    sljit_si cache_arg;
+    sljit_sw cache_argw;
+#endif
+
 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) || (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
     sljit_sw imm;
     sljit_si cache_arg;
@@ -478,11 +484,11 @@


 /* Register output: simply the name of the register.
    For destination, you can use SLJIT_UNUSED as well. */
-#define SLJIT_MEM        0x100
+#define SLJIT_MEM        0x80
 #define SLJIT_MEM0()        (SLJIT_MEM)
 #define SLJIT_MEM1(r1)        (SLJIT_MEM | (r1))
-#define SLJIT_MEM2(r1, r2)    (SLJIT_MEM | (r1) | ((r2) << 4))
-#define SLJIT_IMM        0x200
+#define SLJIT_MEM2(r1, r2)    (SLJIT_MEM | (r1) | ((r2) << 8))
+#define SLJIT_IMM        0x40


 /* Set 32 bit operation mode (I) on 64 bit CPUs. The flag is totally ignored on
    32 bit CPUs. If this flag is set for an arithmetic operation, it uses only the
@@ -575,7 +581,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op);


 /* Notes for MOV instructions:
-   U = Mov with update (post form). If source or destination defined as SLJIT_MEM1(r1)
+   U = Mov with update (pre form). If source or destination defined as SLJIT_MEM1(r1)
        or SLJIT_MEM2(r1, r2), r1 is increased by the sum of r2 and the constant argument
    UB = unsigned byte (8 bit)
    SB = signed byte (8 bit)
@@ -602,7 +608,7 @@
 /* Flags: I - (never set any flags)
    Note: see SLJIT_INT_OP for further details. */
 #define SLJIT_MOV_UI            11
-/* No SLJIT_INT_OP form, since it the same as SLJIT_IMOVU. */
+/* No SLJIT_INT_OP form, since it the same as SLJIT_IMOV. */
 /* Flags: I - (never set any flags)
    Note: see SLJIT_INT_OP for further details. */
 #define SLJIT_MOV_SI            12


Added: code/trunk/sljit/sljitNativeARM_64.c
===================================================================
--- code/trunk/sljit/sljitNativeARM_64.c                            (rev 0)
+++ code/trunk/sljit/sljitNativeARM_64.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -0,0 +1,1865 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright 2009-2012 Zoltan Herczeg (hzmester@???). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+SLJIT_API_FUNC_ATTRIBUTE SLJIT_CONST char* sljit_get_platform_name(void)
+{
+    return "ARM-64" SLJIT_CPUINFO;
+}
+
+/* Length of an instruction word */
+typedef sljit_ui sljit_ins;
+
+#define TMP_ZERO    0
+
+#define TMP_REG1    (SLJIT_NO_REGISTERS + 1)
+#define TMP_REG2    (SLJIT_NO_REGISTERS + 2)
+#define TMP_REG3    (SLJIT_NO_REGISTERS + 3)
+#define TMP_REG4    (SLJIT_NO_REGISTERS + 4)
+#define TMP_LR        (SLJIT_NO_REGISTERS + 5)
+#define TMP_SP        (SLJIT_NO_REGISTERS + 6)
+
+#define TMP_FREG1    (0)
+#define TMP_FREG2    (SLJIT_FLOAT_REG6 + 1)
+
+static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 7] = {
+  31, 0, 1, 2, 3, 4, 19, 20, 21, 22, 23, 29, 9, 10, 11, 12, 30, 31
+};
+
+#define W_OP (1 << 31)
+#define RD(rd) (reg_map[rd])
+#define RT(rt) (reg_map[rt])
+#define RN(rn) (reg_map[rn] << 5)
+#define RT2(rt2) (reg_map[rt2] << 10)
+#define RM(rm) (reg_map[rm] << 16)
+#define VD(vd) (vd)
+#define VT(vt) (vt)
+#define VN(vn) ((vn) << 5)
+#define VM(vm) ((vm) << 16)
+
+/* --------------------------------------------------------------------- */
+/*  Instrucion forms                                                     */
+/* --------------------------------------------------------------------- */
+
+#define ADC 0x9a000000
+#define ADD 0x8b000000
+#define ADDI 0x91000000
+#define AND 0x8a000000
+#define ANDI 0x92000000
+#define ASRV 0x9ac02800
+#define B 0x14000000
+#define B_CC 0x54000000
+#define BL 0x94000000
+#define BLR 0xd63f0000
+#define BR 0xd61f0000
+#define BRK 0xd4200000
+#define CBZ 0xb4000000
+#define CLZ 0xdac01000
+#define CSINC 0x9a800400
+#define EOR 0xca000000
+#define EORI 0xd2000000
+#define FABS 0x1e60c000
+#define FADD 0x1e602800
+#define FCMP 0x1e602000
+#define FDIV 0x1e601800
+#define FMOV 0x1e604000
+#define FMUL 0x1e600800
+#define FNEG 0x1e614000
+#define FSUB 0x1e603800
+#define LDRI 0xf9400000
+#define LDP 0xa9400000
+#define LDP_PST 0xa8c00000
+#define LSLV 0x9ac02000
+#define LSRV 0x9ac02400
+#define MADD 0x9b000000
+#define MOVK 0xf2800000
+#define MOVN 0x92800000
+#define MOVZ 0xd2800000
+#define NOP 0xd503201f
+#define ORN 0xaa200000
+#define ORR 0xaa000000
+#define ORRI 0xb2000000
+#define RET 0xd65f0000
+#define SBC 0xda000000
+#define SBFM 0x93000000
+#define SDIV 0x9ac00c00
+#define SMADDL 0x9b200000
+#define SMULH 0x9b403c00
+#define STP 0xa9000000
+#define STRI 0xf9000000
+#define STR_F 0x3d000000
+#define STR_FR 0x3c206800
+#define STP_PRE 0xa9800000
+#define STUR_F 0x3c000000
+#define SUB 0xcb000000
+#define SUBI 0xd1000000
+#define SUBS 0xeb000000
+#define UBFM 0xd3000000
+#define UDIV 0x9ac00800
+#define UMULH 0x9bc03c00
+
+/* dest_reg is the absolute name of the register
+   Useful for reordering instructions in the delay slot. */
+static sljit_si push_inst(struct sljit_compiler *compiler, sljit_ins ins)
+{
+    sljit_ins *ptr = (sljit_ins*)ensure_buf(compiler, sizeof(sljit_ins));
+    FAIL_IF(!ptr);
+    *ptr = ins;
+    compiler->size++;
+    return SLJIT_SUCCESS;
+}
+
+static SLJIT_INLINE sljit_si emit_imm64_const(struct sljit_compiler *compiler, sljit_si dst, sljit_uw imm)
+{
+    FAIL_IF(push_inst(compiler, MOVZ | RD(dst) | ((imm & 0xffff) << 5)));
+    FAIL_IF(push_inst(compiler, MOVK | RD(dst) | (((imm >> 16) & 0xffff) << 5) | (1 << 21)));
+    FAIL_IF(push_inst(compiler, MOVK | RD(dst) | (((imm >> 32) & 0xffff) << 5) | (2 << 21)));
+    return push_inst(compiler, MOVK | RD(dst) | ((imm >> 48) << 5) | (3 << 21));
+}
+
+static SLJIT_INLINE void modify_imm64_const(sljit_ins* inst, sljit_uw new_imm)
+{
+    sljit_si dst = inst[0] & 0x1f;
+    SLJIT_ASSERT((inst[0] & 0xffe00000) == MOVZ && (inst[1] & 0xffe00000) == (MOVK | (1 << 21)));
+    inst[0] = MOVZ | dst | ((new_imm & 0xffff) << 5);
+    inst[1] = MOVK | dst | (((new_imm >> 16) & 0xffff) << 5) | (1 << 21);
+    inst[2] = MOVK | dst | (((new_imm >> 32) & 0xffff) << 5) | (2 << 21);
+    inst[3] = MOVK | dst | ((new_imm >> 48) << 5) | (3 << 21);
+}
+
+static SLJIT_INLINE sljit_si detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
+{
+    sljit_sw diff;
+
+    if (jump->flags & SLJIT_REWRITABLE_JUMP)
+        return 0;
+
+    if (jump->flags & JUMP_ADDR)
+        diff = ((sljit_sw)jump->u.target - (sljit_sw)(code_ptr + 4));
+    else {
+        SLJIT_ASSERT(jump->flags & JUMP_LABEL);
+        diff = ((sljit_sw)(code + jump->u.label->size) - (sljit_sw)(code_ptr + 4));
+    }
+
+    if (jump->flags & IS_COND) {
+        diff += sizeof(sljit_ins);
+        if (diff <= 0xfffff && diff >= -0x100000) {
+            code_ptr[-5] ^= (jump->flags & IS_CBZ) ? (0x1 << 24) : 0x1;
+            jump->addr -= sizeof(sljit_ins);
+            jump->flags |= PATCH_COND;
+            return 5;
+        }
+        diff -= sizeof(sljit_ins);
+    }
+
+    if (diff > 0x7ffffff || diff < -0x8000000)
+        return 0;
+
+    jump->flags |= PATCH_B;
+    return 4;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
+{
+    struct sljit_memory_fragment *buf;
+    sljit_ins *code;
+    sljit_ins *code_ptr;
+    sljit_ins *buf_ptr;
+    sljit_ins *buf_end;
+    sljit_uw word_count;
+    sljit_uw addr;
+
+    struct sljit_label *label;
+    struct sljit_jump *jump;
+    struct sljit_const *const_;
+
+    CHECK_ERROR_PTR();
+    check_sljit_generate_code(compiler);
+    reverse_buf(compiler);
+
+    code = (sljit_ins*)SLJIT_MALLOC_EXEC(compiler->size * sizeof(sljit_ins));
+    PTR_FAIL_WITH_EXEC_IF(code);
+    buf = compiler->buf;
+
+    code_ptr = code;
+    word_count = 0;
+    label = compiler->labels;
+    jump = compiler->jumps;
+    const_ = compiler->consts;
+
+    do {
+        buf_ptr = (sljit_ins*)buf->memory;
+        buf_end = buf_ptr + (buf->used_size >> 2);
+        do {
+            *code_ptr = *buf_ptr++;
+            /* These structures are ordered by their address. */
+            SLJIT_ASSERT(!label || label->size >= word_count);
+            SLJIT_ASSERT(!jump || jump->addr >= word_count);
+            SLJIT_ASSERT(!const_ || const_->addr >= word_count);
+            if (label && label->size == word_count) {
+                label->addr = (sljit_uw)code_ptr;
+                label->size = code_ptr - code;
+                label = label->next;
+            }
+            if (jump && jump->addr == word_count) {
+                    jump->addr = (sljit_uw)(code_ptr - 4);
+                    code_ptr -= detect_jump_type(jump, code_ptr, code);
+                    jump = jump->next;
+            }
+            if (const_ && const_->addr == word_count) {
+                const_->addr = (sljit_uw)code_ptr;
+                const_ = const_->next;
+            }
+            code_ptr ++;
+            word_count ++;
+        } while (buf_ptr < buf_end);
+
+        buf = buf->next;
+    } while (buf);
+
+    if (label && label->size == word_count) {
+        label->addr = (sljit_uw)code_ptr;
+        label->size = code_ptr - code;
+        label = label->next;
+    }
+
+    SLJIT_ASSERT(!label);
+    SLJIT_ASSERT(!jump);
+    SLJIT_ASSERT(!const_);
+    SLJIT_ASSERT(code_ptr - code <= (sljit_sw)compiler->size);
+
+    jump = compiler->jumps;
+    while (jump) {
+        do {
+            addr = (jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target;
+            buf_ptr = (sljit_ins*)jump->addr;
+            if (jump->flags & PATCH_B) {
+                addr = (sljit_sw)(addr - jump->addr) >> 2;
+                SLJIT_ASSERT((sljit_sw)addr <= 0x1ffffff && (sljit_sw)addr >= -0x2000000);
+                buf_ptr[0] = ((jump->flags & IS_BL) ? BL : B) | (addr & 0x3ffffff);
+                if (jump->flags & IS_COND)
+                    buf_ptr[-1] -= (4 << 5);
+                break;
+            }
+            if (jump->flags & PATCH_COND) {
+                addr = (sljit_sw)(addr - jump->addr) >> 2;
+                SLJIT_ASSERT((sljit_sw)addr <= 0x3ffff && (sljit_sw)addr >= -0x40000);
+                buf_ptr[0] = (buf_ptr[0] & ~0xffffe0) | ((addr & 0x7ffff) << 5);
+                break;
+            }
+            modify_imm64_const(buf_ptr, addr);
+        } while (0);
+        jump = jump->next;
+    }
+
+    compiler->error = SLJIT_ERR_COMPILED;
+    compiler->executable_size = (code_ptr - code) * sizeof(sljit_ins);
+    /* SLJIT_CACHE_FLUSH(code, code_ptr); */
+    return code;
+}
+
+/* --------------------------------------------------------------------- */
+/*  Core code generator functions.                                       */
+/* --------------------------------------------------------------------- */
+
+#define COUNT_TRAILING_ZERO(value, result) \
+    result = 0; \
+    if (!(value & 0xffffffff)) { \
+        result += 32; \
+        value >>= 32; \
+    } \
+    if (!(value & 0xffff)) { \
+        result += 16; \
+        value >>= 16; \
+    } \
+    if (!(value & 0xff)) { \
+        result += 8; \
+        value >>= 8; \
+    } \
+    if (!(value & 0xf)) { \
+        result += 4; \
+        value >>= 4; \
+    } \
+    if (!(value & 0x3)) { \
+        result += 2; \
+        value >>= 2; \
+    } \
+    if (!(value & 0x1)) { \
+        result += 1; \
+        value >>= 1; \
+    }
+
+#define LOGICAL_IMM_CHECK 0x100
+
+static sljit_ins logical_imm(sljit_sw imm, sljit_si len)
+{
+    sljit_si negated, ones, right;
+    sljit_uw mask, uimm;
+    sljit_ins ins;
+
+    if (len & LOGICAL_IMM_CHECK) {
+        len &= ~LOGICAL_IMM_CHECK;
+        if (len == 32 && (imm == 0 || imm == -1))
+            return 0;
+        if (len == 16 && ((sljit_si)imm == 0 || (sljit_si)imm == -1))
+            return 0;
+    }
+
+    SLJIT_ASSERT((len == 32 && imm != 0 && imm != -1)
+        || (len == 16 && (sljit_si)imm != 0 && (sljit_si)imm != -1));
+    uimm = (sljit_uw)imm;
+    while (1) {
+        if (len <= 0) {
+            SLJIT_ASSERT_STOP();
+            return 0;
+        }
+        mask = ((sljit_uw)1 << len) - 1;
+        if ((uimm & mask) != ((uimm >> len) & mask))
+            break;
+        len >>= 1;
+    }
+
+    len <<= 1;
+
+    negated = 0;
+    if (uimm & 0x1) {
+        negated = 1;
+        uimm = ~uimm;
+    }
+
+    if (len < 64)
+        uimm &= ((sljit_uw)1 << len) - 1;
+
+    /* Unsigned right shift. */
+    COUNT_TRAILING_ZERO(uimm, right);
+
+    /* Signed shift. We also know that the highest bit is set. */
+    imm = (sljit_sw)~uimm;
+    SLJIT_ASSERT(imm < 0);
+
+    COUNT_TRAILING_ZERO(imm, ones);
+
+    if (~imm)
+        return 0;
+
+    if (len == 64)
+        ins = 1 << 22;
+    else
+        ins = (0x3f - ((len << 1) - 1)) << 10;
+
+    if (negated)
+        return ins | ((len - ones - 1) << 10) | ((len - ones - right) << 16);
+
+    return ins | ((ones - 1) << 10) | ((len - right) << 16);
+}
+
+#undef COUNT_TRAILING_ZERO
+
+static sljit_si load_immediate(struct sljit_compiler *compiler, sljit_si dst, sljit_sw simm)
+{
+    sljit_uw imm = (sljit_uw)simm;
+    sljit_si i, zeros, ones, first;
+    sljit_ins bitmask;
+
+    if (imm <= 0xffff)
+        return push_inst(compiler, MOVZ | RD(dst) | (imm << 5));
+
+    if (simm >= -0x10000 && simm < 0)
+        return push_inst(compiler, MOVN | RD(dst) | ((~imm & 0xffff) << 5));
+
+    if (imm <= 0xffffffffl) {
+        if ((imm & 0xffff0000l) == 0xffff0000)
+            return push_inst(compiler, (MOVN ^ W_OP) | RD(dst) | ((~imm & 0xffff) << 5));
+        if ((imm & 0xffff) == 0xffff)
+            return push_inst(compiler, (MOVN ^ W_OP) | RD(dst) | ((~imm & 0xffff0000l) >> (16 - 5)) | (1 << 21));
+        bitmask = logical_imm(simm, 16);
+        if (bitmask != 0)
+            return push_inst(compiler, (ORRI ^ W_OP) | RD(dst) | RN(TMP_ZERO) | bitmask);
+    }
+    else {
+        bitmask = logical_imm(simm, 32);
+        if (bitmask != 0)
+            return push_inst(compiler, ORRI | RD(dst) | RN(TMP_ZERO) | bitmask);
+    }
+
+    if (imm <= 0xffffffffl) {
+        FAIL_IF(push_inst(compiler, MOVZ | RD(dst) | ((imm & 0xffff) << 5)));
+        return push_inst(compiler, MOVK | RD(dst) | ((imm & 0xffff0000l) >> (16 - 5)) | (1 << 21));
+    }
+
+    if (simm >= -0x100000000l && simm < 0) {
+        FAIL_IF(push_inst(compiler, MOVN | RD(dst) | ((~imm & 0xffff) << 5)));
+        return push_inst(compiler, MOVK | RD(dst) | ((imm & 0xffff0000l) >> (16 - 5)) | (1 << 21));
+    }
+
+    /* A large amount of number can be constructed from ORR and MOVx,
+    but computing them is costly. We don't  */
+
+    zeros = 0;
+    ones = 0;
+    for (i = 4; i > 0; i--) {
+        if ((simm & 0xffff) == 0)
+            zeros++;
+        if ((simm & 0xffff) == 0xffff)
+            ones++;
+        simm >>= 16;
+    }
+
+    simm = (sljit_sw)imm;
+    first = 1;
+    if (ones > zeros) {
+        simm = ~simm;
+        for (i = 0; i < 4; i++) {
+            if (!(simm & 0xffff)) {
+                simm >>= 16;
+                continue;
+            }
+            if (first) {
+                first = 0;
+                FAIL_IF(push_inst(compiler, MOVN | RD(dst) | ((simm & 0xffff) << 5) | (i << 21)));
+            }
+            else
+                FAIL_IF(push_inst(compiler, MOVK | RD(dst) | ((~simm & 0xffff) << 5) | (i << 21)));
+            simm >>= 16;
+        }
+        return SLJIT_SUCCESS;
+    }
+
+    for (i = 0; i < 4; i++) {
+        if (!(simm & 0xffff)) {
+            simm >>= 16;
+            continue;
+        }
+        if (first) {
+            first = 0;
+            FAIL_IF(push_inst(compiler, MOVZ | RD(dst) | ((simm & 0xffff) << 5) | (i << 21)));
+        }
+        else
+            FAIL_IF(push_inst(compiler, MOVK | RD(dst) | ((simm & 0xffff) << 5) | (i << 21)));
+        simm >>= 16;
+    }
+    return SLJIT_SUCCESS;
+}
+
+#define ARG1_IMM    0x0010000
+#define ARG2_IMM    0x0020000
+#define INT_OP        0x0040000
+#define SET_FLAGS    0x0080000
+#define UNUSED_RETURN    0x0100000
+#define SLOW_DEST    0x0200000
+#define SLOW_SRC1    0x0400000
+#define SLOW_SRC2    0x0800000
+
+#define CHECK_FLAGS(flag_bits) \
+    if (flags & SET_FLAGS) { \
+        inv_bits |= flag_bits; \
+        if (flags & UNUSED_RETURN) \
+            dst = TMP_ZERO; \
+    }
+
+static sljit_si emit_op_imm(struct sljit_compiler *compiler, sljit_si flags, sljit_si dst, sljit_sw arg1, sljit_sw arg2)
+{
+    /* dst must be register, TMP_REG1
+       arg1 must be register, TMP_REG1, imm
+       arg2 must be register, TMP_REG2, imm */
+    sljit_ins inv_bits = (flags & INT_OP) ? (1 << 31) : 0;
+    sljit_ins inst_bits;
+    sljit_si op = (flags & 0xffff);
+    sljit_si reg;
+    sljit_sw imm, nimm;
+
+    if (SLJIT_UNLIKELY((flags & (ARG1_IMM | ARG2_IMM)) == (ARG1_IMM | ARG2_IMM))) {
+        /* Both are immediates. */
+        flags &= ~ARG1_IMM;
+        if (arg1 == 0 && op != SLJIT_ADD && op != SLJIT_SUB)
+            arg1 = TMP_ZERO;
+        else {
+            FAIL_IF(load_immediate(compiler, TMP_REG1, arg1));
+            arg1 = TMP_REG1;
+        }
+    }
+
+    if (flags & (ARG1_IMM | ARG2_IMM)) {
+        reg = (flags & ARG2_IMM) ? arg1 : arg2;
+        imm = (flags & ARG2_IMM) ? arg2 : arg1;
+
+        switch (op) {
+        case SLJIT_MUL:
+        case SLJIT_NEG:
+        case SLJIT_CLZ:
+        case SLJIT_ADDC:
+        case SLJIT_SUBC:
+            /* No form with immediate operand (except imm 0, which
+            is represented by a ZERO register). */
+            break;
+        case SLJIT_MOV:
+            SLJIT_ASSERT(!(flags & SET_FLAGS) && (flags & ARG2_IMM) && arg1 == TMP_REG1);
+            return load_immediate(compiler, dst, imm);
+        case SLJIT_NOT:
+            SLJIT_ASSERT(flags & ARG2_IMM);
+            FAIL_IF(load_immediate(compiler, dst, (flags & INT_OP) ? (~imm & 0xffffffff) : ~imm));
+            goto set_flags;
+        case SLJIT_SUB:
+            if (flags & ARG1_IMM)
+                break;
+            imm = -imm;
+            /* Fall through. */
+        case SLJIT_ADD:
+            if (imm == 0) {
+                CHECK_FLAGS(1 << 29);
+                return push_inst(compiler, ((op == SLJIT_ADD ? ADDI : SUBI) ^ inv_bits) | RD(dst) | RN(reg));
+            }
+            if (imm > 0 && imm <= 0xfff) {
+                CHECK_FLAGS(1 << 29);
+                return push_inst(compiler, (ADDI ^ inv_bits) | RD(dst) | RN(reg) | (imm << 10));
+            }
+            nimm = -imm;
+            if (nimm > 0 && nimm <= 0xfff) {
+                CHECK_FLAGS(1 << 29);
+                return push_inst(compiler, (SUBI ^ inv_bits) | RD(dst) | RN(reg) | (nimm << 10));
+            }
+            if (imm > 0 && imm <= 0xffffff && !(imm & 0xfff)) {
+                CHECK_FLAGS(1 << 29);
+                return push_inst(compiler, (ADDI ^ inv_bits) | RD(dst) | RN(reg) | ((imm >> 12) << 10) | (1 << 22));
+            }
+            if (nimm > 0 && nimm <= 0xffffff && !(nimm & 0xfff)) {
+                CHECK_FLAGS(1 << 29);
+                return push_inst(compiler, (SUBI ^ inv_bits) | RD(dst) | RN(reg) | ((nimm >> 12) << 10) | (1 << 22));
+            }
+            if (imm > 0 && imm <= 0xffffff && !(flags & SET_FLAGS)) {
+                FAIL_IF(push_inst(compiler, (ADDI ^ inv_bits) | RD(dst) | RN(reg) | ((imm >> 12) << 10) | (1 << 22)));
+                return push_inst(compiler, (ADDI ^ inv_bits) | RD(dst) | RN(dst) | ((imm & 0xfff) << 10));
+            }
+            if (nimm > 0 && nimm <= 0xffffff && !(flags & SET_FLAGS)) {
+                FAIL_IF(push_inst(compiler, (SUBI ^ inv_bits) | RD(dst) | RN(reg) | ((nimm >> 12) << 10) | (1 << 22)));
+                return push_inst(compiler, (SUBI ^ inv_bits) | RD(dst) | RN(dst) | ((nimm & 0xfff) << 10));
+            }
+            break;
+        case SLJIT_AND:
+            inst_bits = logical_imm(imm, LOGICAL_IMM_CHECK | ((flags & INT_OP) ? 16 : 32));
+            if (!inst_bits)
+                break;
+            CHECK_FLAGS(3 << 29);
+            return push_inst(compiler, (ANDI ^ inv_bits) | RD(dst) | RN(reg) | inst_bits);
+        case SLJIT_OR:
+        case SLJIT_XOR:
+            inst_bits = logical_imm(imm, LOGICAL_IMM_CHECK | ((flags & INT_OP) ? 16 : 32));
+            if (!inst_bits)
+                break;
+            if (op == SLJIT_OR)
+                inst_bits |= ORRI;
+            else
+                inst_bits |= EORI;
+            FAIL_IF(push_inst(compiler, (inst_bits ^ inv_bits) | RD(dst) | RN(reg)));
+            goto set_flags;
+        case SLJIT_SHL:
+            if (flags & ARG1_IMM)
+                break;
+            if (flags & INT_OP) {
+                imm &= 0x1f;
+                FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | ((-imm & 0x1f) << 16) | ((31 - imm) << 10)));
+            }
+            else {
+                imm &= 0x3f;
+                FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | (1 << 22) | ((-imm & 0x3f) << 16) | ((63 - imm) << 10)));
+            }
+            goto set_flags;
+        case SLJIT_LSHR:
+        case SLJIT_ASHR:
+            if (flags & ARG1_IMM)
+                break;
+            if (op == SLJIT_ASHR)
+                inv_bits |= 1 << 30;
+            if (flags & INT_OP) {
+                imm &= 0x1f;
+                FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | (imm << 16) | (31 << 10)));
+            }
+            else {
+                imm &= 0x3f;
+                FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | (1 << 22) | (imm << 16) | (63 << 10)));
+            }
+            goto set_flags;
+        default:
+            SLJIT_ASSERT_STOP();
+            break;
+        }
+
+        if (flags & ARG2_IMM) {
+            if (arg2 == 0)
+                arg2 = TMP_ZERO;
+            else {
+                FAIL_IF(load_immediate(compiler, TMP_REG2, arg2));
+                arg2 = TMP_REG2;
+            }
+        }
+        else {
+            if (arg1 == 0)
+                arg1 = TMP_ZERO;
+            else {
+                FAIL_IF(load_immediate(compiler, TMP_REG1, arg1));
+                arg1 = TMP_REG1;
+            }
+        }
+    }
+
+    /* Both arguments are registers. */
+    switch (op) {
+    case SLJIT_MOV:
+    case SLJIT_MOV_P:
+    case SLJIT_MOVU:
+    case SLJIT_MOVU_P:
+        SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        if (dst == arg2)
+            return SLJIT_SUCCESS;
+        return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | RM(arg2));
+    case SLJIT_MOV_UB:
+    case SLJIT_MOVU_UB:
+        SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        return push_inst(compiler, (UBFM ^ (1 << 31)) | RD(dst) | RN(arg2) | (7 << 10));
+    case SLJIT_MOV_SB:
+    case SLJIT_MOVU_SB:
+        SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        if (!(flags & INT_OP))
+            inv_bits |= 1 << 22;
+        return push_inst(compiler, (SBFM ^ inv_bits) | RD(dst) | RN(arg2) | (7 << 10));
+    case SLJIT_MOV_UH:
+    case SLJIT_MOVU_UH:
+        SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        return push_inst(compiler, (UBFM ^ (1 << 31)) | RD(dst) | RN(arg2) | (15 << 10));
+    case SLJIT_MOV_SH:
+    case SLJIT_MOVU_SH:
+        SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        if (!(flags & INT_OP))
+            inv_bits |= 1 << 22;
+        return push_inst(compiler, (SBFM ^ inv_bits) | RD(dst) | RN(arg2) | (15 << 10));
+    case SLJIT_MOV_UI:
+    case SLJIT_MOVU_UI:
+        SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        if ((flags & INT_OP) && dst == arg2)
+            return SLJIT_SUCCESS;
+        return push_inst(compiler, (ORR ^ (1 << 31)) | RD(dst) | RN(TMP_ZERO) | RM(arg2));
+    case SLJIT_MOV_SI:
+    case SLJIT_MOVU_SI:
+        SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        if ((flags & INT_OP) && dst == arg2)
+            return SLJIT_SUCCESS;
+        return push_inst(compiler, SBFM | (1 << 22) | RD(dst) | RN(arg2) | (31 << 10));
+    case SLJIT_NOT:
+        SLJIT_ASSERT(arg1 == TMP_REG1);
+        FAIL_IF(push_inst(compiler, (ORN ^ inv_bits) | RD(dst) | RN(TMP_ZERO) | RM(arg2)));
+        goto set_flags;
+    case SLJIT_NEG:
+        SLJIT_ASSERT(arg1 == TMP_REG1);
+        if (flags & SET_FLAGS)
+            inv_bits |= 1 << 29;
+        return push_inst(compiler, (SUB ^ inv_bits) | RD(dst) | RN(TMP_ZERO) | RM(arg2));
+    case SLJIT_CLZ:
+        SLJIT_ASSERT(arg1 == TMP_REG1);
+        FAIL_IF(push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | RN(arg2)));
+        goto set_flags;
+    case SLJIT_ADD:
+        CHECK_FLAGS(1 << 29);
+        return push_inst(compiler, (ADD ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
+    case SLJIT_ADDC:
+        CHECK_FLAGS(1 << 29);
+        return push_inst(compiler, (ADC ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
+    case SLJIT_SUB:
+        CHECK_FLAGS(1 << 29);
+        return push_inst(compiler, (SUB ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
+    case SLJIT_SUBC:
+        CHECK_FLAGS(1 << 29);
+        return push_inst(compiler, (SBC ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
+    case SLJIT_MUL:
+        if (!(flags & SET_FLAGS))
+            return push_inst(compiler, (MADD ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2) | RT2(TMP_ZERO));
+        if (flags & INT_OP) {
+            FAIL_IF(push_inst(compiler, SMADDL | RD(dst) | RN(arg1) | RM(arg2) | (31 << 10)));
+            FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG4) | RN(TMP_ZERO) | RM(dst) | (2 << 22) | (31 << 10)));
+            return push_inst(compiler, SUBS | RD(TMP_ZERO) | RN(TMP_REG4) | RM(dst) | (2 << 22) | (63 << 10));
+        }
+        FAIL_IF(push_inst(compiler, SMULH | RD(TMP_REG4) | RN(arg1) | RM(arg2)));
+        FAIL_IF(push_inst(compiler, MADD | RD(dst) | RN(arg1) | RM(arg2) | RT2(TMP_ZERO)));
+        return push_inst(compiler, SUBS | RD(TMP_ZERO) | RN(TMP_REG4) | RM(dst) | (2 << 22) | (63 << 10));
+    case SLJIT_AND:
+        CHECK_FLAGS(3 << 29);
+        return push_inst(compiler, (AND ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
+    case SLJIT_OR:
+        FAIL_IF(push_inst(compiler, (ORR ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
+        goto set_flags;
+    case SLJIT_XOR:
+        FAIL_IF(push_inst(compiler, (EOR ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
+        goto set_flags;
+    case SLJIT_SHL:
+        FAIL_IF(push_inst(compiler, (LSLV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
+        goto set_flags;
+    case SLJIT_LSHR:
+        FAIL_IF(push_inst(compiler, (LSRV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
+        goto set_flags;
+    case SLJIT_ASHR:
+        FAIL_IF(push_inst(compiler, (ASRV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
+        goto set_flags;
+    }
+
+    SLJIT_ASSERT_STOP();
+    return SLJIT_SUCCESS;
+
+set_flags:
+    if (flags & SET_FLAGS)
+        return push_inst(compiler, (SUBS ^ inv_bits) | RD(TMP_ZERO) | RN(dst) | RM(TMP_ZERO));
+    return SLJIT_SUCCESS;
+}
+
+#define STORE        0x01
+#define SIGNED        0x02
+
+#define UPDATE        0x04
+#define ARG_TEST    0x08
+
+#define BYTE_SIZE    0x000
+#define HALF_SIZE    0x100
+#define INT_SIZE    0x200
+#define WORD_SIZE    0x300
+
+#define MEM_SIZE_SHIFT(flags) ((flags) >> 8)
+
+static SLJIT_CONST sljit_ins sljit_mem_imm[4] = {
+/* u l */ 0x39400000 /* ldrb [reg,imm] */,
+/* u s */ 0x39000000 /* strb [reg,imm] */,
+/* s l */ 0x39800000 /* ldrsb [reg,imm] */,
+/* s s */ 0x39000000 /* strb [reg,imm] */,
+};
+
+static SLJIT_CONST sljit_ins sljit_mem_simm[4] = {
+/* u l */ 0x38400000 /* ldurb [reg,imm] */,
+/* u s */ 0x38000000 /* sturb [reg,imm] */,
+/* s l */ 0x38800000 /* ldursb [reg,imm] */,
+/* s s */ 0x38000000 /* sturb [reg,imm] */,
+};
+
+static SLJIT_CONST sljit_ins sljit_mem_pre_simm[4] = {
+/* u l */ 0x38400c00 /* ldrb [reg,imm]! */,
+/* u s */ 0x38000c00 /* strb [reg,imm]! */,
+/* s l */ 0x38800c00 /* ldrsb [reg,imm]! */,
+/* s s */ 0x38000c00 /* strb [reg,imm]! */,
+};
+
+static SLJIT_CONST sljit_ins sljit_mem_reg[4] = {
+/* u l */ 0x38606800 /* ldrb [reg,reg] */,
+/* u s */ 0x38206800 /* strb [reg,reg] */,
+/* s l */ 0x38a06800 /* ldrsb [reg,reg] */,
+/* s s */ 0x38206800 /* strb [reg,reg] */,
+};
+
+/* Helper function. Dst should be reg + value, using at most 1 instruction, flags does not set. */
+static sljit_si emit_set_delta(struct sljit_compiler *compiler, sljit_si dst, sljit_si reg, sljit_sw value)
+{
+    if (value >= 0) {
+        if (value <= 0xfff)
+            return push_inst(compiler, ADDI | RD(dst) | RN(reg) | (value << 10));
+        if (value <= 0xffffff && !(value & 0xfff))
+            return push_inst(compiler, ADDI | (1 << 22) | RD(dst) | RN(reg) | (value >> 2));
+    }
+    else {
+        value = -value;
+        if (value <= 0xfff)
+            return push_inst(compiler, SUBI | RD(dst) | RN(reg) | (value << 10));
+        if (value <= 0xffffff && !(value & 0xfff))
+            return push_inst(compiler, SUBI | (1 << 22) | RD(dst) | RN(reg) | (value >> 2));
+    }
+    return SLJIT_ERR_UNSUPPORTED;
+}
+
+/* Can perform an operation using at most 1 instruction. */
+static sljit_si getput_arg_fast(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg, sljit_si arg, sljit_sw argw)
+{
+    sljit_ui shift = MEM_SIZE_SHIFT(flags);
+
+    SLJIT_ASSERT(arg & SLJIT_MEM);
+
+    if (SLJIT_UNLIKELY(flags & UPDATE)) {
+        if ((arg & REG_MASK) && !(arg & OFFS_REG_MASK) && argw <= 255 && argw >= -256) {
+            if (SLJIT_UNLIKELY(flags & ARG_TEST))
+                return 1;
+
+            arg &= REG_MASK;
+            argw &= 0x1ff;
+            FAIL_IF(push_inst(compiler, sljit_mem_pre_simm[flags & 0x3]
+                | (shift << 30) | RT(reg) | RN(arg) | (argw << 12)));
+            return -1;
+        }
+        return 0;
+    }
+
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+        argw &= 0x3;
+        if (argw && argw != shift)
+            return 0;
+
+        if (SLJIT_UNLIKELY(flags & ARG_TEST))
+            return 1;
+
+        FAIL_IF(push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg)
+            | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | (argw ? (1 << 12) : 0)));
+        return -1;
+    }
+
+    arg &= REG_MASK;
+    if (argw >= 0 && (argw >> shift) <= 0xfff && (argw & ((1 << shift) - 1)) == 0) {
+        if (SLJIT_UNLIKELY(flags & ARG_TEST))
+            return 1;
+
+        FAIL_IF(push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30)
+            | RT(reg) | RN(arg) | (argw << (10 - shift))));
+        return -1;
+    }
+
+    if (argw > 255 || argw < -256)
+        return 0;
+
+    if (SLJIT_UNLIKELY(flags & ARG_TEST))
+        return 1;
+
+    FAIL_IF(push_inst(compiler, sljit_mem_simm[flags & 0x3] | (shift << 30)
+        | RT(reg) | RN(arg) | ((argw & 0x1ff) << 12)));
+    return -1;
+}
+
+/* see getput_arg below.
+   Note: can_cache is called only for binary operators. Those
+   operators always uses word arguments without write back. */
+static sljit_si can_cache(sljit_si arg, sljit_sw argw, sljit_si next_arg, sljit_sw next_argw)
+{
+    sljit_sw diff;
+    if ((arg & OFFS_REG_MASK) || !(next_arg & SLJIT_MEM))
+        return 0;
+
+    if (!(arg & REG_MASK)) {
+        diff = argw - next_argw;
+        if (diff <= 0xfff && diff >= -0xfff)
+            return 1;
+        return 0;
+    }
+
+    if (argw == next_argw)
+        return 1;
+
+    diff = argw - next_argw;
+    if (arg == next_arg && diff <= 0xfff && diff >= -0xfff)
+        return 1;
+
+    return 0;
+}
+
+/* Emit the necessary instructions. See can_cache above. */
+static sljit_si getput_arg(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg,
+    sljit_si arg, sljit_sw argw, sljit_si next_arg, sljit_sw next_argw)
+{
+    sljit_ui shift = MEM_SIZE_SHIFT(flags);
+    sljit_si tmp_r, other_r;
+    sljit_sw diff;
+
+    SLJIT_ASSERT(arg & SLJIT_MEM);
+    if (!(next_arg & SLJIT_MEM)) {
+        next_arg = 0;
+        next_argw = 0;
+    }
+
+    tmp_r = (flags & STORE) ? TMP_REG3 : reg;
+
+    if (SLJIT_UNLIKELY((flags & UPDATE) && (arg & REG_MASK))) {
+        /* Update only applies if a base register exists. */
+        other_r = OFFS_REG(arg);
+        if (!other_r) {
+            other_r = arg & REG_MASK;
+            if (other_r != reg && argw >= 0 && argw <= 0xffffff) {
+                if ((argw & 0xfff) != 0)
+                    FAIL_IF(push_inst(compiler, ADDI | RD(other_r) | RN(other_r) | ((argw & 0xfff) << 10)));
+                if (argw >> 12)
+                    FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(other_r) | RN(other_r) | ((argw >> 12) << 10)));
+                return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(other_r));
+            }
+            else if (other_r != reg && argw < 0 && argw >= -0xffffff) {
+                argw = -argw;
+                if ((argw & 0xfff) != 0)
+                    FAIL_IF(push_inst(compiler, SUBI | RD(other_r) | RN(other_r) | ((argw & 0xfff) << 10)));
+                if (argw >> 12)
+                    FAIL_IF(push_inst(compiler, SUBI | (1 << 22) | RD(other_r) | RN(other_r) | ((argw >> 12) << 10)));
+                return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(other_r));
+            }
+
+            if (compiler->cache_arg == SLJIT_MEM) {
+                if (argw == compiler->cache_argw) {
+                    other_r = TMP_REG3;
+                    argw = 0;
+                }
+                else if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
+                    FAIL_IF(compiler->error);
+                    compiler->cache_argw = argw;
+                    other_r = TMP_REG3;
+                    argw = 0;
+                }
+            }
+
+            if (argw) {
+                FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+                compiler->cache_arg = SLJIT_MEM;
+                compiler->cache_argw = argw;
+                other_r = TMP_REG3;
+                argw = 0;
+            }
+        }
+
+        /* No caching here. */
+        arg &= REG_MASK;
+        argw &= 0x3;
+        if (!argw || argw == shift) {
+            FAIL_IF(push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(other_r) | (argw ? (1 << 12) : 0)));
+            return push_inst(compiler, ADD | RD(arg) | RN(arg) | RM(other_r) | (argw << 10));
+        }
+        if (arg != reg) {
+            FAIL_IF(push_inst(compiler, ADD | RD(arg) | RN(arg) | RM(other_r) | (argw << 10)));
+            return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg));
+        }
+        FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG4) | RN(arg) | RM(other_r) | (argw << 10)));
+        FAIL_IF(push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(TMP_REG4)));
+        return push_inst(compiler, ORR | RD(arg) | RN(TMP_ZERO) | RM(TMP_REG4));
+    }
+
+    if (arg & OFFS_REG_MASK) {
+        other_r = OFFS_REG(arg);
+        arg &= REG_MASK;
+        FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RN(arg) | RM(other_r) | ((argw & 0x3) << 10)));
+        return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(tmp_r));
+    }
+
+    if (compiler->cache_arg == arg) {
+        diff = argw - compiler->cache_argw;
+        if (diff <= 255 && diff >= -256)
+            return push_inst(compiler, sljit_mem_simm[flags & 0x3] | (shift << 30)
+                | RT(reg) | RN(TMP_REG3) | ((diff & 0x1ff) << 12));
+        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, diff) != SLJIT_ERR_UNSUPPORTED) {
+            FAIL_IF(compiler->error);
+            return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg));
+        }
+    }
+
+    if (argw >= 0 && argw <= 0xffffff && (argw & ((1 << shift) - 1)) == 0) {
+        FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_r) | RN(arg & REG_MASK) | ((argw >> 12) << 10)));
+        return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30)
+            | RT(reg) | RN(tmp_r) | ((argw & 0xfff) << (10 - shift)));
+    }
+
+    diff = argw - next_argw;
+    next_arg = (arg & REG_MASK) && (arg == next_arg) && diff <= 0xfff && diff >= -0xfff && diff != 0;
+    arg &= REG_MASK;
+
+    if (arg && compiler->cache_arg == SLJIT_MEM) {
+        if (compiler->cache_argw == argw)
+            return push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
+        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
+            FAIL_IF(compiler->error);
+            compiler->cache_argw = argw;
+            return push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
+        }
+    }
+
+    compiler->cache_argw = argw;
+    if (next_arg && emit_set_delta(compiler, TMP_REG3, arg, argw) != SLJIT_ERR_UNSUPPORTED) {
+        FAIL_IF(compiler->error);
+        compiler->cache_arg = SLJIT_MEM | arg;
+        arg = 0;
+    }
+    else {
+        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+        compiler->cache_arg = SLJIT_MEM;
+
+        if (next_arg) {
+            FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG3) | RN(TMP_REG3) | RM(arg)));
+            compiler->cache_arg = SLJIT_MEM | arg;
+            arg = 0;
+        }
+    }
+
+    if (arg)
+        return push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
+    return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(TMP_REG3));
+}
+
+static SLJIT_INLINE sljit_si emit_op_mem(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg, sljit_si arg, sljit_sw argw)
+{
+    if (getput_arg_fast(compiler, flags, reg, arg, argw))
+        return compiler->error;
+    compiler->cache_arg = 0;
+    compiler->cache_argw = 0;
+    return getput_arg(compiler, flags, reg, arg, argw, 0, 0);
+}
+
+static SLJIT_INLINE sljit_si emit_op_mem2(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg, sljit_si arg1, sljit_sw arg1w, sljit_si arg2, sljit_sw arg2w)
+{
+    if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
+        return compiler->error;
+    return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
+}
+
+/* --------------------------------------------------------------------- */
+/*  Entry, exit                                                          */
+/* --------------------------------------------------------------------- */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compiler, sljit_si args, sljit_si scratches, sljit_si saveds, sljit_si local_size)
+{
+    CHECK_ERROR();
+    check_sljit_emit_enter(compiler, args, scratches, saveds, local_size);
+
+    compiler->scratches = scratches;
+    compiler->saveds = saveds;
+#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
+    compiler->logical_local_size = local_size;
+#endif
+    compiler->locals_offset = (2 + saveds) * sizeof(sljit_sw);
+    local_size = (compiler->locals_offset + local_size + 15) & ~15;
+    compiler->local_size = local_size;
+
+    if (local_size <= (64 << 3))
+        FAIL_IF(push_inst(compiler, STP_PRE | 29 | RT2(TMP_LR)
+            | RN(TMP_SP) | ((-(local_size >> 3) & 0x7f) << 15)));
+    else {
+        local_size -= (64 << 3);
+        if (local_size > 0xfff) {
+            FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | ((local_size >> 12) << 10) | (1 << 22)));
+            local_size &= 0xfff;
+        }
+        if (local_size)
+            FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (local_size << 10)));
+        FAIL_IF(push_inst(compiler, STP_PRE | 29 | RT2(TMP_LR) | RN(TMP_SP) | (0x40 << 15)));
+    }
+
+    FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_LOCALS_REG) | RN(TMP_SP)));
+
+    if (saveds >= 2)
+        FAIL_IF(push_inst(compiler, STP | RT(SLJIT_SAVED_REG1) | RT2(SLJIT_SAVED_REG2) | RN(TMP_SP) | (2 << 15)));
+    if (saveds >= 4)
+        FAIL_IF(push_inst(compiler, STP | RT(SLJIT_SAVED_REG3) | RT2(SLJIT_SAVED_EREG1) | RN(TMP_SP) | (4 << 15)));
+    if (saveds == 1)
+        FAIL_IF(push_inst(compiler, STRI | RT(SLJIT_SAVED_REG1) | RN(TMP_SP) | (2 << 10)));
+    if (saveds == 3)
+        FAIL_IF(push_inst(compiler, STRI | RT(SLJIT_SAVED_REG3) | RN(TMP_SP) | (4 << 10)));
+    if (saveds == 5)
+        FAIL_IF(push_inst(compiler, STRI | RT(SLJIT_SAVED_EREG2) | RN(TMP_SP) | (6 << 10)));
+
+    if (args >= 1)
+        FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_SAVED_REG1) | RN(TMP_ZERO) | RM(SLJIT_SCRATCH_REG1)));
+    if (args >= 2)
+        FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_SAVED_REG2) | RN(TMP_ZERO) | RM(SLJIT_SCRATCH_REG2)));
+    if (args >= 3)
+        FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_SAVED_REG3) | RN(TMP_ZERO) | RM(SLJIT_SCRATCH_REG3)));
+
+    return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_context(struct sljit_compiler *compiler, sljit_si args, sljit_si scratches, sljit_si saveds, sljit_si local_size)
+{
+    CHECK_ERROR_VOID();
+    check_sljit_set_context(compiler, args, scratches, saveds, local_size);
+
+    compiler->scratches = scratches;
+    compiler->saveds = saveds;
+#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
+    compiler->logical_local_size = local_size;
+#endif
+    compiler->locals_offset = (2 + saveds) * sizeof(sljit_sw);
+    compiler->local_size = (compiler->locals_offset + local_size + 15) & ~15;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compiler, sljit_si op, sljit_si src, sljit_sw srcw)
+{
+    sljit_si saveds, local_size;
+
+    CHECK_ERROR();
+    check_sljit_emit_return(compiler, op, src, srcw);
+
+    FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
+
+    saveds = compiler->saveds;
+
+    if (saveds >= 2)
+        FAIL_IF(push_inst(compiler, LDP | RT(SLJIT_SAVED_REG1) | RT2(SLJIT_SAVED_REG2) | RN(TMP_SP) | (2 << 15)));
+    if (saveds >= 4)
+        FAIL_IF(push_inst(compiler, LDP | RT(SLJIT_SAVED_REG3) | RT2(SLJIT_SAVED_EREG1) | RN(TMP_SP) | (4 << 15)));
+    if (saveds == 1)
+        FAIL_IF(push_inst(compiler, LDRI | RT(SLJIT_SAVED_REG1) | RN(TMP_SP) | (2 << 10)));
+    if (saveds == 3)
+        FAIL_IF(push_inst(compiler, LDRI | RT(SLJIT_SAVED_REG3) | RN(TMP_SP) | (4 << 10)));
+    if (saveds == 5)
+        FAIL_IF(push_inst(compiler, LDRI | RT(SLJIT_SAVED_EREG2) | RN(TMP_SP) | (6 << 10)));
+
+    local_size = compiler->local_size;
+
+    if (local_size <= (62 << 3))
+        FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR)
+            | RN(TMP_SP) | (((local_size >> 3) & 0x7f) << 15)));
+    else {
+        FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR) | RN(TMP_SP) | (0x3e << 15)));
+        local_size -= (62 << 3);
+        if (local_size > 0xfff) {
+            FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | ((local_size >> 12) << 10) | (1 << 22)));
+            local_size &= 0xfff;
+        }
+        if (local_size)
+            FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (local_size << 10)));
+    }
+
+    FAIL_IF(push_inst(compiler, RET | RN(TMP_LR)));
+    return SLJIT_SUCCESS;
+}
+
+/* --------------------------------------------------------------------- */
+/*  Operators                                                            */
+/* --------------------------------------------------------------------- */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
+{
+    sljit_ins inv_bits = (op & SLJIT_INT_OP) ? (1 << 31) : 0;
+
+    CHECK_ERROR();
+    check_sljit_emit_op0(compiler, op);
+
+    op = GET_OPCODE(op);
+    switch (op) {
+    case SLJIT_BREAKPOINT:
+        return push_inst(compiler, BRK);
+    case SLJIT_NOP:
+        return push_inst(compiler, NOP);
+    case SLJIT_UMUL:
+    case SLJIT_SMUL:
+        FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_SCRATCH_REG1)));
+        FAIL_IF(push_inst(compiler, MADD | RD(SLJIT_SCRATCH_REG1) | RN(SLJIT_SCRATCH_REG1) | RM(SLJIT_SCRATCH_REG2) | RT2(TMP_ZERO)));
+        return push_inst(compiler, (op == SLJIT_SMUL ? SMULH : UMULH) | RD(SLJIT_SCRATCH_REG2) | RN(TMP_REG1) | RM(SLJIT_SCRATCH_REG2));
+    case SLJIT_UDIV:
+    case SLJIT_SDIV:
+        FAIL_IF(push_inst(compiler, (ORR ^ inv_bits) | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_SCRATCH_REG1)));
+        FAIL_IF(push_inst(compiler, ((op == SLJIT_SDIV ? SDIV : UDIV) ^ inv_bits) | RD(SLJIT_SCRATCH_REG1) | RN(SLJIT_SCRATCH_REG1) | RM(SLJIT_SCRATCH_REG2)));
+        FAIL_IF(push_inst(compiler, (MADD ^ inv_bits) | RD(SLJIT_SCRATCH_REG2) | RN(SLJIT_SCRATCH_REG1) | RM(SLJIT_SCRATCH_REG2) | RT2(TMP_ZERO)));
+        return push_inst(compiler, (SUB ^ inv_bits) | RD(SLJIT_SCRATCH_REG2) | RN(TMP_REG1) | RM(SLJIT_SCRATCH_REG2));
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op,
+    sljit_si dst, sljit_sw dstw,
+    sljit_si src, sljit_sw srcw)
+{
+    sljit_si dst_r, flags, mem_flags;
+    sljit_si op_flags = GET_ALL_FLAGS(op);
+
+    CHECK_ERROR();
+    check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw);
+    ADJUST_LOCAL_OFFSET(dst, dstw);
+    ADJUST_LOCAL_OFFSET(src, srcw);
+
+    compiler->cache_arg = 0;
+    compiler->cache_argw = 0;
+
+    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+
+    op = GET_OPCODE(op);
+    if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
+        switch (op) {
+        case SLJIT_MOV:
+        case SLJIT_MOV_P:
+            flags = WORD_SIZE;
+            break;
+        case SLJIT_MOV_UB:
+            flags = BYTE_SIZE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_ub)srcw;
+            break;
+        case SLJIT_MOV_SB:
+            flags = BYTE_SIZE | SIGNED;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_sb)srcw;
+            break;
+        case SLJIT_MOV_UH:
+            flags = HALF_SIZE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_uh)srcw;
+            break;
+        case SLJIT_MOV_SH:
+            flags = HALF_SIZE | SIGNED;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_sh)srcw;
+            break;
+        case SLJIT_MOV_UI:
+            flags = INT_SIZE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_ui)srcw;
+            break;
+        case SLJIT_MOV_SI:
+            flags = INT_SIZE | SIGNED;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_si)srcw;
+            break;
+        case SLJIT_MOVU:
+        case SLJIT_MOVU_P:
+            flags = WORD_SIZE | UPDATE;
+            break;
+        case SLJIT_MOVU_UB:
+            flags = BYTE_SIZE | UPDATE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_ub)srcw;
+            break;
+        case SLJIT_MOVU_SB:
+            flags = BYTE_SIZE | SIGNED | UPDATE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_sb)srcw;
+            break;
+        case SLJIT_MOVU_UH:
+            flags = HALF_SIZE | UPDATE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_uh)srcw;
+            break;
+        case SLJIT_MOVU_SH:
+            flags = HALF_SIZE | SIGNED | UPDATE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_sh)srcw;
+            break;
+        case SLJIT_MOVU_UI:
+            flags = INT_SIZE | UPDATE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_ui)srcw;
+            break;
+        case SLJIT_MOVU_SI:
+            flags = INT_SIZE | SIGNED | UPDATE;
+            if (src & SLJIT_IMM)
+                srcw = (sljit_si)srcw;
+            break;
+        default:
+            SLJIT_ASSERT_STOP();
+            flags = 0;
+            break;
+        }
+
+        if (src & SLJIT_IMM)
+            FAIL_IF(emit_op_imm(compiler, SLJIT_MOV | ARG2_IMM, dst_r, TMP_REG1, srcw));
+        else if (src & SLJIT_MEM) {
+            if (getput_arg_fast(compiler, flags, dst_r, src, srcw))
+                FAIL_IF(compiler->error);
+            else
+                FAIL_IF(getput_arg(compiler, flags, dst_r, src, srcw, dst, dstw));
+        } else {
+            if (dst_r != TMP_REG1)
+                return emit_op_imm(compiler, op | ((op_flags & SLJIT_INT_OP) ? INT_OP : 0), dst_r, TMP_REG1, src);
+            dst_r = src;
+        }
+
+        if (dst & SLJIT_MEM) {
+            if (getput_arg_fast(compiler, flags | STORE, dst_r, dst, dstw))
+                return compiler->error;
+            else
+                return getput_arg(compiler, flags | STORE, dst_r, dst, dstw, 0, 0);
+        }
+        return SLJIT_SUCCESS;
+    }
+
+    flags = GET_FLAGS(op_flags) ? SET_FLAGS : 0;
+    mem_flags = WORD_SIZE;
+    if (op_flags & SLJIT_INT_OP) {
+        flags |= INT_OP;
+        mem_flags = INT_SIZE;
+    }
+
+    if (dst == SLJIT_UNUSED)
+        flags |= UNUSED_RETURN;
+
+    if (src & SLJIT_MEM) {
+        if (getput_arg_fast(compiler, mem_flags, TMP_REG2, src, srcw))
+            FAIL_IF(compiler->error);
+        else
+            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src, srcw, dst, dstw));
+        src = TMP_REG2;
+    }
+
+    if (src & SLJIT_IMM) {
+        flags |= ARG2_IMM;
+        if (op_flags & SLJIT_INT_OP)
+            srcw = (sljit_si)srcw;
+    } else
+        srcw = src;
+
+    emit_op_imm(compiler, flags | op, dst_r, TMP_REG1, srcw);
+
+    if (dst & SLJIT_MEM) {
+        if (getput_arg_fast(compiler, mem_flags | STORE, dst_r, dst, dstw))
+            return compiler->error;
+        else
+            return getput_arg(compiler, mem_flags | STORE, dst_r, dst, dstw, 0, 0);
+    }
+    return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op,
+    sljit_si dst, sljit_sw dstw,
+    sljit_si src1, sljit_sw src1w,
+    sljit_si src2, sljit_sw src2w)
+{
+    sljit_si dst_r, flags, mem_flags;
+
+    CHECK_ERROR();
+    check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
+    ADJUST_LOCAL_OFFSET(dst, dstw);
+    ADJUST_LOCAL_OFFSET(src1, src1w);
+    ADJUST_LOCAL_OFFSET(src2, src2w);
+
+    compiler->cache_arg = 0;
+    compiler->cache_argw = 0;
+
+    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+    flags = GET_FLAGS(op) ? SET_FLAGS : 0;
+    mem_flags = WORD_SIZE;
+    if (op & SLJIT_INT_OP) {
+        flags |= INT_OP;
+        mem_flags = INT_SIZE;
+    }
+
+    if (dst == SLJIT_UNUSED)
+        flags |= UNUSED_RETURN;
+
+    if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, mem_flags | STORE | ARG_TEST, TMP_REG1, dst, dstw))
+        flags |= SLOW_DEST;
+
+    if (src1 & SLJIT_MEM) {
+        if (getput_arg_fast(compiler, mem_flags, TMP_REG1, src1, src1w))
+            FAIL_IF(compiler->error);
+        else
+            flags |= SLOW_SRC1;
+    }
+    if (src2 & SLJIT_MEM) {
+        if (getput_arg_fast(compiler, mem_flags, TMP_REG2, src2, src2w))
+            FAIL_IF(compiler->error);
+        else
+            flags |= SLOW_SRC2;
+    }
+
+    if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
+        if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
+            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src2, src2w, src1, src1w));
+            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG1, src1, src1w, dst, dstw));
+        }
+        else {
+            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG1, src1, src1w, src2, src2w));
+            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src2, src2w, dst, dstw));
+        }
+    }
+    else if (flags & SLOW_SRC1)
+        FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG1, src1, src1w, dst, dstw));
+    else if (flags & SLOW_SRC2)
+        FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src2, src2w, dst, dstw));
+
+    if (src1 & SLJIT_MEM)
+        src1 = TMP_REG1;
+    if (src2 & SLJIT_MEM)
+        src2 = TMP_REG2;
+
+    if (src1 & SLJIT_IMM)
+        flags |= ARG1_IMM;
+    else
+        src1w = src1;
+    if (src2 & SLJIT_IMM)
+        flags |= ARG2_IMM;
+    else
+        src2w = src2;
+
+    emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src1w, src2w);
+
+    if (dst & SLJIT_MEM) {
+        if (!(flags & SLOW_DEST)) {
+            getput_arg_fast(compiler, mem_flags | STORE, dst_r, dst, dstw);
+            return compiler->error;
+        }
+        return getput_arg(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, 0, 0);
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
+{
+    check_sljit_get_register_index(reg);
+    return reg_map[reg];
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg)
+{
+    check_sljit_get_float_register_index(reg);
+    return reg;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
+    void *instruction, sljit_si size)
+{
+    CHECK_ERROR();
+    check_sljit_emit_op_custom(compiler, instruction, size);
+    SLJIT_ASSERT(size == 4);
+
+    return push_inst(compiler, *(sljit_ins*)instruction);
+}
+
+/* --------------------------------------------------------------------- */
+/*  Floating point operators                                             */
+/* --------------------------------------------------------------------- */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void)
+{
+    return 1;
+}
+
+static sljit_si emit_fop_mem(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg, sljit_si arg, sljit_sw argw)
+{
+    sljit_ui shift = MEM_SIZE_SHIFT(flags);
+    sljit_ins ins_bits = (shift << 30);
+    sljit_si other_r;
+    sljit_sw diff;
+
+    SLJIT_ASSERT(arg & SLJIT_MEM);
+
+    if (!(flags & STORE))
+        ins_bits |= 1 << 22;
+
+    if (arg & OFFS_REG_MASK) {
+        argw &= 3;
+        if (!argw || argw == shift)
+            return push_inst(compiler, STR_FR | ins_bits | VT(reg)
+                | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | (argw ? (1 << 12) : 0));
+        other_r = OFFS_REG(arg);
+        arg &= REG_MASK;
+        FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RN(arg) | RM(other_r) | (argw << 10)));
+        arg = TMP_REG1;
+        argw = 0;
+    }
+
+    arg &= REG_MASK;
+    if (arg && argw >= 0 && ((argw >> shift) <= 0xfff) && (argw & ((1 << shift) - 1)) == 0)
+        return push_inst(compiler, STR_F | ins_bits | VT(reg) | RN(arg) | (argw << (10 - shift)));
+
+    if (arg && argw <= 255 && argw >= -256)
+        return push_inst(compiler, STUR_F | ins_bits | VT(reg) | RN(arg) | ((argw & 0x1ff) << 12));
+
+    /* Slow cases */
+    if (compiler->cache_arg == SLJIT_MEM && argw != compiler->cache_argw) {
+        diff = argw - compiler->cache_argw;
+        if (!arg && diff <= 255 && diff >= -256)
+            return push_inst(compiler, STUR_F | ins_bits | VT(reg) | RN(TMP_REG3) | ((diff & 0x1ff) << 12));
+        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
+            FAIL_IF(compiler->error);
+            compiler->cache_argw = argw;
+        }
+    }
+
+    if (compiler->cache_arg != SLJIT_MEM || argw != compiler->cache_argw) {
+        compiler->cache_arg = SLJIT_MEM;
+        compiler->cache_argw = argw;
+        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+    }
+
+    if (arg & REG_MASK)
+        return push_inst(compiler, STR_FR | ins_bits | VT(reg) | RN(arg) | RM(TMP_REG3));
+    return push_inst(compiler, STR_F | ins_bits | VT(reg) | RN(TMP_REG3));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
+    sljit_si dst, sljit_sw dstw,
+    sljit_si src, sljit_sw srcw)
+{
+    sljit_si dst_r, mem_flags = (op & SLJIT_SINGLE_OP) ? INT_SIZE : WORD_SIZE;
+    sljit_ins inv_bits = (op & SLJIT_SINGLE_OP) ? (1 << 22) : 0;
+
+    CHECK_ERROR();
+    check_sljit_emit_fop1(compiler, op, dst, dstw, src, srcw);
+
+    compiler->cache_arg = 0;
+    compiler->cache_argw = 0;
+
+    if (GET_OPCODE(op) == SLJIT_CMPD) {
+        if (dst & SLJIT_MEM) {
+            emit_fop_mem(compiler, mem_flags, TMP_FREG1, dst, dstw);
+            dst = TMP_FREG1;
+        }
+        if (src & SLJIT_MEM) {
+            emit_fop_mem(compiler, mem_flags, TMP_FREG2, src, srcw);
+            src = TMP_FREG2;
+        }
+        return push_inst(compiler, (FCMP ^ inv_bits) | VN(dst) | VM(src));
+    }
+
+    dst_r = (dst <= REG_MASK) ? dst : TMP_FREG1;
+    if (src & SLJIT_MEM) {
+        emit_fop_mem(compiler, mem_flags, dst_r, src, srcw);
+        src = dst_r;
+    }
+
+    switch (GET_OPCODE(op)) {
+    case SLJIT_MOVD:
+        if (src != dst_r)
+            FAIL_IF(push_inst(compiler, (FMOV ^ inv_bits) | VD(dst_r) | VN(src)));
+        break;
+    case SLJIT_NEGD:
+        FAIL_IF(push_inst(compiler, (FNEG ^ inv_bits) | VD(dst_r) | VN(src)));
+        break;
+    case SLJIT_ABSD:
+        FAIL_IF(push_inst(compiler, (FABS ^ inv_bits) | VD(dst_r) | VN(src)));
+        break;
+    }
+
+    if (!(dst & SLJIT_MEM))
+        return SLJIT_SUCCESS;
+    return emit_fop_mem(compiler, mem_flags | STORE, TMP_FREG1, dst, dstw);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
+    sljit_si dst, sljit_sw dstw,
+    sljit_si src1, sljit_sw src1w,
+    sljit_si src2, sljit_sw src2w)
+{
+    sljit_si dst_r, mem_flags = (op & SLJIT_SINGLE_OP) ? INT_SIZE : WORD_SIZE;
+    sljit_ins inv_bits = (op & SLJIT_SINGLE_OP) ? (1 << 22) : 0;
+
+    CHECK_ERROR();
+    check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
+
+    compiler->cache_arg = 0;
+    compiler->cache_argw = 0;
+
+    dst_r = (dst <= REG_MASK) ? dst : TMP_FREG1;
+    if (src1 & SLJIT_MEM) {
+        emit_fop_mem(compiler, mem_flags, TMP_FREG1, src1, src1w);
+        src1 = TMP_FREG1;
+    }
+    if (src2 & SLJIT_MEM) {
+        emit_fop_mem(compiler, mem_flags, TMP_FREG2, src2, src2w);
+        src2 = TMP_FREG2;
+    }
+
+    switch (GET_OPCODE(op)) {
+    case SLJIT_ADDD:
+        FAIL_IF(push_inst(compiler, (FADD ^ inv_bits) | VD(dst_r) | VN(src1) | VM(src2)));
+        break;
+    case SLJIT_SUBD:
+        FAIL_IF(push_inst(compiler, (FSUB ^ inv_bits) | VD(dst_r) | VN(src1) | VM(src2)));
+        break;
+    case SLJIT_MULD:
+        FAIL_IF(push_inst(compiler, (FMUL ^ inv_bits) | VD(dst_r) | VN(src1) | VM(src2)));
+        break;
+    case SLJIT_DIVD:
+        FAIL_IF(push_inst(compiler, (FDIV ^ inv_bits) | VD(dst_r) | VN(src1) | VM(src2)));
+        break;
+    }
+
+    if (!(dst & SLJIT_MEM))
+        return SLJIT_SUCCESS;
+    return emit_fop_mem(compiler, mem_flags | STORE, TMP_FREG1, dst, dstw);
+}
+
+/* --------------------------------------------------------------------- */
+/*  Other instructions                                                   */
+/* --------------------------------------------------------------------- */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw)
+{
+    CHECK_ERROR();
+    check_sljit_emit_fast_enter(compiler, dst, dstw);
+    ADJUST_LOCAL_OFFSET(dst, dstw);
+
+    /* For UNUSED dst. Uncommon, but possible. */
+    if (dst == SLJIT_UNUSED)
+        return SLJIT_SUCCESS;
+
+    if (dst <= REG_MASK)
+        return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | RM(TMP_LR));
+
+    /* Memory. */
+    return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_LR, dst, dstw);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_si src, sljit_sw srcw)
+{
+    CHECK_ERROR();
+    check_sljit_emit_fast_return(compiler, src, srcw);
+    ADJUST_LOCAL_OFFSET(src, srcw);
+
+    if (src <= REG_MASK)
+        FAIL_IF(push_inst(compiler, ORR | RD(TMP_LR) | RN(TMP_ZERO) | RM(src)));
+    else if (src & SLJIT_MEM)
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_LR, src, srcw));
+    else if (src & SLJIT_IMM)
+        FAIL_IF(load_immediate(compiler, TMP_LR, srcw));
+
+    return push_inst(compiler, RET | RN(TMP_LR));
+}
+
+/* --------------------------------------------------------------------- */
+/*  Conditional instructions                                             */
+/* --------------------------------------------------------------------- */
+
+static sljit_uw get_cc(sljit_si type)
+{
+    switch (type) {
+    case SLJIT_C_EQUAL:
+    case SLJIT_C_MUL_NOT_OVERFLOW:
+    case SLJIT_C_FLOAT_EQUAL:
+        return 0x1;
+
+    case SLJIT_C_NOT_EQUAL:
+    case SLJIT_C_MUL_OVERFLOW:
+    case SLJIT_C_FLOAT_NOT_EQUAL:
+        return 0x0;
+
+    case SLJIT_C_LESS:
+    case SLJIT_C_FLOAT_LESS:
+        return 0x2;
+
+    case SLJIT_C_GREATER_EQUAL:
+    case SLJIT_C_FLOAT_GREATER_EQUAL:
+        return 0x3;
+
+    case SLJIT_C_GREATER:
+    case SLJIT_C_FLOAT_GREATER:
+        return 0x9;
+
+    case SLJIT_C_LESS_EQUAL:
+    case SLJIT_C_FLOAT_LESS_EQUAL:
+        return 0x8;
+
+    case SLJIT_C_SIG_LESS:
+        return 0xa;
+
+    case SLJIT_C_SIG_GREATER_EQUAL:
+        return 0xb;
+
+    case SLJIT_C_SIG_GREATER:
+        return 0xd;
+
+    case SLJIT_C_SIG_LESS_EQUAL:
+        return 0xc;
+
+    case SLJIT_C_OVERFLOW:
+    case SLJIT_C_FLOAT_UNORDERED:
+        return 0x7;
+
+    case SLJIT_C_NOT_OVERFLOW:
+    case SLJIT_C_FLOAT_ORDERED:
+        return 0x6;
+
+    default:
+        SLJIT_ASSERT_STOP();
+        return 0xe;
+    }
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
+{
+    struct sljit_label *label;
+
+    CHECK_ERROR_PTR();
+    check_sljit_emit_label(compiler);
+
+    if (compiler->last_label && compiler->last_label->size == compiler->size)
+        return compiler->last_label;
+
+    label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
+    PTR_FAIL_IF(!label);
+    set_label(label, compiler);
+    return label;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
+{
+    struct sljit_jump *jump;
+
+    CHECK_ERROR_PTR();
+    check_sljit_emit_jump(compiler, type);
+
+    jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+    PTR_FAIL_IF(!jump);
+    set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
+    type &= 0xff;
+
+    if (type < SLJIT_JUMP) {
+        jump->flags |= IS_COND;
+        PTR_FAIL_IF(push_inst(compiler, B_CC | (6 << 5) | get_cc(type)));
+    }
+    else if (type >= SLJIT_FAST_CALL)
+        jump->flags |= IS_BL;
+
+    PTR_FAIL_IF(emit_imm64_const(compiler, TMP_REG1, 0));
+    jump->addr = compiler->size;
+    PTR_FAIL_IF(push_inst(compiler, ((type >= SLJIT_FAST_CALL) ? BLR : BR) | RN(TMP_REG1)));
+
+    return jump;
+}
+
+static SLJIT_INLINE struct sljit_jump* emit_cmp_to0(struct sljit_compiler *compiler, sljit_si type,
+    sljit_si src, sljit_sw srcw)
+{
+    struct sljit_jump *jump;
+    sljit_ins inv_bits = (type & SLJIT_INT_OP) ? (1 << 31) : 0;
+
+    SLJIT_ASSERT((type & 0xff) == SLJIT_C_EQUAL || (type & 0xff) == SLJIT_C_NOT_EQUAL);
+
+    jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+    PTR_FAIL_IF(!jump);
+    set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
+    jump->flags |= IS_CBZ | IS_COND;
+
+    if (src & SLJIT_MEM) {
+        PTR_FAIL_IF(emit_op_mem(compiler, inv_bits ? INT_SIZE : WORD_SIZE, TMP_REG1, src, srcw));
+        src = TMP_REG1;
+    }
+    else if (src & SLJIT_IMM) {
+        PTR_FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
+        src = TMP_REG1;
+    }
+    SLJIT_ASSERT(FAST_IS_REG(src));
+
+    if ((type & 0xff) == SLJIT_C_EQUAL)
+        inv_bits |= 1 << 24;
+
+    PTR_FAIL_IF(push_inst(compiler, (CBZ ^ inv_bits) | (6 << 5) | RT(src)));
+    PTR_FAIL_IF(emit_imm64_const(compiler, TMP_REG1, 0));
+    jump->addr = compiler->size;
+    PTR_FAIL_IF(push_inst(compiler, BR | RN(TMP_REG1)));
+    return jump;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
+{
+    struct sljit_jump *jump;
+
+    CHECK_ERROR();
+    check_sljit_emit_ijump(compiler, type, src, srcw);
+    ADJUST_LOCAL_OFFSET(src, srcw);
+
+    /* In ARM, we don't need to touch the arguments. */
+    if (!(src & SLJIT_IMM)) {
+        if (src & SLJIT_MEM) {
+            FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw));
+            src = TMP_REG1;
+        }
+        return push_inst(compiler, ((type >= SLJIT_FAST_CALL) ? BLR : BR) | RN(src));
+    }
+
+    jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+    FAIL_IF(!jump);
+    set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
+    jump->u.target = srcw;
+
+    FAIL_IF(emit_imm64_const(compiler, TMP_REG1, 0));
+    jump->addr = compiler->size;
+    return push_inst(compiler, ((type >= SLJIT_FAST_CALL) ? BLR : BR) | RN(TMP_REG1));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
+    sljit_si dst, sljit_sw dstw,
+    sljit_si src, sljit_sw srcw,
+    sljit_si type)
+{
+    sljit_si dst_r, flags, mem_flags;
+    sljit_ins cc;
+
+    CHECK_ERROR();
+    check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
+    ADJUST_LOCAL_OFFSET(dst, dstw);
+    ADJUST_LOCAL_OFFSET(src, srcw);
+
+    if (dst == SLJIT_UNUSED)
+        return SLJIT_SUCCESS;
+
+    cc = get_cc(type);
+    dst_r = (dst <= REG_MASK) ? dst : TMP_REG1;
+
+    if (GET_OPCODE(op) < SLJIT_ADD) {
+        FAIL_IF(push_inst(compiler, CSINC | (cc << 12) | RD(dst_r) | RN(TMP_ZERO) | RM(TMP_ZERO)));
+        if (dst_r != TMP_REG1)
+            return SLJIT_SUCCESS;
+        return emit_op_mem(compiler, (GET_OPCODE(op) == SLJIT_MOV ? WORD_SIZE : INT_SIZE) | STORE, TMP_REG1, dst, dstw);
+    }
+
+    compiler->cache_arg = 0;
+    compiler->cache_argw = 0;
+    flags = GET_FLAGS(op) ? SET_FLAGS : 0;
+    mem_flags = WORD_SIZE;
+    if (op & SLJIT_INT_OP) {
+        flags |= INT_OP;
+        mem_flags = INT_SIZE;
+    }
+
+    if (src & SLJIT_MEM) {
+        FAIL_IF(emit_op_mem2(compiler, mem_flags, TMP_REG1, src, srcw, dst, dstw));
+        src = TMP_REG1;
+        srcw = 0;
+    } else if (src & SLJIT_IMM)
+        flags |= ARG1_IMM;
+
+    FAIL_IF(push_inst(compiler, CSINC | (cc << 12) | RD(TMP_REG2) | RN(TMP_ZERO) | RM(TMP_ZERO)));
+    emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src, TMP_REG2);
+
+    if (dst_r != TMP_REG1)
+        return SLJIT_SUCCESS;
+    return emit_op_mem2(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, 0, 0);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
+{
+    struct sljit_const *const_;
+    sljit_si dst_r;
+
+    CHECK_ERROR_PTR();
+    check_sljit_emit_const(compiler, dst, dstw, init_value);
+    ADJUST_LOCAL_OFFSET(dst, dstw);
+
+    const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
+    PTR_FAIL_IF(!const_);
+    set_const(const_, compiler);
+
+    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+    PTR_FAIL_IF(emit_imm64_const(compiler, dst_r, init_value));
+
+    if (dst & SLJIT_MEM)
+        PTR_FAIL_IF(emit_op_mem(compiler, WORD_SIZE | STORE, dst_r, dst, dstw));
+    return const_;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
+{
+    sljit_ins* inst = (sljit_ins*)addr;
+    modify_imm64_const(inst, new_addr);
+    /* SLJIT_CACHE_FLUSH(inst, inst + 4); */
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
+{
+    sljit_ins* inst = (sljit_ins*)addr;
+    modify_imm64_const(inst, new_constant);
+    /* SLJIT_CACHE_FLUSH(inst, inst + 4); */
+}


Modified: code/trunk/sljit/sljitNativeARM_Thumb2.c
===================================================================
--- code/trunk/sljit/sljitNativeARM_Thumb2.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeARM_Thumb2.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -207,7 +207,7 @@
         COPY_BITS(imm, 12 + 16, 16, 4) | COPY_BITS(imm, 11 + 16, 26, 1) | COPY_BITS(imm, 8 + 16, 12, 3) | ((imm & 0xff0000) >> 16));
 }


-static SLJIT_INLINE void modify_imm32_const(sljit_uh* inst, sljit_uw new_imm)
+static SLJIT_INLINE void modify_imm32_const(sljit_uh *inst, sljit_uw new_imm)
 {
     sljit_si dst = inst[1] & 0x0f00;
     SLJIT_ASSERT(((inst[0] & 0xfbf0) == (MOVW >> 16)) && ((inst[2] & 0xfbf0) == (MOVT >> 16)) && dst == (inst[3] & 0x0f00));
@@ -238,33 +238,33 @@
     if (jump->flags & IS_COND) {
         SLJIT_ASSERT(!(jump->flags & IS_BL));
         if (diff <= 127 && diff >= -128) {
-            jump->flags |= B_TYPE1;
+            jump->flags |= PATCH_TYPE1;
             return 5;
         }
         if (diff <= 524287 && diff >= -524288) {
-            jump->flags |= B_TYPE2;
+            jump->flags |= PATCH_TYPE2;
             return 4;
         }
         /* +1 comes from the prefix IT instruction. */
         diff--;
         if (diff <= 8388607 && diff >= -8388608) {
-            jump->flags |= B_TYPE3;
+            jump->flags |= PATCH_TYPE3;
             return 3;
         }
     }
     else if (jump->flags & IS_BL) {
         if (diff <= 8388607 && diff >= -8388608) {
-            jump->flags |= BL_TYPE6;
+            jump->flags |= PATCH_BL;
             return 3;
         }
     }
     else {
         if (diff <= 1023 && diff >= -1024) {
-            jump->flags |= B_TYPE4;
+            jump->flags |= PATCH_TYPE4;
             return 4;
         }
         if (diff <= 8388607 && diff >= -8388608) {
-            jump->flags |= B_TYPE5;
+            jump->flags |= PATCH_TYPE5;
             return 3;
         }
     }
@@ -272,15 +272,6 @@
     return 0;
 }


-static SLJIT_INLINE void inline_set_jump_addr(sljit_uw addr, sljit_uw new_addr, sljit_si flush)
-{
-    sljit_uh* inst = (sljit_uh*)addr;
-    modify_imm32_const(inst, new_addr);
-    if (flush) {
-        SLJIT_CACHE_FLUSH(inst, inst + 3);
-    }
-}
-
 static SLJIT_INLINE void set_jump_instruction(struct sljit_jump *jump)
 {
     sljit_si type = (jump->flags >> 4) & 0xf;
@@ -289,7 +280,7 @@
     sljit_si s, j1, j2;


     if (SLJIT_UNLIKELY(type == 0)) {
-        inline_set_jump_addr(jump->addr, (jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target, 0);
+        modify_imm32_const((sljit_uh*)jump->addr, (jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target);
         return;
     }


@@ -425,6 +416,10 @@
     return (void*)((sljit_uw)code | 0x1);
 }


+/* --------------------------------------------------------------------- */
+/*  Core code generator functions.                                       */
+/* --------------------------------------------------------------------- */
+
 #define INVALID_IMM    0x80000000
 static sljit_uw get_imm(sljit_uw imm)
 {
@@ -502,7 +497,6 @@
 #define ARG1_IMM    0x0010000
 #define ARG2_IMM    0x0020000
 #define KEEP_FLAGS    0x0040000
-#define SET_MULOV    0x0080000
 /* SET_FLAGS must be 0x100000 as it is also the value of S bit (can be used for optimization). */
 #define SET_FLAGS    0x0100000
 #define UNUSED_RETURN    0x0200000
@@ -516,7 +510,7 @@
        arg1 must be register, TMP_REG1, imm
        arg2 must be register, TMP_REG2, imm */
     sljit_si reg;
-    sljit_uw imm, negated_imm;
+    sljit_uw imm, nimm;


     if (SLJIT_UNLIKELY((flags & (ARG1_IMM | ARG2_IMM)) == (ARG1_IMM | ARG2_IMM))) {
         /* Both are immediates. */
@@ -530,6 +524,10 @@
         imm = (flags & ARG2_IMM) ? arg2 : arg1;


         switch (flags & 0xffff) {
+        case SLJIT_CLZ:
+        case SLJIT_MUL:
+            /* No form with immediate operand. */
+            break;
         case SLJIT_MOV:
             SLJIT_ASSERT(!(flags & SET_FLAGS) && (flags & ARG2_IMM) && arg1 == TMP_REG1);
             return load_immediate(compiler, dst, imm);
@@ -537,30 +535,27 @@
             if (!(flags & SET_FLAGS))
                 return load_immediate(compiler, dst, ~imm);
             /* Since the flags should be set, we just fallback to the register mode.
-               Although I could do some clever things here, "NOT IMM" does not worth the efforts. */
+               Although some clever things could be done here, "NOT IMM" does not worth the efforts. */
             break;
-        case SLJIT_CLZ:
-            /* No form with immediate operand. */
-            break;
         case SLJIT_ADD:
-            negated_imm = (sljit_uw)-(sljit_sw)imm;
+            nimm = -imm;
             if (!(flags & KEEP_FLAGS) && IS_2_LO_REGS(reg, dst)) {
                 if (imm <= 0x7)
                     return push_inst16(compiler, ADDSI3 | IMM3(imm) | RD3(dst) | RN3(reg));
-                if (negated_imm <= 0x7)
-                    return push_inst16(compiler, SUBSI3 | IMM3(negated_imm) | RD3(dst) | RN3(reg));
+                if (nimm <= 0x7)
+                    return push_inst16(compiler, SUBSI3 | IMM3(nimm) | RD3(dst) | RN3(reg));
                 if (reg == dst) {
                     if (imm <= 0xff)
                         return push_inst16(compiler, ADDSI8 | IMM8(imm) | RDN3(dst));
-                    if (negated_imm <= 0xff)
-                        return push_inst16(compiler, SUBSI8 | IMM8(negated_imm) | RDN3(dst));
+                    if (nimm <= 0xff)
+                        return push_inst16(compiler, SUBSI8 | IMM8(nimm) | RDN3(dst));
                 }
             }
             if (!(flags & SET_FLAGS)) {
                 if (imm <= 0xfff)
                     return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(imm));
-                if (negated_imm <= 0xfff)
-                    return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(negated_imm));
+                if (nimm <= 0xfff)
+                    return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(nimm));
             }
             imm = get_imm(imm);
             if (imm != INVALID_IMM)
@@ -572,64 +567,60 @@
                 return push_inst32(compiler, ADCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
             break;
         case SLJIT_SUB:
-            if (flags & ARG2_IMM) {
-                negated_imm = (sljit_uw)-(sljit_sw)imm;
-                if (!(flags & KEEP_FLAGS) && IS_2_LO_REGS(reg, dst)) {
-                    if (imm <= 0x7)
-                        return push_inst16(compiler, SUBSI3 | IMM3(imm) | RD3(dst) | RN3(reg));
-                    if (negated_imm <= 0x7)
-                        return push_inst16(compiler, ADDSI3 | IMM3(negated_imm) | RD3(dst) | RN3(reg));
-                    if (reg == dst) {
-                        if (imm <= 0xff)
-                            return push_inst16(compiler, SUBSI8 | IMM8(imm) | RDN3(dst));
-                        if (negated_imm <= 0xff)
-                            return push_inst16(compiler, ADDSI8 | IMM8(negated_imm) | RDN3(dst));
-                    }
-                    if (imm <= 0xff && (flags & UNUSED_RETURN))
-                        return push_inst16(compiler, CMPI | IMM8(imm) | RDN3(reg));
-                }
-                if (!(flags & SET_FLAGS)) {
-                    if (imm <= 0xfff)
-                        return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(imm));
-                    if (negated_imm <= 0xfff)
-                        return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(negated_imm));
-                }
-                imm = get_imm(imm);
-                if (imm != INVALID_IMM)
-                    return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
-            }
-            else {
+            if (flags & ARG1_IMM) {
                 if (!(flags & KEEP_FLAGS) && imm == 0 && IS_2_LO_REGS(reg, dst))
                     return push_inst16(compiler, RSBSI | RD3(dst) | RN3(reg));
                 imm = get_imm(imm);
                 if (imm != INVALID_IMM)
                     return push_inst32(compiler, RSB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
+                break;
             }
+            nimm = -imm;
+            if (!(flags & KEEP_FLAGS) && IS_2_LO_REGS(reg, dst)) {
+                if (imm <= 0x7)
+                    return push_inst16(compiler, SUBSI3 | IMM3(imm) | RD3(dst) | RN3(reg));
+                if (nimm <= 0x7)
+                    return push_inst16(compiler, ADDSI3 | IMM3(nimm) | RD3(dst) | RN3(reg));
+                if (reg == dst) {
+                    if (imm <= 0xff)
+                        return push_inst16(compiler, SUBSI8 | IMM8(imm) | RDN3(dst));
+                    if (nimm <= 0xff)
+                        return push_inst16(compiler, ADDSI8 | IMM8(nimm) | RDN3(dst));
+                }
+                if (imm <= 0xff && (flags & UNUSED_RETURN))
+                    return push_inst16(compiler, CMPI | IMM8(imm) | RDN3(reg));
+            }
+            if (!(flags & SET_FLAGS)) {
+                if (imm <= 0xfff)
+                    return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(imm));
+                if (nimm <= 0xfff)
+                    return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(nimm));
+            }
+            imm = get_imm(imm);
+            if (imm != INVALID_IMM)
+                return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
             break;
         case SLJIT_SUBC:
-            if (flags & ARG2_IMM) {
-                imm = get_imm(imm);
-                if (imm != INVALID_IMM)
-                    return push_inst32(compiler, SBCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
-            }
+            if (flags & ARG1_IMM)
+                break;
+            imm = get_imm(imm);
+            if (imm != INVALID_IMM)
+                return push_inst32(compiler, SBCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
             break;
-        case SLJIT_MUL:
-            /* No form with immediate operand. */
-            break;
         case SLJIT_AND:
+            nimm = get_imm(imm);
+            if (nimm != INVALID_IMM)
+                return push_inst32(compiler, ANDI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
             imm = get_imm(imm);
             if (imm != INVALID_IMM)
-                return push_inst32(compiler, ANDI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
-            imm = get_imm(~((flags & ARG2_IMM) ? arg2 : arg1));
-            if (imm != INVALID_IMM)
                 return push_inst32(compiler, BICI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
             break;
         case SLJIT_OR:
+            nimm = get_imm(imm);
+            if (nimm != INVALID_IMM)
+                return push_inst32(compiler, ORRI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
             imm = get_imm(imm);
             if (imm != INVALID_IMM)
-                return push_inst32(compiler, ORRI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
-            imm = get_imm(~((flags & ARG2_IMM) ? arg2 : arg1));
-            if (imm != INVALID_IMM)
                 return push_inst32(compiler, ORNI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
             break;
         case SLJIT_XOR:
@@ -638,50 +629,32 @@
                 return push_inst32(compiler, EORI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
             break;
         case SLJIT_SHL:
-            if (flags & ARG2_IMM) {
-                imm &= 0x1f;
-                if (imm == 0) {
-                    if (!(flags & SET_FLAGS))
-                        return push_inst16(compiler, MOV | SET_REGS44(dst, reg));
-                    if (IS_2_LO_REGS(dst, reg))
-                        return push_inst16(compiler, MOVS | RD3(dst) | RN3(reg));
-                    return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(dst) | RM4(reg));
-                }
+        case SLJIT_LSHR:
+        case SLJIT_ASHR:
+            if (flags & ARG1_IMM)
+                break;
+            imm &= 0x1f;
+            if (imm == 0) {
+                if (!(flags & SET_FLAGS))
+                    return push_inst16(compiler, MOV | SET_REGS44(dst, reg));
+                if (IS_2_LO_REGS(dst, reg))
+                    return push_inst16(compiler, MOVS | RD3(dst) | RN3(reg));
+                return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(dst) | RM4(reg));
+            }
+            switch (flags & 0xffff) {
+            case SLJIT_SHL:
                 if (!(flags & KEEP_FLAGS) && IS_2_LO_REGS(dst, reg))
                     return push_inst16(compiler, LSLSI | RD3(dst) | RN3(reg) | (imm << 6));
                 return push_inst32(compiler, LSL_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
-            }
-            break;
-        case SLJIT_LSHR:
-            if (flags & ARG2_IMM) {
-                imm &= 0x1f;
-                if (imm == 0) {
-                    if (!(flags & SET_FLAGS))
-                        return push_inst16(compiler, MOV | SET_REGS44(dst, reg));
-                    if (IS_2_LO_REGS(dst, reg))
-                        return push_inst16(compiler, MOVS | RD3(dst) | RN3(reg));
-                    return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(dst) | RM4(reg));
-                }
+            case SLJIT_LSHR:
                 if (!(flags & KEEP_FLAGS) && IS_2_LO_REGS(dst, reg))
                     return push_inst16(compiler, LSRSI | RD3(dst) | RN3(reg) | (imm << 6));
                 return push_inst32(compiler, LSR_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
-            }
-            break;
-        case SLJIT_ASHR:
-            if (flags & ARG2_IMM) {
-                imm &= 0x1f;
-                if (imm == 0) {
-                    if (!(flags & SET_FLAGS))
-                        return push_inst16(compiler, MOV | SET_REGS44(dst, reg));
-                    if (IS_2_LO_REGS(dst, reg))
-                        return push_inst16(compiler, MOVS | RD3(dst) | RN3(reg));
-                    return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(dst) | RM4(reg));
-                }
+            default: /* SLJIT_ASHR */
                 if (!(flags & KEEP_FLAGS) && IS_2_LO_REGS(dst, reg))
                     return push_inst16(compiler, ASRSI | RD3(dst) | RN3(reg) | (imm << 6));
                 return push_inst32(compiler, ASR_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
             }
-            break;
         default:
             SLJIT_ASSERT_STOP();
             break;
@@ -708,6 +681,8 @@
     case SLJIT_MOVU_SI:
     case SLJIT_MOVU_P:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
+        if (dst == arg2)
+            return SLJIT_SUCCESS;
         return push_inst16(compiler, MOV | SET_REGS44(dst, arg2));
     case SLJIT_MOV_UB:
     case SLJIT_MOVU_UB:
@@ -834,7 +809,7 @@
   s = store
 */


-static SLJIT_CONST sljit_uw sljit_mem16[12] = {
+static SLJIT_CONST sljit_ins sljit_mem16[12] = {
/* w u l */ 0x5800 /* ldr */,
/* w u s */ 0x5000 /* str */,
/* w s l */ 0x5800 /* ldr */,
@@ -851,7 +826,7 @@
/* h s s */ 0x5200 /* strh */,
};

-static SLJIT_CONST sljit_uw sljit_mem16_imm5[12] = {
+static SLJIT_CONST sljit_ins sljit_mem16_imm5[12] = {
/* w u l */ 0x6800 /* ldr imm5 */,
/* w u s */ 0x6000 /* str imm5 */,
/* w s l */ 0x6800 /* ldr imm5 */,
@@ -870,7 +845,7 @@

 #define MEM_IMM8    0xc00
 #define MEM_IMM12    0x800000
-static SLJIT_CONST sljit_uw sljit_mem32[12] = {
+static SLJIT_CONST sljit_ins sljit_mem32[12] = {
 /* w u l */ 0xf8500000 /* ldr.w */,
 /* w u s */ 0xf8400000 /* str.w */,
 /* w s l */ 0xf8500000 /* ldr.w */,
@@ -911,69 +886,71 @@
 /* Can perform an operation using at most 1 instruction. */
 static sljit_si getput_arg_fast(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg, sljit_si arg, sljit_sw argw)
 {
-    sljit_si tmp;
+    sljit_si other_r, shift;


     SLJIT_ASSERT(arg & SLJIT_MEM);


     if (SLJIT_UNLIKELY(flags & UPDATE)) {
-        if ((arg & 0xf) && !(arg & 0xf0) && argw <= 0xff && argw >= -0xff) {
-            flags &= ~UPDATE;
-            arg &= 0xf;
+        if ((arg & REG_MASK) && !(arg & OFFS_REG_MASK) && argw <= 0xff && argw >= -0xff) {
             if (SLJIT_UNLIKELY(flags & ARG_TEST))
                 return 1;


+            flags &= ~UPDATE;
+            arg &= 0xf;
             if (argw >= 0)
                 argw |= 0x200;
             else {
                 argw = -argw;
             }
+
             SLJIT_ASSERT(argw >= 0 && (argw & 0xff) <= 0xff);
             FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | MEM_IMM8 | RT4(reg) | RN4(arg) | 0x100 | argw));
             return -1;
         }
-        return (flags & ARG_TEST) ? SLJIT_SUCCESS : 0;
+        return 0;
     }


-    if (SLJIT_UNLIKELY(arg & 0xf0)) {
-        argw &= 0x3;
-        tmp = (arg >> 4) & 0xf;
-        arg &= 0xf;
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
         if (SLJIT_UNLIKELY(flags & ARG_TEST))
             return 1;


-        if (!argw && IS_3_LO_REGS(reg, arg, tmp))
-            FAIL_IF(push_inst16(compiler, sljit_mem16[flags] | RD3(reg) | RN3(arg) | RM3(tmp)));
+        argw &= 0x3;
+        other_r = OFFS_REG(arg);
+        arg &= 0xf;
+
+        if (!argw && IS_3_LO_REGS(reg, arg, other_r))
+            FAIL_IF(push_inst16(compiler, sljit_mem16[flags] | RD3(reg) | RN3(arg) | RM3(other_r)));
         else
-            FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(tmp) | (argw << 4)));
+            FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(other_r) | (argw << 4)));
         return -1;
     }


-    if (!(arg & 0xf) || argw > 0xfff || argw < -0xff)
-        return (flags & ARG_TEST) ? SLJIT_SUCCESS : 0;
+    if (!(arg & REG_MASK) || argw > 0xfff || argw < -0xff)
+        return 0;


     if (SLJIT_UNLIKELY(flags & ARG_TEST))
         return 1;


     arg &= 0xf;
     if (IS_2_LO_REGS(reg, arg) && sljit_mem16_imm5[flags]) {
-        tmp = 3;
+        shift = 3;
         if (IS_WORD_SIZE(flags)) {
             if (OFFSET_CHECK(0x1f, 2))
-                tmp = 2;
+                shift = 2;
         }
         else if (flags & BYTE_SIZE)
         {
             if (OFFSET_CHECK(0x1f, 0))
-                tmp = 0;
+                shift = 0;
         }
         else {
             SLJIT_ASSERT(flags & HALF_SIZE);
             if (OFFSET_CHECK(0x1f, 1))
-                tmp = 1;
+                shift = 1;
         }


-        if (tmp != 3) {
-            FAIL_IF(push_inst16(compiler, sljit_mem16_imm5[flags] | RD3(reg) | RN3(arg) | (argw << (6 - tmp))));
+        if (shift != 3) {
+            FAIL_IF(push_inst16(compiler, sljit_mem16_imm5[flags] | RD3(reg) | RN3(arg) | (argw << (6 - shift))));
             return -1;
         }
     }
@@ -996,12 +973,13 @@
    operators always uses word arguments without write back. */
 static sljit_si can_cache(sljit_si arg, sljit_sw argw, sljit_si next_arg, sljit_sw next_argw)
 {
-    /* Simple operation except for updates. */
-    if ((arg & 0xf0) || !(next_arg & SLJIT_MEM))
+    sljit_sw diff;
+    if ((arg & OFFS_REG_MASK) || !(next_arg & SLJIT_MEM))
         return 0;


-    if (!(arg & 0xf)) {
-        if ((sljit_uw)(argw - next_argw) <= 0xfff || (sljit_uw)(next_argw - argw) <= 0xfff)
+    if (!(arg & REG_MASK)) {
+        diff = argw - next_argw;
+        if (diff <= 0xfff && diff >= -0xfff)
             return 1;
         return 0;
     }
@@ -1009,17 +987,19 @@
     if (argw == next_argw)
         return 1;


-    if (arg == next_arg && ((sljit_uw)(argw - next_argw) <= 0xfff || (sljit_uw)(next_argw - argw) <= 0xfff))
+    diff = argw - next_argw;
+    if (arg == next_arg && diff <= 0xfff && diff >= -0xfff)
         return 1;


     return 0;
 }


 /* Emit the necessary instructions. See can_cache above. */
-static sljit_si getput_arg(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg, sljit_si arg, sljit_sw argw, sljit_si next_arg, sljit_sw next_argw)
+static sljit_si getput_arg(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg,
+    sljit_si arg, sljit_sw argw, sljit_si next_arg, sljit_sw next_argw)
 {
-    sljit_si tmp_r;
-    sljit_sw tmp;
+    sljit_si tmp_r, other_r;
+    sljit_sw diff;


     SLJIT_ASSERT(arg & SLJIT_MEM);
     if (!(next_arg & SLJIT_MEM)) {
@@ -1029,69 +1009,76 @@


     tmp_r = (flags & STORE) ? TMP_REG3 : reg;


-    if (SLJIT_UNLIKELY(flags & UPDATE)) {
-        flags &= ~UPDATE;
+    if (SLJIT_UNLIKELY((flags & UPDATE) && (arg & REG_MASK))) {
         /* Update only applies if a base register exists. */
-        if (arg & 0xf) {
-            /* There is no caching here. */
-            tmp = (arg & 0xf0) >> 4;
-            arg &= 0xf;
+        /* There is no caching here. */
+        other_r = OFFS_REG(arg);
+        arg &= 0xf;
+        flags &= ~UPDATE;


-            if (!tmp) {
-                if (!(argw & ~0xfff)) {
-                    FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(arg) | argw));
-                    return push_inst32(compiler, ADDWI | RD4(arg) | RN4(arg) | IMM12(argw));
-                }
+        if (!other_r) {
+            if (!(argw & ~0xfff)) {
+                FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(arg) | argw));
+                return push_inst32(compiler, ADDWI | RD4(arg) | RN4(arg) | IMM12(argw));
+            }


-                if (compiler->cache_arg == SLJIT_MEM) {
-                    if (argw == compiler->cache_argw) {
-                        tmp = TMP_REG3;
-                        argw = 0;
-                    }
-                    else if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
-                        FAIL_IF(compiler->error);
-                        compiler->cache_argw = argw;
-                        tmp = TMP_REG3;
-                        argw = 0;
-                    }
+            if (compiler->cache_arg == SLJIT_MEM) {
+                if (argw == compiler->cache_argw) {
+                    other_r = TMP_REG3;
+                    argw = 0;
                 }
-
-                if (argw) {
-                    FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-                    compiler->cache_arg = SLJIT_MEM;
+                else if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
+                    FAIL_IF(compiler->error);
                     compiler->cache_argw = argw;
-                    tmp = TMP_REG3;
+                    other_r = TMP_REG3;
                     argw = 0;
                 }
             }


-            argw &= 0x3;
-            if (!argw && IS_3_LO_REGS(reg, arg, tmp)) {
-                FAIL_IF(push_inst16(compiler, sljit_mem16[flags] | RD3(reg) | RN3(arg) | RM3(tmp)));
-                return push_inst16(compiler, ADD | SET_REGS44(arg, tmp));
+            if (argw) {
+                FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+                compiler->cache_arg = SLJIT_MEM;
+                compiler->cache_argw = argw;
+                other_r = TMP_REG3;
+                argw = 0;
             }
-            FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(tmp) | (argw << 4)));
-            return push_inst32(compiler, ADD_W | RD4(arg) | RN4(arg) | RM4(tmp) | (argw << 6));
         }
+
+        argw &= 0x3;
+        if (!argw && IS_3_LO_REGS(reg, arg, other_r)) {
+            FAIL_IF(push_inst16(compiler, sljit_mem16[flags] | RD3(reg) | RN3(arg) | RM3(other_r)));
+            return push_inst16(compiler, ADD | SET_REGS44(arg, other_r));
+        }
+        FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(other_r) | (argw << 4)));
+        return push_inst32(compiler, ADD_W | RD4(arg) | RN4(arg) | RM4(other_r) | (argw << 6));
     }
+    flags &= ~UPDATE;


-    SLJIT_ASSERT(!(arg & 0xf0));
+    SLJIT_ASSERT(!(arg & OFFS_REG_MASK));


     if (compiler->cache_arg == arg) {
-        if (!((argw - compiler->cache_argw) & ~0xfff))
-            return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(TMP_REG3) | (argw - compiler->cache_argw));
+        diff = argw - compiler->cache_argw;
+        if (!(diff & ~0xfff))
+            return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(TMP_REG3) | diff);
         if (!((compiler->cache_argw - argw) & ~0xff))
             return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM8 | RT4(reg) | RN4(TMP_REG3) | (compiler->cache_argw - argw));
-        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
+        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, diff) != SLJIT_ERR_UNSUPPORTED) {
             FAIL_IF(compiler->error);
             return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(TMP_REG3) | 0);
         }
     }


-    next_arg = (arg & 0xf) && (arg == next_arg);
+    next_arg = (arg & REG_MASK) && (arg == next_arg) && (argw != next_argw);
     arg &= 0xf;
-    if (arg && compiler->cache_arg == SLJIT_MEM && compiler->cache_argw == argw)
-        return push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(TMP_REG3));
+    if (arg && compiler->cache_arg == SLJIT_MEM) {
+        if (compiler->cache_argw == argw)
+            return push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(TMP_REG3));
+        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
+            FAIL_IF(compiler->error);
+            compiler->cache_argw = argw;
+            return push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(TMP_REG3));
+        }
+    }


     compiler->cache_argw = argw;
     if (next_arg && emit_set_delta(compiler, TMP_REG3, arg, argw) != SLJIT_ERR_UNSUPPORTED) {
@@ -1103,7 +1090,8 @@
         FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
         compiler->cache_arg = SLJIT_MEM;


-        if (next_arg) {
+        diff = argw - next_argw;
+        if (next_arg && diff <= 0xfff && diff >= -0xfff) {
             FAIL_IF(push_inst16(compiler, ADD | SET_REGS44(TMP_REG3, arg)));
             compiler->cache_arg = SLJIT_MEM | arg;
             arg = 0;
@@ -1270,11 +1258,9 @@
     op = GET_OPCODE(op);
     switch (op) {
     case SLJIT_BREAKPOINT:
-        push_inst16(compiler, BKPT);
-        break;
+        return push_inst16(compiler, BKPT);
     case SLJIT_NOP:
-        push_inst16(compiler, NOP);
-        break;
+        return push_inst16(compiler, NOP);
     case SLJIT_UMUL:
     case SLJIT_SMUL:
         return push_inst32(compiler, (op == SLJIT_UMUL ? UMULL : SMULL)
@@ -1321,7 +1307,7 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    dst_r = (dst >= SLJIT_SCRATCH_REG1 && dst <= TMP_REG3) ? dst : TMP_REG1;
+    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;


     op = GET_OPCODE(op);
     if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
@@ -1454,7 +1440,7 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    dst_r = (dst >= SLJIT_SCRATCH_REG1 && dst <= TMP_REG3) ? dst : TMP_REG1;
+    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
     flags = (GET_FLAGS(op) ? SET_FLAGS : 0) | ((op & SLJIT_KEEP_FLAGS) ? KEEP_FLAGS : 0);


     if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, WORD_SIZE | STORE | ARG_TEST, TMP_REG1, dst, dstw))
@@ -1505,9 +1491,6 @@
     if (dst == SLJIT_UNUSED)
         flags |= UNUSED_RETURN;


-    if (GET_OPCODE(op) == SLJIT_MUL && (op & SLJIT_SET_O))
-        flags |= SET_MULOV;
-
     emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src1w, src2w);


     if (dst & SLJIT_MEM) {
@@ -1564,20 +1547,21 @@
     SLJIT_ASSERT(arg & SLJIT_MEM);


     /* Fast loads and stores. */
-    if (SLJIT_UNLIKELY(arg & 0xf0)) {
-        FAIL_IF(push_inst32(compiler, ADD_W | RD4(TMP_REG2) | RN4(arg & 0xf) | RM4((arg & 0xf0) >> 4) | ((argw & 0x3) << 6)));
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+        FAIL_IF(push_inst32(compiler, ADD_W | RD4(TMP_REG2) | RN4(arg & REG_MASK) | RM4(OFFS_REG(arg)) | ((argw & 0x3) << 6)));
         arg = SLJIT_MEM | TMP_REG2;
         argw = 0;
     }


-    if ((arg & 0xf) && (argw & 0x3) == 0) {
+    if ((arg & REG_MASK) && (argw & 0x3) == 0) {
         if (!(argw & ~0x3fc))
-            return push_inst32(compiler, inst | 0x800000 | RN4(arg & 0xf) | DD4(reg) | (argw >> 2));
+            return push_inst32(compiler, inst | 0x800000 | RN4(arg & REG_MASK) | DD4(reg) | (argw >> 2));
         if (!(-argw & ~0x3fc))
-            return push_inst32(compiler, inst | RN4(arg & 0xf) | DD4(reg) | (-argw >> 2));
+            return push_inst32(compiler, inst | RN4(arg & REG_MASK) | DD4(reg) | (-argw >> 2));
     }


-    SLJIT_ASSERT(!(arg & 0xf0));
+    /* Slow cases */
+    SLJIT_ASSERT(!(arg & OFFS_REG_MASK));
     if (compiler->cache_arg == arg) {
         tmp = argw - compiler->cache_argw;
         if (!(tmp & ~0x3fc))
@@ -1591,20 +1575,20 @@
         }
     }


-    if (arg & 0xf) {
-        if (emit_set_delta(compiler, TMP_REG1, arg & 0xf, argw) != SLJIT_ERR_UNSUPPORTED) {
+    if (arg & REG_MASK) {
+        if (emit_set_delta(compiler, TMP_REG1, arg & REG_MASK, argw) != SLJIT_ERR_UNSUPPORTED) {
             FAIL_IF(compiler->error);
             return push_inst32(compiler, inst | 0x800000 | RN4(TMP_REG1) | DD4(reg));
         }
         imm = get_imm(argw & ~0x3fc);
         if (imm != INVALID_IMM) {
-            FAIL_IF(push_inst32(compiler, ADD_WI | RD4(TMP_REG1) | RN4(arg & 0xf) | imm));
+            FAIL_IF(push_inst32(compiler, ADD_WI | RD4(TMP_REG1) | RN4(arg & REG_MASK) | imm));
             return push_inst32(compiler, inst | 0x800000 | RN4(TMP_REG1) | DD4(reg) | ((argw & 0x3fc) >> 2));
         }
         imm = get_imm(-argw & ~0x3fc);
         if (imm != INVALID_IMM) {
             argw = -argw;
-            FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(arg & 0xf) | imm));
+            FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(arg & REG_MASK) | imm));
             return push_inst32(compiler, inst | RN4(TMP_REG1) | DD4(reg) | ((argw & 0x3fc) >> 2));
         }
     }
@@ -1612,13 +1596,9 @@
     compiler->cache_arg = arg;
     compiler->cache_argw = argw;


-    if (SLJIT_UNLIKELY(!(arg & 0xf)))
-        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-    else {
-        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-        if (arg & 0xf)
-            FAIL_IF(push_inst16(compiler, ADD | SET_REGS44(TMP_REG3, (arg & 0xf))));
-    }
+    FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+    if (arg & REG_MASK)
+        FAIL_IF(push_inst16(compiler, ADD | SET_REGS44(TMP_REG3, (arg & REG_MASK))));
     return push_inst32(compiler, inst | 0x800000 | RN4(TMP_REG3) | DD4(reg));
 }


@@ -1649,7 +1629,7 @@
         return push_inst32(compiler, VMRS);
     }


-    dst_r = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG1 : dst;
+    dst_r = (dst <= REG_MASK) ? dst : TMP_FREG1;
     if (src & SLJIT_MEM) {
         emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP) | FPU_LOAD, dst_r, src, srcw);
         src = dst_r;
@@ -1668,9 +1648,9 @@
         break;
     }


-    if (dst & SLJIT_MEM)
-        return emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP), TMP_FREG1, dst, dstw);
-    return SLJIT_SUCCESS;
+    if (!(dst & SLJIT_MEM))
+        return SLJIT_SUCCESS;
+    return emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP), TMP_FREG1, dst, dstw);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
@@ -1687,7 +1667,7 @@
     compiler->cache_argw = 0;
     op ^= SLJIT_SINGLE_OP;


-    dst_r = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG1 : dst;
+    dst_r = (dst <= REG_MASK) ? dst : TMP_FREG1;
     if (src1 & SLJIT_MEM) {
         emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP) | FPU_LOAD, TMP_FREG1, src1, src1w);
         src1 = TMP_FREG1;
@@ -1712,9 +1692,9 @@
         break;
     }


-    if (dst & SLJIT_MEM)
-        return emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP), TMP_FREG1, dst, dstw);
-    return SLJIT_SUCCESS;
+    if (!(dst & SLJIT_MEM))
+        return SLJIT_SUCCESS;
+    return emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP), TMP_FREG1, dst, dstw);
 }


 #undef FPU_LOAD
@@ -1733,7 +1713,7 @@
     if (dst == SLJIT_UNUSED)
         return SLJIT_SUCCESS;


-    if (dst <= TMP_REG3)
+    if (dst <= REG_MASK)
         return push_inst16(compiler, MOV | SET_REGS44(dst, TMP_REG3));


     /* Memory. */
@@ -1752,7 +1732,7 @@
     check_sljit_emit_fast_return(compiler, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3)
+    if (src <= REG_MASK)
         FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG3, src)));
     else if (src & SLJIT_MEM) {
         if (getput_arg_fast(compiler, WORD_SIZE, TMP_REG3, src, srcw))
@@ -1846,7 +1826,7 @@
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
 {
     struct sljit_jump *jump;
-    sljit_si cc;
+    sljit_ins cc;


     CHECK_ERROR_PTR();
     check_sljit_emit_jump(compiler, type);
@@ -1885,25 +1865,23 @@
     ADJUST_LOCAL_OFFSET(src, srcw);


     /* In ARM, we don't need to touch the arguments. */
-    if (src & SLJIT_IMM) {
-        jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
-        FAIL_IF(!jump);
-        set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
-        jump->u.target = srcw;
-
-        FAIL_IF(emit_imm32_const(compiler, TMP_REG1, 0));
-        jump->addr = compiler->size;
-        FAIL_IF(push_inst16(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RN3(TMP_REG1)));
-    }
-    else {
-        if (src <= TMP_REG3)
+    if (!(src & SLJIT_IMM)) {
+        if (FAST_IS_REG(src))
             return push_inst16(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RN3(src));


         FAIL_IF(emit_op_mem(compiler, WORD_SIZE, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, src, srcw));
         if (type >= SLJIT_FAST_CALL)
             return push_inst16(compiler, BLX | RN3(TMP_REG1));
     }
-    return SLJIT_SUCCESS;
+
+    jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+    FAIL_IF(!jump);
+    set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
+    jump->u.target = srcw;
+
+    FAIL_IF(emit_imm32_const(compiler, TMP_REG1, 0));
+    jump->addr = compiler->size;
+    return push_inst16(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RN3(TMP_REG1));
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
@@ -1912,8 +1890,7 @@
     sljit_si type)
 {
     sljit_si dst_r, flags = GET_ALL_FLAGS(op);
-    sljit_ins ins;
-    sljit_uw cc;
+    sljit_ins cc, ins;


     CHECK_ERROR();
     check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
@@ -1925,7 +1902,7 @@


     op = GET_OPCODE(op);
     cc = get_cc(type);
-    dst_r = (dst <= TMP_REG3) ? dst : TMP_REG2;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;


     if (op < SLJIT_ADD) {
         FAIL_IF(push_inst16(compiler, IT | (cc << 4) | (((cc & 0x1) ^ 0x1) << 3) | 0x4));
@@ -1936,11 +1913,13 @@
             FAIL_IF(push_inst16(compiler, MOVSI | RDN3(dst_r) | 1));
             FAIL_IF(push_inst16(compiler, MOVSI | RDN3(dst_r) | 0));
         }
-        return dst_r == TMP_REG2 ? emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG2, dst, dstw) : SLJIT_SUCCESS;
+        if (dst_r != TMP_REG2)
+            return SLJIT_SUCCESS;
+        return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG2, dst, dstw);
     }


     ins = (op == SLJIT_AND ? ANDI : (op == SLJIT_OR ? ORRI : EORI));
-    if ((op == SLJIT_OR || op == SLJIT_XOR) && dst <= TMP_REG3 && dst == src) {
+    if ((op == SLJIT_OR || op == SLJIT_XOR) && FAST_IS_REG(dst) && dst == src) {
         /* Does not change the other bits. */
         FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
         FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst) | 1));
@@ -1956,18 +1935,25 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
     if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, WORD_SIZE, TMP_REG1, src, srcw, dst, dstw));
-        src = TMP_REG1;
+        FAIL_IF(emit_op_mem2(compiler, WORD_SIZE, TMP_REG2, src, srcw, dst, dstw));
+        src = TMP_REG2;
         srcw = 0;
     } else if (src & SLJIT_IMM) {
-        FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
-        src = TMP_REG1;
+        FAIL_IF(load_immediate(compiler, TMP_REG2, srcw));
+        src = TMP_REG2;
         srcw = 0;
     }


-    FAIL_IF(push_inst16(compiler, IT | (cc << 4) | (((cc & 0x1) ^ 0x1) << 3) | 0x4));
-    FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 1));
-    FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 0));
+    if (op == SLJIT_AND || src != dst_r) {
+        FAIL_IF(push_inst16(compiler, IT | (cc << 4) | (((cc & 0x1) ^ 0x1) << 3) | 0x4));
+        FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 1));
+        FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 0));
+    }
+    else {
+        FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
+        FAIL_IF(push_inst32(compiler, ins | RN4(src) | RD4(dst_r) | 1));
+    }
+
     if (dst_r == TMP_REG2)
         FAIL_IF(emit_op_mem2(compiler, WORD_SIZE | STORE, TMP_REG2, dst, dstw, 0, 0));


@@ -1993,7 +1979,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    dst_r = (dst <= TMP_REG3) ? dst : TMP_REG1;
+    dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
     PTR_FAIL_IF(emit_imm32_const(compiler, dst_r, init_value));


     if (dst & SLJIT_MEM)
@@ -2003,12 +1989,14 @@


 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
 {
-    inline_set_jump_addr(addr, new_addr, 1);
+    sljit_uh *inst = (sljit_uh*)addr;
+    modify_imm32_const(inst, new_addr);
+    SLJIT_CACHE_FLUSH(inst, inst + 3);
 }


 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
 {
-    sljit_uh* inst = (sljit_uh*)addr;
+    sljit_uh *inst = (sljit_uh*)addr;
     modify_imm32_const(inst, new_constant);
     SLJIT_CACHE_FLUSH(inst, inst + 3);
 }


Modified: code/trunk/sljit/sljitNativeARM_v5.c
===================================================================
--- code/trunk/sljit/sljitNativeARM_v5.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeARM_v5.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -1379,54 +1379,57 @@
             EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(MVN_DP, 0, reg, SLJIT_UNUSED, imm));
             return -1;
         }
-        return (inp_flags & ARG_TEST) ? SLJIT_SUCCESS : 0;
+        return 0;
     }


     SLJIT_ASSERT(arg & SLJIT_MEM);


     /* Fast loads/stores. */
-    if (arg & 0xf) {
-        if (!(arg & 0xf0)) {
-            if (IS_TYPE1_TRANSFER(inp_flags)) {
-                if (argw >= 0 && argw <= 0xfff) {
-                    if (inp_flags & ARG_TEST)
-                        return 1;
-                    EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & 0xf, argw));
-                    return -1;
-                }
-                if (argw < 0 && argw >= -0xfff) {
-                    if (inp_flags & ARG_TEST)
-                        return 1;
-                    EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 0, inp_flags & WRITE_BACK, reg, arg & 0xf, -argw));
-                    return -1;
-                }
-            }
-            else {
-                if (argw >= 0 && argw <= 0xff) {
-                    if (inp_flags & ARG_TEST)
-                        return 1;
-                    EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & 0xf, TYPE2_TRANSFER_IMM(argw)));
-                    return -1;
-                }
-                if (argw < 0 && argw >= -0xff) {
-                    if (inp_flags & ARG_TEST)
-                        return 1;
-                    argw = -argw;
-                    EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 0, inp_flags & WRITE_BACK, reg, arg & 0xf, TYPE2_TRANSFER_IMM(argw)));
-                    return -1;
-                }
-            }
+    if (!(arg & REG_MASK))
+        return 0;
+
+    if (arg & OFFS_REG_MASK) {
+        if ((argw & 0x3) != 0 && !IS_TYPE1_TRANSFER(inp_flags))
+            return 0;
+
+        if (inp_flags & ARG_TEST)
+            return 1;
+        EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK,
+            RM(OFFS_REG(arg)) | (IS_TYPE1_TRANSFER(inp_flags) ? SRC2_IMM : 0) | ((argw & 0x3) << 7)));
+        return -1;
+    }
+
+    if (IS_TYPE1_TRANSFER(inp_flags)) {
+        if (argw >= 0 && argw <= 0xfff) {
+            if (inp_flags & ARG_TEST)
+                return 1;
+            EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK, argw));
+            return -1;
         }
-        else if ((argw & 0x3) == 0 || IS_TYPE1_TRANSFER(inp_flags)) {
+        if (argw < 0 && argw >= -0xfff) {
             if (inp_flags & ARG_TEST)
                 return 1;
-            EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & 0xf,
-                RM((arg >> 4) & 0xf) | (IS_TYPE1_TRANSFER(inp_flags) ? SRC2_IMM : 0) | ((argw & 0x3) << 7)));
+            EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 0, inp_flags & WRITE_BACK, reg, arg & REG_MASK, -argw));
             return -1;
         }
     }
+    else {
+        if (argw >= 0 && argw <= 0xff) {
+            if (inp_flags & ARG_TEST)
+                return 1;
+            EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK, TYPE2_TRANSFER_IMM(argw)));
+            return -1;
+        }
+        if (argw < 0 && argw >= -0xff) {
+            if (inp_flags & ARG_TEST)
+                return 1;
+            argw = -argw;
+            EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 0, inp_flags & WRITE_BACK, reg, arg & REG_MASK, TYPE2_TRANSFER_IMM(argw)));
+            return -1;
+        }
+    }


-    return (inp_flags & ARG_TEST) ? SLJIT_SUCCESS : 0;
+    return 0;
 }


 /* See getput_arg below.
@@ -1439,10 +1442,10 @@
         return 0;


     /* Always a simple operation. */
-    if (arg & 0xf0)
+    if (arg & OFFS_REG_MASK)
         return 0;


-    if (!(arg & 0xf)) {
+    if (!(arg & REG_MASK)) {
         /* Immediate access. */
         if ((next_arg & SLJIT_MEM) && ((sljit_uw)argw - (sljit_uw)next_argw <= 0xfff || (sljit_uw)next_argw - (sljit_uw)argw <= 0xfff))
             return 1;
@@ -1469,7 +1472,7 @@


 #define TEST_WRITE_BACK() \
     if (inp_flags & WRITE_BACK) { \
-        tmp_r = arg & 0xf; \
+        tmp_r = arg & REG_MASK; \
         if (reg == tmp_r) { \
             /* This can only happen for stores */ \
             /* since ldr reg, [reg, ...]! has no meaning */ \
@@ -1497,7 +1500,7 @@
     tmp_r = (inp_flags & LOAD_DATA) ? reg : TMP_REG3;
     max_delta = IS_TYPE1_TRANSFER(inp_flags) ? 0xfff : 0xff;


-    if ((arg & 0xf) == SLJIT_UNUSED) {
+    if ((arg & REG_MASK) == SLJIT_UNUSED) {
         /* Write back is not used. */
         imm = (sljit_uw)(argw - compiler->cache_argw);
         if ((compiler->cache_arg & SLJIT_IMM) && (imm <= (sljit_uw)max_delta || imm >= (sljit_uw)-max_delta)) {
@@ -1530,11 +1533,11 @@
         return SLJIT_SUCCESS;
     }


-    if (arg & 0xf0) {
+    if (arg & OFFS_REG_MASK) {
         SLJIT_ASSERT((argw & 0x3) && !(max_delta & 0xf00));
         if (inp_flags & WRITE_BACK)
-            tmp_r = arg & 0xf;
-        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, tmp_r, arg & 0xf, RM((arg >> 4) & 0xf) | ((argw & 0x3) << 7)));
+            tmp_r = arg & REG_MASK;
+        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, tmp_r, arg & REG_MASK, RM(OFFS_REG(arg)) | ((argw & 0x3) << 7)));
         EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, 0, reg, tmp_r, TYPE2_TRANSFER_IMM(0)));
         return SLJIT_SUCCESS;
     }
@@ -1555,7 +1558,7 @@
     imm = get_imm(argw & ~max_delta);
     if (imm) {
         TEST_WRITE_BACK();
-        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, tmp_r, arg & 0xf, imm));
+        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, tmp_r, arg & REG_MASK, imm));
         GETPUT_ARG_DATA_TRANSFER(1, inp_flags & WRITE_BACK, reg, tmp_r, argw & max_delta);
         return SLJIT_SUCCESS;
     }
@@ -1564,14 +1567,14 @@
     if (imm) {
         argw = -argw;
         TEST_WRITE_BACK();
-        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(SUB_DP, 0, tmp_r, arg & 0xf, imm));
+        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(SUB_DP, 0, tmp_r, arg & REG_MASK, imm));
         GETPUT_ARG_DATA_TRANSFER(0, inp_flags & WRITE_BACK, reg, tmp_r, argw & max_delta);
         return SLJIT_SUCCESS;
     }


     if ((compiler->cache_arg & SLJIT_IMM) && compiler->cache_argw == argw) {
         TEST_WRITE_BACK();
-        EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & 0xf, RM(TMP_REG3) | (max_delta & 0xf00 ? SRC2_IMM : 0)));
+        EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK, RM(TMP_REG3) | (max_delta & 0xf00 ? SRC2_IMM : 0)));
         return SLJIT_SUCCESS;
     }


@@ -1583,7 +1586,7 @@
         compiler->cache_argw = argw;


         TEST_WRITE_BACK();
-        EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & 0xf, RM(TMP_REG3) | (max_delta & 0xf00 ? SRC2_IMM : 0)));
+        EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK, RM(TMP_REG3) | (max_delta & 0xf00 ? SRC2_IMM : 0)));
         return SLJIT_SUCCESS;
     }


@@ -1591,7 +1594,7 @@
     if (arg == next_arg && !(inp_flags & WRITE_BACK) && (imm <= (sljit_uw)max_delta || imm >= (sljit_uw)-max_delta)) {
         SLJIT_ASSERT(inp_flags & LOAD_DATA);
         FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG3, TMP_REG3, reg_map[arg & 0xf]));
+        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG3, TMP_REG3, reg_map[arg & REG_MASK]));


         compiler->cache_arg = arg;
         compiler->cache_argw = argw;
@@ -1600,14 +1603,14 @@
         return SLJIT_SUCCESS;
     }


-    if ((arg & 0xf) == tmp_r) {
+    if ((arg & REG_MASK) == tmp_r) {
         compiler->cache_arg = SLJIT_IMM;
         compiler->cache_argw = argw;
         tmp_r = TMP_REG3;
     }


     FAIL_IF(load_immediate(compiler, tmp_r, argw));
-    EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & 0xf, reg_map[tmp_r] | (max_delta & 0xf00 ? SRC2_IMM : 0)));
+    EMIT_INSTRUCTION(EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK, reg_map[tmp_r] | (max_delta & 0xf00 ? SRC2_IMM : 0)));
     return SLJIT_SUCCESS;
 }


@@ -1653,7 +1656,7 @@
             return SLJIT_SUCCESS;
         dst_r = TMP_REG2;
     }
-    else if (dst <= TMP_REG3) {
+    else if (FAST_IS_REG(dst)) {
         dst_r = dst;
         flags |= REG_DEST;
         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -1672,9 +1675,9 @@
     }


     /* Source 1. */
-    if (src1 <= TMP_REG3)
+    if (FAST_IS_REG(src1))
         src1_r = src1;
-    else if (src2 <= TMP_REG3) {
+    else if (FAST_IS_REG(src2)) {
         flags |= ARGS_SWAPPED;
         src1_r = src2;
         src2 = src1;
@@ -1720,7 +1723,7 @@


     /* Source 2. */
     if (src2_r == 0) {
-        if (src2 <= TMP_REG3) {
+        if (FAST_IS_REG(src2)) {
             src2_r = src2;
             flags |= REG_SOURCE;
             if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -2056,18 +2059,18 @@
     sljit_sw inst = VSTR_F32 | (flags & (SLJIT_SINGLE_OP | FPU_LOAD));
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if (SLJIT_UNLIKELY(arg & 0xf0)) {
-        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG1, arg & 0xf, RM((arg >> 4) & 0xf) | ((argw & 0x3) << 7)));
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG1, arg & REG_MASK, RM(OFFS_REG(arg)) | ((argw & 0x3) << 7)));
         arg = SLJIT_MEM | TMP_REG1;
         argw = 0;
     }


     /* Fast loads and stores. */
-    if ((arg & 0xf)) {
+    if ((arg & REG_MASK)) {
         if (!(argw & ~0x3fc))
-            return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 1, arg & 0xf, reg, argw >> 2));
+            return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 1, arg & REG_MASK, reg, argw >> 2));
         if (!(-argw & ~0x3fc))
-            return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 0, arg & 0xf, reg, (-argw) >> 2));
+            return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 0, arg & REG_MASK, reg, (-argw) >> 2));
     }


     if (compiler->cache_arg == arg) {
@@ -2083,29 +2086,29 @@
         }
     }


-    if (arg & 0xf) {
-        if (emit_set_delta(compiler, TMP_REG1, arg & 0xf, argw) != SLJIT_ERR_UNSUPPORTED) {
+    if (arg & REG_MASK) {
+        if (emit_set_delta(compiler, TMP_REG1, arg & REG_MASK, argw) != SLJIT_ERR_UNSUPPORTED) {
             FAIL_IF(compiler->error);
             return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 1, TMP_REG1, reg, 0));
         }
         imm = get_imm(argw & ~0x3fc);
         if (imm) {
-            EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG1, arg & 0xf, imm));
+            EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG1, arg & REG_MASK, imm));
             return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 1, TMP_REG1, reg, (argw & 0x3fc) >> 2));
         }
         imm = get_imm(-argw & ~0x3fc);
         if (imm) {
             argw = -argw;
-            EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(SUB_DP, 0, TMP_REG1, arg & 0xf, imm));
+            EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(SUB_DP, 0, TMP_REG1, arg & REG_MASK, imm));
             return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 0, TMP_REG1, reg, (argw & 0x3fc) >> 2));
         }
     }


     compiler->cache_arg = arg;
     compiler->cache_argw = argw;
-    if (arg & 0xf) {
+    if (arg & REG_MASK) {
         FAIL_IF(load_immediate(compiler, TMP_REG1, argw));
-        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG3, arg & 0xf, reg_map[TMP_REG1]));
+        EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG3, arg & REG_MASK, reg_map[TMP_REG1]));
     }
     else
         FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
@@ -2128,11 +2131,11 @@
     op ^= SLJIT_SINGLE_OP;


     if (GET_OPCODE(op) == SLJIT_CMPD) {
-        if (dst > SLJIT_FLOAT_REG6) {
+        if (dst & SLJIT_MEM) {
             FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP) | FPU_LOAD, TMP_FREG1, dst, dstw));
             dst = TMP_FREG1;
         }
-        if (src > SLJIT_FLOAT_REG6) {
+        if (src & SLJIT_MEM) {
             FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP) | FPU_LOAD, TMP_FREG2, src, srcw));
             src = TMP_FREG2;
         }
@@ -2141,9 +2144,9 @@
         return SLJIT_SUCCESS;
     }


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG1 : dst;
+    dst_fr = FAST_IS_REG(dst) ? dst : TMP_FREG1;


-    if (src > SLJIT_FLOAT_REG6) {
+    if (src & SLJIT_MEM) {
         FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP) | FPU_LOAD, dst_fr, src, srcw));
         src = dst_fr;
     }
@@ -2184,14 +2187,14 @@
     compiler->cache_argw = 0;
     op ^= SLJIT_SINGLE_OP;


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG1 : dst;
+    dst_fr = FAST_IS_REG(dst) ? dst : TMP_FREG1;


-    if (src2 > SLJIT_FLOAT_REG6) {
+    if (src2 & SLJIT_MEM) {
         FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP) | FPU_LOAD, TMP_FREG2, src2, src2w));
         src2 = TMP_FREG2;
     }


-    if (src1 > SLJIT_FLOAT_REG6) {
+    if (src1 & SLJIT_MEM) {
         FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_SINGLE_OP) | FPU_LOAD, TMP_FREG1, src1, src1w));
         src1 = TMP_FREG1;
     }
@@ -2238,7 +2241,7 @@
     if (dst == SLJIT_UNUSED)
         return SLJIT_SUCCESS;


-    if (dst <= TMP_REG3)
+    if (FAST_IS_REG(dst))
         return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, RM(TMP_REG3)));


     /* Memory. */
@@ -2257,7 +2260,7 @@
     check_sljit_emit_fast_return(compiler, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3)
+    if (FAST_IS_REG(src))
         EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(MOV_DP, 0, TMP_REG3, SLJIT_UNUSED, RM(src)));
     else if (src & SLJIT_MEM) {
         if (getput_arg_fast(compiler, WORD_DATA | LOAD_DATA, TMP_REG3, src, srcw))
@@ -2398,26 +2401,8 @@
     ADJUST_LOCAL_OFFSET(src, srcw);


     /* In ARM, we don't need to touch the arguments. */
-    if (src & SLJIT_IMM) {
-        jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
-        FAIL_IF(!jump);
-        set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
-        jump->u.target = srcw;
-
-#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-        if (type >= SLJIT_FAST_CALL)
-            FAIL_IF(prepare_blx(compiler));
-        FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0), 0));
-        if (type >= SLJIT_FAST_CALL)
-            FAIL_IF(emit_blx(compiler));
-#else
-        FAIL_IF(emit_imm(compiler, TMP_REG1, 0));
-        FAIL_IF(push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1)));
-#endif
-        jump->addr = compiler->size;
-    }
-    else {
-        if (src <= TMP_REG3)
+    if (!(src & SLJIT_IMM)) {
+        if (FAST_IS_REG(src))
             return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(src));


         SLJIT_ASSERT(src & SLJIT_MEM);
@@ -2425,6 +2410,22 @@
         return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG2));
     }


+    jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+    FAIL_IF(!jump);
+    set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
+    jump->u.target = srcw;
+
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+    if (type >= SLJIT_FAST_CALL)
+        FAIL_IF(prepare_blx(compiler));
+    FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0), 0));
+    if (type >= SLJIT_FAST_CALL)
+        FAIL_IF(emit_blx(compiler));
+#else
+    FAIL_IF(emit_imm(compiler, TMP_REG1, 0));
+    FAIL_IF(push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1)));
+#endif
+    jump->addr = compiler->size;
     return SLJIT_SUCCESS;
 }


@@ -2446,7 +2447,7 @@

     op = GET_OPCODE(op);
     cc = get_cc(type);
-    dst_r = (dst <= TMP_REG3) ? dst : TMP_REG2;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;


     if (op < SLJIT_ADD) {
         EMIT_INSTRUCTION(EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_r, SLJIT_UNUSED, SRC2_IMM | 0));
@@ -2455,7 +2456,7 @@
     }


     ins = (op == SLJIT_AND ? AND_DP : (op == SLJIT_OR ? ORR_DP : EOR_DP));
-    if ((op == SLJIT_OR || op == SLJIT_XOR) && dst <= TMP_REG3 && dst == src) {
+    if ((op == SLJIT_OR || op == SLJIT_XOR) && FAST_IS_REG(dst) && dst == src) {
         EMIT_INSTRUCTION((EMIT_DATA_PROCESS_INS(ins, 0, dst, dst, SRC2_IMM | 1) & ~COND_MASK) | cc);
         /* The condition must always be set, even if the ORR/EOR is not executed above. */
         return (flags & SLJIT_SET_E) ? push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, SET_FLAGS, TMP_REG1, SLJIT_UNUSED, RM(dst))) : SLJIT_SUCCESS;
@@ -2493,7 +2494,7 @@
     const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
     PTR_FAIL_IF(!const_);


-    reg = (dst <= TMP_REG3) ? dst : TMP_REG2;
+    reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;


 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
     PTR_FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, reg, TMP_PC, 0), init_value));
@@ -2503,7 +2504,7 @@
 #endif
     set_const(const_, compiler);


-    if (reg == TMP_REG2 && dst != SLJIT_UNUSED)
+    if (dst & SLJIT_MEM)
         PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw));
     return const_;
 }


Modified: code/trunk/sljit/sljitNativeMIPS_common.c
===================================================================
--- code/trunk/sljit/sljitNativeMIPS_common.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeMIPS_common.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -198,7 +198,7 @@
     return (flags & IS_BIT26_COND) ? (1 << 26) : (1 << 16);
 }


-static SLJIT_INLINE sljit_ins* optimize_jump(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
+static SLJIT_INLINE sljit_ins* detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
 {
     sljit_sw diff;
     sljit_uw target_addr;
@@ -237,25 +237,35 @@
             return inst;
         }
     }
+    else {
+        diff = ((sljit_sw)target_addr - (sljit_sw)(inst + 1)) >> 2;
+        if (diff <= SIMM_MAX && diff >= SIMM_MIN) {
+            jump->flags |= PATCH_B;


-    diff = ((sljit_sw)target_addr - (sljit_sw)(inst + 1)) >> 2;
-    if (diff <= SIMM_MAX && diff >= SIMM_MIN) {
-        jump->flags |= PATCH_B;
-
-        if (!(jump->flags & IS_COND)) {
-            inst[0] = (jump->flags & IS_JAL) ? BAL : B;
+            if (!(jump->flags & IS_COND)) {
+                inst[0] = (jump->flags & IS_JAL) ? BAL : B;
+                inst[1] = NOP;
+                return inst + 1;
+            }
+            inst[0] = inst[0] ^ invert_branch(jump->flags);
             inst[1] = NOP;
+            jump->addr -= sizeof(sljit_ins);
             return inst + 1;
         }
-        inst[0] = inst[0] ^ invert_branch(jump->flags);
-        inst[1] = NOP;
-        jump->addr -= sizeof(sljit_ins);
-        return inst + 1;
     }


     if (jump->flags & IS_COND) {
-        if ((target_addr & ~0xfffffff) == ((jump->addr + 3 * sizeof(sljit_ins)) & ~0xfffffff)) {
+        if ((jump->flags & IS_MOVABLE) && (target_addr & ~0xfffffff) == ((jump->addr + 2 * sizeof(sljit_ins)) & ~0xfffffff)) {
             jump->flags |= PATCH_J;
+            saved_inst = inst[0];
+            inst[0] = inst[-1];
+            inst[-1] = (saved_inst & 0xffff0000) | 3;
+            inst[1] = J;
+            inst[2] = NOP;
+            return inst + 2;
+        }
+        else if ((target_addr & ~0xfffffff) == ((jump->addr + 3 * sizeof(sljit_ins)) & ~0xfffffff)) {
+            jump->flags |= PATCH_J;
             inst[0] = (inst[0] & 0xffff0000) | 3;
             inst[1] = NOP;
             inst[2] = J;
@@ -267,14 +277,12 @@
     }


     /* J instuctions. */
-    if (jump->flags & IS_MOVABLE) {
-        if ((target_addr & ~0xfffffff) == (jump->addr & ~0xfffffff)) {
-            jump->flags |= PATCH_J;
-            inst[0] = inst[-1];
-            inst[-1] = (jump->flags & IS_JAL) ? JAL : J;
-            jump->addr -= sizeof(sljit_ins);
-            return inst;
-        }
+    if ((jump->flags & IS_MOVABLE) && (target_addr & ~0xfffffff) == (jump->addr & ~0xfffffff)) {
+        jump->flags |= PATCH_J;
+        inst[0] = inst[-1];
+        inst[-1] = (jump->flags & IS_JAL) ? JAL : J;
+        jump->addr -= sizeof(sljit_ins);
+        return inst;
     }


     if ((target_addr & ~0xfffffff) == ((jump->addr + sizeof(sljit_ins)) & ~0xfffffff)) {
@@ -342,7 +350,7 @@
 #else
 #error "Implementation required"
 #endif
-                code_ptr = optimize_jump(jump, code_ptr, code);
+                code_ptr = detect_jump_type(jump, code_ptr, code);
                 jump = jump->next;
             }
             if (const_ && const_->addr == word_count) {
@@ -612,11 +620,11 @@
 {
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if ((!(flags & WRITE_BACK) || !(arg & 0xf)) && !(arg & 0xf0) && argw <= SIMM_MAX && argw >= SIMM_MIN) {
+    if ((!(flags & WRITE_BACK) || !(arg & REG_MASK)) && !(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN) {
         /* Works for both absoulte and relative addresses. */
         if (SLJIT_UNLIKELY(flags & ARG_TEST))
             return 1;
-        FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(arg & 0xf)
+        FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(arg & REG_MASK)
             | TA(reg_ar) | IMM(argw), ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? reg_ar : MOVABLE_INS));
         return -1;
     }
@@ -631,10 +639,10 @@
     SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));


     /* Simple operation except for updates. */
-    if (arg & 0xf0) {
+    if (arg & OFFS_REG_MASK) {
         argw &= 0x3;
         next_argw &= 0x3;
-        if (argw && argw == next_argw && (arg == next_arg || (arg & 0xf0) == (next_arg & 0xf0)))
+        if (argw && argw == next_argw && (arg == next_arg || (arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK)))
             return 1;
         return 0;
     }
@@ -666,9 +674,9 @@
         tmp_ar = DR(TMP_REG1);
         delay_slot = MOVABLE_INS;
     }
-    base = arg & 0xf;
+    base = arg & REG_MASK;


-    if (SLJIT_UNLIKELY(arg & 0xf0)) {
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
         argw &= 0x3;
         if ((flags & WRITE_BACK) && reg_ar == DR(base)) {
             SLJIT_ASSERT(!(flags & LOAD_DATA) && DR(TMP_REG1) != reg_ar);
@@ -681,7 +689,7 @@
             if (!(flags & WRITE_BACK)) {
                 if (arg == compiler->cache_arg)
                     return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(TMP_REG3) | TA(reg_ar), delay_slot);
-                if ((SLJIT_MEM | (arg & 0xf0)) == compiler->cache_arg) {
+                if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) {
                     if (arg == next_arg && argw == (next_argw & 0x3)) {
                         compiler->cache_arg = arg;
                         compiler->cache_argw = argw;
@@ -693,7 +701,7 @@
                 }
             }
             else {
-                if ((SLJIT_MEM | (arg & 0xf0)) == compiler->cache_arg) {
+                if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) {
                     FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(TMP_REG3) | D(base), DR(base)));
                     return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(base) | TA(reg_ar), delay_slot);
                 }
@@ -701,23 +709,23 @@
         }


         if (SLJIT_UNLIKELY(argw)) {
-            compiler->cache_arg = SLJIT_MEM | (arg & 0xf0);
+            compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
             compiler->cache_argw = argw;
-            FAIL_IF(push_inst(compiler, SLL_W | T((arg >> 4) & 0xf) | D(TMP_REG3) | SH_IMM(argw), DR(TMP_REG3)));
+            FAIL_IF(push_inst(compiler, SLL_W | T(OFFS_REG(arg)) | D(TMP_REG3) | SH_IMM(argw), DR(TMP_REG3)));
         }


         if (!(flags & WRITE_BACK)) {
             if (arg == next_arg && argw == (next_argw & 0x3)) {
                 compiler->cache_arg = arg;
                 compiler->cache_argw = argw;
-                FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(!argw ? ((arg >> 4) & 0xf) : TMP_REG3) | D(TMP_REG3), DR(TMP_REG3)));
+                FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(!argw ? OFFS_REG(arg) : TMP_REG3) | D(TMP_REG3), DR(TMP_REG3)));
                 tmp_ar = DR(TMP_REG3);
             }
             else
-                FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(!argw ? ((arg >> 4) & 0xf) : TMP_REG3) | DA(tmp_ar), tmp_ar));
+                FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(!argw ? OFFS_REG(arg) : TMP_REG3) | DA(tmp_ar), tmp_ar));
             return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar), delay_slot);
         }
-        FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(!argw ? ((arg >> 4) & 0xf) : TMP_REG3) | D(base), DR(base)));
+        FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(!argw ? OFFS_REG(arg) : TMP_REG3) | D(base), DR(base)));
         return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(base) | TA(reg_ar), delay_slot);
     }


@@ -829,7 +837,7 @@
         if (GET_FLAGS(op))
             flags |= UNUSED_DEST;
     }
-    else if (dst <= TMP_REG3) {
+    else if (FAST_IS_REG(dst)) {
         dst_r = dst;
         flags |= REG_DEST;
         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -862,7 +870,7 @@
     }


     /* Source 1. */
-    if (src1 <= TMP_REG3) {
+    if (FAST_IS_REG(src1)) {
         src1_r = src1;
         flags |= REG1_SOURCE;
     }
@@ -883,7 +891,7 @@
     }


     /* Source 2. */
-    if (src2 <= TMP_REG3) {
+    if (FAST_IS_REG(src2)) {
         src2_r = src2;
         flags |= REG2_SOURCE;
         if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -1150,14 +1158,14 @@
     compiler->cache_argw = 0;


     if (GET_OPCODE(op) == SLJIT_CMPD) {
-        if (dst > SLJIT_FLOAT_REG6) {
+        if (dst & SLJIT_MEM) {
             FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, dst, dstw, src, srcw));
             dst = TMP_FREG1;
         }
         else
             dst <<= 1;


-        if (src > SLJIT_FLOAT_REG6) {
+        if (src & SLJIT_MEM) {
             FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src, srcw, 0, 0));
             src = TMP_FREG2;
         }
@@ -1185,9 +1193,9 @@
         return push_inst(compiler, C_UN_fmt | FMT(op) | FT(src) | FS(dst), FCSR_FCC);
     }


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG1 : (dst << 1);
+    dst_fr = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG1;


-    if (src > SLJIT_FLOAT_REG6) {
+    if (src & SLJIT_MEM) {
         FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_fr, src, srcw, dst, dstw));
         src = dst_fr;
     }
@@ -1229,9 +1237,9 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG2 : (dst << 1);
+    dst_fr = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG2;


-    if (src1 > SLJIT_FLOAT_REG6) {
+    if (src1 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
             FAIL_IF(compiler->error);
             src1 = TMP_FREG1;
@@ -1241,7 +1249,7 @@
     else
         src1 <<= 1;


-    if (src2 > SLJIT_FLOAT_REG6) {
+    if (src2 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
             FAIL_IF(compiler->error);
             src2 = TMP_FREG2;
@@ -1309,7 +1317,7 @@
     if (dst == SLJIT_UNUSED)
         return SLJIT_SUCCESS;


-    if (dst <= TMP_REG3)
+    if (FAST_IS_REG(dst))
         return push_inst(compiler, ADDU_W | SA(RETURN_ADDR_REG) | TA(0) | D(dst), DR(dst));


     /* Memory. */
@@ -1322,7 +1330,7 @@
     check_sljit_emit_fast_return(compiler, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3)
+    if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, ADDU_W | S(src) | TA(0) | DA(RETURN_ADDR_REG), RETURN_ADDR_REG));
     else if (src & SLJIT_MEM)
         FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, RETURN_ADDR_REG, src, srcw));
@@ -1631,14 +1639,14 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    if (src1 > SLJIT_FLOAT_REG6) {
+    if (src1 & SLJIT_MEM) {
         PTR_FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(type) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
         src1 = TMP_FREG1;
     }
     else
         src1 <<= 1;


-    if (src2 > SLJIT_FLOAT_REG6) {
+    if (src2 & SLJIT_MEM) {
         PTR_FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(type) | LOAD_DATA, TMP_FREG2, src2, src2w, 0, 0));
         src2 = TMP_FREG2;
     }
@@ -1714,7 +1722,7 @@
     check_sljit_emit_ijump(compiler, type, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3) {
+    if (FAST_IS_REG(src)) {
         if (DR(src) != 4)
             src_r = src;
         else
@@ -1779,7 +1787,7 @@
         return SLJIT_SUCCESS;


     op = GET_OPCODE(op);
-    sugg_dst_ar = DR((op < SLJIT_ADD && dst <= TMP_REG3) ? dst : TMP_REG2);
+    sugg_dst_ar = DR((op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2);


     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
@@ -1877,7 +1885,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    reg = (dst <= TMP_REG3) ? dst : TMP_REG2;
+    reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;


     PTR_FAIL_IF(emit_const(compiler, reg, init_value));



Modified: code/trunk/sljit/sljitNativePPC_32.c
===================================================================
--- code/trunk/sljit/sljitNativePPC_32.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativePPC_32.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -32,7 +32,7 @@
         return push_inst(compiler, ADDI | D(reg) | A(0) | IMM(imm));


     if (!(imm & ~0xffff))
-        return push_inst(compiler, ORI | S(ZERO_REG) | A(reg) | IMM(imm));
+        return push_inst(compiler, ORI | S(TMP_ZERO) | A(reg) | IMM(imm));


     FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | IMM(imm >> 16)));
     return (imm & 0xffff) ? push_inst(compiler, ORI | S(reg) | A(reg) | IMM(imm)) : SLJIT_SUCCESS;


Modified: code/trunk/sljit/sljitNativePPC_64.c
===================================================================
--- code/trunk/sljit/sljitNativePPC_64.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativePPC_64.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -52,7 +52,7 @@
         return push_inst(compiler, ADDI | D(reg) | A(0) | IMM(imm));


     if (!(imm & ~0xffff))
-        return push_inst(compiler, ORI | S(ZERO_REG) | A(reg) | IMM(imm));
+        return push_inst(compiler, ORI | S(TMP_ZERO) | A(reg) | IMM(imm));


     if (imm <= SLJIT_W(0x7fffffff) && imm >= SLJIT_W(-0x80000000)) {
         FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | IMM(imm >> 16)));


Modified: code/trunk/sljit/sljitNativePPC_common.c
===================================================================
--- code/trunk/sljit/sljitNativePPC_common.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativePPC_common.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -86,7 +86,7 @@
 #define TMP_REG1    (SLJIT_NO_REGISTERS + 1)
 #define TMP_REG2    (SLJIT_NO_REGISTERS + 2)
 #define TMP_REG3    (SLJIT_NO_REGISTERS + 3)
-#define ZERO_REG    (SLJIT_NO_REGISTERS + 4)
+#define TMP_ZERO    (SLJIT_NO_REGISTERS + 4)


 #define TMP_FREG1    (0)
 #define TMP_FREG2    (SLJIT_FLOAT_REG6 + 1)
@@ -229,10 +229,11 @@
     return SLJIT_SUCCESS;
 }


-static SLJIT_INLINE sljit_si optimize_jump(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
+static SLJIT_INLINE sljit_si detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
 {
     sljit_sw diff;
     sljit_uw target_addr;
+    sljit_sw extra_jump_flags;


     if (jump->flags & SLJIT_REWRITABLE_JUMP)
         return 0;
@@ -245,17 +246,8 @@
     }
     diff = ((sljit_sw)target_addr - (sljit_sw)(code_ptr)) & ~0x3l;


-    if (jump->flags & UNCOND_B) {
-        if (diff <= 0x01ffffff && diff >= -0x02000000) {
-            jump->flags |= PATCH_B;
-            return 1;
-        }
-        if (target_addr <= 0x03ffffff) {
-            jump->flags |= PATCH_B | ABSOLUTE_B;
-            return 1;
-        }
-    }
-    else {
+    extra_jump_flags = 0;
+    if (jump->flags & COND_B) {
         if (diff <= 0x7fff && diff >= -0x8000) {
             jump->flags |= PATCH_B;
             return 1;
@@ -264,7 +256,19 @@
             jump->flags |= PATCH_B | ABSOLUTE_B;
             return 1;
         }
+        extra_jump_flags = REMOVE_COND;
+
+        diff -= sizeof(sljit_ins);
     }
+
+    if (diff <= 0x01ffffff && diff >= -0x02000000) {
+        jump->flags |= PATCH_B | extra_jump_flags;
+        return 1;
+    }
+    if (target_addr <= 0x03ffffff) {
+        jump->flags |= PATCH_B | ABSOLUTE_B | extra_jump_flags;
+        return 1;
+    }
     return 0;
 }


@@ -323,7 +327,7 @@
 #else
                 jump->addr = (sljit_uw)(code_ptr - 6);
 #endif
-                if (optimize_jump(jump, code_ptr, code)) {
+                if (detect_jump_type(jump, code_ptr, code)) {
 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
                     code_ptr[-3] = code_ptr[0];
                     code_ptr -= 3;
@@ -331,11 +335,17 @@
                     code_ptr[-6] = code_ptr[0];
                     code_ptr -= 6;
 #endif
+                    if (jump->flags & REMOVE_COND) {
+                        code_ptr[0] = BCx | (2 << 2) | ((code_ptr[0] ^ (8 << 21)) & 0x03ff0001);
+                        code_ptr++;
+                        jump->addr += sizeof(sljit_ins);
+                        code_ptr[0] = Bx;
+                        jump->flags -= COND_B;
+                    }
                 }
                 jump = jump->next;
             }
             if (const_ && const_->addr == word_count) {
-                /* Just recording the address. */
                 const_->addr = (sljit_uw)code_ptr;
                 const_ = const_->next;
             }
@@ -367,29 +377,27 @@
             addr = (jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target;
             buf_ptr = (sljit_ins*)jump->addr;
             if (jump->flags & PATCH_B) {
-                if (jump->flags & UNCOND_B) {
+                if (jump->flags & COND_B) {
                     if (!(jump->flags & ABSOLUTE_B)) {
                         addr = addr - jump->addr;
-                        SLJIT_ASSERT((sljit_sw)addr <= 0x01ffffff && (sljit_sw)addr >= -0x02000000);
-                        *buf_ptr = Bx | (addr & 0x03fffffc) | ((*buf_ptr) & 0x1);
+                        SLJIT_ASSERT((sljit_sw)addr <= 0x7fff && (sljit_sw)addr >= -0x8000);
+                        *buf_ptr = BCx | (addr & 0xfffc) | ((*buf_ptr) & 0x03ff0001);
                     }
                     else {
-                        SLJIT_ASSERT(addr <= 0x03ffffff);
-                        *buf_ptr = Bx | (addr & 0x03fffffc) | 0x2 | ((*buf_ptr) & 0x1);
+                        SLJIT_ASSERT(addr <= 0xffff);
+                        *buf_ptr = BCx | (addr & 0xfffc) | 0x2 | ((*buf_ptr) & 0x03ff0001);
                     }
                 }
                 else {
                     if (!(jump->flags & ABSOLUTE_B)) {
                         addr = addr - jump->addr;
-                        SLJIT_ASSERT((sljit_sw)addr <= 0x7fff && (sljit_sw)addr >= -0x8000);
-                        *buf_ptr = BCx | (addr & 0xfffc) | ((*buf_ptr) & 0x03ff0001);
+                        SLJIT_ASSERT((sljit_sw)addr <= 0x01ffffff && (sljit_sw)addr >= -0x02000000);
+                        *buf_ptr = Bx | (addr & 0x03fffffc) | ((*buf_ptr) & 0x1);
                     }
                     else {
-                        addr = addr & ~0x3l;
-                        SLJIT_ASSERT(addr <= 0xffff);
-                        *buf_ptr = BCx | (addr & 0xfffc) | 0x2 | ((*buf_ptr) & 0x03ff0001);
+                        SLJIT_ASSERT(addr <= 0x03ffffff);
+                        *buf_ptr = Bx | (addr & 0x03fffffc) | 0x2 | ((*buf_ptr) & 0x1);
                     }
-
                 }
                 break;
             }
@@ -503,7 +511,7 @@
 #endif


     FAIL_IF(push_inst(compiler, MFLR | D(0)));
-    FAIL_IF(push_inst(compiler, STACK_STORE | S(ZERO_REG) | A(SLJIT_LOCALS_REG) | IMM(-(sljit_si)(sizeof(sljit_sw))) ));
+    FAIL_IF(push_inst(compiler, STACK_STORE | S(TMP_ZERO) | A(SLJIT_LOCALS_REG) | IMM(-(sljit_si)(sizeof(sljit_sw))) ));
     if (saveds >= 1)
         FAIL_IF(push_inst(compiler, STACK_STORE | S(SLJIT_SAVED_REG1) | A(SLJIT_LOCALS_REG) | IMM(-2 * (sljit_si)(sizeof(sljit_sw))) ));
     if (saveds >= 2)
@@ -520,7 +528,7 @@
     FAIL_IF(push_inst(compiler, STACK_STORE | S(0) | A(SLJIT_LOCALS_REG) | IMM(sizeof(sljit_sw)) ));
 #endif


-    FAIL_IF(push_inst(compiler, ADDI | D(ZERO_REG) | A(0) | 0));
+    FAIL_IF(push_inst(compiler, ADDI | D(TMP_ZERO) | A(0) | 0));
     if (args >= 1)
         FAIL_IF(push_inst(compiler, OR | S(SLJIT_SCRATCH_REG1) | A(SLJIT_SAVED_REG1) | B(SLJIT_SCRATCH_REG1)));
     if (args >= 2)
@@ -602,7 +610,7 @@
         FAIL_IF(push_inst(compiler, STACK_LOAD | D(SLJIT_SAVED_REG2) | A(SLJIT_LOCALS_REG) | IMM(-3 * (sljit_si)(sizeof(sljit_sw))) ));
     if (compiler->saveds >= 1)
         FAIL_IF(push_inst(compiler, STACK_LOAD | D(SLJIT_SAVED_REG1) | A(SLJIT_LOCALS_REG) | IMM(-2 * (sljit_si)(sizeof(sljit_sw))) ));
-    FAIL_IF(push_inst(compiler, STACK_LOAD | D(ZERO_REG) | A(SLJIT_LOCALS_REG) | IMM(-(sljit_si)(sizeof(sljit_sw))) ));
+    FAIL_IF(push_inst(compiler, STACK_LOAD | D(TMP_ZERO) | A(SLJIT_LOCALS_REG) | IMM(-(sljit_si)(sizeof(sljit_sw))) ));


     FAIL_IF(push_inst(compiler, MTLR | S(0)));
     FAIL_IF(push_inst(compiler, BLR));
@@ -625,7 +633,7 @@
    It contans 32 items, but not all are different. */


 /* 64 bit only: [reg+imm] must be aligned to 4 bytes. */
-#define ADDR_MODE2    0x10000
+#define INT_ALIGNED    0x10000
 /* 64-bit only: there is no lwau instruction. */
 #define UPDATE_REQ    0x20000


@@ -636,7 +644,7 @@
 #else
 #define ARCH_32_64(a, b)    b
 #define INST_CODE_AND_DST(inst, flags, reg) \
-    (((inst) & ~(ADDR_MODE2 | UPDATE_REQ)) | (((flags) & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg)))
+    (((inst) & ~(INT_ALIGNED | UPDATE_REQ)) | (((flags) & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg)))
 #endif


static SLJIT_CONST sljit_ins data_transfer_insts[64 + 8] = {
@@ -645,13 +653,13 @@

/* Word. */

-/* u w n i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | ADDR_MODE2 | 0x0 /* std */),
-/* u w n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | ADDR_MODE2 | 0x0 /* ld */),
+/* u w n i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | INT_ALIGNED | 0x0 /* std */),
+/* u w n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x0 /* ld */),
/* u w n x s */ ARCH_32_64(HI(31) | LO(151) /* stwx */, HI(31) | LO(149) /* stdx */),
/* u w n x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(21) /* ldx */),

-/* u w w i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | ADDR_MODE2 | 0x1 /* stdu */),
-/* u w w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | ADDR_MODE2 | 0x1 /* ldu */),
+/* u w w i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | INT_ALIGNED | 0x1 /* stdu */),
+/* u w w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | 0x1 /* ldu */),
/* u w w x s */ ARCH_32_64(HI(31) | LO(183) /* stwux */, HI(31) | LO(181) /* stdux */),
/* u w w x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(53) /* ldux */),

@@ -695,13 +703,13 @@

/* Word. */

-/* s w n i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | ADDR_MODE2 | 0x0 /* std */),
-/* s w n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | ADDR_MODE2 | 0x0 /* ld */),
+/* s w n i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | INT_ALIGNED | 0x0 /* std */),
+/* s w n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x0 /* ld */),
/* s w n x s */ ARCH_32_64(HI(31) | LO(151) /* stwx */, HI(31) | LO(149) /* stdx */),
/* s w n x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(21) /* ldx */),

-/* s w w i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | ADDR_MODE2 | 0x1 /* stdu */),
-/* s w w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | ADDR_MODE2 | 0x1 /* ldu */),
+/* s w w i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | INT_ALIGNED | 0x1 /* stdu */),
+/* s w w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | 0x1 /* ldu */),
/* s w w x s */ ARCH_32_64(HI(31) | LO(183) /* stwux */, HI(31) | LO(181) /* stdux */),
/* s w w x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(53) /* ldux */),

@@ -732,12 +740,12 @@
/* Int. */

/* s i n i s */ HI(36) /* stw */,
-/* s i n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | ADDR_MODE2 | 0x2 /* lwa */),
+/* s i n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x2 /* lwa */),
/* s i n x s */ HI(31) | LO(151) /* stwx */,
/* s i n x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(341) /* lwax */),

/* s i w i s */ HI(37) /* stwu */,
-/* s i w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | ADDR_MODE2 | UPDATE_REQ | 0x2 /* lwa */),
+/* s i w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | UPDATE_REQ | 0x2 /* lwa */),
/* s i w x s */ HI(31) | LO(183) /* stwux */,
/* s i w x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(373) /* lwaux */),

@@ -761,74 +769,48 @@
 static sljit_si getput_arg_fast(struct sljit_compiler *compiler, sljit_si inp_flags, sljit_si reg, sljit_si arg, sljit_sw argw)
 {
     sljit_ins inst;
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-    sljit_si tmp_reg;
-#endif


+    /* Should work when (arg & REG_MASK) == 0. */
+    SLJIT_COMPILE_ASSERT(A(0) == 0, a0_must_be_0);
     SLJIT_ASSERT(arg & SLJIT_MEM);
-    if (!(arg & 0xf)) {
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-        if (argw <= SIMM_MAX && argw >= SIMM_MIN) {
-            if (inp_flags & ARG_TEST)
-                return 1;


-            inst = data_transfer_insts[(inp_flags & ~WRITE_BACK) & MEM_MASK];
-            SLJIT_ASSERT(!(inst & (ADDR_MODE2 | UPDATE_REQ)));
-            push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | IMM(argw));
-            return -1;
-        }
-#else
-        inst = data_transfer_insts[(inp_flags & ~WRITE_BACK) & MEM_MASK];
-        if (argw <= SIMM_MAX && argw >= SIMM_MIN &&
-                (!(inst & ADDR_MODE2) || (argw & 0x3) == 0)) {
-            if (inp_flags & ARG_TEST)
-                return 1;
+    if (arg & OFFS_REG_MASK) {
+        if (argw & 0x3)
+            return 0;
+        if (inp_flags & ARG_TEST)
+            return 1;


-            push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | IMM(argw));
-            return -1;
-        }
-#endif
-        return 0;
+        inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
+        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
+        FAIL_IF(push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(OFFS_REG(arg))));
+        return -1;
     }


-    if (!(arg & 0xf0)) {
+    if (SLJIT_UNLIKELY(!(arg & REG_MASK)))
+        inp_flags &= ~WRITE_BACK;
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    inst = data_transfer_insts[inp_flags & MEM_MASK];
+    SLJIT_ASSERT((arg & REG_MASK) || !(inst & UPDATE_REQ));
+
+    if (argw > SIMM_MAX || argw < SIMM_MIN || ((inst & INT_ALIGNED) && (argw & 0x3)) || (inst & UPDATE_REQ))
+        return 0;
+    if (inp_flags & ARG_TEST)
+        return 1;
+#endif
+
 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-        if (argw <= SIMM_MAX && argw >= SIMM_MIN) {
-            if (inp_flags & ARG_TEST)
-                return 1;
+    if (argw > SIMM_MAX || argw < SIMM_MIN)
+        return 0;
+    if (inp_flags & ARG_TEST)
+        return 1;


-            inst = data_transfer_insts[inp_flags & MEM_MASK];
-            SLJIT_ASSERT(!(inst & (ADDR_MODE2 | UPDATE_REQ)));
-            push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & 0xf) | IMM(argw));
-            return -1;
-        }
-#else
-        inst = data_transfer_insts[inp_flags & MEM_MASK];
-        if (argw <= SIMM_MAX && argw >= SIMM_MIN && (!(inst & ADDR_MODE2) || (argw & 0x3) == 0)) {
-            if (inp_flags & ARG_TEST)
-                return 1;
+    inst = data_transfer_insts[inp_flags & MEM_MASK];
+    SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
+#endif


-            if ((inp_flags & WRITE_BACK) && (inst & UPDATE_REQ)) {
-                tmp_reg = (inp_flags & LOAD_DATA) ? (arg & 0xf) : TMP_REG3;
-                if (push_inst(compiler, ADDI | D(tmp_reg) | A(arg & 0xf) | IMM(argw)))
-                    return -1;
-                arg = tmp_reg | SLJIT_MEM;
-                argw = 0;
-            }
-            push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & 0xf) | IMM(argw));
-            return -1;
-        }
-#endif
-    }
-    else if (!(argw & 0x3)) {
-        if (inp_flags & ARG_TEST)
-            return 1;
-        inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (ADDR_MODE2 | UPDATE_REQ)));
-        push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & 0xf) | B((arg >> 4) & 0xf));
-        return -1;
-    }
-    return 0;
+    FAIL_IF(push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | IMM(argw)));
+    return -1;
 }


 /* See getput_arg below.
@@ -836,35 +818,50 @@
    uses word arguments without write back. */
 static sljit_si can_cache(sljit_si arg, sljit_sw argw, sljit_si next_arg, sljit_sw next_argw)
 {
+    sljit_sw high_short, next_high_short;
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    sljit_sw diff;
+#endif
+
     SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));


-    if (!(arg & 0xf))
-        return (next_arg & SLJIT_MEM) && ((sljit_uw)argw - (sljit_uw)next_argw <= SIMM_MAX || (sljit_uw)next_argw - (sljit_uw)argw <= SIMM_MAX);
+    if (arg & OFFS_REG_MASK)
+        return ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && (argw & 0x3) == (next_argw & 0x3));


-    if (arg & 0xf0)
-        return ((arg & 0xf0) == (next_arg & 0xf0) && (argw & 0x3) == (next_argw & 0x3));
+    if (next_arg & OFFS_REG_MASK)
+        return 0;


-    if (argw <= SIMM_MAX && argw >= SIMM_MIN) {
-        if (arg == next_arg && (next_argw >= SIMM_MAX && next_argw <= SIMM_MIN))
+#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
+    high_short = (argw + ((argw & 0x8000) << 1)) & ~0xffff;
+    next_high_short = (next_argw + ((next_argw & 0x8000) << 1)) & ~0xffff;
+    return high_short == next_high_short;
+#else
+    if (argw <= SLJIT_W(0x7fffffff) && argw >= SLJIT_W(-0x80000000)) {
+        high_short = (argw + ((argw & 0x8000) << 1)) & ~0xffff;
+        next_high_short = (next_argw + ((next_argw & 0x8000) << 1)) & ~0xffff;
+        if (high_short == next_high_short)
             return 1;
     }


-    if (arg == next_arg && ((sljit_uw)argw - (sljit_uw)next_argw <= SIMM_MAX || (sljit_uw)next_argw - (sljit_uw)argw <= SIMM_MAX))
+    diff = argw - next_argw;
+    if (!(arg & REG_MASK))
+        return diff <= SIMM_MAX && diff >= SIMM_MIN;
+
+    if (arg == next_arg && diff <= SIMM_MAX && diff >= SIMM_MIN)
         return 1;


     return 0;
+#endif
 }


 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 #define ADJUST_CACHED_IMM(imm) \
-    if ((inst & ADDR_MODE2) && (imm & 0x3)) { \
+    if ((inst & INT_ALIGNED) && (imm & 0x3)) { \
         /* Adjust cached value. Fortunately this is really a rare case */ \
         compiler->cache_argw += imm & 0x3; \
         FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG3) | A(TMP_REG3) | (imm & 0x3))); \
         imm &= ~0x3; \
     }
-#else
-#define ADJUST_CACHED_IMM(imm)
 #endif


 /* Emit the necessary instructions. See can_cache above. */
@@ -872,72 +869,125 @@
 {
     sljit_si tmp_r;
     sljit_ins inst;
+    sljit_sw high_short, next_high_short;
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    sljit_sw diff;
+#endif


     SLJIT_ASSERT(arg & SLJIT_MEM);


     tmp_r = ((inp_flags & LOAD_DATA) && ((inp_flags) & MEM_MASK) <= GPR_REG) ? reg : TMP_REG1;
     /* Special case for "mov reg, [reg, ... ]". */
-    if ((arg & 0xf) == tmp_r)
+    if ((arg & REG_MASK) == tmp_r)
         tmp_r = TMP_REG1;


-    if (!(arg & 0xf)) {
-        inst = data_transfer_insts[(inp_flags & ~WRITE_BACK) & MEM_MASK];
-        if ((compiler->cache_arg & SLJIT_IMM) && (((sljit_uw)argw - (sljit_uw)compiler->cache_argw) <= SIMM_MAX || ((sljit_uw)compiler->cache_argw - (sljit_uw)argw) <= SIMM_MAX)) {
-            argw = argw - compiler->cache_argw;
-            ADJUST_CACHED_IMM(argw);
-            SLJIT_ASSERT(!(inst & UPDATE_REQ));
-            return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3) | IMM(argw));
-        }
-
-        if ((next_arg & SLJIT_MEM) && (argw - next_argw <= SIMM_MAX || next_argw - argw <= SIMM_MAX)) {
-            SLJIT_ASSERT(inp_flags & LOAD_DATA);
-
-            compiler->cache_arg = SLJIT_IMM;
-            compiler->cache_argw = argw;
-            tmp_r = TMP_REG3;
-        }
-
-        FAIL_IF(load_immediate(compiler, tmp_r, argw));
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_r));
-    }
-
-    if (SLJIT_UNLIKELY(arg & 0xf0)) {
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
         argw &= 0x3;
         /* Otherwise getput_arg_fast would capture it. */
         SLJIT_ASSERT(argw);


-        if ((SLJIT_MEM | (arg & 0xf0)) == compiler->cache_arg && argw == compiler->cache_argw)
+        if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg && argw == compiler->cache_argw)
             tmp_r = TMP_REG3;
         else {
-            if ((arg & 0xf0) == (next_arg & 0xf0) && argw == (next_argw & 0x3)) {
-                compiler->cache_arg = SLJIT_MEM | (arg & 0xf0);
+            if ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && argw == (next_argw & 0x3)) {
+                compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
                 compiler->cache_argw = argw;
                 tmp_r = TMP_REG3;
             }
 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-            FAIL_IF(push_inst(compiler, RLWINM | S((arg >> 4) & 0xf) | A(tmp_r) | (argw << 11) | ((31 - argw) << 1)));
+            FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(arg)) | A(tmp_r) | (argw << 11) | ((31 - argw) << 1)));
 #else
-            FAIL_IF(push_inst(compiler, RLDI(tmp_r, (arg >> 4) & 0xf, argw, 63 - argw, 1)));
+            FAIL_IF(push_inst(compiler, RLDI(tmp_r, OFFS_REG(arg), argw, 63 - argw, 1)));
 #endif
         }
         inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (ADDR_MODE2 | UPDATE_REQ)));
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & 0xf) | B(tmp_r));
+        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(tmp_r));
     }


+    if (SLJIT_UNLIKELY(!(arg & REG_MASK)))
+        inp_flags &= ~WRITE_BACK;
+
     inst = data_transfer_insts[inp_flags & MEM_MASK];
+    SLJIT_ASSERT((arg & REG_MASK) || !(inst & UPDATE_REQ));


-    if (compiler->cache_arg == arg && ((sljit_uw)argw - (sljit_uw)compiler->cache_argw <= SIMM_MAX || (sljit_uw)compiler->cache_argw - (sljit_uw)argw <= SIMM_MAX)) {
-        SLJIT_ASSERT(!(inp_flags & WRITE_BACK));
-        argw = argw - compiler->cache_argw;
-        ADJUST_CACHED_IMM(argw);
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3) | IMM(argw));
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    if (argw <= SLJIT_W(0x7fff7fff) && argw >= SLJIT_W(-0x80000000)
+            && (!(inst & INT_ALIGNED) || !(argw & 0x3)) && !(inst & UPDATE_REQ)) {
+#endif
+
+        arg &= REG_MASK;
+        high_short = (sljit_si)(argw + ((argw & 0x8000) << 1)) & ~0xffff;
+        /* The getput_arg_fast should handle this otherwise. */
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+        SLJIT_ASSERT(high_short && high_short <= SLJIT_W(0x7fffffff) && high_short >= SLJIT_W(-0x80000000));
+#else
+        SLJIT_ASSERT(high_short && !(inst & (INT_ALIGNED | UPDATE_REQ)));
+#endif
+
+        if (inp_flags & WRITE_BACK) {
+            if (arg == reg) {
+                FAIL_IF(push_inst(compiler, OR | S(reg) | A(tmp_r) | B(reg)));
+                reg = tmp_r;
+            }
+            tmp_r = arg;
+            FAIL_IF(push_inst(compiler, ADDIS | D(arg) | A(arg) | IMM(high_short >> 16)));
+        }
+        else if (compiler->cache_arg != arg || high_short != compiler->cache_argw) {
+            if ((next_arg & SLJIT_MEM) && !(next_arg & OFFS_REG_MASK)) {
+                next_high_short = (sljit_si)(next_argw + ((next_argw & 0x8000) << 1)) & ~0xffff;
+                if (high_short == next_high_short) {
+                    compiler->cache_arg = SLJIT_IMM | arg;
+                    compiler->cache_argw = next_high_short;
+                    tmp_r = TMP_REG3;
+                }
+            }
+            FAIL_IF(push_inst(compiler, ADDIS | D(tmp_r) | A(arg & REG_MASK) | IMM(high_short >> 16)));
+        }
+        else
+            tmp_r = TMP_REG3;
+
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_r) | IMM(argw));
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
     }


-    if ((compiler->cache_arg & SLJIT_IMM) && compiler->cache_argw == argw) {
+    /* Everything else is PPC-64 only. */
+    if (SLJIT_UNLIKELY(!(arg & REG_MASK))) {
+        diff = argw - compiler->cache_argw;
+        if ((compiler->cache_arg & SLJIT_IMM) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
+            ADJUST_CACHED_IMM(diff);
+            return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3) | IMM(diff));
+        }
+
+        diff = argw - next_argw;
+        if ((next_arg & SLJIT_MEM) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
+            SLJIT_ASSERT(inp_flags & LOAD_DATA);
+
+            compiler->cache_arg = SLJIT_IMM;
+            compiler->cache_argw = argw;
+            tmp_r = TMP_REG3;
+        }
+
+        FAIL_IF(load_immediate(compiler, tmp_r, argw));
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_r));
+    }
+
+    diff = argw - compiler->cache_argw;
+    if (compiler->cache_arg == arg && diff <= SIMM_MAX && diff >= SIMM_MIN) {
+        SLJIT_ASSERT(!(inp_flags & WRITE_BACK) && !(inst & UPDATE_REQ));
+        ADJUST_CACHED_IMM(diff);
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3) | IMM(diff));
+    }
+
+    if ((compiler->cache_arg & SLJIT_IMM) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
         inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (ADDR_MODE2 | UPDATE_REQ)));
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & 0xf) | B(TMP_REG3));
+        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
+        if (compiler->cache_argw != argw) {
+            FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG3) | A(TMP_REG3) | IMM(diff)));
+            compiler->cache_argw = argw;
+        }
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(TMP_REG3));
     }


     if (argw == next_argw && (next_arg & SLJIT_MEM)) {
@@ -948,14 +998,15 @@
         compiler->cache_argw = argw;


         inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (ADDR_MODE2 | UPDATE_REQ)));
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & 0xf) | B(TMP_REG3));
+        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(TMP_REG3));
     }


-    if (arg == next_arg && !(inp_flags & WRITE_BACK) && ((sljit_uw)argw - (sljit_uw)next_argw <= SIMM_MAX || (sljit_uw)next_argw - (sljit_uw)argw <= SIMM_MAX)) {
+    diff = argw - next_argw;
+    if (arg == next_arg && !(inp_flags & WRITE_BACK) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
         SLJIT_ASSERT(inp_flags & LOAD_DATA);
         FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-        FAIL_IF(push_inst(compiler, ADD | D(TMP_REG3) | A(TMP_REG3) | B(arg & 0xf)));
+        FAIL_IF(push_inst(compiler, ADD | D(TMP_REG3) | A(TMP_REG3) | B(arg & REG_MASK)));


         compiler->cache_arg = arg;
         compiler->cache_argw = argw;
@@ -963,11 +1014,22 @@
         return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3));
     }


+    if ((next_arg & SLJIT_MEM) && !(next_arg & OFFS_REG_MASK) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
+        SLJIT_ASSERT(inp_flags & LOAD_DATA);
+        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+
+        compiler->cache_arg = SLJIT_IMM;
+        compiler->cache_argw = argw;
+        tmp_r = TMP_REG3;
+    }
+    else
+        FAIL_IF(load_immediate(compiler, tmp_r, argw));
+
     /* Get the indexed version instead of the normal one. */
     inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-    SLJIT_ASSERT(!(inst & (ADDR_MODE2 | UPDATE_REQ)));
-    FAIL_IF(load_immediate(compiler, tmp_r, argw));
-    return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & 0xf) | B(tmp_r));
+    SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
+    return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(tmp_r));
+#endif
 }


 static SLJIT_INLINE sljit_si emit_op_mem2(struct sljit_compiler *compiler, sljit_si flags, sljit_si reg, sljit_si arg1, sljit_sw arg1w, sljit_si arg2, sljit_sw arg2w)
@@ -1003,7 +1065,7 @@
             return SLJIT_SUCCESS;
         dst_r = TMP_REG2;
     }
-    else if (dst <= ZERO_REG) {
+    else if (FAST_IS_REG(dst)) {
         dst_r = dst;
         flags |= REG_DEST;
         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -1022,7 +1084,7 @@
     }


     /* Source 1. */
-    if (src1 <= ZERO_REG) {
+    if (FAST_IS_REG(src1)) {
         src1_r = src1;
         flags |= REG1_SOURCE;
     }
@@ -1038,7 +1100,7 @@
         src1_r = 0;


     /* Source 2. */
-    if (src2 <= ZERO_REG) {
+    if (FAST_IS_REG(src2)) {
         src2_r = src2;
         flags |= REG2_SOURCE;
         if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -1115,7 +1177,6 @@
     case SLJIT_BREAKPOINT:
     case SLJIT_NOP:
         return push_inst(compiler, NOP);
-        break;
     case SLJIT_UMUL:
     case SLJIT_SMUL:
         FAIL_IF(push_inst(compiler, OR | S(SLJIT_SCRATCH_REG1) | A(TMP_REG1) | B(SLJIT_SCRATCH_REG1)));
@@ -1165,14 +1226,14 @@


     op = GET_OPCODE(op);
     if ((src & SLJIT_IMM) && srcw == 0)
-        src = ZERO_REG;
+        src = TMP_ZERO;


     if (op_flags & SLJIT_SET_O)
-        FAIL_IF(push_inst(compiler, MTXER | S(ZERO_REG)));
+        FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));


     if (op_flags & SLJIT_INT_OP) {
         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
-            if (src <= ZERO_REG && src == dst) {
+            if (FAST_IS_REG(src) && src == dst) {
                 if (!TYPE_CAST_NEEDED(op))
                     return SLJIT_SUCCESS;
             }
@@ -1320,9 +1381,9 @@
     ADJUST_LOCAL_OFFSET(src2, src2w);


     if ((src1 & SLJIT_IMM) && src1w == 0)
-        src1 = ZERO_REG;
+        src1 = TMP_ZERO;
     if ((src2 & SLJIT_IMM) && src2w == 0)
-        src2 = ZERO_REG;
+        src2 = TMP_ZERO;


 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
     if (op & SLJIT_INT_OP) {
@@ -1337,7 +1398,7 @@
     }
 #endif
     if (op & SLJIT_SET_O)
-        FAIL_IF(push_inst(compiler, MTXER | S(ZERO_REG)));
+        FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));
     if (src2 == TMP_REG2)
         flags |= ALT_KEEP_CACHE;


@@ -1563,12 +1624,12 @@
     compiler->cache_argw = 0;


     if (GET_OPCODE(op) == SLJIT_CMPD) {
-        if (dst > SLJIT_FLOAT_REG6) {
+        if (dst & SLJIT_MEM) {
             FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, dst, dstw, src, srcw));
             dst = TMP_FREG1;
         }


-        if (src > SLJIT_FLOAT_REG6) {
+        if (src & SLJIT_MEM) {
             FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src, srcw, 0, 0));
             src = TMP_FREG2;
         }
@@ -1576,9 +1637,9 @@
         return push_inst(compiler, FCMPU | CRD(4) | FA(dst) | FB(src));
     }


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG1 : dst;
+    dst_fr = FAST_IS_REG(dst) ? dst : TMP_FREG1;


-    if (src > SLJIT_FLOAT_REG6) {
+    if (src & SLJIT_MEM) {
         FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_fr, src, srcw, dst, dstw));
         src = dst_fr;
     }
@@ -1618,9 +1679,9 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG2 : dst;
+    dst_fr = FAST_IS_REG(dst) ? dst : TMP_FREG2;


-    if (src1 > SLJIT_FLOAT_REG6) {
+    if (src1 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
             FAIL_IF(compiler->error);
             src1 = TMP_FREG1;
@@ -1628,7 +1689,7 @@
             flags |= ALT_FORM1;
     }


-    if (src2 > SLJIT_FLOAT_REG6) {
+    if (src2 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
             FAIL_IF(compiler->error);
             src2 = TMP_FREG2;
@@ -1697,7 +1758,7 @@
     if (dst == SLJIT_UNUSED)
         return SLJIT_SUCCESS;


-    if (dst <= ZERO_REG)
+    if (FAST_IS_REG(dst))
         return push_inst(compiler, MFLR | D(dst));


     /* Memory. */
@@ -1711,7 +1772,7 @@
     check_sljit_emit_fast_return(compiler, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= ZERO_REG)
+    if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, MTLR | S(src)));
     else {
         if (src & SLJIT_MEM)
@@ -1824,8 +1885,8 @@
     type &= 0xff;


     /* In PPC, we don't need to touch the arguments. */
-    if (type >= SLJIT_JUMP)
-        jump->flags |= UNCOND_B;
+    if (type < SLJIT_JUMP)
+        jump->flags |= COND_B;


     PTR_FAIL_IF(emit_const(compiler, TMP_REG1, 0));
     PTR_FAIL_IF(push_inst(compiler, MTCTR | S(TMP_REG1)));
@@ -1843,12 +1904,12 @@
     check_sljit_emit_ijump(compiler, type, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= ZERO_REG)
+    if (FAST_IS_REG(src))
         src_r = src;
     else if (src & SLJIT_IMM) {
         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
         FAIL_IF(!jump);
-        set_jump(jump, compiler, JUMP_ADDR | UNCOND_B);
+        set_jump(jump, compiler, JUMP_ADDR);
         jump->u.target = srcw;


         FAIL_IF(emit_const(compiler, TMP_REG2, 0));
@@ -1880,6 +1941,7 @@
 {
     sljit_si reg, input_flags;
     sljit_si flags = GET_ALL_FLAGS(op);
+    sljit_sw original_dstw = dstw;


     CHECK_ERROR();
     check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
@@ -1889,7 +1951,7 @@
         return SLJIT_SUCCESS;


     op = GET_OPCODE(op);
-    reg = (op < SLJIT_ADD && dst <= ZERO_REG) ? dst : TMP_REG2;
+    reg = (op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2;


     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
@@ -2001,13 +2063,15 @@
         op = SLJIT_MOV;
         input_flags = WORD_DATA;
 #endif
-        return (reg == TMP_REG2) ? emit_op(compiler, op, input_flags, dst, dstw, TMP_REG1, 0, TMP_REG2, 0) : SLJIT_SUCCESS;
+        if (reg != TMP_REG2)
+            return SLJIT_SUCCESS;
+        return emit_op(compiler, op, input_flags, dst, dstw, TMP_REG1, 0, TMP_REG2, 0);
     }


 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
     compiler->skip_checks = 1;
 #endif
-    return sljit_emit_op2(compiler, op | flags, dst, dstw, src, srcw, TMP_REG2, 0);
+    return sljit_emit_op2(compiler, op | flags, dst, original_dstw, src, srcw, TMP_REG2, 0);
 }


 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
@@ -2023,7 +2087,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    reg = (dst <= ZERO_REG) ? dst : TMP_REG2;
+    reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;


     PTR_FAIL_IF(emit_const(compiler, reg, init_value));



Modified: code/trunk/sljit/sljitNativeSPARC_common.c
===================================================================
--- code/trunk/sljit/sljitNativeSPARC_common.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeSPARC_common.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -87,7 +87,7 @@
 #define TMP_REG2    (SLJIT_NO_REGISTERS + 2)
 #define TMP_REG3    (SLJIT_NO_REGISTERS + 3)
 #define TMP_REG4    (SLJIT_NO_REGISTERS + 4)
-#define LINK_REG    (SLJIT_NO_REGISTERS + 5)
+#define TMP_LINK    (SLJIT_NO_REGISTERS + 5)


 #define TMP_FREG1    (0)
 #define TMP_FREG2    ((SLJIT_FLOAT_REG6 + 1) << 1)
@@ -190,7 +190,7 @@
     return SLJIT_SUCCESS;
 }


-static SLJIT_INLINE sljit_ins* optimize_jump(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
+static SLJIT_INLINE sljit_ins* detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
 {
     sljit_sw diff;
     sljit_uw target_addr;
@@ -311,7 +311,7 @@
 #else
                 jump->addr = (sljit_uw)(code_ptr - 6);
 #endif
-                code_ptr = optimize_jump(jump, code_ptr, code);
+                code_ptr = detect_jump_type(jump, code_ptr, code);
                 jump = jump->next;
             }
             if (const_ && const_->addr == word_count) {
@@ -465,7 +465,7 @@
     CHECK_ERROR();
     check_sljit_emit_return(compiler, op, src, srcw);


-    if (op != SLJIT_MOV || !(src <= TMP_REG3)) {
+    if (op != SLJIT_MOV || !FAST_IS_REG(src)) {
         FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
         src = SLJIT_SCRATCH_REG1;
     }
@@ -516,15 +516,15 @@
 {
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if (!(flags & WRITE_BACK) || !(arg & 0xf)) {
-        if ((!(arg & 0xf0) && argw <= SIMM_MAX && argw >= SIMM_MIN)
-                || ((arg & 0xf0) && (argw & 0x3) == 0)) {
+    if (!(flags & WRITE_BACK) || !(arg & REG_MASK)) {
+        if ((!(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN)
+                || ((arg & OFFS_REG_MASK) && (argw & 0x3) == 0)) {
             /* Works for both absoulte and relative addresses (immediate case). */
             if (SLJIT_UNLIKELY(flags & ARG_TEST))
                 return 1;
             FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK]
                 | ((flags & MEM_MASK) <= GPR_REG ? D(reg) : DA(reg))
-                | S1(arg & 0xf) | ((arg & 0xf0) ? S2((arg >> 4) & 0xf) : IMM(argw)),
+                | S1(arg & REG_MASK) | ((arg & OFFS_REG_MASK) ? S2(OFFS_REG(arg)) : IMM(argw)),
                 ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS));
             return -1;
         }
@@ -540,11 +540,11 @@
     SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));


     /* Simple operation except for updates. */
-    if (arg & 0xf0) {
+    if (arg & OFFS_REG_MASK) {
         argw &= 0x3;
         SLJIT_ASSERT(argw);
         next_argw &= 0x3;
-        if ((arg & 0xf0) == (next_arg & 0xf0) && argw == next_argw)
+        if ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && argw == next_argw)
             return 1;
         return 0;
     }
@@ -566,25 +566,25 @@
         next_argw = 0;
     }


-    base = arg & 0xf;
-    if (SLJIT_UNLIKELY(arg & 0xf0)) {
+    base = arg & REG_MASK;
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
         argw &= 0x3;
         SLJIT_ASSERT(argw != 0);


         /* Using the cache. */
-        if (((SLJIT_MEM | (arg & 0xf0)) == compiler->cache_arg) && (argw == compiler->cache_argw))
+        if (((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) && (argw == compiler->cache_argw))
             arg2 = TMP_REG3;
         else {
-            if ((arg & 0xf0) == (next_arg & 0xf0) && argw == (next_argw & 0x3)) {
-                compiler->cache_arg = SLJIT_MEM | (arg & 0xf0);
+            if ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && argw == (next_argw & 0x3)) {
+                compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
                 compiler->cache_argw = argw;
                 arg2 = TMP_REG3;
             }
-            else if ((flags & LOAD_DATA) && ((flags & MEM_MASK) <= GPR_REG) && reg != base && (reg << 4) != (arg & 0xf0))
+            else if ((flags & LOAD_DATA) && ((flags & MEM_MASK) <= GPR_REG) && reg != base && reg != OFFS_REG(arg))
                 arg2 = reg;
             else /* It must be a mov operation, so tmp1 must be free to use. */
                 arg2 = TMP_REG1;
-            FAIL_IF(push_inst(compiler, SLL_W | D(arg2) | S1((arg >> 4) & 0xf) | IMM_ARG | argw, DR(arg2)));
+            FAIL_IF(push_inst(compiler, SLL_W | D(arg2) | S1(OFFS_REG(arg)) | IMM_ARG | argw, DR(arg2)));
         }
     }
     else {
@@ -658,7 +658,7 @@
         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_SI && !(src2 & SLJIT_MEM))
             return SLJIT_SUCCESS;
     }
-    else if (dst <= TMP_REG3) {
+    else if (FAST_IS_REG(dst)) {
         dst_r = dst;
         flags |= REG_DEST;
         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -689,7 +689,7 @@
     }


     /* Source 1. */
-    if (src1 <= TMP_REG3)
+    if (FAST_IS_REG(src1))
         src1_r = src1;
     else if (src1 & SLJIT_IMM) {
         if (src1w) {
@@ -708,7 +708,7 @@
     }


     /* Source 2. */
-    if (src2 <= TMP_REG3) {
+    if (FAST_IS_REG(src2)) {
         src2_r = src2;
         flags |= REG2_SOURCE;
         if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -963,14 +963,14 @@
     compiler->cache_argw = 0;


     if (GET_OPCODE(op) == SLJIT_CMPD) {
-        if (dst > SLJIT_FLOAT_REG6) {
+        if (dst & SLJIT_MEM) {
             FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, dst, dstw, src, srcw));
             dst = TMP_FREG1;
         }
         else
             dst <<= 1;


-        if (src > SLJIT_FLOAT_REG6) {
+        if (src & SLJIT_MEM) {
             FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src, srcw, 0, 0));
             src = TMP_FREG2;
         }
@@ -980,9 +980,9 @@
         return push_inst(compiler, SELECT_FOP(op, FCMPS, FCMPD) | S1A(dst) | S2A(src), FCC_IS_SET | MOVABLE_INS);
     }


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG1 : (dst << 1);
+    dst_fr = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG1;


-    if (src > SLJIT_FLOAT_REG6) {
+    if (src & SLJIT_MEM) {
         FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_fr, src, srcw, dst, dstw));
         src = dst_fr;
     }
@@ -1031,9 +1031,9 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    dst_fr = (dst > SLJIT_FLOAT_REG6) ? TMP_FREG2 : (dst << 1);
+    dst_fr = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG2;


-    if (src1 > SLJIT_FLOAT_REG6) {
+    if (src1 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
             FAIL_IF(compiler->error);
             src1 = TMP_FREG1;
@@ -1043,7 +1043,7 @@
     else
         src1 <<= 1;


-    if (src2 > SLJIT_FLOAT_REG6) {
+    if (src2 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
             FAIL_IF(compiler->error);
             src2 = TMP_FREG2;
@@ -1114,11 +1114,11 @@
     if (dst == SLJIT_UNUSED)
         return SLJIT_SUCCESS;


-    if (dst <= TMP_REG3)
-        return push_inst(compiler, OR | D(dst) | S1(0) | S2(LINK_REG), DR(dst));
+    if (FAST_IS_REG(dst))
+        return push_inst(compiler, OR | D(dst) | S1(0) | S2(TMP_LINK), DR(dst));


     /* Memory. */
-    return emit_op_mem(compiler, WORD_DATA, LINK_REG, dst, dstw);
+    return emit_op_mem(compiler, WORD_DATA, TMP_LINK, dst, dstw);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_si src, sljit_sw srcw)
@@ -1127,14 +1127,14 @@
     check_sljit_emit_fast_return(compiler, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3)
-        FAIL_IF(push_inst(compiler, OR | D(LINK_REG) | S1(0) | S2(src), DR(LINK_REG)));
+    if (FAST_IS_REG(src))
+        FAIL_IF(push_inst(compiler, OR | D(TMP_LINK) | S1(0) | S2(src), DR(TMP_LINK)));
     else if (src & SLJIT_MEM)
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, LINK_REG, src, srcw));
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_LINK, src, srcw));
     else if (src & SLJIT_IMM)
-        FAIL_IF(load_immediate(compiler, LINK_REG, srcw));
+        FAIL_IF(load_immediate(compiler, TMP_LINK, srcw));


-    FAIL_IF(push_inst(compiler, JMPL | D(0) | S1(LINK_REG) | IMM(8), UNMOVABLE_INS));
+    FAIL_IF(push_inst(compiler, JMPL | D(0) | S1(TMP_LINK) | IMM(8), UNMOVABLE_INS));
     return push_inst(compiler, NOP, UNMOVABLE_INS);
 }


@@ -1269,7 +1269,7 @@
     }


     PTR_FAIL_IF(emit_const(compiler, TMP_REG2, 0));
-    PTR_FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? LINK_REG : 0) | S1(TMP_REG2) | IMM(0), UNMOVABLE_INS));
+    PTR_FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? TMP_LINK : 0) | S1(TMP_REG2) | IMM(0), UNMOVABLE_INS));
     jump->addr = compiler->size;
     PTR_FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));


@@ -1285,7 +1285,7 @@
     check_sljit_emit_ijump(compiler, type, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3)
+    if (FAST_IS_REG(src))
         src_r = src;
     else if (src & SLJIT_IMM) {
         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
@@ -1305,7 +1305,7 @@
         src_r = TMP_REG2;
     }


-    FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? LINK_REG : 0) | S1(src_r) | IMM(0), UNMOVABLE_INS));
+    FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? TMP_LINK : 0) | S1(src_r) | IMM(0), UNMOVABLE_INS));
     if (jump)
         jump->addr = compiler->size;
     return push_inst(compiler, NOP, UNMOVABLE_INS);
@@ -1327,7 +1327,7 @@


 #if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
     op = GET_OPCODE(op);
-    reg = (op < SLJIT_ADD && dst <= TMP_REG3) ? dst : TMP_REG2;
+    reg = (op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2;


     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
@@ -1368,12 +1368,11 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    reg = (dst <= TMP_REG3) ? dst : TMP_REG2;
+    reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;


     PTR_FAIL_IF(emit_const(compiler, reg, init_value));


     if (dst & SLJIT_MEM)
         PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw));
-
     return const_;
 }


Modified: code/trunk/sljit/sljitNativeTILEGX.c
===================================================================
--- code/trunk/sljit/sljitNativeTILEGX.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeTILEGX.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -900,7 +900,7 @@
     return flush_buffer(compiler);
 }


-static SLJIT_INLINE sljit_ins * optimize_jump(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
+static SLJIT_INLINE sljit_ins * detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
 {
     sljit_sw diff;
     sljit_uw target_addr;
@@ -1043,7 +1043,7 @@
                 else
                     jump->addr = (sljit_uw)(code_ptr - 3);


-                code_ptr = optimize_jump(jump, code_ptr, code);
+                code_ptr = detect_jump_type(jump, code_ptr, code);
                 jump = jump->next;
             }


@@ -1330,13 +1330,13 @@
 {
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if ((!(flags & WRITE_BACK) || !(arg & 0xf))
-            && !(arg & 0xf0) && argw <= SIMM_16BIT_MAX && argw >= SIMM_16BIT_MIN) {
+    if ((!(flags & WRITE_BACK) || !(arg & REG_MASK))
+            && !(arg & OFFS_REG_MASK) && argw <= SIMM_16BIT_MAX && argw >= SIMM_16BIT_MIN) {
         /* Works for both absoulte and relative addresses. */
         if (SLJIT_UNLIKELY(flags & ARG_TEST))
             return 1;


-        FAIL_IF(ADDLI(ADDR_TMP_mapped, reg_map[arg & 0xf], argw));
+        FAIL_IF(ADDLI(ADDR_TMP_mapped, reg_map[arg & REG_MASK], argw));


         if (flags & LOAD_DATA)
             FAIL_IF(PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, ADDR_TMP_mapped));
@@ -1357,11 +1357,11 @@
     SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));


     /* Simple operation except for updates. */
-    if (arg & 0xf0) {
+    if (arg & OFFS_REG_MASK) {
         argw &= 0x3;
         next_argw &= 0x3;
         if (argw && argw == next_argw
-                && (arg == next_arg || (arg & 0xf0) == (next_arg & 0xf0)))
+                && (arg == next_arg || (arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK)))
             return 1;
         return 0;
     }
@@ -1393,9 +1393,9 @@
     else
         tmp_ar = TMP_REG1_mapped;


-    base = arg & 0xf;
+    base = arg & REG_MASK;


-    if (SLJIT_UNLIKELY(arg & 0xf0)) {
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
         argw &= 0x3;


         if ((flags & WRITE_BACK) && reg_ar == reg_map[base]) {
@@ -1414,7 +1414,7 @@
                         return PB2(data_transfer_insts[flags & MEM_MASK], TMP_REG3_mapped, reg_ar);
                 }


-                if ((SLJIT_MEM | (arg & 0xf0)) == compiler->cache_arg) {
+                if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) {
                     if (arg == next_arg && argw == (next_argw & 0x3)) {
                         compiler->cache_arg = arg;
                         compiler->cache_argw = argw;
@@ -1432,7 +1432,7 @@
                         return PB2(data_transfer_insts[flags & MEM_MASK], tmp_ar, reg_ar);
                 }
             } else {
-                if ((SLJIT_MEM | (arg & 0xf0)) == compiler->cache_arg) {
+                if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) {
                     FAIL_IF(ADD(reg_map[base], reg_map[base], TMP_REG3_mapped));
                     if (flags & LOAD_DATA)
                         return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, reg_map[base]);
@@ -1443,19 +1443,19 @@
         }


         if (SLJIT_UNLIKELY(argw)) {
-            compiler->cache_arg = SLJIT_MEM | (arg & 0xf0);
+            compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
             compiler->cache_argw = argw;
-            FAIL_IF(SHLI(TMP_REG3_mapped, reg_map[((arg >> 4) & 0xf)], argw));
+            FAIL_IF(SHLI(TMP_REG3_mapped, reg_map[OFFS_REG(arg)], argw));
         }


         if (!(flags & WRITE_BACK)) {
             if (arg == next_arg && argw == (next_argw & 0x3)) {
                 compiler->cache_arg = arg;
                 compiler->cache_argw = argw;
-                FAIL_IF(ADD(TMP_REG3_mapped, reg_map[base], reg_map[!argw ? ((arg >> 4) & 0xf) : TMP_REG3]));
+                FAIL_IF(ADD(TMP_REG3_mapped, reg_map[base], reg_map[!argw ? OFFS_REG(arg) : TMP_REG3]));
                 tmp_ar = TMP_REG3_mapped;
             } else
-                FAIL_IF(ADD(tmp_ar, reg_map[base], reg_map[!argw ? ((arg >> 4) & 0xf) : TMP_REG3]));
+                FAIL_IF(ADD(tmp_ar, reg_map[base], reg_map[!argw ? OFFS_REG(arg) : TMP_REG3]));


             if (flags & LOAD_DATA)
                 return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, tmp_ar);
@@ -1463,7 +1463,7 @@
                 return PB2(data_transfer_insts[flags & MEM_MASK], tmp_ar, reg_ar);
         }


-        FAIL_IF(ADD(reg_map[base], reg_map[base], reg_map[!argw ? ((arg >> 4) & 0xf) : TMP_REG3]));
+        FAIL_IF(ADD(reg_map[base], reg_map[base], reg_map[!argw ? OFFS_REG(arg) : TMP_REG3]));


         if (flags & LOAD_DATA)
             return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, reg_map[base]);
@@ -1598,7 +1598,7 @@
     if (dst == SLJIT_UNUSED)
         return SLJIT_SUCCESS;


-    if (dst <= TMP_REG3)
+    if (FAST_IS_REG(dst))
         return ADD(reg_map[dst], RA, ZERO);


     /* Memory. */
@@ -1611,7 +1611,7 @@
     check_sljit_emit_fast_return(compiler, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3)
+    if (FAST_IS_REG(src))
         FAIL_IF(ADD(RA, reg_map[src], ZERO));


     else if (src & SLJIT_MEM)
@@ -2002,7 +2002,7 @@
             return SLJIT_SUCCESS;
         if (GET_FLAGS(op))
             flags |= UNUSED_DEST;
-    } else if (dst <= TMP_REG3) {
+    } else if (FAST_IS_REG(dst)) {
         dst_r = dst;
         flags |= REG_DEST;
         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -2037,7 +2037,7 @@
     }


     /* Source 1. */
-    if (src1 <= TMP_REG3) {
+    if (FAST_IS_REG(src1)) {
         src1_r = src1;
         flags |= REG1_SOURCE;
     } else if (src1 & SLJIT_IMM) {
@@ -2055,7 +2055,7 @@
     }


     /* Source 2. */
-    if (src2 <= TMP_REG3) {
+    if (FAST_IS_REG(src2)) {
         src2_r = src2;
         flags |= REG2_SOURCE;
         if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_SI)
@@ -2120,7 +2120,7 @@
         return SLJIT_SUCCESS;


     op = GET_OPCODE(op);
-    sugg_dst_ar = reg_map[(op < SLJIT_ADD && dst <= TMP_REG3) ? dst : TMP_REG2];
+    sugg_dst_ar = reg_map[(op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2];


     compiler->cache_arg = 0;
     compiler->cache_argw = 0;
@@ -2353,7 +2353,7 @@
     check_sljit_emit_ijump(compiler, type, src, srcw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (src <= TMP_REG3) {
+    if (FAST_IS_REG(src)) {
         if (reg_map[src] != 0)
             src_r = src;
         else
@@ -2549,7 +2549,7 @@
     PTR_FAIL_IF(!const_);
     set_const(const_, compiler);


-    reg = (dst <= TMP_REG3) ? dst : TMP_REG2;
+    reg = FAST_IS_REG(dst) ? dst : TMP_REG2;


     PTR_FAIL_IF(emit_const_64(compiler, reg, init_value, 1));



Modified: code/trunk/sljit/sljitNativeX86_32.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_32.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeX86_32.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -89,11 +89,11 @@
     FAIL_IF(!inst);


     INC_SIZE(size);
-    PUSH_REG(reg_map[TMP_REGISTER]);
+    PUSH_REG(reg_map[TMP_REG1]);
 #if !(defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
     if (args > 0) {
         *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (reg_map[TMP_REGISTER] << 3) | 0x4 /* esp */;
+        *inst++ = MOD_REG | (reg_map[TMP_REG1] << 3) | 0x4 /* esp */;
     }
 #endif
     if (saveds > 2)
@@ -121,17 +121,17 @@
 #else
     if (args > 0) {
         *inst++ = MOV_r_rm;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG1] << 3) | reg_map[TMP_REGISTER];
+        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG1] << 3) | reg_map[TMP_REG1];
         *inst++ = sizeof(sljit_sw) * 2;
     }
     if (args > 1) {
         *inst++ = MOV_r_rm;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG2] << 3) | reg_map[TMP_REGISTER];
+        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG2] << 3) | reg_map[TMP_REG1];
         *inst++ = sizeof(sljit_sw) * 3;
     }
     if (args > 2) {
         *inst++ = MOV_r_rm;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG3] << 3) | reg_map[TMP_REGISTER];
+        *inst++ = MOD_DISP8 | (reg_map[SLJIT_SAVED_REG3] << 3) | reg_map[TMP_REG1];
         *inst++ = sizeof(sljit_sw) * 4;
     }
 #endif
@@ -245,7 +245,7 @@
         POP_REG(reg_map[SLJIT_SAVED_REG2]);
     if (compiler->saveds > 2)
         POP_REG(reg_map[SLJIT_SAVED_REG3]);
-    POP_REG(reg_map[TMP_REGISTER]);
+    POP_REG(reg_map[TMP_REG1]);
 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
     if (compiler->args > 2)
         RET_I16(sizeof(sljit_sw));
@@ -301,9 +301,9 @@
     /* Calculate size of b. */
     inst_size += 1; /* mod r/m byte. */
     if (b & SLJIT_MEM) {
-        if ((b & 0x0f) == SLJIT_UNUSED)
+        if ((b & REG_MASK) == SLJIT_UNUSED)
             inst_size += sizeof(sljit_sw);
-        else if (immb != 0 && !(b & 0xf0)) {
+        else if (immb != 0 && !(b & OFFS_REG_MASK)) {
             /* Immediate operand. */
             if (immb <= 127 && immb >= -128)
                 inst_size += sizeof(sljit_sb);
@@ -311,10 +311,10 @@
                 inst_size += sizeof(sljit_sw);
         }


-        if ((b & 0xf) == SLJIT_LOCALS_REG && !(b & 0xf0))
-            b |= SLJIT_LOCALS_REG << 4;
+        if ((b & REG_MASK) == SLJIT_LOCALS_REG && !(b & OFFS_REG_MASK))
+            b |= TO_OFFS_REG(SLJIT_LOCALS_REG);


-        if ((b & 0xf0) != SLJIT_UNUSED)
+        if ((b & OFFS_REG_MASK) != SLJIT_UNUSED)
             inst_size += 1; /* SIB byte. */
     }


@@ -393,8 +393,8 @@
 #else
         *buf_ptr++ |= MOD_REG + reg_map[b];
 #endif
-    else if ((b & 0x0f) != SLJIT_UNUSED) {
-        if ((b & 0xf0) == SLJIT_UNUSED || (b & 0xf0) == (SLJIT_LOCALS_REG << 4)) {
+    else if ((b & REG_MASK) != SLJIT_UNUSED) {
+        if ((b & OFFS_REG_MASK) == SLJIT_UNUSED || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_LOCALS_REG)) {
             if (immb != 0) {
                 if (immb <= 127 && immb >= -128)
                     *buf_ptr |= 0x40;
@@ -402,11 +402,11 @@
                     *buf_ptr |= 0x80;
             }


-            if ((b & 0xf0) == SLJIT_UNUSED)
-                *buf_ptr++ |= reg_map[b & 0x0f];
+            if ((b & OFFS_REG_MASK) == SLJIT_UNUSED)
+                *buf_ptr++ |= reg_map[b & REG_MASK];
             else {
                 *buf_ptr++ |= 0x04;
-                *buf_ptr++ = reg_map[b & 0x0f] | (reg_map[(b >> 4) & 0x0f] << 3);
+                *buf_ptr++ = reg_map[b & REG_MASK] | (reg_map[OFFS_REG(b)] << 3);
             }


             if (immb != 0) {
@@ -420,7 +420,7 @@
         }
         else {
             *buf_ptr++ |= 0x04;
-            *buf_ptr++ = reg_map[b & 0x0f] | (reg_map[(b >> 4) & 0x0f] << 3) | (immb << 6);
+            *buf_ptr++ = reg_map[b & REG_MASK] | (reg_map[OFFS_REG(b)] << 3) | (immb << 6);
         }
     }
     else {
@@ -495,9 +495,9 @@


     /* For UNUSED dst. Uncommon, but possible. */
     if (dst == SLJIT_UNUSED)
-        dst = TMP_REGISTER;
+        dst = TMP_REG1;


-    if (dst <= TMP_REGISTER) {
+    if (FAST_IS_REG(dst)) {
         /* Unused dest is possible here. */
         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
         FAIL_IF(!inst);
@@ -524,7 +524,7 @@


     CHECK_EXTRA_REGS(src, srcw, (void)0);


-    if (src <= TMP_REGISTER) {
+    if (FAST_IS_REG(src)) {
         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 1);
         FAIL_IF(!inst);



Modified: code/trunk/sljit/sljitNativeX86_64.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_64.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeX86_64.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -420,12 +420,12 @@
     size &= 0xf;
     inst_size = size;


-    if ((b & SLJIT_MEM) && !(b & 0xf0) && NOT_HALFWORD(immb)) {
+    if ((b & SLJIT_MEM) && !(b & OFFS_REG_MASK) && NOT_HALFWORD(immb)) {
         if (emit_load_imm64(compiler, TMP_REG3, immb))
             return NULL;
         immb = 0;
-        if (b & 0xf)
-            b |= TMP_REG3 << 4;
+        if (b & REG_MASK)
+            b |= TO_OFFS_REG(TMP_REG3);
         else
             b |= TMP_REG3;
     }
@@ -445,12 +445,12 @@
     /* Calculate size of b. */
     inst_size += 1; /* mod r/m byte. */
     if (b & SLJIT_MEM) {
-        if ((b & 0x0f) == SLJIT_UNUSED)
+        if ((b & REG_MASK) == SLJIT_UNUSED)
             inst_size += 1 + sizeof(sljit_si); /* SIB byte required to avoid RIP based addressing. */
         else {
-            if (reg_map[b & 0x0f] >= 8)
+            if (reg_map[b & REG_MASK] >= 8)
                 rex |= REX_B;
-            if (immb != 0 && !(b & 0xf0)) {
+            if (immb != 0 && !(b & OFFS_REG_MASK)) {
                 /* Immediate operand. */
                 if (immb <= 127 && immb >= -128)
                     inst_size += sizeof(sljit_sb);
@@ -459,12 +459,12 @@
             }
         }


-        if ((b & 0xf) == SLJIT_LOCALS_REG && !(b & 0xf0))
-            b |= SLJIT_LOCALS_REG << 4;
+        if ((b & REG_MASK) == SLJIT_LOCALS_REG && !(b & OFFS_REG_MASK))
+            b |= TO_OFFS_REG(SLJIT_LOCALS_REG);


-        if ((b & 0xf0) != SLJIT_UNUSED) {
+        if ((b & OFFS_REG_MASK) != SLJIT_UNUSED) {
             inst_size += 1; /* SIB byte. */
-            if (reg_map[(b >> 4) & 0x0f] >= 8)
+            if (reg_map[OFFS_REG(b)] >= 8)
                 rex |= REX_X;
         }
     }
@@ -563,8 +563,8 @@
 #else
         *buf_ptr++ |= MOD_REG + reg_lmap[b];
 #endif
-    else if ((b & 0x0f) != SLJIT_UNUSED) {
-        if ((b & 0xf0) == SLJIT_UNUSED || (b & 0xf0) == (SLJIT_LOCALS_REG << 4)) {
+    else if ((b & REG_MASK) != SLJIT_UNUSED) {
+        if ((b & OFFS_REG_MASK) == SLJIT_UNUSED || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_LOCALS_REG)) {
             if (immb != 0) {
                 if (immb <= 127 && immb >= -128)
                     *buf_ptr |= 0x40;
@@ -572,11 +572,11 @@
                     *buf_ptr |= 0x80;
             }


-            if ((b & 0xf0) == SLJIT_UNUSED)
-                *buf_ptr++ |= reg_lmap[b & 0x0f];
+            if ((b & OFFS_REG_MASK) == SLJIT_UNUSED)
+                *buf_ptr++ |= reg_lmap[b & REG_MASK];
             else {
                 *buf_ptr++ |= 0x04;
-                *buf_ptr++ = reg_lmap[b & 0x0f] | (reg_lmap[(b >> 4) & 0x0f] << 3);
+                *buf_ptr++ = reg_lmap[b & REG_MASK] | (reg_lmap[OFFS_REG(b)] << 3);
             }


             if (immb != 0) {
@@ -590,7 +590,7 @@
         }
         else {
             *buf_ptr++ |= 0x04;
-            *buf_ptr++ = reg_lmap[b & 0x0f] | (reg_lmap[(b >> 4) & 0x0f] << 3) | (immb << 6);
+            *buf_ptr++ = reg_lmap[b & REG_MASK] | (reg_lmap[OFFS_REG(b)] << 3) | (immb << 6);
         }
     }
     else {
@@ -662,9 +662,9 @@


     /* For UNUSED dst. Uncommon, but possible. */
     if (dst == SLJIT_UNUSED)
-        dst = TMP_REGISTER;
+        dst = TMP_REG1;


-    if (dst <= TMP_REGISTER) {
+    if (FAST_IS_REG(dst)) {
         if (reg_map[dst] < 8) {
             inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
             FAIL_IF(!inst);
@@ -698,11 +698,11 @@
     ADJUST_LOCAL_OFFSET(src, srcw);


     if ((src & SLJIT_IMM) && NOT_HALFWORD(srcw)) {
-        FAIL_IF(emit_load_imm64(compiler, TMP_REGISTER, srcw));
-        src = TMP_REGISTER;
+        FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
+        src = TMP_REG1;
     }


-    if (src <= TMP_REGISTER) {
+    if (FAST_IS_REG(src)) {
         if (reg_map[src] < 8) {
             inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 1);
             FAIL_IF(!inst);
@@ -765,7 +765,7 @@
         return SLJIT_SUCCESS; /* Empty instruction. */


     if (src & SLJIT_IMM) {
-        if (dst <= TMP_REGISTER) {
+        if (FAST_IS_REG(dst)) {
             if (sign || ((sljit_uw)srcw <= 0x7fffffff)) {
                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, (sljit_sw)(sljit_si)srcw, dst, dstw);
                 FAIL_IF(!inst);
@@ -782,9 +782,9 @@
         return SLJIT_SUCCESS;
     }


-    dst_r = (dst <= TMP_REGISTER) ? dst : TMP_REGISTER;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


-    if ((dst & SLJIT_MEM) && (src <= TMP_REGISTER))
+    if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
         dst_r = src;
     else {
         if (sign) {


Modified: code/trunk/sljit/sljitNativeX86_common.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_common.c    2014-01-28 16:07:52 UTC (rev 1452)
+++ code/trunk/sljit/sljitNativeX86_common.c    2014-01-30 06:10:21 UTC (rev 1453)
@@ -64,7 +64,7 @@
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)


 /* Last register + 1. */
-#define TMP_REGISTER    (SLJIT_NO_REGISTERS + 1)
+#define TMP_REG1    (SLJIT_NO_REGISTERS + 1)


 static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 2] = {
     0, 0, 2, 1, 0, 0, 3, 6, 7, 0, 0, 4, 5
@@ -85,7 +85,7 @@
 #else /* SLJIT_CONFIG_X86_32 */


 /* Last register + 1. */
-#define TMP_REGISTER    (SLJIT_NO_REGISTERS + 1)
+#define TMP_REG1    (SLJIT_NO_REGISTERS + 1)
 #define TMP_REG2    (SLJIT_NO_REGISTERS + 2)
 #define TMP_REG3    (SLJIT_NO_REGISTERS + 3)


@@ -633,7 +633,7 @@
     CPU cycles if the stack is large enough. However, you don't know it in
     advance, so it must always be called. I think this is a bad design in
     general even if it has some reasons. */
-    *(sljit_si*)alloca(local_size) = 0;
+    *(volatile sljit_si*)alloca(local_size) = 0;
 }


 #endif
@@ -653,20 +653,20 @@
     if (dst == SLJIT_UNUSED) {
         /* No destination, doesn't need to setup flags. */
         if (src & SLJIT_MEM) {
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src, srcw);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
             FAIL_IF(!inst);
             *inst = MOV_r_rm;
         }
         return SLJIT_SUCCESS;
     }
-    if (src <= TMP_REGISTER) {
+    if (FAST_IS_REG(src)) {
         inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
         FAIL_IF(!inst);
         *inst = MOV_rm_r;
         return SLJIT_SUCCESS;
     }
     if (src & SLJIT_IMM) {
-        if (dst <= TMP_REGISTER) {
+        if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
             return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
 #else
@@ -692,7 +692,7 @@
         *inst = MOV_rm_i32;
         return SLJIT_SUCCESS;
     }
-    if (dst <= TMP_REGISTER) {
+    if (FAST_IS_REG(dst)) {
         inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
         FAIL_IF(!inst);
         *inst = MOV_r_rm;
@@ -700,10 +700,10 @@
     }


     /* Memory to memory move. Requires two instruction. */
-    inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src, srcw);
+    inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
     FAIL_IF(!inst);
     *inst = MOV_r_rm;
-    inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+    inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
     FAIL_IF(!inst);
     *inst = MOV_rm_r;
     return SLJIT_SUCCESS;
@@ -745,13 +745,13 @@
         SLJIT_COMPILE_ASSERT(
             reg_map[SLJIT_SCRATCH_REG1] == 0
             && reg_map[SLJIT_SCRATCH_REG2] == 2
-            && reg_map[TMP_REGISTER] > 7,
+            && reg_map[TMP_REG1] > 7,
             invalid_register_assignment_for_div_mul);
 #else
         SLJIT_COMPILE_ASSERT(
             reg_map[SLJIT_SCRATCH_REG1] == 0
             && reg_map[SLJIT_SCRATCH_REG2] < 7
-            && reg_map[TMP_REGISTER] == 2,
+            && reg_map[TMP_REG1] == 2,
             invalid_register_assignment_for_div_mul);
 #endif
         compiler->mode32 = op & SLJIT_INT_OP;
@@ -760,10 +760,10 @@
         op = GET_OPCODE(op);
         if (op == SLJIT_UDIV) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
-            EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_SCRATCH_REG2, 0);
+            EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_SCRATCH_REG2, 0);
             inst = emit_x86_instruction(compiler, 1, SLJIT_SCRATCH_REG2, 0, SLJIT_SCRATCH_REG2, 0);
 #else
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
 #endif
             FAIL_IF(!inst);
             *inst = XOR_r_rm;
@@ -771,7 +771,7 @@


         if (op == SLJIT_SDIV) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
-            EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_SCRATCH_REG2, 0);
+            EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_SCRATCH_REG2, 0);
 #endif


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
@@ -800,7 +800,7 @@
         FAIL_IF(!inst);
         INC_SIZE(2);
         *inst++ = GROUP_F7;
-        *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_map[TMP_REGISTER] : reg_map[SLJIT_SCRATCH_REG2]);
+        *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_SCRATCH_REG2]);
 #else
 #ifdef _WIN64
         size = (!compiler->mode32 || op >= SLJIT_UDIV) ? 3 : 2;
@@ -816,7 +816,7 @@
         else if (op >= SLJIT_UDIV)
             *inst++ = REX_B;
         *inst++ = GROUP_F7;
-        *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_lmap[TMP_REGISTER] : reg_lmap[SLJIT_SCRATCH_REG2]);
+        *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_SCRATCH_REG2]);
 #else
         if (!compiler->mode32)
             *inst++ = REX_W;
@@ -839,7 +839,7 @@
             break;
         }
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
-        EMIT_MOV(compiler, SLJIT_SCRATCH_REG2, 0, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, SLJIT_SCRATCH_REG2, 0, TMP_REG1, 0);
 #endif
         break;
     }
@@ -873,7 +873,7 @@
         return SLJIT_SUCCESS; /* Empty instruction. */


     if (src & SLJIT_IMM) {
-        if (dst <= TMP_REGISTER) {
+        if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
             return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
 #else
@@ -889,13 +889,13 @@
         return SLJIT_SUCCESS;
     }


-    dst_r = (dst <= TMP_REGISTER) ? dst : TMP_REGISTER;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


-    if ((dst & SLJIT_MEM) && src <= TMP_REGISTER) {
+    if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
         if (reg_map[src] >= 4) {
-            SLJIT_ASSERT(dst_r == TMP_REGISTER);
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src, 0);
+            SLJIT_ASSERT(dst_r == TMP_REG1);
+            EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
         } else
             dst_r = src;
 #else
@@ -903,9 +903,9 @@
 #endif
     }
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    else if (src <= TMP_REGISTER && reg_map[src] >= 4) {
+    else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
         /* src, dst are registers. */
-        SLJIT_ASSERT(dst >= SLJIT_SCRATCH_REG1 && dst <= TMP_REGISTER);
+        SLJIT_ASSERT(SLOW_IS_REG(dst));
         if (reg_map[dst] < 4) {
             if (dst != src)
                 EMIT_MOV(compiler, dst, 0, src, 0);
@@ -946,25 +946,25 @@


     if (dst & SLJIT_MEM) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        if (dst_r == TMP_REGISTER) {
+        if (dst_r == TMP_REG1) {
             /* Find a non-used register, whose reg_map[src] < 4. */
-            if ((dst & 0xf) == SLJIT_SCRATCH_REG1) {
-                if ((dst & 0xf0) == (SLJIT_SCRATCH_REG2 << 4))
+            if ((dst & REG_MASK) == SLJIT_SCRATCH_REG1) {
+                if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SCRATCH_REG2))
                     work_r = SLJIT_SCRATCH_REG3;
                 else
                     work_r = SLJIT_SCRATCH_REG2;
             }
             else {
-                if ((dst & 0xf0) != (SLJIT_SCRATCH_REG1 << 4))
+                if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG1))
                     work_r = SLJIT_SCRATCH_REG1;
-                else if ((dst & 0xf) == SLJIT_SCRATCH_REG2)
+                else if ((dst & REG_MASK) == SLJIT_SCRATCH_REG2)
                     work_r = SLJIT_SCRATCH_REG3;
                 else
                     work_r = SLJIT_SCRATCH_REG2;
             }


             if (work_r == SLJIT_SCRATCH_REG1) {
-                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REGISTER]);
+                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
             }
             else {
                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
@@ -977,7 +977,7 @@
             *inst = MOV_rm8_r8;


             if (work_r == SLJIT_SCRATCH_REG1) {
-                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REGISTER]);
+                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
             }
             else {
                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
@@ -1015,7 +1015,7 @@
         return SLJIT_SUCCESS; /* Empty instruction. */


     if (src & SLJIT_IMM) {
-        if (dst <= TMP_REGISTER) {
+        if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
             return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
 #else
@@ -1031,9 +1031,9 @@
         return SLJIT_SUCCESS;
     }


-    dst_r = (dst <= TMP_REGISTER) ? dst : TMP_REGISTER;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


-    if ((dst & SLJIT_MEM) && src <= TMP_REGISTER)
+    if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
         dst_r = src;
     else {
         inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
@@ -1058,8 +1058,8 @@
     sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
+        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst++ = GROUP_F7;
         *inst |= opcode;
@@ -1073,7 +1073,7 @@
         *inst |= opcode;
         return SLJIT_SUCCESS;
     }
-    if (dst <= TMP_REGISTER) {
+    if (FAST_IS_REG(dst)) {
         EMIT_MOV(compiler, dst, 0, src, srcw);
         inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
         FAIL_IF(!inst);
@@ -1081,12 +1081,12 @@
         *inst |= opcode;
         return SLJIT_SUCCESS;
     }
-    EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-    inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+    EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
+    inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
     FAIL_IF(!inst);
     *inst++ = GROUP_F7;
     *inst |= opcode;
-    EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
+    EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
     return SLJIT_SUCCESS;
 }


@@ -1097,17 +1097,17 @@
     sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
+        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst++ = GROUP_F7;
         *inst |= NOT_rm;
-        inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst = OR_r_rm;
         return SLJIT_SUCCESS;
     }
-    if (dst <= TMP_REGISTER) {
+    if (FAST_IS_REG(dst)) {
         EMIT_MOV(compiler, dst, 0, src, srcw);
         inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
         FAIL_IF(!inst);
@@ -1118,15 +1118,15 @@
         *inst = OR_r_rm;
         return SLJIT_SUCCESS;
     }
-    EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-    inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+    EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
+    inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
     FAIL_IF(!inst);
     *inst++ = GROUP_F7;
     *inst |= NOT_rm;
-    inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, TMP_REGISTER, 0);
+    inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
     FAIL_IF(!inst);
     *inst = OR_r_rm;
-    EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
+    EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
     return SLJIT_SUCCESS;
 }


@@ -1140,15 +1140,15 @@
     SLJIT_UNUSED_ARG(op_flags);
     if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
         /* Just set the zero flag. */
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src, srcw);
-        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
+        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst++ = GROUP_F7;
         *inst |= NOT_rm;
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
 #else
-        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REG1, 0);
 #endif
         FAIL_IF(!inst);
         *inst |= SHR;
@@ -1156,24 +1156,24 @@
     }


     if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_IMM, srcw);
-        src = TMP_REGISTER;
+        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
+        src = TMP_REG1;
         srcw = 0;
     }


-    inst = emit_x86_instruction(compiler, 2, TMP_REGISTER, 0, src, srcw);
+    inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
     FAIL_IF(!inst);
     *inst++ = GROUP_0F;
     *inst = BSR_r_rm;


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-    if (dst <= TMP_REGISTER)
+    if (FAST_IS_REG(dst))
         dst_r = dst;
     else {
         /* Find an unused temporary register. */
-        if ((dst & 0xf) != SLJIT_SCRATCH_REG1 && (dst & 0xf0) != (SLJIT_SCRATCH_REG1 << 4))
+        if ((dst & REG_MASK) != SLJIT_SCRATCH_REG1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG1))
             dst_r = SLJIT_SCRATCH_REG1;
-        else if ((dst & 0xf) != SLJIT_SCRATCH_REG2 && (dst & 0xf0) != (SLJIT_SCRATCH_REG2 << 4))
+        else if ((dst & REG_MASK) != SLJIT_SCRATCH_REG2 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG2))
             dst_r = SLJIT_SCRATCH_REG2;
         else
             dst_r = SLJIT_SCRATCH_REG3;
@@ -1181,7 +1181,7 @@
     }
     EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
 #else
-    dst_r = (dst <= TMP_REGISTER) ? dst : TMP_REG2;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
     compiler->mode32 = 0;
     EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 64 + 63 : 32 + 31);
     compiler->mode32 = op_flags & SLJIT_INT_OP;
@@ -1191,7 +1191,7 @@
         get_cpu_features();


     if (cpu_has_cmov) {
-        inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst++ = GROUP_0F;
         *inst = CMOVNE_r_rm;
@@ -1204,7 +1204,7 @@
         *inst++ = JE_i8;
         *inst++ = 2;
         *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REGISTER];
+        *inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
 #else
         inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
         FAIL_IF(!inst);
@@ -1212,9 +1212,9 @@


         *inst++ = JE_i8;
         *inst++ = 3;
-        *inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REGISTER] >= 8 ? REX_B : 0);
+        *inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
         *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REGISTER];
+        *inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
 #endif
     }


@@ -1271,7 +1271,7 @@
#endif

         if (op_flags & SLJIT_INT_OP) {
-            if (src <= TMP_REGISTER && src == dst) {
+            if (FAST_IS_REG(src) && src == dst) {
                 if (!TYPE_CAST_NEEDED(op))
                     return SLJIT_SUCCESS;
             }
@@ -1322,8 +1322,8 @@
 #endif
         }


-        if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & 0xf) && (srcw != 0 || (src & 0xf0) != 0)) {
-            inst = emit_x86_instruction(compiler, 1, src & 0xf, 0, src, srcw);
+        if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
+            inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
             FAIL_IF(!inst);
             *inst = LEA_r_m;
             src &= SLJIT_MEM | 0xf;
@@ -1333,7 +1333,7 @@
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
         if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_UI || op == SLJIT_MOV_SI || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
             SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_LOCALS_REG));
-            dst = TMP_REGISTER;
+            dst = TMP_REG1;
         }
 #endif


@@ -1369,12 +1369,12 @@
         }


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REGISTER)
-            return emit_mov(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), dstw, TMP_REGISTER, 0);
+        if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
+            return emit_mov(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), dstw, TMP_REG1, 0);
 #endif


-        if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & 0xf) && (dstw != 0 || (dst & 0xf0) != 0)) {
-            inst = emit_x86_instruction(compiler, 1, dst & 0xf, 0, dst, dstw);
+        if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
+            inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
             FAIL_IF(!inst);
             *inst = LEA_r_m;
         }
@@ -1447,12 +1447,12 @@
     sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
         if (src2 & SLJIT_IMM) {
-            BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
+            BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
         }
         else {
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
             FAIL_IF(!inst);
             *inst = op_rm;
         }
@@ -1472,20 +1472,20 @@
                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
             }
         }
-        else if (dst <= TMP_REGISTER) {
+        else if (FAST_IS_REG(dst)) {
             inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
             FAIL_IF(!inst);
             *inst = op_rm;
         }
-        else if (src2 <= TMP_REGISTER) {
+        else if (FAST_IS_REG(src2)) {
             /* Special exception for sljit_emit_op_flags. */
             inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
             FAIL_IF(!inst);
             *inst = op_mr;
         }
         else {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src2, src2w);
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+            EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
             FAIL_IF(!inst);
             *inst = op_mr;
         }
@@ -1506,19 +1506,19 @@
                 BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
             }
         }
-        else if (dst <= TMP_REGISTER) {
+        else if (FAST_IS_REG(dst)) {
             inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
             FAIL_IF(!inst);
             *inst = op_rm;
         }
-        else if (src1 <= TMP_REGISTER) {
+        else if (FAST_IS_REG(src1)) {
             inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
             FAIL_IF(!inst);
             *inst = op_mr;
         }
         else {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+            EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
             FAIL_IF(!inst);
             *inst = op_mr;
         }
@@ -1526,7 +1526,7 @@
     }


     /* General version. */
-    if (dst <= TMP_REGISTER) {
+    if (FAST_IS_REG(dst)) {
         EMIT_MOV(compiler, dst, 0, src1, src1w);
         if (src2 & SLJIT_IMM) {
             BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
@@ -1539,16 +1539,16 @@
     }
     else {
         /* This version requires less memory writing. */
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
         if (src2 & SLJIT_IMM) {
-            BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
+            BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
         }
         else {
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
             FAIL_IF(!inst);
             *inst = op_rm;
         }
-        EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
     }


     return SLJIT_SUCCESS;
@@ -1563,12 +1563,12 @@
     sljit_ub* inst;


     if (dst == SLJIT_UNUSED) {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
         if (src2 & SLJIT_IMM) {
-            BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
+            BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
         }
         else {
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
             FAIL_IF(!inst);
             *inst = op_rm;
         }
@@ -1588,19 +1588,19 @@
                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
             }
         }
-        else if (dst <= TMP_REGISTER) {
+        else if (FAST_IS_REG(dst)) {
             inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
             FAIL_IF(!inst);
             *inst = op_rm;
         }
-        else if (src2 <= TMP_REGISTER) {
+        else if (FAST_IS_REG(src2)) {
             inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
             FAIL_IF(!inst);
             *inst = op_mr;
         }
         else {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src2, src2w);
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, dst, dstw);
+            EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
             FAIL_IF(!inst);
             *inst = op_mr;
         }
@@ -1608,7 +1608,7 @@
     }


     /* General version. */
-    if (dst <= TMP_REGISTER && dst != src2) {
+    if (FAST_IS_REG(dst) && dst != src2) {
         EMIT_MOV(compiler, dst, 0, src1, src1w);
         if (src2 & SLJIT_IMM) {
             BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
@@ -1621,16 +1621,16 @@
     }
     else {
         /* This version requires less memory writing. */
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
         if (src2 & SLJIT_IMM) {
-            BINARY_IMM(op_imm, op_mr, src2w, TMP_REGISTER, 0);
+            BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
         }
         else {
-            inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
             FAIL_IF(!inst);
             *inst = op_rm;
         }
-        EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
     }


     return SLJIT_SUCCESS;
@@ -1644,7 +1644,7 @@
     sljit_ub* inst;
     sljit_si dst_r;


-    dst_r = (dst <= TMP_REGISTER) ? dst : TMP_REGISTER;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


     /* Register destination. */
     if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
@@ -1752,7 +1752,7 @@
     else {
         /* Neither argument is immediate. */
         if (ADDRESSING_DEPENDS_ON(src2, dst_r))
-            dst_r = TMP_REGISTER;
+            dst_r = TMP_REG1;
         EMIT_MOV(compiler, dst_r, 0, src1, src1w);
         inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
         FAIL_IF(!inst);
@@ -1760,8 +1760,8 @@
         *inst = IMUL_r_rm;
     }


-    if (dst_r == TMP_REGISTER)
-        EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
+    if (dst_r == TMP_REG1)
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);


     return SLJIT_SUCCESS;
 }
@@ -1782,10 +1782,10 @@
             return SLJIT_ERR_UNSUPPORTED;
     }


-    dst_r = (dst <= TMP_REGISTER) ? dst : TMP_REGISTER;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


-    if (src1 <= TMP_REGISTER) {
-        if (src2 <= TMP_REGISTER || src2 == TMP_REGISTER) {
+    if (FAST_IS_REG(src1)) {
+        if (FAST_IS_REG(src2)) {
             inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
             FAIL_IF(!inst);
             *inst = LEA_r_m;
@@ -1803,7 +1803,7 @@
             done = 1;
         }
     }
-    else if (src2 <= TMP_REGISTER) {
+    else if (FAST_IS_REG(src2)) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
             inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
@@ -1818,8 +1818,8 @@
     }


     if (done) {
-        if (dst_r == TMP_REGISTER)
-            return emit_mov(compiler, dst, dstw, TMP_REGISTER, 0);
+        if (dst_r == TMP_REG1)
+            return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
         return SLJIT_SUCCESS;
     }
     return SLJIT_ERR_UNSUPPORTED;
@@ -1840,7 +1840,7 @@
         return SLJIT_SUCCESS;
     }


-    if (src1 <= TMP_REGISTER) {
+    if (FAST_IS_REG(src1)) {
         if (src2 & SLJIT_IMM) {
             BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
         }
@@ -1852,7 +1852,7 @@
         return SLJIT_SUCCESS;
     }


-    if (src2 <= TMP_REGISTER && !(src1 & SLJIT_IMM)) {
+    if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
         inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
         FAIL_IF(!inst);
         *inst = CMP_rm_r;
@@ -1861,15 +1861,15 @@


     if (src2 & SLJIT_IMM) {
         if (src1 & SLJIT_IMM) {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-            src1 = TMP_REGISTER;
+            EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+            src1 = TMP_REG1;
             src1w = 0;
         }
         BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
     }
     else {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-        inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
         FAIL_IF(!inst);
         *inst = CMP_r_rm;
     }
@@ -1900,7 +1900,7 @@
         return SLJIT_SUCCESS;
     }


-    if (src1 <= TMP_REGISTER) {
+    if (FAST_IS_REG(src1)) {
         if (src2 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
             if (IS_HALFWORD(src2w) || compiler->mode32) {
@@ -1928,7 +1928,7 @@
         return SLJIT_SUCCESS;
     }


-    if (src2 <= TMP_REGISTER) {
+    if (FAST_IS_REG(src2)) {
         if (src1 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
             if (IS_HALFWORD(src1w) || compiler->mode32) {
@@ -1956,28 +1956,28 @@
         return SLJIT_SUCCESS;
     }


-    EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
+    EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
     if (src2 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         if (IS_HALFWORD(src2w) || compiler->mode32) {
-            inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REGISTER, 0);
+            inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
             FAIL_IF(!inst);
             *inst = GROUP_F7;
         }
         else {
             FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
-            inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REGISTER, 0);
+            inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
             FAIL_IF(!inst);
             *inst = TEST_rm_r;
         }
 #else
-        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst = GROUP_F7;
 #endif
     }
     else {
-        inst = emit_x86_instruction(compiler, 1, TMP_REGISTER, 0, src2, src2w);
+        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
         FAIL_IF(!inst);
         *inst = TEST_rm_r;
     }
@@ -2000,21 +2000,21 @@
             return SLJIT_SUCCESS;
         }
         if (dst == SLJIT_UNUSED) {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REGISTER, 0);
+            EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
             FAIL_IF(!inst);
             *inst |= mode;
             return SLJIT_SUCCESS;
         }
         if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+            EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+            inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
             FAIL_IF(!inst);
             *inst |= mode;
-            EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+            EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
             return SLJIT_SUCCESS;
         }
-        if (dst <= TMP_REGISTER) {
+        if (FAST_IS_REG(dst)) {
             EMIT_MOV(compiler, dst, 0, src1, src1w);
             inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
             FAIL_IF(!inst);
@@ -2022,36 +2022,36 @@
             return SLJIT_SUCCESS;
         }


-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
-        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst |= mode;
-        EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
         return SLJIT_SUCCESS;
     }


     if (dst == SLJIT_PREF_SHIFT_REG) {
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst |= mode;
-        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
     }
-    else if (dst <= TMP_REGISTER && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
+    else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
         if (src1 != dst)
             EMIT_MOV(compiler, dst, 0, src1, src1w);
-        EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_PREF_SHIFT_REG, 0);
+        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
         FAIL_IF(!inst);
         *inst |= mode;
-        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
     }
     else {
         /* This case is really difficult, since ecx itself may used for
            addressing, and we must ensure to work even in that case. */
-        EMIT_MOV(compiler, TMP_REGISTER, 0, src1, src1w);
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
 #else
@@ -2059,7 +2059,7 @@
         EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
 #endif
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REGISTER, 0);
+        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst |= mode;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -2067,7 +2067,7 @@
 #else
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), sizeof(sljit_sw));
 #endif
-        EMIT_MOV(compiler, dst, dstw, TMP_REGISTER, 0);
+        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
     }


     return SLJIT_SUCCESS;
@@ -2098,12 +2098,12 @@
     if (!set_flags)
         return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);


-    if (!(dst <= TMP_REGISTER))
+    if (!FAST_IS_REG(dst))
         FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));


     FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));


-    if (dst <= TMP_REGISTER)
+    if (FAST_IS_REG(dst))
         return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
     return SLJIT_SUCCESS;
 }
@@ -2329,7 +2329,7 @@


     if (GET_OPCODE(op) == SLJIT_CMPD) {
         compiler->flags_saved = 0;
-        if (dst <= SLJIT_FLOAT_REG6)
+        if (FAST_IS_REG(dst))
             dst_r = dst;
         else {
             dst_r = TMP_FREG;
@@ -2339,15 +2339,15 @@
     }


     if (op == SLJIT_MOVD) {
-        if (dst <= SLJIT_FLOAT_REG6)
+        if (FAST_IS_REG(dst))
             return emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst, src, srcw);
-        if (src <= SLJIT_FLOAT_REG6)
+        if (FAST_IS_REG(src))
             return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, src);
         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src, srcw));
         return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
     }


-    if (dst >= SLJIT_FLOAT_REG1 && dst <= SLJIT_FLOAT_REG6) {
+    if (SLOW_IS_REG(dst)) {
         dst_r = dst;
         if (dst != src)
             FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
@@ -2386,7 +2386,7 @@
     compiler->mode32 = 1;
 #endif


-    if (dst <= SLJIT_FLOAT_REG6) {
+    if (FAST_IS_REG(dst)) {
         dst_r = dst;
         if (dst == src1)
             ; /* Do nothing here. */
@@ -2548,8 +2548,8 @@
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
         if (src == SLJIT_SCRATCH_REG3) {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src, 0);
-            src = TMP_REGISTER;
+            EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
+            src = TMP_REG1;
         }
         if (src == SLJIT_MEM1(SLJIT_LOCALS_REG) && type >= SLJIT_CALL3)
             srcw += sizeof(sljit_sw);
@@ -2557,8 +2557,8 @@
 #endif
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
         if (src == SLJIT_SCRATCH_REG3) {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, src, 0);
-            src = TMP_REGISTER;
+            EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
+            src = TMP_REG1;
         }
 #endif
         FAIL_IF(call_with_args(compiler, type));
@@ -2626,22 +2626,22 @@
     cond_set = get_jump_code(type) + 0x10;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && dst <= TMP_REGISTER && dst == src) {
+    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
         inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 3);
         FAIL_IF(!inst);
         INC_SIZE(4 + 3);
         /* Set low register to conditional flag. */
-        *inst++ = (reg_map[TMP_REGISTER] <= 7) ? REX : REX_B;
+        *inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
         *inst++ = GROUP_0F;
         *inst++ = cond_set;
-        *inst++ = MOD_REG | reg_lmap[TMP_REGISTER];
-        *inst++ = REX | (reg_map[TMP_REGISTER] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
+        *inst++ = MOD_REG | reg_lmap[TMP_REG1];
+        *inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
         *inst++ = OR_rm8_r8;
-        *inst++ = MOD_REG | (reg_lmap[TMP_REGISTER] << 3) | reg_lmap[dst];
+        *inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
         return SLJIT_SUCCESS;
     }


-    reg = (op == SLJIT_MOV && dst <= TMP_REGISTER) ? dst : TMP_REGISTER;
+    reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;


     inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
     FAIL_IF(!inst);
@@ -2656,19 +2656,19 @@
     *inst++ = MOVZX_r_rm8;
     *inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];


-    if (reg != TMP_REGISTER)
+    if (reg != TMP_REG1)
         return SLJIT_SUCCESS;


     if (GET_OPCODE(op) < SLJIT_ADD) {
         compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
-        return emit_mov(compiler, dst, dstw, TMP_REGISTER, 0);
+        return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
     }
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
     compiler->skip_checks = 1;
 #endif
-    return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REGISTER, 0);
+    return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
 #else /* SLJIT_CONFIG_X86_64 */
-    if (GET_OPCODE(op) < SLJIT_ADD && dst <= TMP_REGISTER) {
+    if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
         if (reg_map[dst] <= 4) {
             /* Low byte is accessible. */
             inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
@@ -2690,7 +2690,7 @@
             get_cpu_features();


         if (cpu_has_cmov) {
-            EMIT_MOV(compiler, TMP_REGISTER, 0, SLJIT_IMM, 1);
+            EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
             /* a xor reg, reg operation would overwrite the flags. */
             EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);


@@ -2701,14 +2701,14 @@
             *inst++ = GROUP_0F;
             /* cmovcc = setcc - 0x50. */
             *inst++ = cond_set - 0x50;
-            *inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REGISTER];
+            *inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
             return SLJIT_SUCCESS;
         }


         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
         FAIL_IF(!inst);
         INC_SIZE(1 + 3 + 3 + 1);
-        *inst++ = XCHG_EAX_r + reg_map[TMP_REGISTER];
+        *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
         /* Set al to conditional flag. */
         *inst++ = GROUP_0F;
         *inst++ = cond_set;
@@ -2717,24 +2717,24 @@
         *inst++ = GROUP_0F;
         *inst++ = MOVZX_r_rm8;
         *inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
-        *inst++ = XCHG_EAX_r + reg_map[TMP_REGISTER];
+        *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
         return SLJIT_SUCCESS;
     }


-    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && dst <= TMP_REGISTER && dst == src && reg_map[dst] <= 4) {
+    if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
         SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SCRATCH_REG1] == 0, scratch_reg1_must_be_eax);
         if (dst != SLJIT_SCRATCH_REG1) {
             inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
             FAIL_IF(!inst);
             INC_SIZE(1 + 3 + 2 + 1);
             /* Set low register to conditional flag. */
-            *inst++ = XCHG_EAX_r + reg_map[TMP_REGISTER];
+            *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
             *inst++ = GROUP_0F;
             *inst++ = cond_set;
             *inst++ = MOD_REG | 0 /* eax */;
             *inst++ = OR_rm8_r8;
             *inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
-            *inst++ = XCHG_EAX_r + reg_map[TMP_REGISTER];
+            *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
         }
         else {
             inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
@@ -2742,23 +2742,23 @@
             INC_SIZE(2 + 3 + 2 + 2);
             /* Set low register to conditional flag. */
             *inst++ = XCHG_r_rm;
-            *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REGISTER];
+            *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
             *inst++ = GROUP_0F;
             *inst++ = cond_set;
             *inst++ = MOD_REG | 1 /* ecx */;
             *inst++ = OR_rm8_r8;
             *inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
             *inst++ = XCHG_r_rm;
-            *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REGISTER];
+            *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
         }
         return SLJIT_SUCCESS;
     }


-    /* Set TMP_REGISTER to the bit. */
+    /* Set TMP_REG1 to the bit. */
     inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
     FAIL_IF(!inst);
     INC_SIZE(1 + 3 + 3 + 1);
-    *inst++ = XCHG_EAX_r + reg_map[TMP_REGISTER];
+    *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
     /* Set al to conditional flag. */
     *inst++ = GROUP_0F;
     *inst++ = cond_set;
@@ -2768,15 +2768,15 @@
     *inst++ = MOVZX_r_rm8;
     *inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;


-    *inst++ = XCHG_EAX_r + reg_map[TMP_REGISTER];
+    *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];


     if (GET_OPCODE(op) < SLJIT_ADD)
-        return emit_mov(compiler, dst, dstw, TMP_REGISTER, 0);
+        return emit_mov(compiler, dst, dstw, TMP_REG1, 0);


 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
     compiler->skip_checks = 1;
 #endif
-    return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REGISTER, 0);
+    return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
 #endif /* SLJIT_CONFIG_X86_64 */
 }


@@ -2796,12 +2796,12 @@

 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     if (NOT_HALFWORD(offset)) {
-        FAIL_IF(emit_load_imm64(compiler, TMP_REGISTER, offset));
+        FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
-        SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REGISTER, 0) != SLJIT_ERR_UNSUPPORTED);
+        SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
         return compiler->error;
 #else
-        return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REGISTER, 0);
+        return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REG1, 0);
 #endif
     }
 #endif
@@ -2831,13 +2831,13 @@


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     compiler->mode32 = 0;
-    reg = (dst <= TMP_REGISTER) ? dst : TMP_REGISTER;
+    reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;


     if (emit_load_imm64(compiler, reg, init_value))
         return NULL;
 #else
     if (dst == SLJIT_UNUSED)
-        dst = TMP_REGISTER;
+        dst = TMP_REG1;


     if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
         return NULL;
@@ -2850,8 +2850,8 @@
     *inst++ = 1;


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-    if (reg == TMP_REGISTER && dst != SLJIT_UNUSED)
-        if (emit_mov(compiler, dst, dstw, TMP_REGISTER, 0))
+    if (dst & SLJIT_MEM)
+        if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
             return NULL;
 #endif