[Pcre-svn] [904] code/trunk/src: JIT compiler update.

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [904] code/trunk/src: JIT compiler update.
Revision: 904
          http://www.exim.org/viewvc/pcre2?view=rev&revision=904
Author:   zherczeg
Date:     2018-01-05 09:30:45 +0000 (Fri, 05 Jan 2018)
Log Message:
-----------
JIT compiler update.


Modified Paths:
--------------
    code/trunk/src/pcre2_jit_compile.c
    code/trunk/src/sljit/sljitConfigInternal.h
    code/trunk/src/sljit/sljitLir.c
    code/trunk/src/sljit/sljitLir.h
    code/trunk/src/sljit/sljitNativeARM_32.c
    code/trunk/src/sljit/sljitNativeARM_64.c
    code/trunk/src/sljit/sljitNativeARM_T2_32.c
    code/trunk/src/sljit/sljitNativeMIPS_common.c
    code/trunk/src/sljit/sljitNativePPC_common.c
    code/trunk/src/sljit/sljitNativeSPARC_common.c
    code/trunk/src/sljit/sljitNativeX86_32.c
    code/trunk/src/sljit/sljitNativeX86_64.c
    code/trunk/src/sljit/sljitNativeX86_common.c


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/pcre2_jit_compile.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -573,16 +573,13 @@


 #if PCRE2_CODE_UNIT_WIDTH == 8
 #define MOV_UCHAR  SLJIT_MOV_U8
-#define MOVU_UCHAR SLJIT_MOVU_U8
 #define IN_UCHARS(x) (x)
 #elif PCRE2_CODE_UNIT_WIDTH == 16
 #define MOV_UCHAR  SLJIT_MOV_U16
-#define MOVU_UCHAR SLJIT_MOVU_U16
 #define UCHAR_SHIFT (1)
 #define IN_UCHARS(x) ((x) * 2)
 #elif PCRE2_CODE_UNIT_WIDTH == 32
 #define MOV_UCHAR  SLJIT_MOV_U32
-#define MOVU_UCHAR SLJIT_MOVU_U32
 #define UCHAR_SHIFT (2)
 #define IN_UCHARS(x) ((x) * 4)
 #else
@@ -2712,12 +2709,25 @@
   }
 else
   {
-  GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START);
-  OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1);
-  loop = LABEL();
-  OP1(SLJIT_MOVU, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw), SLJIT_R0, 0);
-  OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1);
-  JUMPTO(SLJIT_NOT_ZERO, loop);
+  if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw)) == SLJIT_SUCCESS)
+    {
+    GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START);
+    OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1);
+    loop = LABEL();
+    sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw));
+    OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1);
+    JUMPTO(SLJIT_NOT_ZERO, loop);
+    }
+  else
+    {
+    GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START + sizeof(sljit_sw));
+    OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1);
+    loop = LABEL();
+    OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
+    OP2(SLJIT_ADD, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, sizeof(sljit_sw));
+    OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1);
+    JUMPTO(SLJIT_NOT_ZERO, loop);
+    }
   }
 }


@@ -2750,12 +2760,25 @@
   }
 else
   {
-  GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + sizeof(sljit_sw));
-  OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2);
-  loop = LABEL();
-  OP1(SLJIT_MOVU, SLJIT_MEM1(TMP2), sizeof(sljit_sw), TMP1, 0);
-  OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1);
-  JUMPTO(SLJIT_NOT_ZERO, loop);
+  if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw)) == SLJIT_SUCCESS)
+    {
+    GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + sizeof(sljit_sw));
+    OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2);
+    loop = LABEL();
+    sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw));
+    OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1);
+    JUMPTO(SLJIT_NOT_ZERO, loop);
+    }
+  else
+    {
+    GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + 2 * sizeof(sljit_sw));
+    OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2);
+    loop = LABEL();
+    OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP1, 0);
+    OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, sizeof(sljit_sw));
+    OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1);
+    JUMPTO(SLJIT_NOT_ZERO, loop);
+    }
   }


OP1(SLJIT_MOV, STACK_TOP, 0, ARGUMENTS, 0);
@@ -2796,6 +2819,7 @@
{
DEFINE_COMPILER;
struct sljit_label *loop;
+BOOL has_pre;

/* At this point we can freely use all registers. */
OP1(SLJIT_MOV, SLJIT_S2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1));
@@ -2812,21 +2836,31 @@
OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, match_data),
SLJIT_IMM, SLJIT_OFFSETOF(pcre2_match_data, ovector) - sizeof(PCRE2_SIZE));

-GET_LOCAL_BASE(SLJIT_S0, 0, OVECTOR_START);
+has_pre = sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw)) == SLJIT_SUCCESS;
+
+GET_LOCAL_BASE(SLJIT_S0, 0, OVECTOR_START - (has_pre ? sizeof(sljit_sw) : 0));
OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, begin));

loop = LABEL();
-OP2(SLJIT_SUB, SLJIT_S1, 0, SLJIT_MEM1(SLJIT_S0), 0, SLJIT_R0, 0);
-OP2(SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, sizeof(sljit_sw));
+
+if (has_pre)
+ sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
+else
+ {
+ OP1(SLJIT_MOV, SLJIT_S1, 0, SLJIT_MEM1(SLJIT_S0), 0);
+ OP2(SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, sizeof(sljit_sw));
+ }
+
+OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, sizeof(PCRE2_SIZE));
+OP2(SLJIT_SUB, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_R0, 0);
/* Copy the integer value to the output buffer */
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
OP2(SLJIT_ASHR, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_IMM, UCHAR_SHIFT);
#endif
+
SLJIT_ASSERT(sizeof(PCRE2_SIZE) == 4 || sizeof(PCRE2_SIZE) == 8);
-if (sizeof(PCRE2_SIZE) == 4)
- OP1(SLJIT_MOVU_U32, SLJIT_MEM1(SLJIT_R2), sizeof(PCRE2_SIZE), SLJIT_S1, 0);
-else
- OP1(SLJIT_MOVU, SLJIT_MEM1(SLJIT_R2), sizeof(PCRE2_SIZE), SLJIT_S1, 0);
+OP1(((sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV), SLJIT_MEM1(SLJIT_R2), 0, SLJIT_S1, 0);
+
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
JUMPTO(SLJIT_NOT_ZERO, loop);

@@ -2833,15 +2867,31 @@
 /* Calculate the return value, which is the maximum ovector value. */
 if (topbracket > 1)
   {
-  GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + topbracket * 2 * sizeof(sljit_sw));
-  OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1);
+  if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw))) == SLJIT_SUCCESS)
+    {
+    GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + topbracket * 2 * sizeof(sljit_sw));
+    OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1);


-  /* OVECTOR(0) is never equal to SLJIT_S2. */
-  loop = LABEL();
-  OP1(SLJIT_MOVU, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw)));
-  OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
-  CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop);
-  OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0);
+    /* OVECTOR(0) is never equal to SLJIT_S2. */
+    loop = LABEL();
+    sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw)));
+    OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
+    CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop);
+    OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0);
+    }
+  else
+    {
+    GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + (topbracket - 1) * 2 * sizeof(sljit_sw));
+    OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1);
+
+    /* OVECTOR(0) is never equal to SLJIT_S2. */
+    loop = LABEL();
+    OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), 0);
+    OP2(SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2 * (sljit_sw)sizeof(sljit_sw));
+    OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
+    CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop);
+    OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0);
+    }
   }
 else
   OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1);
@@ -5988,85 +6038,184 @@
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 }


-#define CHAR1 STR_END
-#define CHAR2 STACK_TOP
-
static void do_casefulcmp(compiler_common *common)
{
DEFINE_COMPILER;
struct sljit_jump *jump;
struct sljit_label *label;
+int char1_reg;
+int char2_reg;

-sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+if (sljit_get_register_index(TMP3) < 0)
+ {
+ char1_reg = STR_END;
+ char2_reg = STACK_TOP;
+ }
+else
+ {
+ char1_reg = TMP3;
+ char2_reg = RETURN_ADDR;
+ }
+
+sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP3, 0, CHAR1, 0);
-OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, CHAR2, 0);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

-label = LABEL();
-OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
-OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
-jump = CMP(SLJIT_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
-JUMPTO(SLJIT_NOT_ZERO, label);
+if (char1_reg == STR_END)
+ {
+ OP1(SLJIT_MOV, TMP3, 0, char1_reg, 0);
+ OP1(SLJIT_MOV, RETURN_ADDR, 0, char2_reg, 0);
+ }

-JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-OP1(SLJIT_MOV, CHAR1, 0, TMP3, 0);
-OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
-sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+ {
+ label = LABEL();
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+ jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0);
+ OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+ JUMPTO(SLJIT_NOT_ZERO, label);
+
+ JUMPHERE(jump);
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+ }
+else if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+ {
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+ label = LABEL();
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+ jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0);
+ OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+ JUMPTO(SLJIT_NOT_ZERO, label);
+
+ JUMPHERE(jump);
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ }
+else
+ {
+ label = LABEL();
+ OP1(MOV_UCHAR, char1_reg, 0, SLJIT_MEM1(TMP1), 0);
+ OP1(MOV_UCHAR, char2_reg, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0);
+ OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+ JUMPTO(SLJIT_NOT_ZERO, label);
+
+ JUMPHERE(jump);
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+ }
+
+if (char1_reg == STR_END)
+ {
+ OP1(SLJIT_MOV, char1_reg, 0, TMP3, 0);
+ OP1(SLJIT_MOV, char2_reg, 0, RETURN_ADDR, 0);
+ }
+
+sljit_emit_fast_return(compiler, TMP1, 0);
}

-#define LCC_TABLE STACK_LIMIT
-
static void do_caselesscmp(compiler_common *common)
{
DEFINE_COMPILER;
struct sljit_jump *jump;
struct sljit_label *label;
+int char1_reg = STR_END;
+int char2_reg;
+int lcc_table;
+int opt_type = 0;

-sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+if (sljit_get_register_index(TMP3) < 0)
+ {
+ char2_reg = STACK_TOP;
+ lcc_table = STACK_LIMIT;
+ }
+else
+ {
+ char2_reg = RETURN_ADDR;
+ lcc_table = TMP3;
+ }
+
+if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+ opt_type = 1;
+else if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+ opt_type = 2;
+
+sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

-OP1(SLJIT_MOV, TMP3, 0, LCC_TABLE, 0);
-OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, CHAR1, 0);
-OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, CHAR2, 0);
-OP1(SLJIT_MOV, LCC_TABLE, 0, SLJIT_IMM, common->lcc);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, char1_reg, 0);

-label = LABEL();
-OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
-OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+if (char2_reg == STACK_TOP)
+ {
+ OP1(SLJIT_MOV, TMP3, 0, char2_reg, 0);
+ OP1(SLJIT_MOV, RETURN_ADDR, 0, lcc_table, 0);
+ }
+
+OP1(SLJIT_MOV, lcc_table, 0, SLJIT_IMM, common->lcc);
+
+if (opt_type == 1)
+ {
+ label = LABEL();
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+ }
+else if (opt_type == 2)
+ {
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+ label = LABEL();
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+ sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+ }
+else
+ {
+ label = LABEL();
+ OP1(MOV_UCHAR, char1_reg, 0, SLJIT_MEM1(TMP1), 0);
+ OP1(MOV_UCHAR, char2_reg, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+ }
+
#if PCRE2_CODE_UNIT_WIDTH != 8
-jump = CMP(SLJIT_GREATER, CHAR1, 0, SLJIT_IMM, 255);
+jump = CMP(SLJIT_GREATER, char1_reg, 0, SLJIT_IMM, 255);
#endif
-OP1(SLJIT_MOV_U8, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0);
+OP1(SLJIT_MOV_U8, char1_reg, 0, SLJIT_MEM2(lcc_table, char1_reg), 0);
#if PCRE2_CODE_UNIT_WIDTH != 8
JUMPHERE(jump);
-jump = CMP(SLJIT_GREATER, CHAR2, 0, SLJIT_IMM, 255);
+jump = CMP(SLJIT_GREATER, char2_reg, 0, SLJIT_IMM, 255);
#endif
-OP1(SLJIT_MOV_U8, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0);
+OP1(SLJIT_MOV_U8, char2_reg, 0, SLJIT_MEM2(lcc_table, char2_reg), 0);
#if PCRE2_CODE_UNIT_WIDTH != 8
JUMPHERE(jump);
#endif
-jump = CMP(SLJIT_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
+
+if (opt_type == 0)
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0);
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_NOT_ZERO, label);

JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-OP1(SLJIT_MOV, LCC_TABLE, 0, TMP3, 0);
-OP1(SLJIT_MOV, CHAR1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
-OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
-sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+
+if (opt_type == 2)
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+if (char2_reg == STACK_TOP)
+ {
+ OP1(SLJIT_MOV, char2_reg, 0, TMP3, 0);
+ OP1(SLJIT_MOV, lcc_table, 0, RETURN_ADDR, 0);
+ }
+
+OP1(SLJIT_MOV, char1_reg, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
+sljit_emit_fast_return(compiler, TMP1, 0);
}

-#undef LCC_TABLE
-#undef CHAR1
-#undef CHAR2
-
#if defined SUPPORT_UNICODE

static PCRE2_SPTR SLJIT_FUNC do_utf_caselesscmp(PCRE2_SPTR src1, PCRE2_SPTR src2, PCRE2_SPTR end1, PCRE2_SPTR end2)

Modified: code/trunk/src/sljit/sljitConfigInternal.h
===================================================================
--- code/trunk/src/sljit/sljitConfigInternal.h    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitConfigInternal.h    2018-01-05 09:30:45 UTC (rev 904)
@@ -588,13 +588,13 @@


#elif (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)

-#define SLJIT_NUMBER_OF_REGISTERS 25
+#define SLJIT_NUMBER_OF_REGISTERS 26
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 10
#define SLJIT_LOCALS_OFFSET_BASE (2 * sizeof(sljit_sw))

#elif (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)

-#define SLJIT_NUMBER_OF_REGISTERS 22
+#define SLJIT_NUMBER_OF_REGISTERS 23
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 17
#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) || (defined _AIX)
#define SLJIT_LOCALS_OFFSET_BASE ((6 + 8) * sizeof(sljit_sw))

Modified: code/trunk/src/sljit/sljitLir.c
===================================================================
--- code/trunk/src/sljit/sljitLir.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitLir.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -99,10 +99,10 @@


 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 #define TYPE_CAST_NEEDED(op) \
-    (((op) >= SLJIT_MOV_U8 && (op) <= SLJIT_MOV_S32) || ((op) >= SLJIT_MOVU_U8 && (op) <= SLJIT_MOVU_S32))
+    ((op) >= SLJIT_MOV_U8 && (op) <= SLJIT_MOV_S32)
 #else
 #define TYPE_CAST_NEEDED(op) \
-    (((op) >= SLJIT_MOV_U8 && (op) <= SLJIT_MOV_S16) || ((op) >= SLJIT_MOVU_U8 && (op) <= SLJIT_MOVU_S16))
+    ((op) >= SLJIT_MOV_U8 && (op) <= SLJIT_MOV_S16)
 #endif


 #define BUF_SIZE    4096
@@ -685,80 +685,106 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)


 #define FUNCTION_CHECK_IS_REG(r) \
-    (((r) >= SLJIT_R0 && (r) < (SLJIT_R0 + compiler->scratches)) || \
-    ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0))
+    (((r) >= SLJIT_R0 && (r) < (SLJIT_R0 + compiler->scratches)) \
+    || ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0))


-#define FUNCTION_CHECK_IS_REG_OR_UNUSED(r) \
-    ((r) == SLJIT_UNUSED || \
-    ((r) >= SLJIT_R0 && (r) < (SLJIT_R0 + compiler->scratches)) || \
-    ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0))
+#define FUNCTION_CHECK_IS_FREG(fr) \
+    (((fr) >= SLJIT_FR0 && (fr) < (SLJIT_FR0 + compiler->fscratches)) \
+    || ((fr) > (SLJIT_FS0 - compiler->fsaveds) && (fr) <= SLJIT_FS0))


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-#define CHECK_NOT_VIRTUAL_REGISTER(p) \
-    CHECK_ARGUMENT((p) < SLJIT_R3 || (p) > SLJIT_R6);
+#define CHECK_IF_VIRTUAL_REGISTER(p) ((p) <= SLJIT_S3 && (p) >= SLJIT_S8)
 #else
-#define CHECK_NOT_VIRTUAL_REGISTER(p)
+#define CHECK_IF_VIRTUAL_REGISTER(p) 0
 #endif


-#define FUNCTION_CHECK_SRC(p, i) \
-    CHECK_ARGUMENT(compiler->scratches != -1 && compiler->saveds != -1); \
-    if (FUNCTION_CHECK_IS_REG(p)) \
-        CHECK_ARGUMENT((i) == 0); \
-    else if ((p) == SLJIT_IMM) \
-        ; \
-    else if ((p) == (SLJIT_MEM1(SLJIT_SP))) \
-        CHECK_ARGUMENT((i) >= 0 && (i) < compiler->logical_local_size); \
-    else { \
-        CHECK_ARGUMENT((p) & SLJIT_MEM); \
-        CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG_OR_UNUSED((p) & REG_MASK)); \
-        CHECK_NOT_VIRTUAL_REGISTER((p) & REG_MASK); \
-        if ((p) & OFFS_REG_MASK) { \
-            CHECK_ARGUMENT(((p) & REG_MASK) != SLJIT_UNUSED); \
-            CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(OFFS_REG(p))); \
-            CHECK_NOT_VIRTUAL_REGISTER(OFFS_REG(p)); \
-            CHECK_ARGUMENT(!((i) & ~0x3)); \
-        } \
-        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | REG_MASK | OFFS_REG_MASK))); \
+static sljit_s32 function_check_src_mem(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
+{
+    if (compiler->scratches == -1 || compiler->saveds == -1)
+        return 0;
+
+    if (!(p & SLJIT_MEM))
+        return 0;
+
+    if (!((p & REG_MASK) == SLJIT_UNUSED || FUNCTION_CHECK_IS_REG(p & REG_MASK)))
+        return 0;
+
+    if (CHECK_IF_VIRTUAL_REGISTER(p & REG_MASK))
+        return 0;
+
+    if (p & OFFS_REG_MASK) {
+        if ((p & REG_MASK) == SLJIT_UNUSED)
+            return 0;
+
+        if (!(FUNCTION_CHECK_IS_REG(OFFS_REG(p))))
+            return 0;
+
+        if (CHECK_IF_VIRTUAL_REGISTER(OFFS_REG(p)))
+            return 0;
+
+        if ((i & ~0x3) != 0)
+            return 0;
     }


+    return (p & ~(SLJIT_MEM | REG_MASK | OFFS_REG_MASK)) == 0;
+}
+
+#define FUNCTION_CHECK_SRC_MEM(p, i) \
+    CHECK_ARGUMENT(function_check_src_mem(compiler, p, i));
+
+static sljit_s32 function_check_src(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
+{
+    if (compiler->scratches == -1 || compiler->saveds == -1)
+        return 0;
+
+    if (FUNCTION_CHECK_IS_REG(p))
+        return (i == 0);
+
+    if (p == SLJIT_IMM)
+        return 1;
+
+    if (p == SLJIT_MEM1(SLJIT_SP))
+        return (i >= 0 && i < compiler->logical_local_size);
+
+    return function_check_src_mem(compiler, p, i);
+}
+
+#define FUNCTION_CHECK_SRC(p, i) \
+    CHECK_ARGUMENT(function_check_src(compiler, p, i));
+
+static sljit_s32 function_check_dst(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i, sljit_s32 unused)
+{
+    if (compiler->scratches == -1 || compiler->saveds == -1)
+        return 0;
+
+    if (FUNCTION_CHECK_IS_REG(p) || ((unused) && (p) == SLJIT_UNUSED))
+        return (i == 0);
+
+    if (p == SLJIT_MEM1(SLJIT_SP))
+        return (i >= 0 && i < compiler->logical_local_size);
+
+    return function_check_src_mem(compiler, p, i);
+}
+
 #define FUNCTION_CHECK_DST(p, i, unused) \
-    CHECK_ARGUMENT(compiler->scratches != -1 && compiler->saveds != -1); \
-    if (FUNCTION_CHECK_IS_REG(p) || ((unused) && (p) == SLJIT_UNUSED)) \
-        CHECK_ARGUMENT((i) == 0); \
-    else if ((p) == (SLJIT_MEM1(SLJIT_SP))) \
-        CHECK_ARGUMENT((i) >= 0 && (i) < compiler->logical_local_size); \
-    else { \
-        CHECK_ARGUMENT((p) & SLJIT_MEM); \
-        CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG_OR_UNUSED((p) & REG_MASK)); \
-        CHECK_NOT_VIRTUAL_REGISTER((p) & REG_MASK); \
-        if ((p) & OFFS_REG_MASK) { \
-            CHECK_ARGUMENT(((p) & REG_MASK) != SLJIT_UNUSED); \
-            CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(OFFS_REG(p))); \
-            CHECK_NOT_VIRTUAL_REGISTER(OFFS_REG(p)); \
-            CHECK_ARGUMENT(!((i) & ~0x3)); \
-        } \
-        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | REG_MASK | OFFS_REG_MASK))); \
-    }
+    CHECK_ARGUMENT(function_check_dst(compiler, p, i, unused));


+static sljit_s32 function_fcheck(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
+{
+    if (compiler->scratches == -1 || compiler->saveds == -1)
+        return 0;
+
+    if (FUNCTION_CHECK_IS_FREG(p))
+        return (i == 0);
+
+    if (p == SLJIT_MEM1(SLJIT_SP))
+        return (i >= 0 && i < compiler->logical_local_size);
+
+    return function_check_src_mem(compiler, p, i);
+}
+
 #define FUNCTION_FCHECK(p, i) \
-    CHECK_ARGUMENT(compiler->fscratches != -1 && compiler->fsaveds != -1); \
-    if (((p) >= SLJIT_FR0 && (p) < (SLJIT_FR0 + compiler->fscratches)) || \
-            ((p) > (SLJIT_FS0 - compiler->fsaveds) && (p) <= SLJIT_FS0)) \
-        CHECK_ARGUMENT(i == 0); \
-    else if ((p) == (SLJIT_MEM1(SLJIT_SP))) \
-        CHECK_ARGUMENT((i) >= 0 && (i) < compiler->logical_local_size); \
-    else { \
-        CHECK_ARGUMENT((p) & SLJIT_MEM); \
-        CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG_OR_UNUSED((p) & REG_MASK)); \
-        CHECK_NOT_VIRTUAL_REGISTER((p) & REG_MASK); \
-        if ((p) & OFFS_REG_MASK) { \
-            CHECK_ARGUMENT(((p) & REG_MASK) != SLJIT_UNUSED); \
-            CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(OFFS_REG(p))); \
-            CHECK_NOT_VIRTUAL_REGISTER(OFFS_REG(p)); \
-            CHECK_ARGUMENT(((p) & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SP) && !(i & ~0x3)); \
-        } \
-        CHECK_ARGUMENT(!((p) & ~(SLJIT_MEM | REG_MASK | OFFS_REG_MASK))); \
-    }
+    CHECK_ARGUMENT(function_fcheck(compiler, p, i));


#endif /* SLJIT_ARGUMENT_CHECKS */

@@ -779,64 +805,72 @@
 #    define SLJIT_PRINT_D    ""
 #endif


-#define sljit_verbose_reg(compiler, r) \
-    do { \
-        if ((r) < (SLJIT_R0 + compiler->scratches)) \
-            fprintf(compiler->verbose, "r%d", (r) - SLJIT_R0); \
-        else if ((r) != SLJIT_SP) \
-            fprintf(compiler->verbose, "s%d", SLJIT_NUMBER_OF_REGISTERS - (r)); \
-        else \
-            fprintf(compiler->verbose, "sp"); \
-    } while (0)
+static void sljit_verbose_reg(struct sljit_compiler *compiler, sljit_s32 r)
+{
+    if (r < (SLJIT_R0 + compiler->scratches))
+        fprintf(compiler->verbose, "r%d", r - SLJIT_R0);
+    else if (r != SLJIT_SP)
+        fprintf(compiler->verbose, "s%d", SLJIT_NUMBER_OF_REGISTERS - r);
+    else
+        fprintf(compiler->verbose, "sp");
+}


-#define sljit_verbose_param(compiler, p, i) \
-    if ((p) & SLJIT_IMM) \
-        fprintf(compiler->verbose, "#%" SLJIT_PRINT_D "d", (i)); \
-    else if ((p) & SLJIT_MEM) { \
-        if ((p) & REG_MASK) { \
-            fputc('[', compiler->verbose); \
-            sljit_verbose_reg(compiler, (p) & REG_MASK); \
-            if ((p) & OFFS_REG_MASK) { \
-                fprintf(compiler->verbose, " + "); \
-                sljit_verbose_reg(compiler, OFFS_REG(p)); \
-                if (i) \
-                    fprintf(compiler->verbose, " * %d", 1 << (i)); \
-            } \
-            else if (i) \
-                fprintf(compiler->verbose, " + %" SLJIT_PRINT_D "d", (i)); \
-            fputc(']', compiler->verbose); \
-        } \
-        else \
-            fprintf(compiler->verbose, "[#%" SLJIT_PRINT_D "d]", (i)); \
-    } else if (p) \
-        sljit_verbose_reg(compiler, p); \
-    else \
+static void sljit_verbose_freg(struct sljit_compiler *compiler, sljit_s32 r)
+{
+    if (r < (SLJIT_FR0 + compiler->fscratches))
+        fprintf(compiler->verbose, "fr%d", r - SLJIT_FR0);
+    else
+        fprintf(compiler->verbose, "fs%d", SLJIT_NUMBER_OF_FLOAT_REGISTERS - r);
+}
+
+static void sljit_verbose_param(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
+{
+    if ((p) & SLJIT_IMM)
+        fprintf(compiler->verbose, "#%" SLJIT_PRINT_D "d", (i));
+    else if ((p) & SLJIT_MEM) {
+        if ((p) & REG_MASK) {
+            fputc('[', compiler->verbose);
+            sljit_verbose_reg(compiler, (p) & REG_MASK);
+            if ((p) & OFFS_REG_MASK) {
+                fprintf(compiler->verbose, " + ");
+                sljit_verbose_reg(compiler, OFFS_REG(p));
+                if (i)
+                    fprintf(compiler->verbose, " * %d", 1 << (i));
+            }
+            else if (i)
+                fprintf(compiler->verbose, " + %" SLJIT_PRINT_D "d", (i));
+            fputc(']', compiler->verbose);
+        }
+        else
+            fprintf(compiler->verbose, "[#%" SLJIT_PRINT_D "d]", (i));
+    } else if (p)
+        sljit_verbose_reg(compiler, p);
+    else
         fprintf(compiler->verbose, "unused");
+}


-#define sljit_verbose_fparam(compiler, p, i) \
-    if ((p) & SLJIT_MEM) { \
-        if ((p) & REG_MASK) { \
-            fputc('[', compiler->verbose); \
-            sljit_verbose_reg(compiler, (p) & REG_MASK); \
-            if ((p) & OFFS_REG_MASK) { \
-                fprintf(compiler->verbose, " + "); \
-                sljit_verbose_reg(compiler, OFFS_REG(p)); \
-                if (i) \
-                    fprintf(compiler->verbose, "%d", 1 << (i)); \
-            } \
-            else if (i) \
-                fprintf(compiler->verbose, "%" SLJIT_PRINT_D "d", (i)); \
-            fputc(']', compiler->verbose); \
-        } \
-        else \
-            fprintf(compiler->verbose, "[#%" SLJIT_PRINT_D "d]", (i)); \
-    } \
-    else { \
-        if ((p) < (SLJIT_FR0 + compiler->fscratches)) \
-            fprintf(compiler->verbose, "fr%d", (p) - SLJIT_FR0); \
-        else \
-            fprintf(compiler->verbose, "fs%d", SLJIT_NUMBER_OF_FLOAT_REGISTERS - (p)); \
+static void sljit_verbose_fparam(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
+{
+    if ((p) & SLJIT_MEM) {
+        if ((p) & REG_MASK) {
+            fputc('[', compiler->verbose);
+            sljit_verbose_reg(compiler, (p) & REG_MASK);
+            if ((p) & OFFS_REG_MASK) {
+                fprintf(compiler->verbose, " + ");
+                sljit_verbose_reg(compiler, OFFS_REG(p));
+                if (i)
+                    fprintf(compiler->verbose, "%d", 1 << (i));
+            }
+            else if (i)
+                fprintf(compiler->verbose, " + %" SLJIT_PRINT_D "d", (i));
+            fputc(']', compiler->verbose);
+        }
+        else
+            fprintf(compiler->verbose, "[#%" SLJIT_PRINT_D "d]", (i));
     }
+    else
+        sljit_verbose_freg(compiler, p);
+}


 static const char* op0_names[] = {
     (char*)"breakpoint", (char*)"nop", (char*)"lmul.uw", (char*)"lmul.sw",
@@ -1070,6 +1104,7 @@
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
     FUNCTION_CHECK_SRC(src, srcw);
+    CHECK_ARGUMENT(src != SLJIT_IMM);
     compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1128,9 +1163,6 @@
     case SLJIT_MOV:
     case SLJIT_MOV_U32:
     case SLJIT_MOV_P:
-    case SLJIT_MOVU:
-    case SLJIT_MOVU_U32:
-    case SLJIT_MOVU_P:
         /* Nothing allowed */
         CHECK_ARGUMENT(!(op & (SLJIT_I32_OP | SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
         break;
@@ -1143,28 +1175,17 @@
     FUNCTION_CHECK_DST(dst, dstw, 1);
     FUNCTION_CHECK_SRC(src, srcw);


-    if (GET_OPCODE(op) >= SLJIT_NOT)
+    if (GET_OPCODE(op) >= SLJIT_NOT) {
+        CHECK_ARGUMENT(src != SLJIT_IMM);
         compiler->last_flags = GET_FLAG_TYPE(op) | (op & (SLJIT_I32_OP | SLJIT_SET_Z));
-    else if (GET_OPCODE(op) >= SLJIT_MOVU) {
-        CHECK_ARGUMENT(!(src & SLJIT_MEM) || (src & REG_MASK) != SLJIT_SP);
-        CHECK_ARGUMENT(!(dst & SLJIT_MEM) || (dst & REG_MASK) != SLJIT_SP);
-        if ((src & REG_MASK) != SLJIT_UNUSED) {
-            CHECK_ARGUMENT((src & REG_MASK) != (dst & REG_MASK) && (src & REG_MASK) != OFFS_REG(dst));
-            CHECK_ARGUMENT((src & OFFS_REG_MASK) == SLJIT_UNUSED || srcw == 0);
-        }
-        if ((dst & REG_MASK) != SLJIT_UNUSED) {
-            CHECK_ARGUMENT((dst & REG_MASK) != OFFS_REG(src));
-            CHECK_ARGUMENT((dst & OFFS_REG_MASK) == SLJIT_UNUSED || dstw == 0);
-        }
-        compiler->last_flags = 0;
     }
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-        if (GET_OPCODE(op) <= SLJIT_MOVU_P)
+        if (GET_OPCODE(op) <= SLJIT_MOV_P)
         {
-            fprintf(compiler->verbose, "  mov%s%s%s ", (GET_OPCODE(op) >= SLJIT_MOVU) ? "u" : "",
-                !(op & SLJIT_I32_OP) ? "" : "32", (op != SLJIT_MOV32 && op != SLJIT_MOVU32) ? op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE] : "");
+            fprintf(compiler->verbose, "  mov%s%s ", !(op & SLJIT_I32_OP) ? "" : "32",
+                (op != SLJIT_MOV32) ? op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE] : "");
         }
         else
         {
@@ -1746,6 +1767,8 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
     CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
     CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
+
+    CHECK_ARGUMENT(compiler->scratches != -1 && compiler->saveds != -1);
     CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_I32_OP));
     if (src != SLJIT_IMM) {
         CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src));
@@ -1762,7 +1785,7 @@
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {
         fprintf(compiler->verbose, "  cmov%s %s%s, ",
-            !(dst_reg & SLJIT_I32_OP) ? "" : ".i",
+            !(dst_reg & SLJIT_I32_OP) ? "" : "32",
             jump_names[type & 0xff], JUMP_POSTFIX(type));
         sljit_verbose_reg(compiler, dst_reg & ~SLJIT_I32_OP);
         fprintf(compiler->verbose, ", ");
@@ -1773,6 +1796,72 @@
     CHECK_RETURN_OK;
 }


+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 reg,
+    sljit_s32 mem, sljit_sw memw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    CHECK_ARGUMENT((type & 0xff) >= SLJIT_MOV && (type & 0xff) <= SLJIT_MOV_P);
+    CHECK_ARGUMENT(!(type & SLJIT_I32_OP) || ((type & 0xff) != SLJIT_MOV && (type & 0xff) != SLJIT_MOV_U32 && (type & 0xff) != SLJIT_MOV_P));
+    CHECK_ARGUMENT((type & SLJIT_MEM_PRE) || (type & SLJIT_MEM_POST));
+    CHECK_ARGUMENT((type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) != (SLJIT_MEM_PRE | SLJIT_MEM_POST));
+    CHECK_ARGUMENT((type & ~(0xff | SLJIT_I32_OP | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_PRE | SLJIT_MEM_POST)) == 0);
+
+    FUNCTION_CHECK_SRC_MEM(mem, memw);
+    CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
+
+    CHECK_ARGUMENT((mem & REG_MASK) != SLJIT_UNUSED && (mem & REG_MASK) != reg);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+    if (!(type & SLJIT_MEM_SUPP) && SLJIT_UNLIKELY(!!compiler->verbose)) {
+        if (sljit_emit_mem(compiler, type | SLJIT_MEM_SUPP, reg, mem, memw) == SLJIT_ERR_UNSUPPORTED)
+            fprintf(compiler->verbose, "  //");
+
+        fprintf(compiler->verbose, "  mem%s.%s%s%s ",
+            !(type & SLJIT_I32_OP) ? "" : "32",
+            (type & SLJIT_MEM_STORE) ? "st" : "ld",
+            op1_names[(type & 0xff) - SLJIT_OP1_BASE],
+            (type & SLJIT_MEM_PRE) ? ".pre" : ".post");
+        sljit_verbose_reg(compiler, reg);
+        fprintf(compiler->verbose, ", ");
+        sljit_verbose_param(compiler, mem, memw);
+        fprintf(compiler->verbose, "\n");
+    }
+#endif
+    CHECK_RETURN_OK;
+}
+
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 freg,
+    sljit_s32 mem, sljit_sw memw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV_F64);
+    CHECK_ARGUMENT((type & SLJIT_MEM_PRE) || (type & SLJIT_MEM_POST));
+    CHECK_ARGUMENT((type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) != (SLJIT_MEM_PRE | SLJIT_MEM_POST));
+    CHECK_ARGUMENT((type & ~(0xff | SLJIT_I32_OP | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_PRE | SLJIT_MEM_POST)) == 0);
+
+    FUNCTION_CHECK_SRC_MEM(mem, memw);
+    CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+    if (!(type & SLJIT_MEM_SUPP) && SLJIT_UNLIKELY(!!compiler->verbose)) {
+        if (sljit_emit_fmem(compiler, type | SLJIT_MEM_SUPP, freg, mem, memw) == SLJIT_ERR_UNSUPPORTED)
+            fprintf(compiler->verbose, "  //");
+
+        fprintf(compiler->verbose, "  fmem.%s%s%s ",
+            (type & SLJIT_MEM_STORE) ? "st" : "ld",
+            !(type & SLJIT_I32_OP) ? ".f64" : ".f32",
+            (type & SLJIT_MEM_PRE) ? ".pre" : ".post");
+        sljit_verbose_freg(compiler, freg);
+        fprintf(compiler->verbose, ", ");
+        sljit_verbose_param(compiler, mem, memw);
+        fprintf(compiler->verbose, "\n");
+    }
+#endif
+    CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
 {
     /* Any offset is allowed. */
@@ -2046,6 +2135,49 @@
     return sljit_emit_jump(compiler, type);
 }


+#if !(defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
+    && !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+    && !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 reg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    SLJIT_UNUSED_ARG(compiler);
+    SLJIT_UNUSED_ARG(type);
+    SLJIT_UNUSED_ARG(reg);
+    SLJIT_UNUSED_ARG(mem);
+    SLJIT_UNUSED_ARG(memw);
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+    return SLJIT_ERR_UNSUPPORTED;
+}
+
+#endif
+
+#if !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+    && !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 freg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    SLJIT_UNUSED_ARG(compiler);
+    SLJIT_UNUSED_ARG(type);
+    SLJIT_UNUSED_ARG(freg);
+    SLJIT_UNUSED_ARG(mem);
+    SLJIT_UNUSED_ARG(memw);
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+
+    return SLJIT_ERR_UNSUPPORTED;
+}
+
+#endif
+
 #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
@@ -2398,6 +2530,28 @@
     return SLJIT_ERR_UNSUPPORTED;
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 reg, sljit_s32 mem, sljit_sw memw)
+{
+    SLJIT_UNUSED_ARG(compiler);
+    SLJIT_UNUSED_ARG(type);
+    SLJIT_UNUSED_ARG(reg);
+    SLJIT_UNUSED_ARG(mem);
+    SLJIT_UNUSED_ARG(memw);
+    SLJIT_UNREACHABLE();
+    return SLJIT_ERR_UNSUPPORTED;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 mem, sljit_sw memw)
+{
+    SLJIT_UNUSED_ARG(compiler);
+    SLJIT_UNUSED_ARG(type);
+    SLJIT_UNUSED_ARG(freg);
+    SLJIT_UNUSED_ARG(mem);
+    SLJIT_UNUSED_ARG(memw);
+    SLJIT_UNREACHABLE();
+    return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
 {
     SLJIT_UNUSED_ARG(compiler);


Modified: code/trunk/src/sljit/sljitLir.h
===================================================================
--- code/trunk/src/sljit/sljitLir.h    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitLir.h    2018-01-05 09:30:45 UTC (rev 904)
@@ -153,8 +153,8 @@
         is not available at all.
 */


-/* When SLJIT_UNUSED is specified as the destination of sljit_emit_op1 and
-   and sljit_emit_op2 operations the result is discarded. If no status
+/* When SLJIT_UNUSED is specified as the destination of sljit_emit_op1
+   or sljit_emit_op2 operations the result is discarded. If no status
    flags are set, no instructions are emitted for these operations. Data
    prefetch is a special exception, see SLJIT_MOV operation. Other SLJIT
    operations do not support SLJIT_UNUSED as a destination operand. */
@@ -422,15 +422,8 @@
     sljit_uw shift_imm;
 #endif


-#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
-    sljit_s32 cache_arg;
-    sljit_sw cache_argw;
-#endif
-
 #if (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
     sljit_sw imm;
-    sljit_s32 cache_arg;
-    sljit_sw cache_argw;
 #endif


 #if (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
@@ -565,12 +558,10 @@
 #define SLJIT_HAS_FPU            0
 /* [Limitation] Some registers are virtual registers. */
 #define SLJIT_HAS_VIRTUAL_REGISTERS    1
-/* [Emulated] Some forms of move with pre update is supported. */
-#define SLJIT_HAS_PRE_UPDATE        2
 /* [Emulated] Count leading zero is supported. */
-#define SLJIT_HAS_CLZ            3
+#define SLJIT_HAS_CLZ            2
 /* [Emulated] Conditional move is supported. */
-#define SLJIT_HAS_CMOV            4
+#define SLJIT_HAS_CMOV            3


 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
 /* [Not emulated] SSE2 support is available on x86. */
@@ -657,26 +648,31 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 src, sljit_sw srcw);


-/* Fast calling mechanism for utility functions (see SLJIT_FAST_CALL). All registers and
- even the stack frame is passed to the callee. The return address is preserved in
- dst/dstw by sljit_emit_fast_enter (the type of the value stored by this function
- is sljit_p), and sljit_emit_fast_return can use this as a return value later. */
+/* Generating entry and exit points for fast call functions (see SLJIT_FAST_CALL).
+ Both sljit_emit_fast_enter and sljit_emit_fast_return functions preserve the
+ values of all registers and stack frame. The return address is stored in the
+ dst argument of sljit_emit_fast_enter, and this return address can be passed
+ to sljit_emit_fast_return to continue the execution after the fast call.

-/* Note: only for sljit specific, non ABI compilant calls. Fast, since only a few machine
- instructions are needed. Excellent for small uility functions, where saving registers
- and setting up a new stack frame would cost too much performance. However, it is still
- possible to return to the address of the caller (or anywhere else). */
+ Fast calls are cheap operations (usually only a single call instruction is
+ emitted) but they do not preserve any registers. However the callee function
+ can freely use / update any registers and stack values which can be
+ efficiently exploited by various optimizations. Registers can be saved
+ manually by the callee function if needed.

-/* Note: may destroy flags. */
+ Although returning to different address by sljit_emit_fast_return is possible,
+ this address usually cannot be predicted by the return address predictor of
+ modern CPUs which may reduce performance. Furthermore using sljit_emit_ijump
+ to return is also inefficient since return address prediction is usually
+ triggered by a specific form of ijump.

-/* Note: although sljit_emit_fast_return could be replaced by an ijump, it is not suggested,
- since many architectures do clever branch prediction on call / return instruction pairs. */
+ Flags: - (does not modify flags). */

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw);
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw);

 /*
-   Source and destination values for arithmetical instructions
+   Source and destination operands for arithmetical instructions
     imm              - a simple immediate value (cannot be used as a destination)
     reg              - any of the registers (immediate argument must be 0)
     [imm]            - absolute immediate memory address
@@ -717,6 +713,9 @@
    arm-t2: [reg+imm], -255 <= imm <= 4095
            [reg+(reg<<imm)] is supported
            Write back is supported only for [reg+imm], where -255 <= imm <= 255
+   arm64:  [reg+imm], -256 <= imm <= 255, 0 <= aligned imm <= 4095 * alignment
+           [reg+(reg<<imm)] is supported
+           Write back is supported only for [reg+imm], where -256 <= imm <= 255
    ppc:    [reg+imm], -65536 <= imm <= 65535. 64 bit loads/stores and 32 bit
                 signed load on 64 bit requires immediates divisible by 4.
                 [reg+imm] is not supported for signed 8 bit values.
@@ -728,8 +727,7 @@
            [reg+reg] is supported
 */


-/* Register output: simply the name of the register.
-   For destination, you can use SLJIT_UNUSED as well. */
+/* Macros for specifying operand types. */
 #define SLJIT_MEM        0x80
 #define SLJIT_MEM0()        (SLJIT_MEM)
 #define SLJIT_MEM1(r1)        (SLJIT_MEM | (r1))
@@ -898,43 +896,14 @@
    S32 - signed int (32 bit) data transfer
    P   - pointer (sljit_p) data transfer


-   U = move with update (pre form). If source or destination defined as
-       SLJIT_MEM1(r1) or SLJIT_MEM2(r1, r2), r1 is increased by the
-       offset part of the address.
-
-   Register arguments and base registers can only be used once for move
-   with update instructions. The shift value of SLJIT_MEM2 addressing
-   mode must also be 0. Reason: SLJIT_MOVU instructions are expected to
-   be in high-performance loops where complex instruction emulation
-   would be too costly.
-
-   Examples for invalid move with update instructions:
-
-   sljit_emit_op1(..., SLJIT_MOVU_U8,
-       SLJIT_R0, 0, SLJIT_MEM1(SLJIT_R0), 8);
-   sljit_emit_op1(..., SLJIT_MOVU_U8,
-       SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0, SLJIT_R0, 0);
-   sljit_emit_op1(..., SLJIT_MOVU_U8,
-       SLJIT_MEM2(SLJIT_R0, SLJIT_R1), 0, SLJIT_MEM1(SLJIT_R0), 8);
-   sljit_emit_op1(..., SLJIT_MOVU_U8,
-       SLJIT_MEM2(SLJIT_R0, SLJIT_R1), 0, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0);
-   sljit_emit_op1(..., SLJIT_MOVU_U8,
-       SLJIT_R2, 0, SLJIT_MEM2(SLJIT_R0, SLJIT_R1), 1);
-
-   The following example is valid, since only the offset register is
-   used multiple times:
-
-   sljit_emit_op1(..., SLJIT_MOVU_U8,
-       SLJIT_MEM2(SLJIT_R0, SLJIT_R2), 0, SLJIT_MEM2(SLJIT_R1, SLJIT_R2), 0);
-
-   If the destination of a MOV without update instruction is SLJIT_UNUSED
-   and the source operand is a memory address the compiler emits a prefetch
-   instruction if this instruction is supported by the current CPU.
-   Higher data sizes bring the data closer to the core: a MOV with word
-   size loads the data into a higher level cache than a byte size. Otherwise
-   the type does not affect the prefetch instruction. Furthermore a prefetch
-   instruction never fails, so it can be used to prefetch a data from an
-   address and check whether that address is NULL afterwards.
+   If the destination of a MOV instruction is SLJIT_UNUSED and the source
+   operand is a memory address the compiler emits a prefetch instruction
+   if this instruction is supported by the current CPU. Higher data sizes
+   bring the data closer to the core: a MOV with word size loads the data
+   into a higher level cache than a byte size. Otherwise the type does not
+   affect the prefetch instruction. Furthermore a prefetch instruction
+   never fails, so it can be used to prefetch a data from an address and
+   check whether that address is NULL afterwards.
 */


 /* Flags: - (does not modify flags) */
@@ -959,41 +928,23 @@
 #define SLJIT_MOV_S32            (SLJIT_OP1_BASE + 6)
 /* Flags: - (does not modify flags) */
 #define SLJIT_MOV32            (SLJIT_MOV_S32 | SLJIT_I32_OP)
-/* Flags: - (does not modify flags) */
+/* Flags: - (does not modify flags)
+   Note: load a pointer sized data, useful on x32 (a 32 bit mode on x86-64
+         where all x64 features are available, e.g. 16 register) or similar
+         compiling modes */
 #define SLJIT_MOV_P            (SLJIT_OP1_BASE + 7)
-/* Flags: - (may destroy flags) */
-#define SLJIT_MOVU            (SLJIT_OP1_BASE + 8)
-/* Flags: - (may destroy flags) */
-#define SLJIT_MOVU_U8            (SLJIT_OP1_BASE + 9)
-#define SLJIT_MOVU32_U8            (SLJIT_MOVU_U8 | SLJIT_I32_OP)
-/* Flags: - (may destroy flags) */
-#define SLJIT_MOVU_S8            (SLJIT_OP1_BASE + 10)
-#define SLJIT_MOVU32_S8            (SLJIT_MOVU_S8 | SLJIT_I32_OP)
-/* Flags: - (may destroy flags) */
-#define SLJIT_MOVU_U16            (SLJIT_OP1_BASE + 11)
-#define SLJIT_MOVU32_U16            (SLJIT_MOVU_U16 | SLJIT_I32_OP)
-/* Flags: - (may destroy flags) */
-#define SLJIT_MOVU_S16            (SLJIT_OP1_BASE + 12)
-#define SLJIT_MOVU32_S16        (SLJIT_MOVU_S16 | SLJIT_I32_OP)
-/* Flags: - (may destroy flags)
-   Note: no SLJIT_MOVU32_U32 form, since it is the same as SLJIT_MOVU32 */
-#define SLJIT_MOVU_U32            (SLJIT_OP1_BASE + 13)
-/* Flags: - (may destroy flags)
-   Note: no SLJIT_MOVU32_S32 form, since it is the same as SLJIT_MOVU32 */
-#define SLJIT_MOVU_S32            (SLJIT_OP1_BASE + 14)
-/* Flags: - (may destroy flags) */
-#define SLJIT_MOVU32            (SLJIT_MOVU_S32 | SLJIT_I32_OP)
-/* Flags: - (may destroy flags) */
-#define SLJIT_MOVU_P            (SLJIT_OP1_BASE + 15)
-/* Flags: Z */
-#define SLJIT_NOT            (SLJIT_OP1_BASE + 16)
+/* Flags: Z
+   Note: immediate source argument is not supported */
+#define SLJIT_NOT            (SLJIT_OP1_BASE + 8)
 #define SLJIT_NOT32            (SLJIT_NOT | SLJIT_I32_OP)
-/* Flags: Z | OVERFLOW */
-#define SLJIT_NEG            (SLJIT_OP1_BASE + 17)
+/* Flags: Z | OVERFLOW
+   Note: immediate source argument is not supported */
+#define SLJIT_NEG            (SLJIT_OP1_BASE + 9)
 #define SLJIT_NEG32            (SLJIT_NEG | SLJIT_I32_OP)
 /* Count leading zeroes
-   Flags: - (may destroy flags) */
-#define SLJIT_CLZ            (SLJIT_OP1_BASE + 18)
+   Flags: - (may destroy flags)
+   Note: immediate source argument is not supported */
+#define SLJIT_CLZ            (SLJIT_OP1_BASE + 10)
 #define SLJIT_CLZ32            (SLJIT_CLZ | SLJIT_I32_OP)


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
@@ -1294,7 +1245,7 @@

 /* Emit a conditional mov instruction which moves source to destination,
    if the condition is satisfied. Unlike other arithmetic operations this
-   instruction does not support memory accesses.
+   instruction does not support memory access.


    type must be between SLJIT_EQUAL and SLJIT_ORDERED_F64
    dst_reg must be a valid register and it can be combined
@@ -1306,6 +1257,51 @@
     sljit_s32 dst_reg,
     sljit_s32 src, sljit_sw srcw);


+/* The following flags are used by sljit_emit_mem() and sljit_emit_fmem(). */
+
+/* When SLJIT_MEM_SUPP is passed, no instructions are emitted.
+   Instead the function returns with SLJIT_SUCCESS if the instruction
+   form is supported and SLJIT_ERR_UNSUPPORTED otherwise. This flag
+   allows runtime checking of available instruction forms. */
+#define SLJIT_MEM_SUPP        0x0200
+/* Memory load operation. This is the default. */
+#define SLJIT_MEM_LOAD        0x0000
+/* Memory store operation. */
+#define SLJIT_MEM_STORE        0x0400
+/* Base register is updated before the memory access. */
+#define SLJIT_MEM_PRE        0x0800
+/* Base register is updated after the memory access. */
+#define SLJIT_MEM_POST        0x1000
+
+/* Emit a single memory load or store with update instruction. When the
+   requested instruction from is not supported by the CPU, it returns
+   with SLJIT_ERR_UNSUPPORTED instead of emulating the instruction. This
+   allows specializing tight loops based on the supported instruction
+   forms (see SLJIT_MEM_SUPP flag).
+
+   type must be between SLJIT_MOV and SLJIT_MOV_P and can be
+     combined with SLJIT_MEM_* flags. Either SLJIT_MEM_PRE
+     or SLJIT_MEM_POST must be specified.
+   reg is the source or destination register, and must be
+     different from the base register of the mem operand
+   mem must be a SLJIT_MEM1() or SLJIT_MEM2() operand
+
+   Flags: - (does not modify flags) */
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 reg,
+    sljit_s32 mem, sljit_sw memw);
+
+/* Same as sljit_emit_mem except the followings:
+
+   type must be SLJIT_MOV_F64 or SLJIT_MOV_F32 and can be
+     combined with SLJIT_MEM_* flags. Either SLJIT_MEM_PRE
+     or SLJIT_MEM_POST must be specified.
+   freg is the source or destination floating point register */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 freg,
+    sljit_s32 mem, sljit_sw memw);
+
 /* Copies the base address of SLJIT_SP + offset to dst. The offset can be
    anything to negate the effect of relative addressing. For example if an
    array of sljit_sw values is stored on the stack from offset 0x40, and R0
@@ -1358,9 +1354,9 @@
 #if (defined SLJIT_UTIL_STACK && SLJIT_UTIL_STACK)


 /* The sljit_stack is a utility extension of sljit, which provides
-   a top-down stack. The stack starts at base and goes down to
-   max_limit, so the memory region for this stack is between
-   max_limit (inclusive) and base (exclusive). However the
+   a top-down stack. The stack top is stored in base and the stack
+   goes down to max_limit, so the memory region for this stack is
+   between max_limit (inclusive) and base (exclusive). However the
    application can only use the region between limit (inclusive)
    and base (exclusive). The sljit_stack_resize can be used to
    extend this region up to max_limit.
@@ -1368,8 +1364,8 @@
    This feature uses the "address space reserve" feature of modern
    operating systems, so instead of allocating a huge memory block
    applications can allocate a small region and extend it later
-   without moving the memory area. Hence pointers can be stored
-   in this area. */
+   without moving the memory area. Hence the region is never moved
+   so pointers are valid after resize. */


 /* Note: base and max_limit fields are aligned to PAGE_SIZE bytes
      (usually 4 Kbyte or more).
@@ -1389,9 +1385,13 @@
 };


 /* Returns NULL if unsuccessful.
-   Note: max_limit contains the maximum stack size in bytes.
-   Note: limit contains the starting stack size in bytes.
-   Note: the top field is initialized to base.
+
+   Note:
+     max_limit field contains the lower bound adress of the stack.
+     limit field contains the current starting address of the stack.
+     base field contains the end address of the stack.
+     top field is initialized to base.
+
    Note: see sljit_create_compiler for the explanation of allocator_data. */
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_stack* SLJIT_FUNC sljit_allocate_stack(sljit_uw limit, sljit_uw max_limit, void *allocator_data);
 SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_free_stack(struct sljit_stack *stack, void *allocator_data);


Modified: code/trunk/src/sljit/sljitNativeARM_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_32.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeARM_32.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -837,7 +837,6 @@
         return 1;
 #endif


-    case SLJIT_HAS_PRE_UPDATE:
     case SLJIT_HAS_CLZ:
     case SLJIT_HAS_CMOV:
         return 1;
@@ -852,18 +851,16 @@
 /* --------------------------------------------------------------------- */


 /* Creates an index in data_transfer_insts array. */
-#define WORD_DATA    0x00
-#define BYTE_DATA    0x01
-#define HALF_DATA    0x02
-#define PRELOAD_DATA    0x03
-#define SIGNED_DATA    0x04
+#define WORD_SIZE    0x00
+#define BYTE_SIZE    0x01
+#define HALF_SIZE    0x02
+#define PRELOAD        0x03
+#define SIGNED        0x04
 #define LOAD_DATA    0x08


-/* emit_op inp_flags.
-   WRITE_BACK must be the first, since it is a flag. */
-#define WRITE_BACK    0x10
-#define ALLOW_IMM    0x20
-#define ALLOW_INV_IMM    0x40
+/* Flag bits for emit_op. */
+#define ALLOW_IMM    0x10
+#define ALLOW_INV_IMM    0x20
 #define ALLOW_ANY_IMM    (ALLOW_IMM | ALLOW_INV_IMM)


/* s/l - store/load (1 bit)
@@ -884,7 +881,7 @@
/* l u w */ 0xe5100000 /* ldr */,
/* l u b */ 0xe5500000 /* ldrb */,
/* l u h */ 0xe11000b0 /* ldrh */,
-/* l u p */ 0xf5500000 /* preload data */,
+/* l u p */ 0xf5500000 /* preload */,
/* l s w */ 0xe5100000 /* ldr */,
/* l s b */ 0xe11000d0 /* ldrsb */,
/* l s h */ 0xe11000f0 /* ldrsh */,
@@ -891,8 +888,8 @@
/* l s N */ 0x00000000 /* not allowed */,
};

-#define EMIT_DATA_TRANSFER(type, add, wb, target_reg, base_reg, arg) \
-    (data_transfer_insts[(type) & 0xf] | ((add) << 23) | ((wb) << (21 - 4)) | RD(target_reg) | RN(base_reg) | (arg))
+#define EMIT_DATA_TRANSFER(type, add, target_reg, base_reg, arg) \
+    (data_transfer_insts[(type) & 0xf] | ((add) << 23) | RD(target_reg) | RN(base_reg) | (arg))


 /* Normal ldr/str instruction.
    Type2: ldrsb, ldrh, ldrsh */
@@ -1325,7 +1322,7 @@
     FAIL_IF(generate_int(compiler, reg, ~imm, 0));


     /* Load integer. */
-    return push_inst_with_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, reg, TMP_PC, 0), imm);
+    return push_inst_with_literal(compiler, EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1, reg, TMP_PC, 0), imm);
 #else
     FAIL_IF(push_inst(compiler, MOVW | RD(reg) | ((imm << 4) & 0xf0000) | (imm & 0xfff)));
     if (imm <= 0xffff)
@@ -1337,16 +1334,13 @@
 static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg,
     sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
-    sljit_uw offset_reg, imm;
+    sljit_uw imm, offset_reg;
     sljit_uw is_type1_transfer = IS_TYPE1_TRANSFER(flags);


     SLJIT_ASSERT (arg & SLJIT_MEM);
     SLJIT_ASSERT((arg & REG_MASK) != tmp_reg);


-    SLJIT_COMPILE_ASSERT(WRITE_BACK == 0x10, optimized_for_emit_data_transfer);
-
     if ((arg & REG_MASK) == SLJIT_UNUSED) {
-        /* Write back is not used. */
         if (is_type1_transfer) {
             FAIL_IF(load_immediate(compiler, tmp_reg, argw & ~0xfff));
             argw &= 0xfff;
@@ -1356,7 +1350,8 @@
             argw &= 0xff;
         }


-        return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 0, reg, tmp_reg, is_type1_transfer ? argw : TYPE2_TRANSFER_IMM(argw)));
+        return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, tmp_reg,
+            is_type1_transfer ? argw : TYPE2_TRANSFER_IMM(argw)));
     }


     if (arg & OFFS_REG_MASK) {
@@ -1365,14 +1360,12 @@
         argw &= 0x3;


         if (argw != 0 && !is_type1_transfer) {
-            SLJIT_ASSERT(!(flags & WRITE_BACK));
-
             FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | RM(offset_reg) | (argw << 7)));
-            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 0, reg, tmp_reg, TYPE2_TRANSFER_IMM(0)));
+            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, tmp_reg, TYPE2_TRANSFER_IMM(0)));
         }


         /* Bit 25: RM is offset. */
-        return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, flags & WRITE_BACK, reg, arg,
+        return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg,
             RM(offset_reg) | (is_type1_transfer ? (1 << 25) : 0) | (argw << 7)));
     }


@@ -1382,60 +1375,55 @@
         if (argw > 0xfff) {
             imm = get_imm(argw & ~0xfff);
             if (imm) {
-                offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, ADD | RD(offset_reg) | RN(arg) | imm));
+                FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | imm));
                 argw = argw & 0xfff;
-                arg = offset_reg;
+                arg = tmp_reg;
             }
         }
         else if (argw < -0xfff) {
             imm = get_imm(-argw & ~0xfff);
             if (imm) {
-                offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, SUB | RD(offset_reg) | RN(arg) | imm));
+                FAIL_IF(push_inst(compiler, SUB | RD(tmp_reg) | RN(arg) | imm));
                 argw = -(-argw & 0xfff);
-                arg = offset_reg;
+                arg = tmp_reg;
             }
         }


-        if (argw >= 0 && argw <= 0xfff) {
-            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, flags & WRITE_BACK, reg, arg & REG_MASK, argw));
-        }
-        if (argw < 0 && argw >= -0xfff) {
-            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, flags & WRITE_BACK, reg, arg & REG_MASK, -argw));
-        }
+        if (argw >= 0 && argw <= 0xfff)
+            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg, argw));
+
+        if (argw < 0 && argw >= -0xfff)
+            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, reg, arg, -argw));
     }
     else {
         if (argw > 0xff) {
             imm = get_imm(argw & ~0xff);
             if (imm) {
-                offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, ADD | RD(offset_reg) | RN(arg) | imm));
+                FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | imm));
                 argw = argw & 0xff;
-                arg = offset_reg;
+                arg = tmp_reg;
             }
         }
         else if (argw < -0xff) {
             imm = get_imm(-argw & ~0xff);
             if (imm) {
-                offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, SUB | RD(offset_reg) | RN(arg) | imm));
+                FAIL_IF(push_inst(compiler, SUB | RD(tmp_reg) | RN(arg) | imm));
                 argw = -(-argw & 0xff);
-                arg = offset_reg;
+                arg = tmp_reg;
             }
         }


-        if (argw >= 0 && argw <= 0xff) {
-            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, flags & WRITE_BACK, reg, arg, TYPE2_TRANSFER_IMM(argw)));
-        }
+        if (argw >= 0 && argw <= 0xff)
+            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg, TYPE2_TRANSFER_IMM(argw)));
+
         if (argw < 0 && argw >= -0xff) {
             argw = -argw;
-            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, flags & WRITE_BACK, reg, arg, TYPE2_TRANSFER_IMM(argw)));
+            return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, reg, arg, TYPE2_TRANSFER_IMM(argw)));
         }
     }


     FAIL_IF(load_immediate(compiler, tmp_reg, argw));
-    return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, flags & WRITE_BACK, reg, arg,
+    return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg,
         RM(tmp_reg) | (is_type1_transfer ? (1 << 25) : 0)));
 }


@@ -1538,10 +1526,10 @@
     /* Destination. */
     dst_reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;


-    if (op <= SLJIT_MOVU_P) {
+    if (op <= SLJIT_MOV_P) {
         if (dst & SLJIT_MEM) {
-            if (inp_flags & BYTE_DATA)
-                inp_flags &= ~SIGNED_DATA;
+            if (inp_flags & BYTE_SIZE)
+                inp_flags &= ~SIGNED;


             if (FAST_IS_REG(src2))
                 return emit_op_mem(compiler, inp_flags, src2, dst, dstw, TMP_REG2);
@@ -1553,7 +1541,7 @@


     /* Source 2. */
     if (src2_reg == 0) {
-        src2_reg = (op <= SLJIT_MOVU_P) ? dst_reg : TMP_REG2;
+        src2_reg = (op <= SLJIT_MOV_P) ? dst_reg : TMP_REG2;


         if (FAST_IS_REG(src2))
             src2_reg = src2;
@@ -1674,7 +1662,7 @@
     if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
 #if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
         if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
-            return emit_op_mem(compiler, PRELOAD_DATA | LOAD_DATA, TMP_PC, src, srcw, TMP_REG1);
+            return emit_op_mem(compiler, PRELOAD | LOAD_DATA, TMP_PC, src, srcw, TMP_REG1);
 #endif
         return SLJIT_SUCCESS;
     }
@@ -1687,35 +1675,17 @@
         return emit_op(compiler, SLJIT_MOV, ALLOW_ANY_IMM, dst, dstw, TMP_REG1, 0, src, srcw);


     case SLJIT_MOV_U8:
-        return emit_op(compiler, SLJIT_MOV_U8, ALLOW_ANY_IMM | BYTE_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
+        return emit_op(compiler, SLJIT_MOV_U8, ALLOW_ANY_IMM | BYTE_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);


     case SLJIT_MOV_S8:
-        return emit_op(compiler, SLJIT_MOV_S8, ALLOW_ANY_IMM | SIGNED_DATA | BYTE_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
+        return emit_op(compiler, SLJIT_MOV_S8, ALLOW_ANY_IMM | SIGNED | BYTE_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);


     case SLJIT_MOV_U16:
-        return emit_op(compiler, SLJIT_MOV_U16, ALLOW_ANY_IMM | HALF_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
+        return emit_op(compiler, SLJIT_MOV_U16, ALLOW_ANY_IMM | HALF_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);


     case SLJIT_MOV_S16:
-        return emit_op(compiler, SLJIT_MOV_S16, ALLOW_ANY_IMM | SIGNED_DATA | HALF_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
+        return emit_op(compiler, SLJIT_MOV_S16, ALLOW_ANY_IMM | SIGNED | HALF_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);


-    case SLJIT_MOVU:
-    case SLJIT_MOVU_U32:
-    case SLJIT_MOVU_S32:
-    case SLJIT_MOVU_P:
-        return emit_op(compiler, SLJIT_MOV, ALLOW_ANY_IMM | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-
-    case SLJIT_MOVU_U8:
-        return emit_op(compiler, SLJIT_MOV_U8, ALLOW_ANY_IMM | BYTE_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
-
-    case SLJIT_MOVU_S8:
-        return emit_op(compiler, SLJIT_MOV_S8, ALLOW_ANY_IMM | SIGNED_DATA | BYTE_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
-
-    case SLJIT_MOVU_U16:
-        return emit_op(compiler, SLJIT_MOV_U16, ALLOW_ANY_IMM | HALF_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
-
-    case SLJIT_MOVU_S16:
-        return emit_op(compiler, SLJIT_MOV_S16, ALLOW_ANY_IMM | SIGNED_DATA | HALF_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
-
     case SLJIT_NOT:
         return emit_op(compiler, op, ALLOW_ANY_IMM, dst, dstw, TMP_REG1, 0, src, srcw);


@@ -2037,7 +2007,7 @@
         return push_inst(compiler, MOV | RD(dst) | RM(TMP_REG2));


     /* Memory. */
-    return emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw, TMP_REG1);
+    return emit_op_mem(compiler, WORD_SIZE, TMP_REG2, dst, dstw, TMP_REG1);
 }


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
@@ -2050,10 +2020,8 @@

     if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG2) | RM(src)));
-    else if (src & SLJIT_MEM)
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, src, srcw, TMP_REG1));
-    else if (src & SLJIT_IMM)
-        FAIL_IF(load_immediate(compiler, TMP_REG2, srcw));
+    else
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG2, src, srcw, TMP_REG1));


     return push_inst(compiler, BX | RM(TMP_REG2));
 }
@@ -2150,7 +2118,7 @@
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
     if (type >= SLJIT_FAST_CALL)
         PTR_FAIL_IF(prepare_blx(compiler));
-    PTR_FAIL_IF(push_inst_with_unique_literal(compiler, ((EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0,
+    PTR_FAIL_IF(push_inst_with_unique_literal(compiler, ((EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1,
         type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0)) & ~COND_MASK) | get_cc(type), 0));


     if (jump->flags & SLJIT_REWRITABLE_JUMP) {
@@ -2276,7 +2244,7 @@
                     }
                     FAIL_IF(push_inst(compiler, MOV | (stack_offset << 10) | (word_arg_offset >> 2)));
                 } else
-                    FAIL_IF(push_inst(compiler, data_transfer_insts[WORD_DATA] | 0x800000 | RN(SLJIT_SP) | (word_arg_offset << 10) | (stack_offset - 16)));
+                    FAIL_IF(push_inst(compiler, data_transfer_insts[WORD_SIZE] | 0x800000 | RN(SLJIT_SP) | (word_arg_offset << 10) | (stack_offset - 16)));
             }
             break;
         }
@@ -2427,7 +2395,7 @@
         }


         SLJIT_ASSERT(src & SLJIT_MEM);
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
         return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1));
     }


@@ -2440,7 +2408,7 @@
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
     if (type >= SLJIT_FAST_CALL)
         FAIL_IF(prepare_blx(compiler));
-    FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0), 0));
+    FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0), 0));
     if (type >= SLJIT_FAST_CALL)
         FAIL_IF(emit_blx(compiler));
 #else
@@ -2460,7 +2428,7 @@


 #ifdef __SOFTFP__
     if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
         src = TMP_REG1;
     }


@@ -2505,7 +2473,7 @@
         FAIL_IF(push_inst(compiler, MOV | RD(dst_reg) | SRC2_IMM | 0));
         FAIL_IF(push_inst(compiler, ((MOV | RD(dst_reg) | SRC2_IMM | 1) & ~COND_MASK) | cc));
         if (dst & SLJIT_MEM)
-            return emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2);
+            return emit_op_mem(compiler, WORD_SIZE, TMP_REG1, dst, dstw, TMP_REG2);
         return SLJIT_SUCCESS;
     }


@@ -2512,7 +2480,7 @@
     ins = (op == SLJIT_AND ? AND : (op == SLJIT_OR ? ORR : EOR));


     if (dst & SLJIT_MEM)
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, dst, dstw, TMP_REG2));
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG1, dst, dstw, TMP_REG2));


     FAIL_IF(push_inst(compiler, ((ins | RD(dst_reg) | RN(dst_reg) | SRC2_IMM | 1) & ~COND_MASK) | cc));


@@ -2520,7 +2488,7 @@
         FAIL_IF(push_inst(compiler, ((ins | RD(dst_reg) | RN(dst_reg) | SRC2_IMM | 0) & ~COND_MASK) | (cc ^ 0x10000000)));


     if (dst & SLJIT_MEM)
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2));
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, dst, dstw, TMP_REG2));


     if (flags & SLJIT_SET_Z)
         return push_inst(compiler, MOV | SET_FLAGS | RD(TMP_REG2) | RM(dst_reg));
@@ -2564,6 +2532,110 @@
     return push_inst(compiler, ((MOV | RD(dst_reg) | RM(src)) & ~COND_MASK) | cc);
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 reg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    sljit_s32 flags;
+    sljit_uw is_type1_transfer, inst;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+    is_type1_transfer = 1;
+
+    switch (type & 0xff) {
+    case SLJIT_MOV:
+    case SLJIT_MOV_U32:
+    case SLJIT_MOV_S32:
+    case SLJIT_MOV_P:
+        flags = WORD_SIZE;
+        break;
+    case SLJIT_MOV_U8:
+        flags = BYTE_SIZE;
+        break;
+    case SLJIT_MOV_S8:
+        if (!(type & SLJIT_MEM_STORE))
+            is_type1_transfer = 0;
+        flags = BYTE_SIZE | SIGNED;
+        break;
+    case SLJIT_MOV_U16:
+        is_type1_transfer = 0;
+        flags = HALF_SIZE;
+        break;
+    case SLJIT_MOV_S16:
+        is_type1_transfer = 0;
+        flags = HALF_SIZE | SIGNED;
+        break;
+    default:
+        SLJIT_UNREACHABLE();
+        flags = WORD_SIZE;
+        break;
+    }
+
+    if (!(type & SLJIT_MEM_STORE))
+        flags |= LOAD_DATA;
+
+    SLJIT_ASSERT(is_type1_transfer == !!IS_TYPE1_TRANSFER(flags));
+
+    if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+        if (!is_type1_transfer && memw != 0)
+            return SLJIT_ERR_UNSUPPORTED;
+    }
+    else {
+        if (is_type1_transfer) {
+            if (memw > 4095 && memw < -4095)
+                return SLJIT_ERR_UNSUPPORTED;
+        }
+        else {
+            if (memw > 255 && memw < -255)
+                return SLJIT_ERR_UNSUPPORTED;
+        }
+    }
+
+    if (type & SLJIT_MEM_SUPP)
+        return SLJIT_SUCCESS;
+
+    if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+        memw &= 0x3;
+
+        inst = EMIT_DATA_TRANSFER(flags, 1, reg, mem & REG_MASK, RM(OFFS_REG(mem)) | (memw << 7));
+
+        if (is_type1_transfer)
+            inst |= (1 << 25);
+
+        if (type & SLJIT_MEM_PRE)
+            inst |= (1 << 21);
+        else
+            inst ^= (1 << 24);
+
+        return push_inst(compiler, inst);
+    }
+
+    inst = EMIT_DATA_TRANSFER(flags, 0, reg, mem & REG_MASK, 0);
+
+    if (type & SLJIT_MEM_PRE)
+        inst |= (1 << 21);
+    else
+        inst ^= (1 << 24);
+
+    if (is_type1_transfer) {
+        if (memw >= 0)
+            inst |= (1 << 23);
+        else
+            memw = -memw;
+
+        return push_inst(compiler, inst | memw);
+    }
+
+    if (memw >= 0)
+        inst |= (1 << 23);
+    else
+        memw = -memw;
+
+    return push_inst(compiler, inst | TYPE2_TRANSFER_IMM(memw));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
     struct sljit_const *const_;
@@ -2579,7 +2651,7 @@
     reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;


 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-    PTR_FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, reg, TMP_PC, 0), init_value));
+    PTR_FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1, reg, TMP_PC, 0), init_value));
     compiler->patches++;
 #else
     PTR_FAIL_IF(emit_imm(compiler, reg, init_value));
@@ -2587,7 +2659,7 @@
     set_const(const_, compiler);


     if (dst & SLJIT_MEM)
-        PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw, TMP_REG1));
+        PTR_FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, dst, dstw, TMP_REG1));
     return const_;
 }



Modified: code/trunk/src/sljit/sljitNativeARM_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_64.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeARM_64.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -36,15 +36,15 @@


 #define TMP_REG1    (SLJIT_NUMBER_OF_REGISTERS + 2)
 #define TMP_REG2    (SLJIT_NUMBER_OF_REGISTERS + 3)
-#define TMP_REG3    (SLJIT_NUMBER_OF_REGISTERS + 4)
-#define TMP_LR        (SLJIT_NUMBER_OF_REGISTERS + 5)
-#define TMP_SP        (SLJIT_NUMBER_OF_REGISTERS + 6)
+#define TMP_LR        (SLJIT_NUMBER_OF_REGISTERS + 4)
+#define TMP_SP        (SLJIT_NUMBER_OF_REGISTERS + 5)


 #define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
 #define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


+/* r18 - platform register, currently not used */
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 8] = {
-  31, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 16, 17, 8, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 29, 9, 10, 11, 30, 31
+    31, 0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 8, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 29, 9, 10, 30, 31
 };


 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
@@ -116,10 +116,13 @@
 #define SMULH 0x9b403c00
 #define STP 0xa9000000
 #define STP_PRE 0xa9800000
+#define STRB 0x38206800
+#define STRBI 0x39000000
 #define STRI 0xf9000000
 #define STR_FI 0x3d000000
 #define STR_FR 0x3c206800
 #define STUR_FI 0x3c000000
+#define STURBI 0x38000000
 #define SUB 0xcb000000
 #define SUBI 0xd1000000
 #define SUBS 0xeb000000
@@ -197,6 +200,7 @@
         code_ptr[-2] = code_ptr[0];
         return 2;
     }
+
     if (target_addr <= 0xffffffffffffl) {
         if (jump->flags & IS_COND)
             code_ptr[-5] -= (1 << 5);
@@ -339,7 +343,6 @@
         return 1;
 #endif


-    case SLJIT_HAS_PRE_UPDATE:
     case SLJIT_HAS_CLZ:
     case SLJIT_HAS_CMOV:
         return 1;
@@ -398,6 +401,7 @@


     SLJIT_ASSERT((len == 32 && imm != 0 && imm != -1)
         || (len == 16 && (sljit_s32)imm != 0 && (sljit_s32)imm != -1));
+
     uimm = (sljit_uw)imm;
     while (1) {
         if (len <= 0) {
@@ -404,6 +408,7 @@
             SLJIT_UNREACHABLE();
             return 0;
         }
+
         mask = ((sljit_uw)1 << len) - 1;
         if ((uimm & mask) != ((uimm >> len) & mask))
             break;
@@ -452,39 +457,42 @@
     sljit_s32 i, zeros, ones, first;
     sljit_ins bitmask;


+    /* Handling simple immediates first. */
     if (imm <= 0xffff)
         return push_inst(compiler, MOVZ | RD(dst) | (imm << 5));


-    if (simm >= -0x10000 && simm < 0)
+    if (simm < 0 && simm >= -0x10000)
         return push_inst(compiler, MOVN | RD(dst) | ((~imm & 0xffff) << 5));


     if (imm <= 0xffffffffl) {
+        if ((imm & 0xffff) == 0)
+            return push_inst(compiler, MOVZ | RD(dst) | ((imm >> 16) << 5) | (1 << 21));
         if ((imm & 0xffff0000l) == 0xffff0000)
             return push_inst(compiler, (MOVN ^ W_OP) | RD(dst) | ((~imm & 0xffff) << 5));
         if ((imm & 0xffff) == 0xffff)
             return push_inst(compiler, (MOVN ^ W_OP) | RD(dst) | ((~imm & 0xffff0000l) >> (16 - 5)) | (1 << 21));
+
         bitmask = logical_imm(simm, 16);
         if (bitmask != 0)
             return push_inst(compiler, (ORRI ^ W_OP) | RD(dst) | RN(TMP_ZERO) | bitmask);
-    }
-    else {
-        bitmask = logical_imm(simm, 32);
-        if (bitmask != 0)
-            return push_inst(compiler, ORRI | RD(dst) | RN(TMP_ZERO) | bitmask);
-    }


-    if (imm <= 0xffffffffl) {
         FAIL_IF(push_inst(compiler, MOVZ | RD(dst) | ((imm & 0xffff) << 5)));
         return push_inst(compiler, MOVK | RD(dst) | ((imm & 0xffff0000l) >> (16 - 5)) | (1 << 21));
     }


-    if (simm >= -0x100000000l && simm < 0) {
+    bitmask = logical_imm(simm, 32);
+    if (bitmask != 0)
+        return push_inst(compiler, ORRI | RD(dst) | RN(TMP_ZERO) | bitmask);
+
+    if (simm < 0 && simm >= -0x100000000l) {
+        if ((imm & 0xffff) == 0xffff)
+            return push_inst(compiler, MOVN | RD(dst) | ((~imm & 0xffff0000l) >> (16 - 5)) | (1 << 21));
+
         FAIL_IF(push_inst(compiler, MOVN | RD(dst) | ((~imm & 0xffff) << 5)));
         return push_inst(compiler, MOVK | RD(dst) | ((imm & 0xffff0000l) >> (16 - 5)) | (1 << 21));
     }


-    /* A large amount of number can be constructed from ORR and MOVx,
-    but computing them is costly. We don't  */
+    /* A large amount of number can be constructed from ORR and MOVx, but computing them is costly. */


     zeros = 0;
     ones = 0;
@@ -537,9 +545,6 @@
 #define INT_OP        0x0040000
 #define SET_FLAGS    0x0080000
 #define UNUSED_RETURN    0x0100000
-#define SLOW_DEST    0x0200000
-#define SLOW_SRC1    0x0400000
-#define SLOW_SRC2    0x0800000


 #define CHECK_FLAGS(flag_bits) \
     if (flags & SET_FLAGS) { \
@@ -697,40 +702,32 @@
     switch (op) {
     case SLJIT_MOV:
     case SLJIT_MOV_P:
-    case SLJIT_MOVU:
-    case SLJIT_MOVU_P:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
         if (dst == arg2)
             return SLJIT_SUCCESS;
         return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | RM(arg2));
     case SLJIT_MOV_U8:
-    case SLJIT_MOVU_U8:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
         return push_inst(compiler, (UBFM ^ (1 << 31)) | RD(dst) | RN(arg2) | (7 << 10));
     case SLJIT_MOV_S8:
-    case SLJIT_MOVU_S8:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
         if (!(flags & INT_OP))
             inv_bits |= 1 << 22;
         return push_inst(compiler, (SBFM ^ inv_bits) | RD(dst) | RN(arg2) | (7 << 10));
     case SLJIT_MOV_U16:
-    case SLJIT_MOVU_U16:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
         return push_inst(compiler, (UBFM ^ (1 << 31)) | RD(dst) | RN(arg2) | (15 << 10));
     case SLJIT_MOV_S16:
-    case SLJIT_MOVU_S16:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
         if (!(flags & INT_OP))
             inv_bits |= 1 << 22;
         return push_inst(compiler, (SBFM ^ inv_bits) | RD(dst) | RN(arg2) | (15 << 10));
     case SLJIT_MOV_U32:
-    case SLJIT_MOVU_U32:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
         if ((flags & INT_OP) && dst == arg2)
             return SLJIT_SUCCESS;
         return push_inst(compiler, (ORR ^ (1 << 31)) | RD(dst) | RN(TMP_ZERO) | RM(arg2));
     case SLJIT_MOV_S32:
-    case SLJIT_MOVU_S32:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG1);
         if ((flags & INT_OP) && dst == arg2)
             return SLJIT_SUCCESS;
@@ -799,294 +796,69 @@
     return SLJIT_SUCCESS;
 }


-#define STORE        0x01
-#define SIGNED        0x02
+#define STORE        0x10
+#define SIGNED        0x20


-#define UPDATE        0x04
-#define ARG_TEST    0x08
+#define BYTE_SIZE    0x0
+#define HALF_SIZE    0x1
+#define INT_SIZE    0x2
+#define WORD_SIZE    0x3


-#define BYTE_SIZE    0x000
-#define HALF_SIZE    0x100
-#define INT_SIZE    0x200
-#define WORD_SIZE    0x300
+#define MEM_SIZE_SHIFT(flags) ((flags) & 0x3)


-#define MEM_SIZE_SHIFT(flags) ((flags) >> 8)
-
-static const sljit_ins sljit_mem_imm[4] = {
-/* u l */ 0x39400000 /* ldrb [reg,imm] */,
-/* u s */ 0x39000000 /* strb [reg,imm] */,
-/* s l */ 0x39800000 /* ldrsb [reg,imm] */,
-/* s s */ 0x39000000 /* strb [reg,imm] */,
-};
-
-static const sljit_ins sljit_mem_simm[4] = {
-/* u l */ 0x38400000 /* ldurb [reg,imm] */,
-/* u s */ 0x38000000 /* sturb [reg,imm] */,
-/* s l */ 0x38800000 /* ldursb [reg,imm] */,
-/* s s */ 0x38000000 /* sturb [reg,imm] */,
-};
-
-static const sljit_ins sljit_mem_pre_simm[4] = {
-/* u l */ 0x38400c00 /* ldrb [reg,imm]! */,
-/* u s */ 0x38000c00 /* strb [reg,imm]! */,
-/* s l */ 0x38800c00 /* ldrsb [reg,imm]! */,
-/* s s */ 0x38000c00 /* strb [reg,imm]! */,
-};
-
-static const sljit_ins sljit_mem_reg[4] = {
-/* u l */ 0x38606800 /* ldrb [reg,reg] */,
-/* u s */ 0x38206800 /* strb [reg,reg] */,
-/* s l */ 0x38a06800 /* ldrsb [reg,reg] */,
-/* s s */ 0x38206800 /* strb [reg,reg] */,
-};
-
-/* Helper function. Dst should be reg + value, using at most 1 instruction, flags does not set. */
-static sljit_s32 emit_set_delta(struct sljit_compiler *compiler, sljit_s32 dst, sljit_s32 reg, sljit_sw value)
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg,
+    sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
-    if (value >= 0) {
-        if (value <= 0xfff)
-            return push_inst(compiler, ADDI | RD(dst) | RN(reg) | (value << 10));
-        if (value <= 0xffffff && !(value & 0xfff))
-            return push_inst(compiler, ADDI | (1 << 22) | RD(dst) | RN(reg) | (value >> 2));
-    }
-    else {
-        value = -value;
-        if (value <= 0xfff)
-            return push_inst(compiler, SUBI | RD(dst) | RN(reg) | (value << 10));
-        if (value <= 0xffffff && !(value & 0xfff))
-            return push_inst(compiler, SUBI | (1 << 22) | RD(dst) | RN(reg) | (value >> 2));
-    }
-    return SLJIT_ERR_UNSUPPORTED;
-}
-
-/* Can perform an operation using at most 1 instruction. */
-static sljit_s32 getput_arg_fast(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
-{
     sljit_u32 shift = MEM_SIZE_SHIFT(flags);
+    sljit_u32 type = (shift << 30);


+    if (!(flags & STORE))
+        type |= (flags & SIGNED) ? 0x00800000 : 0x00400000;
+
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if (SLJIT_UNLIKELY(flags & UPDATE)) {
-        if ((arg & REG_MASK) && !(arg & OFFS_REG_MASK) && argw <= 255 && argw >= -256) {
-            if (SLJIT_UNLIKELY(flags & ARG_TEST))
-                return 1;
-
-            arg &= REG_MASK;
-            argw &= 0x1ff;
-            FAIL_IF(push_inst(compiler, sljit_mem_pre_simm[flags & 0x3]
-                | (shift << 30) | RT(reg) | RN(arg) | (argw << 12)));
-            return -1;
-        }
-        return 0;
-    }
-
     if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
         argw &= 0x3;
-        if (argw && argw != shift)
-            return 0;


-        if (SLJIT_UNLIKELY(flags & ARG_TEST))
-            return 1;
+        if (argw == 0 || argw == shift)
+            return push_inst(compiler, STRB | type | RT(reg)
+                | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | (argw ? (1 << 12) : 0));


-        FAIL_IF(push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg)
-            | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | (argw ? (1 << 12) : 0)));
-        return -1;
+        FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | (argw << 10)));
+        return push_inst(compiler, STRBI | type | RT(reg) | RN(tmp_reg));
     }


     arg &= REG_MASK;


-    if (arg == SLJIT_UNUSED)
-        return 0;
+    if (arg == SLJIT_UNUSED) {
+        FAIL_IF(load_immediate(compiler, tmp_reg, argw & ~(0xfff << shift)));


-    if (argw >= 0 && (argw >> shift) <= 0xfff && (argw & ((1 << shift) - 1)) == 0) {
-        if (SLJIT_UNLIKELY(flags & ARG_TEST))
-            return 1;
+        argw = (argw >> shift) & 0xfff;


-        FAIL_IF(push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30)
-            | RT(reg) | RN(arg) | (argw << (10 - shift))));
-        return -1;
+        return push_inst(compiler, STRBI | type | RT(reg) | RN(tmp_reg) | (argw << 10));
     }


-    if (argw > 255 || argw < -256)
-        return 0;
-
-    if (SLJIT_UNLIKELY(flags & ARG_TEST))
-        return 1;
-
-    FAIL_IF(push_inst(compiler, sljit_mem_simm[flags & 0x3] | (shift << 30)
-        | RT(reg) | RN(arg) | ((argw & 0x1ff) << 12)));
-    return -1;
-}
-
-/* see getput_arg below.
-   Note: can_cache is called only for binary operators. Those
-   operators always uses word arguments without write back. */
-static sljit_s32 can_cache(sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
-{
-    sljit_sw diff;
-    if ((arg & OFFS_REG_MASK) || !(next_arg & SLJIT_MEM))
-        return 0;
-
-    if (!(arg & REG_MASK)) {
-        diff = argw - next_argw;
-        if (diff <= 0xfff && diff >= -0xfff)
-            return 1;
-        return 0;
-    }
-
-    if (argw == next_argw)
-        return 1;
-
-    diff = argw - next_argw;
-    if (arg == next_arg && diff <= 0xfff && diff >= -0xfff)
-        return 1;
-
-    return 0;
-}
-
-/* Emit the necessary instructions. See can_cache above. */
-static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg,
-    sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
-{
-    sljit_u32 shift = MEM_SIZE_SHIFT(flags);
-    sljit_s32 tmp_r, other_r;
-    sljit_sw diff;
-
-    SLJIT_ASSERT(arg & SLJIT_MEM);
-    if (!(next_arg & SLJIT_MEM)) {
-        next_arg = 0;
-        next_argw = 0;
-    }
-
-    tmp_r = ((flags & STORE) || (flags == (WORD_SIZE | SIGNED))) ? TMP_REG3 : reg;
-
-    if (SLJIT_UNLIKELY((flags & UPDATE) && (arg & REG_MASK))) {
-        /* Update only applies if a base register exists. */
-        other_r = OFFS_REG(arg);
-        if (!other_r) {
-            other_r = arg & REG_MASK;
-            SLJIT_ASSERT(other_r != reg);
-
-            if (argw >= 0 && argw <= 0xffffff) {
-                if ((argw & 0xfff) != 0)
-                    FAIL_IF(push_inst(compiler, ADDI | RD(other_r) | RN(other_r) | ((argw & 0xfff) << 10)));
-                if (argw >> 12)
-                    FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(other_r) | RN(other_r) | ((argw >> 12) << 10)));
-                return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(other_r));
-            }
-            else if (argw < 0 && argw >= -0xffffff) {
-                argw = -argw;
-                if ((argw & 0xfff) != 0)
-                    FAIL_IF(push_inst(compiler, SUBI | RD(other_r) | RN(other_r) | ((argw & 0xfff) << 10)));
-                if (argw >> 12)
-                    FAIL_IF(push_inst(compiler, SUBI | (1 << 22) | RD(other_r) | RN(other_r) | ((argw >> 12) << 10)));
-                return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(other_r));
-            }
-
-            if (compiler->cache_arg == SLJIT_MEM) {
-                if (argw == compiler->cache_argw) {
-                    other_r = TMP_REG3;
-                    argw = 0;
-                }
-                else if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
-                    FAIL_IF(compiler->error);
-                    compiler->cache_argw = argw;
-                    other_r = TMP_REG3;
-                    argw = 0;
-                }
-            }
-
-            if (argw) {
-                FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-                compiler->cache_arg = SLJIT_MEM;
-                compiler->cache_argw = argw;
-                other_r = TMP_REG3;
-                argw = 0;
-            }
+    if (argw >= 0 && (argw & ((1 << shift) - 1)) == 0) {
+        if ((argw >> shift) <= 0xfff) {
+            return push_inst(compiler, STRBI | type | RT(reg) | RN(arg) | (argw << (10 - shift)));
         }


-        /* No caching here. */
-        arg &= REG_MASK;
-        FAIL_IF(push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(other_r)));
-        return push_inst(compiler, ADD | RD(arg) | RN(arg) | RM(other_r));
-    }
+        if (argw <= 0xffffff) {
+            FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_reg) | RN(arg) | ((argw >> 12) << 10)));


-    if (arg & OFFS_REG_MASK) {
-        other_r = OFFS_REG(arg);
-        arg &= REG_MASK;
-        FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RN(arg) | RM(other_r) | ((argw & 0x3) << 10)));
-        return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(tmp_r));
-    }
-
-    if (compiler->cache_arg == arg) {
-        diff = argw - compiler->cache_argw;
-        if (diff <= 255 && diff >= -256)
-            return push_inst(compiler, sljit_mem_simm[flags & 0x3] | (shift << 30)
-                | RT(reg) | RN(TMP_REG3) | ((diff & 0x1ff) << 12));
-        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, diff) != SLJIT_ERR_UNSUPPORTED) {
-            FAIL_IF(compiler->error);
-            return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg));
+            argw = ((argw & 0xfff) >> shift);
+            return push_inst(compiler, STRBI | type | RT(reg) | RN(tmp_reg) | (argw << 10));
         }
     }


-    diff = argw - next_argw;
-    next_arg = (arg & REG_MASK) && (arg == next_arg) && diff <= 0xfff && diff >= -0xfff && diff != 0;
-    arg &= REG_MASK;
+    if (argw <= 255 && argw >= -256)
+        return push_inst(compiler, STURBI | type | RT(reg) | RN(arg) | ((argw & 0x1ff) << 12));


-    if (arg != SLJIT_UNUSED && argw >= 0 && argw <= 0xffffff && (argw & ((1 << shift) - 1)) == 0) {
-        FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_r) | RN(arg) | ((argw >> 12) << 10)));
-        return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30)
-            | RT(reg) | RN(tmp_r) | ((argw & 0xfff) << (10 - shift)));
-    }
+    FAIL_IF(load_immediate(compiler, tmp_reg, argw));


-    if (arg && compiler->cache_arg == SLJIT_MEM) {
-        if (compiler->cache_argw == argw)
-            return push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
-        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
-            FAIL_IF(compiler->error);
-            compiler->cache_argw = argw;
-            return push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
-        }
-    }
-
-    compiler->cache_argw = argw;
-    if (next_arg && emit_set_delta(compiler, TMP_REG3, arg, argw) != SLJIT_ERR_UNSUPPORTED) {
-        FAIL_IF(compiler->error);
-        compiler->cache_arg = SLJIT_MEM | arg;
-        arg = 0;
-    }
-    else {
-        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-        compiler->cache_arg = SLJIT_MEM;
-
-        if (next_arg) {
-            FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG3) | RN(TMP_REG3) | RM(arg)));
-            compiler->cache_arg = SLJIT_MEM | arg;
-            arg = 0;
-        }
-    }
-
-    if (arg)
-        return push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
-    return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift << 30) | RT(reg) | RN(TMP_REG3));
+    return push_inst(compiler, STRB | type | RT(reg) | RN(arg) | RM(tmp_reg));
 }


-static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
-{
-    if (getput_arg_fast(compiler, flags, reg, arg, argw))
-        return compiler->error;
-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;
-    return getput_arg(compiler, flags, reg, arg, argw, 0, 0);
-}
-
-static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, sljit_sw arg2w)
-{
-    if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
-        return compiler->error;
-    return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
-}
-
 /* --------------------------------------------------------------------- */
 /*  Entry, exit                                                          */
 /* --------------------------------------------------------------------- */
@@ -1320,9 +1092,6 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);
     ADJUST_LOCAL_OFFSET(src, srcw);


-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;
-
     if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
         if (op <= SLJIT_MOV_P && (src & SLJIT_MEM)) {
             SLJIT_ASSERT(reg_map[1] == 0 && reg_map[3] == 2 && reg_map[5] == 4);
@@ -1335,7 +1104,7 @@
                 dst = 1;


             /* Signed word sized load is the prefetch instruction. */
-            return emit_op_mem(compiler, WORD_SIZE | SIGNED, dst, src, srcw);
+            return emit_op_mem(compiler, WORD_SIZE | SIGNED, dst, src, srcw, TMP_REG1);
         }
         return SLJIT_SUCCESS;
     }
@@ -1343,106 +1112,67 @@
     dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;


     op = GET_OPCODE(op);
-    if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
+    if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
+        /* Both operands are registers. */
+        if (dst_r != TMP_REG1 && FAST_IS_REG(src))
+            return emit_op_imm(compiler, op | ((op_flags & SLJIT_I32_OP) ? INT_OP : 0), dst_r, TMP_REG1, src);
+
         switch (op) {
         case SLJIT_MOV:
         case SLJIT_MOV_P:
-            flags = WORD_SIZE;
+            mem_flags = WORD_SIZE;
             break;
         case SLJIT_MOV_U8:
-            flags = BYTE_SIZE;
+            mem_flags = BYTE_SIZE;
             if (src & SLJIT_IMM)
                 srcw = (sljit_u8)srcw;
             break;
         case SLJIT_MOV_S8:
-            flags = BYTE_SIZE | SIGNED;
+            mem_flags = BYTE_SIZE | SIGNED;
             if (src & SLJIT_IMM)
                 srcw = (sljit_s8)srcw;
             break;
         case SLJIT_MOV_U16:
-            flags = HALF_SIZE;
+            mem_flags = HALF_SIZE;
             if (src & SLJIT_IMM)
                 srcw = (sljit_u16)srcw;
             break;
         case SLJIT_MOV_S16:
-            flags = HALF_SIZE | SIGNED;
+            mem_flags = HALF_SIZE | SIGNED;
             if (src & SLJIT_IMM)
                 srcw = (sljit_s16)srcw;
             break;
         case SLJIT_MOV_U32:
-            flags = INT_SIZE;
+            mem_flags = INT_SIZE;
             if (src & SLJIT_IMM)
                 srcw = (sljit_u32)srcw;
             break;
         case SLJIT_MOV_S32:
-            flags = INT_SIZE | SIGNED;
+            mem_flags = INT_SIZE | SIGNED;
             if (src & SLJIT_IMM)
                 srcw = (sljit_s32)srcw;
             break;
-        case SLJIT_MOVU:
-        case SLJIT_MOVU_P:
-            flags = WORD_SIZE | UPDATE;
-            break;
-        case SLJIT_MOVU_U8:
-            flags = BYTE_SIZE | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_u8)srcw;
-            break;
-        case SLJIT_MOVU_S8:
-            flags = BYTE_SIZE | SIGNED | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_s8)srcw;
-            break;
-        case SLJIT_MOVU_U16:
-            flags = HALF_SIZE | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_u16)srcw;
-            break;
-        case SLJIT_MOVU_S16:
-            flags = HALF_SIZE | SIGNED | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_s16)srcw;
-            break;
-        case SLJIT_MOVU_U32:
-            flags = INT_SIZE | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_u32)srcw;
-            break;
-        case SLJIT_MOVU_S32:
-            flags = INT_SIZE | SIGNED | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_s32)srcw;
-            break;
         default:
             SLJIT_UNREACHABLE();
-            flags = 0;
+            mem_flags = 0;
             break;
         }


         if (src & SLJIT_IMM)
             FAIL_IF(emit_op_imm(compiler, SLJIT_MOV | ARG2_IMM, dst_r, TMP_REG1, srcw));
-        else if (src & SLJIT_MEM) {
-            if (getput_arg_fast(compiler, flags, dst_r, src, srcw))
-                FAIL_IF(compiler->error);
-            else
-                FAIL_IF(getput_arg(compiler, flags, dst_r, src, srcw, dst, dstw));
-        } else {
-            if (dst_r != TMP_REG1)
-                return emit_op_imm(compiler, op | ((op_flags & SLJIT_I32_OP) ? INT_OP : 0), dst_r, TMP_REG1, src);
+        else if (!(src & SLJIT_MEM))
             dst_r = src;
-        }
+        else
+            FAIL_IF(emit_op_mem(compiler, mem_flags, dst_r, src, srcw, TMP_REG1));


-        if (dst & SLJIT_MEM) {
-            if (getput_arg_fast(compiler, flags | STORE, dst_r, dst, dstw))
-                return compiler->error;
-            else
-                return getput_arg(compiler, flags | STORE, dst_r, dst, dstw, 0, 0);
-        }
+        if (dst & SLJIT_MEM)
+            return emit_op_mem(compiler, mem_flags | STORE, dst_r, dst, dstw, TMP_REG2);
         return SLJIT_SUCCESS;
     }


     flags = HAS_FLAGS(op_flags) ? SET_FLAGS : 0;
     mem_flags = WORD_SIZE;
+
     if (op_flags & SLJIT_I32_OP) {
         flags |= INT_OP;
         mem_flags = INT_SIZE;
@@ -1452,28 +1182,14 @@
         flags |= UNUSED_RETURN;


     if (src & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, mem_flags, TMP_REG2, src, srcw))
-            FAIL_IF(compiler->error);
-        else
-            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src, srcw, dst, dstw));
+        FAIL_IF(emit_op_mem(compiler, mem_flags, TMP_REG2, src, srcw, TMP_REG2));
         src = TMP_REG2;
     }


-    if (src & SLJIT_IMM) {
-        flags |= ARG2_IMM;
-        if (op_flags & SLJIT_I32_OP)
-            srcw = (sljit_s32)srcw;
-    } else
-        srcw = src;
+    emit_op_imm(compiler, flags | op, dst_r, TMP_REG1, src);


-    emit_op_imm(compiler, flags | op, dst_r, TMP_REG1, srcw);
-
-    if (dst & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, mem_flags | STORE, dst_r, dst, dstw))
-            return compiler->error;
-        else
-            return getput_arg(compiler, mem_flags | STORE, dst_r, dst, dstw, 0, 0);
-    }
+    if (SLJIT_UNLIKELY(dst & SLJIT_MEM))
+        return emit_op_mem(compiler, mem_flags | STORE, dst_r, dst, dstw, TMP_REG2);
     return SLJIT_SUCCESS;
 }


@@ -1490,9 +1206,6 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;
-
     if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
         return SLJIT_SUCCESS;


@@ -1499,6 +1212,7 @@
     dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
     flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
     mem_flags = WORD_SIZE;
+
     if (op & SLJIT_I32_OP) {
         flags |= INT_OP;
         mem_flags = INT_SIZE;
@@ -1507,46 +1221,21 @@
     if (dst == SLJIT_UNUSED)
         flags |= UNUSED_RETURN;


-    if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, mem_flags | STORE | ARG_TEST, TMP_REG1, dst, dstw))
-        flags |= SLOW_DEST;
-
     if (src1 & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, mem_flags, TMP_REG1, src1, src1w))
-            FAIL_IF(compiler->error);
-        else
-            flags |= SLOW_SRC1;
+        FAIL_IF(emit_op_mem(compiler, mem_flags, TMP_REG1, src1, src1w, TMP_REG1));
+        src1 = TMP_REG1;
     }
+
     if (src2 & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, mem_flags, TMP_REG2, src2, src2w))
-            FAIL_IF(compiler->error);
-        else
-            flags |= SLOW_SRC2;
+        FAIL_IF(emit_op_mem(compiler, mem_flags, TMP_REG2, src2, src2w, TMP_REG2));
+        src2 = TMP_REG2;
     }


-    if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
-        if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
-            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src2, src2w, src1, src1w));
-            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG1, src1, src1w, dst, dstw));
-        }
-        else {
-            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG1, src1, src1w, src2, src2w));
-            FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src2, src2w, dst, dstw));
-        }
-    }
-    else if (flags & SLOW_SRC1)
-        FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG1, src1, src1w, dst, dstw));
-    else if (flags & SLOW_SRC2)
-        FAIL_IF(getput_arg(compiler, mem_flags, TMP_REG2, src2, src2w, dst, dstw));
-
-    if (src1 & SLJIT_MEM)
-        src1 = TMP_REG1;
-    if (src2 & SLJIT_MEM)
-        src2 = TMP_REG2;
-
     if (src1 & SLJIT_IMM)
         flags |= ARG1_IMM;
     else
         src1w = src1;
+
     if (src2 & SLJIT_IMM)
         flags |= ARG2_IMM;
     else
@@ -1554,14 +1243,8 @@


     emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src1w, src2w);


-    if (dst & SLJIT_MEM) {
-        if (!(flags & SLOW_DEST)) {
-            getput_arg_fast(compiler, mem_flags | STORE, dst_r, dst, dstw);
-            return compiler->error;
-        }
-        return getput_arg(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, 0, 0);
-    }
-
+    if (dst & SLJIT_MEM)
+        return emit_op_mem(compiler, mem_flags | STORE, dst_r, dst, dstw, TMP_REG2);
     return SLJIT_SUCCESS;
 }


@@ -1593,54 +1276,50 @@
 static sljit_s32 emit_fop_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
 {
     sljit_u32 shift = MEM_SIZE_SHIFT(flags);
-    sljit_ins ins_bits = (shift << 30);
-    sljit_s32 other_r;
-    sljit_sw diff;
+    sljit_ins type = (shift << 30);


     SLJIT_ASSERT(arg & SLJIT_MEM);


     if (!(flags & STORE))
-        ins_bits |= 1 << 22;
+        type |= 0x00400000;


     if (arg & OFFS_REG_MASK) {
         argw &= 3;
-        if (!argw || argw == shift)
-            return push_inst(compiler, STR_FR | ins_bits | VT(reg)
+        if (argw == 0 || argw == shift)
+            return push_inst(compiler, STR_FR | type | VT(reg)
                 | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | (argw ? (1 << 12) : 0));
-        other_r = OFFS_REG(arg);
-        arg &= REG_MASK;
-        FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RN(arg) | RM(other_r) | (argw << 10)));
-        arg = TMP_REG1;
-        argw = 0;
+
+        FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | (argw << 10)));
+        return push_inst(compiler, STR_FI | type | VT(reg) | RN(TMP_REG1));
     }


     arg &= REG_MASK;
-    if (arg && argw >= 0 && ((argw >> shift) <= 0xfff) && (argw & ((1 << shift) - 1)) == 0)
-        return push_inst(compiler, STR_FI | ins_bits | VT(reg) | RN(arg) | (argw << (10 - shift)));


-    if (arg && argw <= 255 && argw >= -256)
-        return push_inst(compiler, STUR_FI | ins_bits | VT(reg) | RN(arg) | ((argw & 0x1ff) << 12));
+    if (arg == SLJIT_UNUSED) {
+        FAIL_IF(load_immediate(compiler, TMP_REG1, argw & ~(0xfff << shift)));


-    /* Slow cases */
-    if (compiler->cache_arg == SLJIT_MEM && argw != compiler->cache_argw) {
-        diff = argw - compiler->cache_argw;
-        if (!arg && diff <= 255 && diff >= -256)
-            return push_inst(compiler, STUR_FI | ins_bits | VT(reg) | RN(TMP_REG3) | ((diff & 0x1ff) << 12));
-        if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, argw - compiler->cache_argw) != SLJIT_ERR_UNSUPPORTED) {
-            FAIL_IF(compiler->error);
-            compiler->cache_argw = argw;
-        }
+        argw = (argw >> shift) & 0xfff;
+
+        return push_inst(compiler, STR_FI | type | VT(reg) | RN(TMP_REG1) | (argw << 10));
     }


-    if (compiler->cache_arg != SLJIT_MEM || argw != compiler->cache_argw) {
-        compiler->cache_arg = SLJIT_MEM;
-        compiler->cache_argw = argw;
-        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+    if (argw >= 0 && (argw & ((1 << shift) - 1)) == 0) {
+        if ((argw >> shift) <= 0xfff)
+            return push_inst(compiler, STR_FI | type | VT(reg) | RN(arg) | (argw << (10 - shift)));
+
+        if (argw <= 0xffffff) {
+            FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(TMP_REG1) | RN(arg) | ((argw >> 12) << 10)));
+
+            argw = ((argw & 0xfff) >> shift);
+            return push_inst(compiler, STR_FI | type | VT(reg) | RN(TMP_REG1) | (argw << 10));
+        }
     }


-    if (arg & REG_MASK)
-        return push_inst(compiler, STR_FR | ins_bits | VT(reg) | RN(arg) | RM(TMP_REG3));
-    return push_inst(compiler, STR_FI | ins_bits | VT(reg) | RN(TMP_REG3));
+    if (argw <= 255 && argw >= -256)
+        return push_inst(compiler, STUR_FI | type | VT(reg) | RN(arg) | ((argw & 0x1ff) << 12));
+
+    FAIL_IF(load_immediate(compiler, TMP_REG1, argw));
+    return push_inst(compiler, STR_FR | type | VT(reg) | RN(arg) | RM(TMP_REG1));
 }


 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
@@ -1661,7 +1340,7 @@
     FAIL_IF(push_inst(compiler, (FCVTZS ^ inv_bits) | RD(dst_r) | VN(src)));


     if (dst & SLJIT_MEM)
-        return emit_op_mem(compiler, ((GET_OPCODE(op) == SLJIT_CONV_S32_FROM_F64) ? INT_SIZE : WORD_SIZE) | STORE, TMP_REG1, dst, dstw);
+        return emit_op_mem(compiler, ((GET_OPCODE(op) == SLJIT_CONV_S32_FROM_F64) ? INT_SIZE : WORD_SIZE) | STORE, TMP_REG1, dst, dstw, TMP_REG2);
     return SLJIT_SUCCESS;
 }


@@ -1676,7 +1355,7 @@
         inv_bits |= (1 << 31);


     if (src & SLJIT_MEM) {
-        emit_op_mem(compiler, ((GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32) ? INT_SIZE : WORD_SIZE), TMP_REG1, src, srcw);
+        emit_op_mem(compiler, ((GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32) ? INT_SIZE : WORD_SIZE), TMP_REG1, src, srcw, TMP_REG1);
         src = TMP_REG1;
     } else if (src & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -1722,10 +1401,8 @@
     sljit_ins inv_bits;


     CHECK_ERROR();
-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;


-    SLJIT_COMPILE_ASSERT((INT_SIZE ^ 0x100) == WORD_SIZE, must_be_one_bit_difference);
+    SLJIT_COMPILE_ASSERT((INT_SIZE ^ 0x1) == WORD_SIZE, must_be_one_bit_difference);
     SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);


     inv_bits = (op & SLJIT_F32_OP) ? (1 << 22) : 0;
@@ -1732,7 +1409,7 @@
     dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;


     if (src & SLJIT_MEM) {
-        emit_fop_mem(compiler, (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) ? (mem_flags ^ 0x100) : mem_flags, dst_r, src, srcw);
+        emit_fop_mem(compiler, (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) ? (mem_flags ^ 0x1) : mem_flags, dst_r, src, srcw);
         src = dst_r;
     }


@@ -1775,9 +1452,6 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;
-
     dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
     if (src1 & SLJIT_MEM) {
         emit_fop_mem(compiler, mem_flags, TMP_FREG1, src1, src1w);
@@ -1822,7 +1496,7 @@
         return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | RM(TMP_LR));


     /* Memory. */
-    return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_LR, dst, dstw);
+    return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_LR, dst, dstw, TMP_REG1);
 }


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
@@ -1833,10 +1507,8 @@

     if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, ORR | RD(TMP_LR) | RN(TMP_ZERO) | RM(src)));
-    else if (src & SLJIT_MEM)
-        FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_LR, src, srcw));
-    else if (src & SLJIT_IMM)
-        FAIL_IF(load_immediate(compiler, TMP_LR, srcw));
+    else
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_LR, src, srcw, TMP_REG1));


     return push_inst(compiler, RET | RN(TMP_LR));
 }
@@ -1971,7 +1643,7 @@
     jump->flags |= IS_CBZ | IS_COND;


     if (src & SLJIT_MEM) {
-        PTR_FAIL_IF(emit_op_mem(compiler, inv_bits ? INT_SIZE : WORD_SIZE, TMP_REG1, src, srcw));
+        PTR_FAIL_IF(emit_op_mem(compiler, inv_bits ? INT_SIZE : WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
         src = TMP_REG1;
     }
     else if (src & SLJIT_IMM) {
@@ -1978,6 +1650,7 @@
         PTR_FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
         src = TMP_REG1;
     }
+
     SLJIT_ASSERT(FAST_IS_REG(src));


     if ((type & 0xff) == SLJIT_EQUAL)
@@ -2000,7 +1673,7 @@


     if (!(src & SLJIT_IMM)) {
         if (src & SLJIT_MEM) {
-            FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw));
+            FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
             src = TMP_REG1;
         }
         return push_inst(compiler, ((type >= SLJIT_FAST_CALL) ? BLR : BR) | RN(src));
@@ -2048,15 +1721,18 @@


     if (GET_OPCODE(op) < SLJIT_ADD) {
         FAIL_IF(push_inst(compiler, CSINC | (cc << 12) | RD(dst_r) | RN(TMP_ZERO) | RM(TMP_ZERO)));
-        if (dst_r != TMP_REG1)
-            return SLJIT_SUCCESS;
-        return emit_op_mem(compiler, (GET_OPCODE(op) == SLJIT_MOV ? WORD_SIZE : INT_SIZE) | STORE, TMP_REG1, dst, dstw);
+
+        if (dst_r == TMP_REG1) {
+            mem_flags = (GET_OPCODE(op) == SLJIT_MOV ? WORD_SIZE : INT_SIZE) | STORE;
+            return emit_op_mem(compiler, mem_flags, TMP_REG1, dst, dstw, TMP_REG2);
+        }
+
+        return SLJIT_SUCCESS;
     }


-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;
     flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
     mem_flags = WORD_SIZE;
+
     if (op & SLJIT_I32_OP) {
         flags |= INT_OP;
         mem_flags = INT_SIZE;
@@ -2065,7 +1741,7 @@
     src_r = dst;


     if (dst & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, mem_flags, TMP_REG1, dst, dstw, dst, dstw));
+        FAIL_IF(emit_op_mem(compiler, mem_flags, TMP_REG1, dst, dstw, TMP_REG1));
         src_r = TMP_REG1;
     }


@@ -2073,7 +1749,7 @@
     emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src_r, TMP_REG2);


     if (dst & SLJIT_MEM)
-        return emit_op_mem2(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, 0, 0);
+        return emit_op_mem(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, TMP_REG2);
     return SLJIT_SUCCESS;
 }


@@ -2101,6 +1777,85 @@
     return push_inst(compiler, (CSEL ^ inv_bits) | (cc << 12) | RD(dst_reg) | RN(dst_reg) | RM(src));
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 reg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    sljit_u32 sign = 0, inst;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+    if ((mem & OFFS_REG_MASK) || (memw > 255 && memw < -256))
+        return SLJIT_ERR_UNSUPPORTED;
+
+    if (type & SLJIT_MEM_SUPP)
+        return SLJIT_SUCCESS;
+
+    switch (type & 0xff) {
+    case SLJIT_MOV:
+    case SLJIT_MOV_P:
+        inst = STURBI | (MEM_SIZE_SHIFT(WORD_SIZE) << 30) | 0x400;
+        break;
+    case SLJIT_MOV_S8:
+        sign = 1;
+    case SLJIT_MOV_U8:
+        inst = STURBI | (MEM_SIZE_SHIFT(BYTE_SIZE) << 30) | 0x400;
+        break;
+    case SLJIT_MOV_S16:
+        sign = 1;
+    case SLJIT_MOV_U16:
+        inst = STURBI | (MEM_SIZE_SHIFT(HALF_SIZE) << 30) | 0x400;
+        break;
+    case SLJIT_MOV_S32:
+        sign = 1;
+    case SLJIT_MOV_U32:
+        inst = STURBI | (MEM_SIZE_SHIFT(INT_SIZE) << 30) | 0x400;
+        break;
+    default:
+        SLJIT_UNREACHABLE();
+        inst = STURBI | (MEM_SIZE_SHIFT(WORD_SIZE) << 30) | 0x400;
+        break;
+    }
+
+    if (!(type & SLJIT_MEM_STORE))
+        inst |= sign ? 0x00800000 : 0x00400000;
+
+    if (type & SLJIT_MEM_PRE)
+        inst |= 0x800;
+
+    return push_inst(compiler, inst | RT(reg) | RN(mem & REG_MASK) | ((memw & 0x1ff) << 12));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 freg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    sljit_u32 inst;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+
+    if ((mem & OFFS_REG_MASK) || (memw > 255 && memw < -256))
+        return SLJIT_ERR_UNSUPPORTED;
+
+    if (type & SLJIT_MEM_SUPP)
+        return SLJIT_SUCCESS;
+
+    inst = STUR_FI | 0x80000400;
+
+    if (!(type & SLJIT_F32_OP))
+        inst |= 0x40000000;
+
+    if (!(type & SLJIT_MEM_STORE))
+        inst |= 0x00400000;
+
+    if (type & SLJIT_MEM_PRE)
+        inst |= 0x800;
+
+    return push_inst(compiler, inst | VT(freg) | RN(mem & REG_MASK) | ((memw & 0x1ff) << 12));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
     struct sljit_const *const_;
@@ -2118,7 +1873,7 @@
     PTR_FAIL_IF(emit_imm64_const(compiler, dst_r, init_value));


     if (dst & SLJIT_MEM)
-        PTR_FAIL_IF(emit_op_mem(compiler, WORD_SIZE | STORE, dst_r, dst, dstw));
+        PTR_FAIL_IF(emit_op_mem(compiler, WORD_SIZE | STORE, dst_r, dst, dstw, TMP_REG2));
     return const_;
 }



Modified: code/trunk/src/sljit/sljitNativeARM_T2_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_T2_32.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeARM_T2_32.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -453,7 +453,6 @@
         return 1;
 #endif


-    case SLJIT_HAS_PRE_UPDATE:
     case SLJIT_HAS_CLZ:
     case SLJIT_HAS_CMOV:
         return 1;
@@ -738,34 +737,26 @@
     case SLJIT_MOV_U32:
     case SLJIT_MOV_S32:
     case SLJIT_MOV_P:
-    case SLJIT_MOVU:
-    case SLJIT_MOVU_U32:
-    case SLJIT_MOVU_S32:
-    case SLJIT_MOVU_P:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG2);
         if (dst == arg2)
             return SLJIT_SUCCESS;
         return push_inst16(compiler, MOV | SET_REGS44(dst, arg2));
     case SLJIT_MOV_U8:
-    case SLJIT_MOVU_U8:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG2);
         if (IS_2_LO_REGS(dst, arg2))
             return push_inst16(compiler, UXTB | RD3(dst) | RN3(arg2));
         return push_inst32(compiler, UXTB_W | RD4(dst) | RM4(arg2));
     case SLJIT_MOV_S8:
-    case SLJIT_MOVU_S8:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG2);
         if (IS_2_LO_REGS(dst, arg2))
             return push_inst16(compiler, SXTB | RD3(dst) | RN3(arg2));
         return push_inst32(compiler, SXTB_W | RD4(dst) | RM4(arg2));
     case SLJIT_MOV_U16:
-    case SLJIT_MOVU_U16:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG2);
         if (IS_2_LO_REGS(dst, arg2))
             return push_inst16(compiler, UXTH | RD3(dst) | RN3(arg2));
         return push_inst32(compiler, UXTH_W | RD4(dst) | RM4(arg2));
     case SLJIT_MOV_S16:
-    case SLJIT_MOVU_S16:
         SLJIT_ASSERT(!(flags & SET_FLAGS) && arg1 == TMP_REG2);
         if (IS_2_LO_REGS(dst, arg2))
             return push_inst16(compiler, SXTH | RD3(dst) | RN3(arg2));
@@ -849,8 +840,6 @@
 #define HALF_SIZE    0x08
 #define PRELOAD        0x0c


-#define UPDATE        0x10
-
 #define IS_WORD_SIZE(flags)        (!(flags & (BYTE_SIZE | HALF_SIZE)))
 #define OFFSET_CHECK(imm, shift)    (!(argw & ~(imm << shift)))


@@ -949,12 +938,10 @@
     sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
     sljit_s32 other_r;
-    sljit_s32 update = flags & UPDATE;
     sljit_uw tmp;


     SLJIT_ASSERT(arg & SLJIT_MEM);
     SLJIT_ASSERT((arg & REG_MASK) != tmp_reg);
-    flags &= ~UPDATE;
     arg &= ~SLJIT_MEM;


     if (SLJIT_UNLIKELY(!(arg & REG_MASK))) {
@@ -970,63 +957,6 @@
         return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(tmp_reg));
     }


-    if (SLJIT_UNLIKELY(update)) {
-        SLJIT_ASSERT(reg != arg);
-
-        if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
-            other_r = OFFS_REG(arg);
-            arg &= 0xf;
-
-            if (IS_3_LO_REGS(reg, arg, other_r))
-                FAIL_IF(push_inst16(compiler, sljit_mem16[flags] | RD3(reg) | RN3(arg) | RM3(other_r)));
-            else
-                FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(other_r)));
-            return push_inst16(compiler, ADD | SET_REGS44(arg, other_r));
-        }
-
-        if (argw > 0xff) {
-            tmp = get_imm(argw & ~0xff);
-            if (tmp != INVALID_IMM) {
-                push_inst32(compiler, ADD_WI | RD4(arg) | RN4(arg) | tmp);
-                argw = argw & 0xff;
-            }
-        }
-        else if (argw < -0xff) {
-            tmp = get_imm(-argw & ~0xff);
-            if (tmp != INVALID_IMM) {
-                push_inst32(compiler, SUB_WI | RD4(arg) | RN4(arg) | tmp);
-                argw = -(-argw & 0xff);
-            }
-        }
-
-        if (argw == 0) {
-            if (IS_2_LO_REGS(reg, arg) && sljit_mem16_imm5[flags])
-                return push_inst16(compiler, sljit_mem16_imm5[flags] | RD3(reg) | RN3(arg));
-            return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(arg));
-        }
-
-        if (argw <= 0xff && argw >= -0xff) {
-            if (argw >= 0)
-                argw |= 0x200;
-            else {
-                argw = -argw;
-            }
-
-            SLJIT_ASSERT(argw >= 0 && (argw & 0xff) <= 0xff);
-            return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM8 | RT4(reg) | RN4(arg) | 0x100 | argw);
-        }
-
-        FAIL_IF(load_immediate(compiler, tmp_reg, argw));
-
-        SLJIT_ASSERT(reg != tmp_reg);
-
-        if (IS_3_LO_REGS(reg, arg, tmp_reg))
-            FAIL_IF(push_inst16(compiler, sljit_mem16[flags] | RD3(reg) | RN3(arg) | RM3(tmp_reg)));
-        else
-            FAIL_IF(push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(tmp_reg)));
-        return push_inst16(compiler, ADD | SET_REGS44(arg, tmp_reg));
-    }
-
     if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
         argw &= 0x3;
         other_r = OFFS_REG(arg);
@@ -1300,7 +1230,7 @@
     dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;


     op = GET_OPCODE(op);
-    if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
+    if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
         switch (op) {
         case SLJIT_MOV:
         case SLJIT_MOV_U32:
@@ -1328,32 +1258,6 @@
             if (src & SLJIT_IMM)
                 srcw = (sljit_s16)srcw;
             break;
-        case SLJIT_MOVU:
-        case SLJIT_MOVU_U32:
-        case SLJIT_MOVU_S32:
-        case SLJIT_MOVU_P:
-            flags = WORD_SIZE | UPDATE;
-            break;
-        case SLJIT_MOVU_U8:
-            flags = BYTE_SIZE | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_u8)srcw;
-            break;
-        case SLJIT_MOVU_S8:
-            flags = BYTE_SIZE | SIGNED | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_s8)srcw;
-            break;
-        case SLJIT_MOVU_U16:
-            flags = HALF_SIZE | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_u16)srcw;
-            break;
-        case SLJIT_MOVU_S16:
-            flags = HALF_SIZE | SIGNED | UPDATE;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_s16)srcw;
-            break;
         default:
             SLJIT_UNREACHABLE();
             flags = 0;
@@ -1363,7 +1267,7 @@
         if (src & SLJIT_IMM)
             FAIL_IF(emit_op_imm(compiler, SLJIT_MOV | ARG2_IMM, dst_r, TMP_REG2, srcw));
         else if (src & SLJIT_MEM) {
-            FAIL_IF(emit_op_mem(compiler, flags, dst_r, src, srcw, ((flags & UPDATE) && dst_r == TMP_REG1) ? TMP_REG2 : TMP_REG1));
+            FAIL_IF(emit_op_mem(compiler, flags, dst_r, src, srcw, TMP_REG1));
         } else {
             if (dst_r != TMP_REG1)
                 return emit_op_imm(compiler, op, dst_r, TMP_REG2, src);
@@ -1373,7 +1277,7 @@
         if (!(dst & SLJIT_MEM))
             return SLJIT_SUCCESS;


-        return emit_op_mem(compiler, flags | STORE, dst_r, dst, dstw, (dst_r == TMP_REG1) ? TMP_REG2 : TMP_REG1);
+        return emit_op_mem(compiler, flags | STORE, dst_r, dst, dstw, TMP_REG2);
     }


     if (op == SLJIT_NEG) {
@@ -1386,20 +1290,16 @@


     flags = HAS_FLAGS(op_flags) ? SET_FLAGS : 0;


-    if (src & SLJIT_IMM)
-        flags |= ARG2_IMM;
-    else if (src & SLJIT_MEM) {
+    if (src & SLJIT_MEM) {
         FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
-        srcw = TMP_REG1;
+        src = TMP_REG1;
     }
-    else
-        srcw = src;


-    emit_op_imm(compiler, flags | op, dst_r, TMP_REG2, srcw);
+    emit_op_imm(compiler, flags | op, dst_r, TMP_REG2, src);


-    if (!(dst & SLJIT_MEM))
-        return SLJIT_SUCCESS;
-    return emit_op_mem(compiler, flags | STORE, dst_r, dst, dstw, TMP_REG2);
+    if (SLJIT_UNLIKELY(dst & SLJIT_MEM))
+        return emit_op_mem(compiler, flags | STORE, dst_r, dst, dstw, TMP_REG2);
+    return SLJIT_SUCCESS;
 }


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
@@ -1713,11 +1613,9 @@

     if (FAST_IS_REG(src))
         FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG2, src)));
-    else if (src & SLJIT_MEM) {
+    else
         FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, src, srcw, TMP_REG2));
-    }
-    else if (src & SLJIT_IMM)
-        FAIL_IF(load_immediate(compiler, TMP_REG2, srcw));
+
     return push_inst16(compiler, BX | RN3(TMP_REG2));
 }


@@ -2231,6 +2129,63 @@
         | COPY_BITS(tmp, 12 + 16, 16, 4) | COPY_BITS(tmp, 11 + 16, 26, 1) | COPY_BITS(tmp, 8 + 16, 12, 3) | ((tmp & 0xff0000) >> 16));
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 reg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    sljit_s32 flags;
+    sljit_ins inst;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+    if ((mem & OFFS_REG_MASK) || (memw > 255 && memw < -255))
+        return SLJIT_ERR_UNSUPPORTED;
+
+    if (type & SLJIT_MEM_SUPP)
+        return SLJIT_SUCCESS;
+
+    switch (type & 0xff) {
+    case SLJIT_MOV:
+    case SLJIT_MOV_U32:
+    case SLJIT_MOV_S32:
+    case SLJIT_MOV_P:
+        flags = WORD_SIZE;
+        break;
+    case SLJIT_MOV_U8:
+        flags = BYTE_SIZE;
+        break;
+    case SLJIT_MOV_S8:
+        flags = BYTE_SIZE | SIGNED;
+        break;
+    case SLJIT_MOV_U16:
+        flags = HALF_SIZE;
+        break;
+    case SLJIT_MOV_S16:
+        flags = HALF_SIZE | SIGNED;
+        break;
+    default:
+        SLJIT_UNREACHABLE();
+        flags = WORD_SIZE;
+        break;
+    }
+
+    if (type & SLJIT_MEM_STORE)
+        flags |= STORE;
+
+    inst = sljit_mem32[flags] | 0x900;
+
+    if (type & SLJIT_MEM_PRE)
+        inst |= 0x400;
+
+    if (memw >= 0)
+        inst |= 0x200;
+    else
+        memw = -memw;
+
+    return push_inst32(compiler, inst | RT4(reg) | RN4(mem & REG_MASK) | memw);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
     struct sljit_const *const_;


Modified: code/trunk/src/sljit/sljitNativeMIPS_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeMIPS_common.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeMIPS_common.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -558,21 +558,20 @@


 #define MEM_MASK    0x1f


-#define WRITE_BACK    0x00020
-#define ARG_TEST    0x00040
-#define ALT_KEEP_CACHE    0x00080
-#define CUMULATIVE_OP    0x00100
-#define LOGICAL_OP    0x00200
-#define IMM_OP        0x00400
-#define SRC2_IMM    0x00800
+#define ARG_TEST    0x00020
+#define ALT_KEEP_CACHE    0x00040
+#define CUMULATIVE_OP    0x00080
+#define LOGICAL_OP    0x00100
+#define IMM_OP        0x00200
+#define SRC2_IMM    0x00400


-#define UNUSED_DEST    0x01000
-#define REG_DEST    0x02000
-#define REG1_SOURCE    0x04000
-#define REG2_SOURCE    0x08000
-#define SLOW_SRC1    0x10000
-#define SLOW_SRC2    0x20000
-#define SLOW_DEST    0x40000
+#define UNUSED_DEST    0x00800
+#define REG_DEST    0x01000
+#define REG1_SOURCE    0x02000
+#define REG2_SOURCE    0x04000
+#define SLOW_SRC1    0x08000
+#define SLOW_SRC2    0x10000
+#define SLOW_DEST    0x20000


 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 #define STACK_STORE    SW
@@ -756,7 +755,7 @@
 {
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if ((!(flags & WRITE_BACK) || !(arg & REG_MASK)) && !(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN) {
+    if (!(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN) {
         /* Works for both absoulte and relative addresses. */
         if (SLJIT_UNLIKELY(flags & ARG_TEST))
             return 1;
@@ -813,12 +812,6 @@
     base = arg & REG_MASK;


     if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
-        if (SLJIT_UNLIKELY(flags & WRITE_BACK)) {
-            SLJIT_ASSERT(argw == 0);
-            FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(OFFS_REG(arg)) | D(base), DR(base)));
-            return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(base) | TA(reg_ar), delay_slot);
-        }
-
         argw &= 0x3;


         /* Using the cache. */
@@ -855,29 +848,6 @@
         return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar), delay_slot);
     }


-    if (SLJIT_UNLIKELY(flags & WRITE_BACK) && base) {
-        if (argw <= SIMM_MAX && argw >= SIMM_MIN) {
-            if (argw)
-                FAIL_IF(push_inst(compiler, ADDIU_W | S(base) | T(base) | IMM(argw), DR(base)));
-        }
-        else {
-            if (compiler->cache_arg == SLJIT_MEM && argw - compiler->cache_argw <= SIMM_MAX && argw - compiler->cache_argw >= SIMM_MIN) {
-                if (argw != compiler->cache_argw) {
-                    FAIL_IF(push_inst(compiler, ADDIU_W | S(TMP_REG3) | T(TMP_REG3) | IMM(argw - compiler->cache_argw), DR(TMP_REG3)));
-                    compiler->cache_argw = argw;
-                }
-                FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(TMP_REG3) | D(base), DR(base)));
-            }
-            else {
-                compiler->cache_arg = SLJIT_MEM;
-                compiler->cache_argw = argw;
-                FAIL_IF(load_immediate(compiler, DR(TMP_REG3), argw));
-                FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(TMP_REG3) | D(base), DR(base)));
-            }
-        }
-        return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(base) | TA(reg_ar), delay_slot);
-    }
-
     if (compiler->cache_arg == arg && argw - compiler->cache_argw <= SIMM_MAX && argw - compiler->cache_argw >= SIMM_MIN) {
         if (argw != compiler->cache_argw) {
             FAIL_IF(push_inst(compiler, ADDIU_W | S(TMP_REG3) | T(TMP_REG3) | IMM(argw - compiler->cache_argw), DR(TMP_REG3)));
@@ -951,7 +921,7 @@
     else if (FAST_IS_REG(dst)) {
         dst_r = dst;
         flags |= REG_DEST;
-        if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
+        if (op >= SLJIT_MOV && op <= SLJIT_MOV_P)
             sugg_src2_r = dst_r;
     }
     else if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, flags | ARG_TEST, DR(TMP_REG1), dst, dstw))
@@ -1005,7 +975,7 @@
     if (FAST_IS_REG(src2)) {
         src2_r = src2;
         flags |= REG2_SOURCE;
-        if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
+        if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOV_P)
             dst_r = src2_r;
     }
     else if (src2 & SLJIT_IMM) {
@@ -1016,7 +986,7 @@
             }
             else {
                 src2_r = 0;
-                if ((op >= SLJIT_MOV && op <= SLJIT_MOVU_S32) && (dst & SLJIT_MEM))
+                if ((op >= SLJIT_MOV && op <= SLJIT_MOV_P) && (dst & SLJIT_MEM))
                     dst_r = 0;
             }
         }
@@ -1155,11 +1125,8 @@
     }


 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-    if ((op & SLJIT_I32_OP) && GET_OPCODE(op) >= SLJIT_NOT) {
+    if ((op & SLJIT_I32_OP) && GET_OPCODE(op) >= SLJIT_NOT)
         flags |= INT_DATA | SIGNED_DATA;
-        if (src & SLJIT_IMM)
-            srcw = (sljit_s32)srcw;
-    }
 #endif


     switch (GET_OPCODE(op)) {
@@ -1193,36 +1160,6 @@
     case SLJIT_MOV_S16:
         return emit_op(compiler, SLJIT_MOV_S16, HALF_DATA | SIGNED_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);


-    case SLJIT_MOVU:
-    case SLJIT_MOVU_P:
-        return emit_op(compiler, SLJIT_MOV, WORD_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-
-    case SLJIT_MOVU_U32:
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-        return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-#else
-        return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u32)srcw : srcw);
-#endif
-
-    case SLJIT_MOVU_S32:
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-        return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-#else
-        return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s32)srcw : srcw);
-#endif
-
-    case SLJIT_MOVU_U8:
-        return emit_op(compiler, SLJIT_MOV_U8, BYTE_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
-
-    case SLJIT_MOVU_S8:
-        return emit_op(compiler, SLJIT_MOV_S8, BYTE_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
-
-    case SLJIT_MOVU_U16:
-        return emit_op(compiler, SLJIT_MOV_U16, HALF_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
-
-    case SLJIT_MOVU_S16:
-        return emit_op(compiler, SLJIT_MOV_S16, HALF_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
-
     case SLJIT_NOT:
         return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);


@@ -1233,6 +1170,7 @@
         return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
     }


+    SLJIT_UNREACHABLE();
     return SLJIT_SUCCESS;


 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -1304,6 +1242,7 @@
         return emit_op(compiler, op, flags | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
     }


+    SLJIT_UNREACHABLE();
     return SLJIT_SUCCESS;


#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -1595,10 +1534,8 @@

     if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, ADDU_W | S(src) | TA(0) | DA(RETURN_ADDR_REG), RETURN_ADDR_REG));
-    else if (src & SLJIT_MEM)
+    else
         FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, RETURN_ADDR_REG, src, srcw));
-    else if (src & SLJIT_IMM)
-        FAIL_IF(load_immediate(compiler, RETURN_ADDR_REG, srcw));


     FAIL_IF(push_inst(compiler, JR | SA(RETURN_ADDR_REG), UNMOVABLE_INS));
     return push_inst(compiler, NOP, UNMOVABLE_INS);


Modified: code/trunk/src/sljit/sljitNativePPC_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativePPC_common.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativePPC_common.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -93,11 +93,10 @@


 #define TMP_REG1    (SLJIT_NUMBER_OF_REGISTERS + 2)
 #define TMP_REG2    (SLJIT_NUMBER_OF_REGISTERS + 3)
-#define TMP_REG3    (SLJIT_NUMBER_OF_REGISTERS + 4)
-#define TMP_ZERO    (SLJIT_NUMBER_OF_REGISTERS + 5)
+#define TMP_ZERO    (SLJIT_NUMBER_OF_REGISTERS + 4)


 #if (defined SLJIT_PASS_ENTRY_ADDR_TO_CALL && SLJIT_PASS_ENTRY_ADDR_TO_CALL)
-#define TMP_CALL_REG    (SLJIT_NUMBER_OF_REGISTERS + 6)
+#define TMP_CALL_REG    (SLJIT_NUMBER_OF_REGISTERS + 5)
 #else
 #define TMP_CALL_REG    TMP_REG2
 #endif
@@ -106,7 +105,7 @@
 #define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = {
-    0, 3, 4, 5, 6, 7, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 8, 9, 10, 31, 12
+    0, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 9, 10, 31, 12
 };


 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
@@ -540,7 +539,6 @@
         return 1;
 #endif


-    case SLJIT_HAS_PRE_UPDATE:
     case SLJIT_HAS_CLZ:
         return 1;


@@ -558,46 +556,40 @@
 /* Creates an index in data_transfer_insts array. */
 #define LOAD_DATA    0x01
 #define INDEXED        0x02
-#define WRITE_BACK    0x04
+#define SIGNED_DATA    0x04
+
 #define WORD_DATA    0x00
 #define BYTE_DATA    0x08
 #define HALF_DATA    0x10
 #define INT_DATA    0x18
-#define SIGNED_DATA    0x20
 /* Separates integer and floating point registers */
-#define GPR_REG        0x3f
-#define DOUBLE_DATA    0x40
+#define GPR_REG        0x1f
+#define DOUBLE_DATA    0x20


 #define MEM_MASK    0x7f


/* Other inp_flags. */

-#define ARG_TEST    0x000100
 /* Integer opertion and set flags -> requires exts on 64 bit systems. */
-#define ALT_SIGN_EXT    0x000200
+#define ALT_SIGN_EXT    0x000100
 /* This flag affects the RC() and OERC() macros. */
 #define ALT_SET_FLAGS    0x000400
-#define ALT_KEEP_CACHE    0x000800
-#define ALT_FORM1    0x010000
-#define ALT_FORM2    0x020000
-#define ALT_FORM3    0x040000
-#define ALT_FORM4    0x080000
-#define ALT_FORM5    0x100000
+#define ALT_FORM1    0x001000
+#define ALT_FORM2    0x002000
+#define ALT_FORM3    0x004000
+#define ALT_FORM4    0x008000
+#define ALT_FORM5    0x010000


 /* Source and destination is register. */
 #define REG_DEST    0x000001
 #define REG1_SOURCE    0x000002
 #define REG2_SOURCE    0x000004
-/* getput_arg_fast returned true. */
-#define FAST_DEST    0x000008
-/* Multiple instructions are required. */
-#define SLOW_DEST    0x000010
 /*
-ALT_SIGN_EXT        0x000200
-ALT_SET_FLAGS        0x000400
-ALT_FORM1        0x010000
+ALT_SIGN_EXT        0x000100
+ALT_SET_FLAGS        0x000200
+ALT_FORM1        0x001000
 ...
-ALT_FORM5        0x100000 */
+ALT_FORM5        0x010000 */


 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
 #include "sljitNativePPC_32.c"
@@ -745,17 +737,17 @@
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */


-/* i/x - immediate/indexed form
-   n/w - no write-back / write-back (1 bit)
-   s/l - store/load (1 bit)
+/* s/l - store/load (1 bit)
+   i/x - immediate/indexed form
    u/s - signed/unsigned (1 bit)
    w/b/h/i - word/byte/half/int allowed (2 bit)
-   It contans 32 items, but not all are different. */


+   Some opcodes are repeated (e.g. store signed / unsigned byte is the same instruction). */
+
 /* 64 bit only: [reg+imm] must be aligned to 4 bytes. */
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 #define INT_ALIGNED    0x10000
-/* 64-bit only: there is no lwau instruction. */
-#define UPDATE_REQ    0x20000
+#endif


 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
 #define ARCH_32_64(a, b)    a
@@ -764,406 +756,217 @@
 #else
 #define ARCH_32_64(a, b)    b
 #define INST_CODE_AND_DST(inst, flags, reg) \
-    (((inst) & ~(INT_ALIGNED | UPDATE_REQ)) | (((flags) & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg)))
+    (((inst) & ~INT_ALIGNED) | (((flags) & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg)))
 #endif


static const sljit_ins data_transfer_insts[64 + 16] = {

-/* -------- Unsigned -------- */
+/* -------- Integer -------- */

/* Word. */

-/* u w n i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | INT_ALIGNED | 0x0 /* std */),
-/* u w n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x0 /* ld */),
-/* u w n x s */ ARCH_32_64(HI(31) | LO(151) /* stwx */, HI(31) | LO(149) /* stdx */),
-/* u w n x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(21) /* ldx */),
+/* w u i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | INT_ALIGNED | 0x0 /* std */),
+/* w u i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x0 /* ld */),
+/* w u x s */ ARCH_32_64(HI(31) | LO(151) /* stwx */, HI(31) | LO(149) /* stdx */),
+/* w u x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(21) /* ldx */),

-/* u w w i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | INT_ALIGNED | 0x1 /* stdu */),
-/* u w w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | 0x1 /* ldu */),
-/* u w w x s */ ARCH_32_64(HI(31) | LO(183) /* stwux */, HI(31) | LO(181) /* stdux */),
-/* u w w x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(53) /* ldux */),
+/* w s i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | INT_ALIGNED | 0x0 /* std */),
+/* w s i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x0 /* ld */),
+/* w s x s */ ARCH_32_64(HI(31) | LO(151) /* stwx */, HI(31) | LO(149) /* stdx */),
+/* w s x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(21) /* ldx */),

/* Byte. */

-/* u b n i s */ HI(38) /* stb */,
-/* u b n i l */ HI(34) /* lbz */,
-/* u b n x s */ HI(31) | LO(215) /* stbx */,
-/* u b n x l */ HI(31) | LO(87) /* lbzx */,
+/* b u i s */ HI(38) /* stb */,
+/* b u i l */ HI(34) /* lbz */,
+/* b u x s */ HI(31) | LO(215) /* stbx */,
+/* b u x l */ HI(31) | LO(87) /* lbzx */,

-/* u b w i s */ HI(39) /* stbu */,
-/* u b w i l */ HI(35) /* lbzu */,
-/* u b w x s */ HI(31) | LO(247) /* stbux */,
-/* u b w x l */ HI(31) | LO(119) /* lbzux */,
+/* b s i s */ HI(38) /* stb */,
+/* b s i l */ HI(34) /* lbz */ /* EXTS_REQ */,
+/* b s x s */ HI(31) | LO(215) /* stbx */,
+/* b s x l */ HI(31) | LO(87) /* lbzx */ /* EXTS_REQ */,

/* Half. */

-/* u h n i s */ HI(44) /* sth */,
-/* u h n i l */ HI(40) /* lhz */,
-/* u h n x s */ HI(31) | LO(407) /* sthx */,
-/* u h n x l */ HI(31) | LO(279) /* lhzx */,
+/* h u i s */ HI(44) /* sth */,
+/* h u i l */ HI(40) /* lhz */,
+/* h u x s */ HI(31) | LO(407) /* sthx */,
+/* h u x l */ HI(31) | LO(279) /* lhzx */,

-/* u h w i s */ HI(45) /* sthu */,
-/* u h w i l */ HI(41) /* lhzu */,
-/* u h w x s */ HI(31) | LO(439) /* sthux */,
-/* u h w x l */ HI(31) | LO(311) /* lhzux */,
+/* h s i s */ HI(44) /* sth */,
+/* h s i l */ HI(42) /* lha */,
+/* h s x s */ HI(31) | LO(407) /* sthx */,
+/* h s x l */ HI(31) | LO(343) /* lhax */,

/* Int. */

-/* u i n i s */ HI(36) /* stw */,
-/* u i n i l */ HI(32) /* lwz */,
-/* u i n x s */ HI(31) | LO(151) /* stwx */,
-/* u i n x l */ HI(31) | LO(23) /* lwzx */,
+/* i u i s */ HI(36) /* stw */,
+/* i u i l */ HI(32) /* lwz */,
+/* i u x s */ HI(31) | LO(151) /* stwx */,
+/* i u x l */ HI(31) | LO(23) /* lwzx */,

-/* u i w i s */ HI(37) /* stwu */,
-/* u i w i l */ HI(33) /* lwzu */,
-/* u i w x s */ HI(31) | LO(183) /* stwux */,
-/* u i w x l */ HI(31) | LO(55) /* lwzux */,
+/* i s i s */ HI(36) /* stw */,
+/* i s i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x2 /* lwa */),
+/* i s x s */ HI(31) | LO(151) /* stwx */,
+/* i s x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(341) /* lwax */),

-/* -------- Signed -------- */
+/* -------- Floating point -------- */

+/* d i s */ HI(54) /* stfd */,
+/* d i l */ HI(50) /* lfd */,
+/* d x s */ HI(31) | LO(727) /* stfdx */,
+/* d x l */ HI(31) | LO(599) /* lfdx */,
+
+/* s i s */ HI(52) /* stfs */,
+/* s i l */ HI(48) /* lfs */,
+/* s x s */ HI(31) | LO(663) /* stfsx */,
+/* s x l */ HI(31) | LO(535) /* lfsx */,
+};
+
+static const sljit_ins updated_data_transfer_insts[64] = {
+
+/* -------- Integer -------- */
+
/* Word. */

-/* s w n i s */ ARCH_32_64(HI(36) /* stw */, HI(62) | INT_ALIGNED | 0x0 /* std */),
-/* s w n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x0 /* ld */),
-/* s w n x s */ ARCH_32_64(HI(31) | LO(151) /* stwx */, HI(31) | LO(149) /* stdx */),
-/* s w n x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(21) /* ldx */),
+/* w u i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | INT_ALIGNED | 0x1 /* stdu */),
+/* w u i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | 0x1 /* ldu */),
+/* w u x s */ ARCH_32_64(HI(31) | LO(183) /* stwux */, HI(31) | LO(181) /* stdux */),
+/* w u x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(53) /* ldux */),

-/* s w w i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | INT_ALIGNED | 0x1 /* stdu */),
-/* s w w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | 0x1 /* ldu */),
-/* s w w x s */ ARCH_32_64(HI(31) | LO(183) /* stwux */, HI(31) | LO(181) /* stdux */),
-/* s w w x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(53) /* ldux */),
+/* w s i s */ ARCH_32_64(HI(37) /* stwu */, HI(62) | INT_ALIGNED | 0x1 /* stdu */),
+/* w s i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | 0x1 /* ldu */),
+/* w s x s */ ARCH_32_64(HI(31) | LO(183) /* stwux */, HI(31) | LO(181) /* stdux */),
+/* w s x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(53) /* ldux */),

/* Byte. */

-/* s b n i s */ HI(38) /* stb */,
-/* s b n i l */ HI(34) /* lbz */ /* EXTS_REQ */,
-/* s b n x s */ HI(31) | LO(215) /* stbx */,
-/* s b n x l */ HI(31) | LO(87) /* lbzx */ /* EXTS_REQ */,
+/* b u i s */ HI(39) /* stbu */,
+/* b u i l */ HI(35) /* lbzu */,
+/* b u x s */ HI(31) | LO(247) /* stbux */,
+/* b u x l */ HI(31) | LO(119) /* lbzux */,

-/* s b w i s */ HI(39) /* stbu */,
-/* s b w i l */ HI(35) /* lbzu */ /* EXTS_REQ */,
-/* s b w x s */ HI(31) | LO(247) /* stbux */,
-/* s b w x l */ HI(31) | LO(119) /* lbzux */ /* EXTS_REQ */,
+/* b s i s */ HI(39) /* stbu */,
+/* b s i l */ 0 /* no such instruction */,
+/* b s x s */ HI(31) | LO(247) /* stbux */,
+/* b s x l */ 0 /* no such instruction */,

/* Half. */

-/* s h n i s */ HI(44) /* sth */,
-/* s h n i l */ HI(42) /* lha */,
-/* s h n x s */ HI(31) | LO(407) /* sthx */,
-/* s h n x l */ HI(31) | LO(343) /* lhax */,
+/* h u i s */ HI(45) /* sthu */,
+/* h u i l */ HI(41) /* lhzu */,
+/* h u x s */ HI(31) | LO(439) /* sthux */,
+/* h u x l */ HI(31) | LO(311) /* lhzux */,

-/* s h w i s */ HI(45) /* sthu */,
-/* s h w i l */ HI(43) /* lhau */,
-/* s h w x s */ HI(31) | LO(439) /* sthux */,
-/* s h w x l */ HI(31) | LO(375) /* lhaux */,
+/* h s i s */ HI(45) /* sthu */,
+/* h s i l */ HI(43) /* lhau */,
+/* h s x s */ HI(31) | LO(439) /* sthux */,
+/* h s x l */ HI(31) | LO(375) /* lhaux */,

/* Int. */

-/* s i n i s */ HI(36) /* stw */,
-/* s i n i l */ ARCH_32_64(HI(32) /* lwz */, HI(58) | INT_ALIGNED | 0x2 /* lwa */),
-/* s i n x s */ HI(31) | LO(151) /* stwx */,
-/* s i n x l */ ARCH_32_64(HI(31) | LO(23) /* lwzx */, HI(31) | LO(341) /* lwax */),
+/* i u i s */ HI(37) /* stwu */,
+/* i u i l */ HI(33) /* lwzu */,
+/* i u x s */ HI(31) | LO(183) /* stwux */,
+/* i u x l */ HI(31) | LO(55) /* lwzux */,

-/* s i w i s */ HI(37) /* stwu */,
-/* s i w i l */ ARCH_32_64(HI(33) /* lwzu */, HI(58) | INT_ALIGNED | UPDATE_REQ | 0x2 /* lwa */),
-/* s i w x s */ HI(31) | LO(183) /* stwux */,
-/* s i w x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(373) /* lwaux */),
+/* i s i s */ HI(37) /* stwu */,
+/* i s i l */ ARCH_32_64(HI(33) /* lwzu */, 0 /* no such instruction */),
+/* i s x s */ HI(31) | LO(183) /* stwux */,
+/* i s x l */ ARCH_32_64(HI(31) | LO(55) /* lwzux */, HI(31) | LO(373) /* lwaux */),

-/* -------- Double -------- */
+/* -------- Floating point -------- */

-/* d n i s */ HI(54) /* stfd */,
-/* d n i l */ HI(50) /* lfd */,
-/* d n x s */ HI(31) | LO(727) /* stfdx */,
-/* d n x l */ HI(31) | LO(599) /* lfdx */,
+/* d i s */ HI(55) /* stfdu */,
+/* d i l */ HI(51) /* lfdu */,
+/* d x s */ HI(31) | LO(759) /* stfdux */,
+/* d x l */ HI(31) | LO(631) /* lfdux */,

-/* d w i s */ HI(55) /* stfdu */,
-/* d w i l */ HI(51) /* lfdu */,
-/* d w x s */ HI(31) | LO(759) /* stfdux */,
-/* d w x l */ HI(31) | LO(631) /* lfdux */,
-
-/* s n i s */ HI(52) /* stfs */,
-/* s n i l */ HI(48) /* lfs */,
-/* s n x s */ HI(31) | LO(663) /* stfsx */,
-/* s n x l */ HI(31) | LO(535) /* lfsx */,
-
-/* s w i s */ HI(53) /* stfsu */,
-/* s w i l */ HI(49) /* lfsu */,
-/* s w x s */ HI(31) | LO(695) /* stfsux */,
-/* s w x l */ HI(31) | LO(567) /* lfsux */,
+/* s i s */ HI(53) /* stfsu */,
+/* s i l */ HI(49) /* lfsu */,
+/* s x s */ HI(31) | LO(695) /* stfsux */,
+/* s x l */ HI(31) | LO(567) /* lfsux */,
};

#undef ARCH_32_64

 /* Simple cases, (no caching is required). */
-static sljit_s32 getput_arg_fast(struct sljit_compiler *compiler, sljit_s32 inp_flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flags, sljit_s32 reg,
+    sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
     sljit_ins inst;
+    sljit_s32 offs_reg;
+    sljit_sw high_short;


     /* Should work when (arg & REG_MASK) == 0. */
     SLJIT_ASSERT(A(0) == 0);
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if (arg & OFFS_REG_MASK) {
-        if (argw & 0x3)
-            return 0;
-        if (inp_flags & ARG_TEST)
-            return 1;
+    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+        argw &= 0x3;
+        offs_reg = OFFS_REG(arg);


-        inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
-        FAIL_IF(push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(OFFS_REG(arg))));
-        return -1;
-    }
-
-    if (SLJIT_UNLIKELY(!(arg & REG_MASK)))
-        inp_flags &= ~WRITE_BACK;
-
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-    inst = data_transfer_insts[inp_flags & MEM_MASK];
-    SLJIT_ASSERT((arg & REG_MASK) || !(inst & UPDATE_REQ));
-
-    if (argw > SIMM_MAX || argw < SIMM_MIN || ((inst & INT_ALIGNED) && (argw & 0x3)) || (inst & UPDATE_REQ))
-        return 0;
-    if (inp_flags & ARG_TEST)
-        return 1;
-#endif
-
+        if (argw != 0) {
 #if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-    if (argw > SIMM_MAX || argw < SIMM_MIN)
-        return 0;
-    if (inp_flags & ARG_TEST)
-        return 1;
-
-    inst = data_transfer_insts[inp_flags & MEM_MASK];
-    SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
+            FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(arg)) | A(tmp_reg) | (argw << 11) | ((31 - argw) << 1)));
+#else
+            FAIL_IF(push_inst(compiler, RLDI(tmp_reg, OFFS_REG(arg), argw, 63 - argw, 1)));
 #endif
+            offs_reg = tmp_reg;
+        }


-    FAIL_IF(push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | IMM(argw)));
-    return -1;
-}
+        inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];


-/* See getput_arg below.
-   Note: can_cache is called only for binary operators. Those operator always
-   uses word arguments without write back. */
-static sljit_s32 can_cache(sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
-{
-    sljit_sw high_short, next_high_short;
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-    sljit_sw diff;
+        SLJIT_ASSERT(!(inst & INT_ALIGNED));
 #endif


-    SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));
-
-    if (arg & OFFS_REG_MASK)
-        return ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && (argw & 0x3) == (next_argw & 0x3));
-
-    if (next_arg & OFFS_REG_MASK)
-        return 0;
-
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-    high_short = (argw + ((argw & 0x8000) << 1)) & ~0xffff;
-    next_high_short = (next_argw + ((next_argw & 0x8000) << 1)) & ~0xffff;
-    return high_short == next_high_short;
-#else
-    if (argw <= 0x7fffffffl && argw >= -0x80000000l) {
-        high_short = (argw + ((argw & 0x8000) << 1)) & ~0xffff;
-        next_high_short = (next_argw + ((next_argw & 0x8000) << 1)) & ~0xffff;
-        if (high_short == next_high_short)
-            return 1;
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(offs_reg));
     }


-    diff = argw - next_argw;
-    if (!(arg & REG_MASK))
-        return diff <= SIMM_MAX && diff >= SIMM_MIN;
+    inst = data_transfer_insts[inp_flags & MEM_MASK];
+    arg &= REG_MASK;


-    if (arg == next_arg && diff <= SIMM_MAX && diff >= SIMM_MIN)
-        return 1;
-
-    return 0;
-#endif
-}
-
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-#define ADJUST_CACHED_IMM(imm) \
-    if ((inst & INT_ALIGNED) && (imm & 0x3)) { \
-        /* Adjust cached value. Fortunately this is really a rare case */ \
-        compiler->cache_argw += imm & 0x3; \
-        FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG3) | A(TMP_REG3) | (imm & 0x3))); \
-        imm &= ~0x3; \
-    }
-#endif
+    if ((inst & INT_ALIGNED) && (argw & 0x3) != 0) {
+        FAIL_IF(load_immediate(compiler, tmp_reg, argw));


-/* Emit the necessary instructions. See can_cache above. */
-static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 inp_flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
-{
-    sljit_s32 tmp_r;
-    sljit_ins inst;
-    sljit_sw high_short, next_high_short;
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-    sljit_sw diff;
-#endif
-
-    SLJIT_ASSERT(arg & SLJIT_MEM);
-
-    tmp_r = ((inp_flags & LOAD_DATA) && ((inp_flags) & MEM_MASK) <= GPR_REG) ? reg : TMP_REG1;
-    /* Special case for "mov reg, [reg, ... ]". */
-    if ((arg & REG_MASK) == tmp_r)
-        tmp_r = TMP_REG1;
-
-    if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
-        argw &= 0x3;
-        /* Otherwise getput_arg_fast would capture it. */
-        SLJIT_ASSERT(argw);
-
-        if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg && argw == compiler->cache_argw)
-            tmp_r = TMP_REG3;
-        else {
-            if ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && argw == (next_argw & 0x3)) {
-                compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
-                compiler->cache_argw = argw;
-                tmp_r = TMP_REG3;
-            }
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-            FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(arg)) | A(tmp_r) | (argw << 11) | ((31 - argw) << 1)));
-#else
-            FAIL_IF(push_inst(compiler, RLDI(tmp_r, OFFS_REG(arg), argw, 63 - argw, 1)));
-#endif
-        }
         inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(tmp_r));
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | B(tmp_reg));
     }
+#endif


-    if (SLJIT_UNLIKELY(!(arg & REG_MASK)))
-        inp_flags &= ~WRITE_BACK;
+    if (argw <= SIMM_MAX && argw >= SIMM_MIN)
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | IMM(argw));


-    inst = data_transfer_insts[inp_flags & MEM_MASK];
-    SLJIT_ASSERT((arg & REG_MASK) || !(inst & UPDATE_REQ));
-
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-    if (argw <= 0x7fff7fffl && argw >= -0x80000000l
-            && (!(inst & INT_ALIGNED) || !(argw & 0x3)) && !(inst & UPDATE_REQ)) {
+    if (argw <= 0x7fff7fffl && argw >= -0x80000000l) {
 #endif


-        arg &= REG_MASK;
         high_short = (sljit_s32)(argw + ((argw & 0x8000) << 1)) & ~0xffff;
-        /* The getput_arg_fast should handle this otherwise. */
+
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
         SLJIT_ASSERT(high_short && high_short <= 0x7fffffffl && high_short >= -0x80000000l);
 #else
-        SLJIT_ASSERT(high_short && !(inst & (INT_ALIGNED | UPDATE_REQ)));
+        SLJIT_ASSERT(high_short);
 #endif


-        if (inp_flags & WRITE_BACK) {
-            tmp_r = arg;
-            FAIL_IF(push_inst(compiler, ADDIS | D(arg) | A(arg) | IMM(high_short >> 16)));
-        }
-        else if (compiler->cache_arg != (SLJIT_MEM | arg) || high_short != compiler->cache_argw) {
-            if ((next_arg & SLJIT_MEM) && !(next_arg & OFFS_REG_MASK)) {
-                next_high_short = (sljit_s32)(next_argw + ((next_argw & 0x8000) << 1)) & ~0xffff;
-                if (high_short == next_high_short) {
-                    compiler->cache_arg = SLJIT_MEM | arg;
-                    compiler->cache_argw = high_short;
-                    tmp_r = TMP_REG3;
-                }
-            }
-            FAIL_IF(push_inst(compiler, ADDIS | D(tmp_r) | A(arg & REG_MASK) | IMM(high_short >> 16)));
-        }
-        else
-            tmp_r = TMP_REG3;
+        FAIL_IF(push_inst(compiler, ADDIS | D(tmp_reg) | A(arg) | IMM(high_short >> 16)));
+        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_reg) | IMM(argw));


-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_r) | IMM(argw));
-
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
     }


-    /* Everything else is PPC-64 only. */
-    if (SLJIT_UNLIKELY(!(arg & REG_MASK))) {
-        diff = argw - compiler->cache_argw;
-        if ((compiler->cache_arg & SLJIT_IMM) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
-            ADJUST_CACHED_IMM(diff);
-            return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3) | IMM(diff));
-        }
+    /* The rest is PPC-64 only. */


-        diff = argw - next_argw;
-        if ((next_arg & SLJIT_MEM) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
-            SLJIT_ASSERT(inp_flags & LOAD_DATA);
+    FAIL_IF(load_immediate(compiler, tmp_reg, argw));


-            compiler->cache_arg = SLJIT_IMM;
-            compiler->cache_argw = argw;
-            tmp_r = TMP_REG3;
-        }
-
-        FAIL_IF(load_immediate(compiler, tmp_r, argw));
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_r));
-    }
-
-    diff = argw - compiler->cache_argw;
-    if (compiler->cache_arg == arg && diff <= SIMM_MAX && diff >= SIMM_MIN) {
-        SLJIT_ASSERT(!(inp_flags & WRITE_BACK) && !(inst & UPDATE_REQ));
-        ADJUST_CACHED_IMM(diff);
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3) | IMM(diff));
-    }
-
-    if ((compiler->cache_arg & SLJIT_IMM) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
-        inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
-        if (compiler->cache_argw != argw) {
-            FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG3) | A(TMP_REG3) | IMM(diff)));
-            compiler->cache_argw = argw;
-        }
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(TMP_REG3));
-    }
-
-    if (argw == next_argw && (next_arg & SLJIT_MEM)) {
-        SLJIT_ASSERT(inp_flags & LOAD_DATA);
-        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-
-        compiler->cache_arg = SLJIT_IMM;
-        compiler->cache_argw = argw;
-
-        inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-        SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(TMP_REG3));
-    }
-
-    diff = argw - next_argw;
-    if (arg == next_arg && !(inp_flags & WRITE_BACK) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
-        SLJIT_ASSERT(inp_flags & LOAD_DATA);
-        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-        FAIL_IF(push_inst(compiler, ADD | D(TMP_REG3) | A(TMP_REG3) | B(arg & REG_MASK)));
-
-        compiler->cache_arg = arg;
-        compiler->cache_argw = argw;
-
-        return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(TMP_REG3));
-    }
-
-    if ((next_arg & SLJIT_MEM) && !(next_arg & OFFS_REG_MASK) && diff <= SIMM_MAX && diff >= SIMM_MIN) {
-        SLJIT_ASSERT(inp_flags & LOAD_DATA);
-        FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-
-        compiler->cache_arg = SLJIT_IMM;
-        compiler->cache_argw = argw;
-        tmp_r = TMP_REG3;
-    }
-    else
-        FAIL_IF(load_immediate(compiler, tmp_r, argw));
-
-    /* Get the indexed version instead of the normal one. */
     inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
-    SLJIT_ASSERT(!(inst & (INT_ALIGNED | UPDATE_REQ)));
-    return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(tmp_r));
+    return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | B(tmp_reg));
 #endif
 }


-static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, sljit_sw arg2w)
-{
-    if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
-        return compiler->error;
-    return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
-}
-
 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 input_flags,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
@@ -1171,40 +974,21 @@
 {
     /* arg1 goes to TMP_REG1 or src reg
        arg2 goes to TMP_REG2, imm or src reg
-       TMP_REG3 can be used for caching
-       result goes to TMP_REG2, so put result can use TMP_REG1 and TMP_REG3. */
-    sljit_s32 dst_r;
+       result goes to TMP_REG2, so put result can use TMP_REG1. */
+    sljit_s32 dst_r = TMP_REG2;
     sljit_s32 src1_r;
     sljit_s32 src2_r;
     sljit_s32 sugg_src2_r = TMP_REG2;
     sljit_s32 flags = input_flags & (ALT_FORM1 | ALT_FORM2 | ALT_FORM3 | ALT_FORM4 | ALT_FORM5 | ALT_SIGN_EXT | ALT_SET_FLAGS);


-    if (!(input_flags & ALT_KEEP_CACHE)) {
-        compiler->cache_arg = 0;
-        compiler->cache_argw = 0;
-    }
-
     /* Destination check. */
-    if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
-        dst_r = TMP_REG2;
-    }
-    else if (FAST_IS_REG(dst)) {
+    if (SLOW_IS_REG(dst)) {
         dst_r = dst;
         flags |= REG_DEST;
-        if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
+
+        if (op >= SLJIT_MOV && op <= SLJIT_MOV_P)
             sugg_src2_r = dst_r;
     }
-    else {
-        SLJIT_ASSERT(dst & SLJIT_MEM);
-        if (getput_arg_fast(compiler, input_flags | ARG_TEST, TMP_REG2, dst, dstw)) {
-            flags |= FAST_DEST;
-            dst_r = TMP_REG2;
-        }
-        else {
-            flags |= SLOW_DEST;
-            dst_r = 0;
-        }
-    }


     /* Source 1. */
     if (FAST_IS_REG(src1)) {
@@ -1215,18 +999,17 @@
         FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
         src1_r = TMP_REG1;
     }
-    else if (getput_arg_fast(compiler, input_flags | LOAD_DATA, TMP_REG1, src1, src1w)) {
-        FAIL_IF(compiler->error);
+    else {
+        FAIL_IF(emit_op_mem(compiler, input_flags | LOAD_DATA, TMP_REG1, src1, src1w, TMP_REG1));
         src1_r = TMP_REG1;
     }
-    else
-        src1_r = 0;


     /* Source 2. */
     if (FAST_IS_REG(src2)) {
         src2_r = src2;
         flags |= REG2_SOURCE;
-        if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
+
+        if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOV_P)
             dst_r = src2_r;
     }
     else if (src2 & SLJIT_IMM) {
@@ -1233,62 +1016,17 @@
         FAIL_IF(load_immediate(compiler, sugg_src2_r, src2w));
         src2_r = sugg_src2_r;
     }
-    else if (getput_arg_fast(compiler, input_flags | LOAD_DATA, sugg_src2_r, src2, src2w)) {
-        FAIL_IF(compiler->error);
+    else {
+        FAIL_IF(emit_op_mem(compiler, input_flags | LOAD_DATA, sugg_src2_r, src2, src2w, TMP_REG2));
         src2_r = sugg_src2_r;
     }
-    else
-        src2_r = 0;


-    /* src1_r, src2_r and dst_r can be zero (=unprocessed).
-       All arguments are complex addressing modes, and it is a binary operator. */
-    if (src1_r == 0 && src2_r == 0 && dst_r == 0) {
-        if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
-            FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, TMP_REG2, src2, src2w, src1, src1w));
-            FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, TMP_REG1, src1, src1w, dst, dstw));
-        }
-        else {
-            FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, TMP_REG1, src1, src1w, src2, src2w));
-            FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, TMP_REG2, src2, src2w, dst, dstw));
-        }
-        src1_r = TMP_REG1;
-        src2_r = TMP_REG2;
-    }
-    else if (src1_r == 0 && src2_r == 0) {
-        FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, TMP_REG1, src1, src1w, src2, src2w));
-        src1_r = TMP_REG1;
-    }
-    else if (src1_r == 0 && dst_r == 0) {
-        FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, TMP_REG1, src1, src1w, dst, dstw));
-        src1_r = TMP_REG1;
-    }
-    else if (src2_r == 0 && dst_r == 0) {
-        FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, sugg_src2_r, src2, src2w, dst, dstw));
-        src2_r = sugg_src2_r;
-    }
+    FAIL_IF(emit_single_op(compiler, op, flags, dst_r, src1_r, src2_r));


-    if (dst_r == 0)
-        dst_r = TMP_REG2;
+    if (!(dst & SLJIT_MEM))
+        return SLJIT_SUCCESS;


-    if (src1_r == 0) {
-        FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, TMP_REG1, src1, src1w, 0, 0));
-        src1_r = TMP_REG1;
-    }
-
-    if (src2_r == 0) {
-        FAIL_IF(getput_arg(compiler, input_flags | LOAD_DATA, sugg_src2_r, src2, src2w, 0, 0));
-        src2_r = sugg_src2_r;
-    }
-
-    FAIL_IF(emit_single_op(compiler, op, flags, dst_r, src1_r, src2_r));
-
-    if (flags & (FAST_DEST | SLOW_DEST)) {
-        if (flags & FAST_DEST)
-            FAIL_IF(getput_arg_fast(compiler, input_flags, dst_r, dst, dstw));
-        else
-            FAIL_IF(getput_arg(compiler, input_flags, dst_r, dst, dstw, 0, 0));
-    }
-    return SLJIT_SUCCESS;
+    return emit_op_mem(compiler, input_flags, dst_r, dst, dstw, TMP_REG1);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
@@ -1397,34 +1135,26 @@
             return SLJIT_SUCCESS;
     }


+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
     if (op_flags & SLJIT_I32_OP) {
         if (op < SLJIT_NOT) {
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
             if (src & SLJIT_MEM) {
                 if (op == SLJIT_MOV_S32)
                     op = SLJIT_MOV_U32;
-                if (op == SLJIT_MOVU_S32)
-                    op = SLJIT_MOVU_U32;
             }
             else if (src & SLJIT_IMM) {
                 if (op == SLJIT_MOV_U32)
                     op = SLJIT_MOV_S32;
-                if (op == SLJIT_MOVU_U32)
-                    op = SLJIT_MOVU_S32;
             }
-#endif
         }
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
         else {
             /* Most operations expect sign extended arguments. */
             flags |= INT_DATA | SIGNED_DATA;
-            if (src & SLJIT_IMM)
-                srcw = (sljit_s32)srcw;
             if (HAS_FLAGS(op_flags))
                 flags |= ALT_SIGN_EXT;
         }
+    }
 #endif
-    }


     switch (op) {
     case SLJIT_MOV:
@@ -1455,34 +1185,6 @@
     case SLJIT_MOV_S16:
         return EMIT_MOV(SLJIT_MOV_S16, HALF_DATA | SIGNED_DATA, (sljit_s16));


-    case SLJIT_MOVU:
-    case SLJIT_MOVU_P:
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-    case SLJIT_MOVU_U32:
-    case SLJIT_MOVU_S32:
-#endif
-        return emit_op(compiler, SLJIT_MOV, flags | WORD_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-    case SLJIT_MOVU_U32:
-        return EMIT_MOV(SLJIT_MOV_U32, INT_DATA | WRITE_BACK, (sljit_u32));
-
-    case SLJIT_MOVU_S32:
-        return EMIT_MOV(SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | WRITE_BACK, (sljit_s32));
-#endif
-
-    case SLJIT_MOVU_U8:
-        return EMIT_MOV(SLJIT_MOV_U8, BYTE_DATA | WRITE_BACK, (sljit_u8));
-
-    case SLJIT_MOVU_S8:
-        return EMIT_MOV(SLJIT_MOV_S8, BYTE_DATA | SIGNED_DATA | WRITE_BACK, (sljit_s8));
-
-    case SLJIT_MOVU_U16:
-        return EMIT_MOV(SLJIT_MOV_U16, HALF_DATA | WRITE_BACK, (sljit_u16));
-
-    case SLJIT_MOVU_S16:
-        return EMIT_MOV(SLJIT_MOV_S16, HALF_DATA | SIGNED_DATA | WRITE_BACK, (sljit_s16));
-
     case SLJIT_NOT:
         return emit_op(compiler, SLJIT_NOT, flags, dst, dstw, TMP_REG1, 0, src, srcw);


@@ -1570,8 +1272,6 @@
 #endif
     if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
         FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));
-    if (src2 == TMP_REG2)
-        flags |= ALT_KEEP_CACHE;


     switch (GET_OPCODE(op)) {
     case SLJIT_ADD:
@@ -1774,7 +1474,7 @@
 /*  Floating point operators                                             */
 /* --------------------------------------------------------------------- */


-#define FLOAT_DATA(op) (DOUBLE_DATA | ((op & SLJIT_F32_OP) >> 5))
+#define FLOAT_DATA(op) (DOUBLE_DATA | ((op & SLJIT_F32_OP) >> 6))
#define SELECT_FOP(op, single, double) ((op & SLJIT_F32_OP) ? single : double)

 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
@@ -1798,7 +1498,7 @@
 {
     if (src & SLJIT_MEM) {
         /* We can ignore the temporary data store on the stack from caching point of view. */
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src, srcw, TMP_REG1));
         src = TMP_FREG1;
     }


@@ -1808,10 +1508,10 @@

     if (op == SLJIT_CONV_SW_FROM_F64) {
         if (FAST_IS_REG(dst)) {
-            FAIL_IF(emit_op_mem2(compiler, DOUBLE_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, 0, 0));
-            return emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, 0, 0);
+            FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
+            return emit_op_mem(compiler, WORD_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1);
         }
-        return emit_op_mem2(compiler, DOUBLE_DATA, TMP_FREG1, dst, dstw, 0, 0);
+        return emit_op_mem(compiler, DOUBLE_DATA, TMP_FREG1, dst, dstw, TMP_REG1);
     }
 #else
     FAIL_IF(push_inst(compiler, FCTIWZ | FD(TMP_FREG1) | FB(src)));
@@ -1820,7 +1520,7 @@
     if (FAST_IS_REG(dst)) {
         FAIL_IF(load_immediate(compiler, TMP_REG1, FLOAT_TMP_MEM_OFFSET));
         FAIL_IF(push_inst(compiler, STFIWX | FS(TMP_FREG1) | A(SLJIT_SP) | B(TMP_REG1)));
-        return emit_op_mem2(compiler, INT_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, 0, 0);
+        return emit_op_mem(compiler, INT_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1);
     }


     SLJIT_ASSERT(dst & SLJIT_MEM);
@@ -1871,21 +1571,21 @@
         if (FAST_IS_REG(src))
             FAIL_IF(push_inst(compiler, EXTSW | S(src) | A(TMP_REG1)));
         else
-            FAIL_IF(emit_op_mem2(compiler, INT_DATA | SIGNED_DATA | LOAD_DATA, TMP_REG1, src, srcw, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
+            FAIL_IF(emit_op_mem(compiler, INT_DATA | SIGNED_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
         src = TMP_REG1;
     }


     if (FAST_IS_REG(src)) {
-        FAIL_IF(emit_op_mem2(compiler, WORD_DATA, src, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
-        FAIL_IF(emit_op_mem2(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, dst, dstw));
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA, src, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
+        FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
     }
     else
-        FAIL_IF(emit_op_mem2(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
+        FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, src, srcw, TMP_REG1));


     FAIL_IF(push_inst(compiler, FCFID | FD(dst_r) | FB(TMP_FREG1)));


     if (dst & SLJIT_MEM)
-        return emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, 0, 0);
+        return emit_op_mem(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, TMP_REG1);
     if (op & SLJIT_F32_OP)
         return push_inst(compiler, FRSP | FD(dst_r) | FB(dst_r));
     return SLJIT_SUCCESS;
@@ -1901,7 +1601,7 @@
         invert_sign = 0;
     }
     else if (!FAST_IS_REG(src)) {
-        FAIL_IF(emit_op_mem2(compiler, WORD_DATA | SIGNED_DATA | LOAD_DATA, TMP_REG1, src, srcw, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW));
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | SIGNED_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
         src = TMP_REG1;
     }


@@ -1913,17 +1613,17 @@
     FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG2) | A(0) | 0x4330));
     if (invert_sign)
         FAIL_IF(push_inst(compiler, XORIS | S(src) | A(TMP_REG1) | 0x8000));
-    FAIL_IF(emit_op_mem2(compiler, WORD_DATA, TMP_REG2, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_HI, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
-    FAIL_IF(emit_op_mem2(compiler, WORD_DATA, TMP_REG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_HI));
+    FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_HI, TMP_REG1));
+    FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW, TMP_REG2));
     FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG1) | A(0) | 0x8000));
-    FAIL_IF(emit_op_mem2(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW));
-    FAIL_IF(emit_op_mem2(compiler, WORD_DATA, TMP_REG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
-    FAIL_IF(emit_op_mem2(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG2, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW));
+    FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));
+    FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET_LOW, TMP_REG2));
+    FAIL_IF(emit_op_mem(compiler, DOUBLE_DATA | LOAD_DATA, TMP_FREG2, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, TMP_REG1));


     FAIL_IF(push_inst(compiler, FSUB | FD(dst_r) | FA(TMP_FREG1) | FB(TMP_FREG2)));


     if (dst & SLJIT_MEM)
-        return emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, 0, 0);
+        return emit_op_mem(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, TMP_REG1);
     if (op & SLJIT_F32_OP)
         return push_inst(compiler, FRSP | FD(dst_r) | FB(dst_r));
     return SLJIT_SUCCESS;
@@ -1936,12 +1636,12 @@
     sljit_s32 src2, sljit_sw src2w)
 {
     if (src1 & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, TMP_REG1));
         src1 = TMP_FREG1;
     }


     if (src2 & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, 0, 0));
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, TMP_REG2));
         src2 = TMP_FREG2;
     }


@@ -1955,8 +1655,6 @@
     sljit_s32 dst_r;


     CHECK_ERROR();
-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;


     SLJIT_COMPILE_ASSERT((SLJIT_F32_OP == 0x100) && !(DOUBLE_DATA & 0x4), float_transfer_bit_error);
     SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
@@ -1967,7 +1665,7 @@
     dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;


     if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_r, src, srcw, dst, dstw));
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_r, src, srcw, TMP_REG1));
         src = dst_r;
     }


@@ -1996,7 +1694,7 @@
     }


     if (dst & SLJIT_MEM)
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op), dst_r, dst, dstw, 0, 0));
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op), dst_r, dst, dstw, TMP_REG1));
     return SLJIT_SUCCESS;
 }


@@ -2005,7 +1703,7 @@
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w)
 {
-    sljit_s32 dst_r, flags = 0;
+    sljit_s32 dst_r;


     CHECK_ERROR();
     CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
@@ -2013,47 +1711,18 @@
     ADJUST_LOCAL_OFFSET(src1, src1w);
     ADJUST_LOCAL_OFFSET(src2, src2w);


-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;
-
     dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG2;


     if (src1 & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
-            FAIL_IF(compiler->error);
-            src1 = TMP_FREG1;
-        } else
-            flags |= ALT_FORM1;
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, TMP_REG1));
+        src1 = TMP_FREG1;
     }


     if (src2 & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
-            FAIL_IF(compiler->error);
-            src2 = TMP_FREG2;
-        } else
-            flags |= ALT_FORM2;
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, TMP_REG2));
+        src2 = TMP_FREG2;
     }


-    if ((flags & (ALT_FORM1 | ALT_FORM2)) == (ALT_FORM1 | ALT_FORM2)) {
-        if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, src1, src1w));
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
-        }
-        else {
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
-        }
-    }
-    else if (flags & ALT_FORM1)
-        FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
-    else if (flags & ALT_FORM2)
-        FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
-
-    if (flags & ALT_FORM1)
-        src1 = TMP_FREG1;
-    if (flags & ALT_FORM2)
-        src2 = TMP_FREG2;
-
     switch (GET_OPCODE(op)) {
     case SLJIT_ADD_F64:
         FAIL_IF(push_inst(compiler, SELECT_FOP(op, FADDS, FADD) | FD(dst_r) | FA(src1) | FB(src2)));
@@ -2072,13 +1741,12 @@
         break;
     }


-    if (dst_r == TMP_FREG2)
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG2, dst, dstw, 0, 0));
+    if (dst & SLJIT_MEM)
+        FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(op), TMP_FREG2, dst, dstw, TMP_REG1));


     return SLJIT_SUCCESS;
 }


-#undef FLOAT_DATA
#undef SELECT_FOP

 /* --------------------------------------------------------------------- */
@@ -2108,12 +1776,10 @@
     if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, MTLR | S(src)));
     else {
-        if (src & SLJIT_MEM)
-            FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
-        else if (src & SLJIT_IMM)
-            FAIL_IF(load_immediate(compiler, TMP_REG2, srcw));
+        FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
         FAIL_IF(push_inst(compiler, MTLR | S(TMP_REG2)));
     }
+
     return push_inst(compiler, BLR);
 }


@@ -2340,11 +2006,8 @@
     op = GET_OPCODE(op);
     reg = (op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2;


-    compiler->cache_arg = 0;
-    compiler->cache_argw = 0;
-
     if (op >= SLJIT_ADD && (dst & SLJIT_MEM))
-        FAIL_IF(emit_op_mem2(compiler, input_flags | LOAD_DATA, TMP_REG1, dst, dstw, dst, dstw));
+        FAIL_IF(emit_op_mem(compiler, input_flags | LOAD_DATA, TMP_REG1, dst, dstw, TMP_REG1));


     invert = 0;
     cr_bit = 0;
@@ -2440,7 +2103,7 @@
     if (op < SLJIT_ADD) {
         if (!(dst & SLJIT_MEM))
             return SLJIT_SUCCESS;
-        return emit_op_mem2(compiler, input_flags, reg, dst, dstw, reg, 0);
+        return emit_op_mem(compiler, input_flags, reg, dst, dstw, TMP_REG1);
     }


 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
@@ -2462,6 +2125,139 @@
     return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);;
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 reg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    sljit_s32 mem_flags;
+    sljit_ins inst;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+    if (type & SLJIT_MEM_POST)
+        return SLJIT_ERR_UNSUPPORTED;
+
+    switch (type & 0xff) {
+    case SLJIT_MOV:
+    case SLJIT_MOV_P:
+#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
+    case SLJIT_MOV_U32:
+    case SLJIT_MOV_S32:
+#endif
+        mem_flags = WORD_DATA;
+        break;
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    case SLJIT_MOV_U32:
+        mem_flags = INT_DATA;
+        break;
+
+    case SLJIT_MOV_S32:
+        mem_flags = INT_DATA;
+
+        if (!(type & SLJIT_MEM_STORE) && !(type & SLJIT_I32_OP)) {
+            if (mem & OFFS_REG_MASK)
+                mem_flags |= SIGNED_DATA;
+            else
+                return SLJIT_ERR_UNSUPPORTED;
+        }
+        break;
+#endif
+
+    case SLJIT_MOV_U8:
+    case SLJIT_MOV_S8:
+        mem_flags = BYTE_DATA;
+        break;
+
+    case SLJIT_MOV_U16:
+        mem_flags = HALF_DATA;
+        break;
+
+    case SLJIT_MOV_S16:
+        mem_flags = HALF_DATA | SIGNED_DATA;
+        break;
+
+    default:
+        SLJIT_UNREACHABLE();
+        mem_flags = WORD_DATA;
+        break;
+    }
+
+    if (!(type & SLJIT_MEM_STORE))
+        mem_flags |= LOAD_DATA;
+
+    if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+        if (memw != 0)
+            return SLJIT_ERR_UNSUPPORTED;
+
+        if (type & SLJIT_MEM_SUPP)
+            return SLJIT_SUCCESS;
+
+        inst = updated_data_transfer_insts[mem_flags | INDEXED];
+        FAIL_IF(push_inst(compiler, INST_CODE_AND_DST(inst, 0, reg) | A(mem & REG_MASK) | B(OFFS_REG(mem))));
+    }
+    else {
+        if (memw > SIMM_MAX || memw < SIMM_MIN)
+            return SLJIT_ERR_UNSUPPORTED;
+
+        inst = updated_data_transfer_insts[mem_flags];
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+        if ((inst & INT_ALIGNED) && (memw & 0x3) != 0)
+            return SLJIT_ERR_UNSUPPORTED;
+#endif
+
+        if (type & SLJIT_MEM_SUPP)
+            return SLJIT_SUCCESS;
+
+        FAIL_IF(push_inst(compiler, INST_CODE_AND_DST(inst, 0, reg) | A(mem & REG_MASK) | IMM(memw)));
+    }
+
+    if ((mem_flags & LOAD_DATA) && (type & 0xff) == SLJIT_MOV_S8)
+        return push_inst(compiler, EXTSB | S(reg) | A(reg));
+    return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 freg,
+    sljit_s32 mem, sljit_sw memw)
+{
+    sljit_s32 mem_flags;
+    sljit_ins inst;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+
+    if (type & SLJIT_MEM_POST)
+        return SLJIT_ERR_UNSUPPORTED;
+
+    if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+        if (memw != 0)
+            return SLJIT_ERR_UNSUPPORTED;
+    }
+    else {
+        if (memw > SIMM_MAX || memw < SIMM_MIN)
+            return SLJIT_ERR_UNSUPPORTED;
+    }
+
+    if (type & SLJIT_MEM_SUPP)
+        return SLJIT_SUCCESS;
+
+    mem_flags = FLOAT_DATA(type);
+
+    if (!(type & SLJIT_MEM_STORE))
+        mem_flags |= LOAD_DATA;
+
+    if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+        inst = updated_data_transfer_insts[mem_flags | INDEXED];
+        return push_inst(compiler, INST_CODE_AND_DST(inst, DOUBLE_DATA, freg) | A(mem & REG_MASK) | B(OFFS_REG(mem)));
+    }
+
+    inst = updated_data_transfer_insts[mem_flags];
+    return push_inst(compiler, INST_CODE_AND_DST(inst, DOUBLE_DATA, freg) | A(mem & REG_MASK) | IMM(memw));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
     struct sljit_const *const_;


Modified: code/trunk/src/sljit/sljitNativeSPARC_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeSPARC_common.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeSPARC_common.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -449,18 +449,17 @@


 #define MEM_MASK    0x1f


-#define WRITE_BACK    0x00020
-#define ARG_TEST    0x00040
-#define ALT_KEEP_CACHE    0x00080
-#define CUMULATIVE_OP    0x00100
-#define IMM_OP        0x00200
-#define SRC2_IMM    0x00400
+#define ARG_TEST    0x00020
+#define ALT_KEEP_CACHE    0x00040
+#define CUMULATIVE_OP    0x00080
+#define IMM_OP        0x00100
+#define SRC2_IMM    0x00200


-#define REG_DEST    0x00800
-#define REG2_SOURCE    0x01000
-#define SLOW_SRC1    0x02000
-#define SLOW_SRC2    0x04000
-#define SLOW_DEST    0x08000
+#define REG_DEST    0x00400
+#define REG2_SOURCE    0x00800
+#define SLOW_SRC1    0x01000
+#define SLOW_SRC2    0x02000
+#define SLOW_DEST    0x04000


/* SET_FLAGS (0x10 << 19) also belong here! */

@@ -562,18 +561,16 @@
 {
     SLJIT_ASSERT(arg & SLJIT_MEM);


-    if (!(flags & WRITE_BACK) || !(arg & REG_MASK)) {
-        if ((!(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN)
-                || ((arg & OFFS_REG_MASK) && (argw & 0x3) == 0)) {
-            /* Works for both absoulte and relative addresses (immediate case). */
-            if (SLJIT_UNLIKELY(flags & ARG_TEST))
-                return 1;
-            FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK]
-                | ((flags & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg))
-                | S1(arg & REG_MASK) | ((arg & OFFS_REG_MASK) ? S2(OFFS_REG(arg)) : IMM(argw)),
-                ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS));
-            return -1;
-        }
+    if ((!(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN)
+            || ((arg & OFFS_REG_MASK) && (argw & 0x3) == 0)) {
+        /* Works for both absoulte and relative addresses (immediate case). */
+        if (SLJIT_UNLIKELY(flags & ARG_TEST))
+            return 1;
+        FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK]
+            | ((flags & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg))
+            | S1(arg & REG_MASK) | ((arg & OFFS_REG_MASK) ? S2(OFFS_REG(arg)) : IMM(argw)),
+            ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS));
+        return -1;
     }
     return 0;
 }
@@ -658,10 +655,7 @@
     delay_slot = ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS;
     if (!base)
         return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | dest | S1(arg2) | IMM(0), delay_slot);
-    if (!(flags & WRITE_BACK))
-        return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | dest | S1(base) | S2(arg2), delay_slot);
-    FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | dest | S1(base) | S2(arg2), delay_slot));
-    return push_inst(compiler, ADD | D(base) | S1(base) | S2(arg2), DR(base));
+    return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | dest | S1(base) | S2(arg2), delay_slot);
 }


 static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
@@ -703,7 +697,7 @@
         if (FAST_IS_REG(dst)) {
             dst_r = dst;
             flags |= REG_DEST;
-            if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
+            if (op >= SLJIT_MOV && op <= SLJIT_MOV_P)
                 sugg_src2_r = dst_r;
         }
         else if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, flags | ARG_TEST, TMP_REG1, dst, dstw))
@@ -754,7 +748,7 @@
     if (FAST_IS_REG(src2)) {
         src2_r = src2;
         flags |= REG2_SOURCE;
-        if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
+        if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOV_P)
             dst_r = src2_r;
     }
     else if (src2 & SLJIT_IMM) {
@@ -765,7 +759,7 @@
             }
             else {
                 src2_r = 0;
-                if ((op >= SLJIT_MOV && op <= SLJIT_MOVU_S32) && (dst & SLJIT_MEM))
+                if ((op >= SLJIT_MOV && op <= SLJIT_MOV_P) && (dst & SLJIT_MEM))
                     dst_r = 0;
             }
         }
@@ -891,28 +885,6 @@
     case SLJIT_MOV_S16:
         return emit_op(compiler, SLJIT_MOV_S16, flags | HALF_DATA | SIGNED_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);


-    case SLJIT_MOVU:
-    case SLJIT_MOVU_P:
-        return emit_op(compiler, SLJIT_MOV, flags | WORD_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-
-    case SLJIT_MOVU_U32:
-        return emit_op(compiler, SLJIT_MOV_U32, flags | INT_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-
-    case SLJIT_MOVU_S32:
-        return emit_op(compiler, SLJIT_MOV_S32, flags | INT_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
-
-    case SLJIT_MOVU_U8:
-        return emit_op(compiler, SLJIT_MOV_U8, flags | BYTE_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
-
-    case SLJIT_MOVU_S8:
-        return emit_op(compiler, SLJIT_MOV_S8, flags | BYTE_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
-
-    case SLJIT_MOVU_U16:
-        return emit_op(compiler, SLJIT_MOV_U16, flags | HALF_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
-
-    case SLJIT_MOVU_S16:
-        return emit_op(compiler, SLJIT_MOV_S16, flags | HALF_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
-
     case SLJIT_NOT:
     case SLJIT_CLZ:
         return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
@@ -1227,10 +1199,8 @@


     if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, OR | D(TMP_LINK) | S1(0) | S2(src), DR(TMP_LINK)));
-    else if (src & SLJIT_MEM)
+    else
         FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_LINK, src, srcw));
-    else if (src & SLJIT_IMM)
-        FAIL_IF(load_immediate(compiler, TMP_LINK, srcw));


     FAIL_IF(push_inst(compiler, JMPL | D(0) | S1(TMP_LINK) | IMM(8), UNMOVABLE_INS));
     return push_inst(compiler, NOP, UNMOVABLE_INS);


Modified: code/trunk/src/sljit/sljitNativeX86_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_32.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeX86_32.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -855,7 +855,7 @@
         INC_SIZE(1 + 1);
         PUSH_REG(reg_map[src]);
     }
-    else if (src & SLJIT_MEM) {
+    else {
         inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
         FAIL_IF(!inst);
         *inst++ = GROUP_FF;
@@ -865,17 +865,7 @@
         FAIL_IF(!inst);
         INC_SIZE(1);
     }
-    else {
-        /* SLJIT_IMM. */
-        inst = (sljit_u8*)ensure_buf(compiler, 1 + 5 + 1);
-        FAIL_IF(!inst);


-        INC_SIZE(5 + 1);
-        *inst++ = PUSH_i32;
-        sljit_unaligned_store_sw(inst, srcw);
-        inst += sizeof(sljit_sw);
-    }
-
     RET();
     return SLJIT_SUCCESS;
 }


Modified: code/trunk/src/sljit/sljitNativeX86_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_64.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeX86_64.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -41,24 +41,31 @@


 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type)
 {
+    /* The relative jump below specialized for this case. */
+    SLJIT_ASSERT(reg_map[TMP_REG2] >= 8);
+
+    int short_addr = !(jump->flags & SLJIT_REWRITABLE_JUMP) && !(jump->flags & JUMP_LABEL) && (jump->u.target <= 0xffffffff);
+
     if (type < SLJIT_JUMP) {
         /* Invert type. */
         *code_ptr++ = get_jump_code(type ^ 0x1) - 0x10;
-        *code_ptr++ = 10 + 3;
+        *code_ptr++ = short_addr ? (6 + 3) : (10 + 3);
     }


-    *code_ptr++ = REX_W | ((reg_map[TMP_REG2] <= 7) ? 0 : REX_B);
+    *code_ptr++ = short_addr ? REX_B : (REX_W | REX_B);
     *code_ptr++ = MOV_r_i32 | reg_lmap[TMP_REG2];
     jump->addr = (sljit_uw)code_ptr;


     if (jump->flags & JUMP_LABEL)
         jump->flags |= PATCH_MD;
+    else if (short_addr)
+        sljit_unaligned_store_s32(code_ptr, (sljit_s32)jump->u.target);
     else
         sljit_unaligned_store_sw(code_ptr, jump->u.target);


-    code_ptr += sizeof(sljit_sw);
-    if (reg_map[TMP_REG2] >= 8)
-        *code_ptr++ = REX_B;
+    code_ptr += short_addr ? sizeof(sljit_s32) : sizeof(sljit_sw);
+
+    *code_ptr++ = REX_B;
     *code_ptr++ = GROUP_FF;
     *code_ptr++ = MOD_REG | (type >= SLJIT_FAST_CALL ? CALL_rm : JMP_rm) | reg_lmap[TMP_REG2];


@@ -755,11 +762,6 @@
     CHECK(check_sljit_emit_fast_return(compiler, src, srcw));
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if ((src & SLJIT_IMM) && NOT_HALFWORD(srcw)) {
-        FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
-        src = TMP_REG1;
-    }
-
     if (FAST_IS_REG(src)) {
         if (reg_map[src] < 8) {
             inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 1);
@@ -777,7 +779,7 @@
             PUSH_REG(reg_lmap[src]);
         }
     }
-    else if (src & SLJIT_MEM) {
+    else {
         /* REX_W is not necessary (src is not immediate). */
         compiler->mode32 = 1;
         inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
@@ -789,18 +791,7 @@
         FAIL_IF(!inst);
         INC_SIZE(1);
     }
-    else {
-        SLJIT_ASSERT(IS_HALFWORD(srcw));
-        /* SLJIT_IMM. */
-        inst = (sljit_u8*)ensure_buf(compiler, 1 + 5 + 1);
-        FAIL_IF(!inst);


-        INC_SIZE(5 + 1);
-        *inst++ = PUSH_i32;
-        sljit_unaligned_store_s32(inst, srcw);
-        inst += sizeof(sljit_s32);
-    }
-
     RET();
     return SLJIT_SUCCESS;
 }


Modified: code/trunk/src/sljit/sljitNativeX86_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_common.c    2018-01-01 17:27:55 UTC (rev 903)
+++ code/trunk/src/sljit/sljitNativeX86_common.c    2018-01-05 09:30:45 UTC (rev 904)
@@ -39,7 +39,7 @@
      1 - ECX
      2 - EDX
      3 - EBX
-     4 - none
+     4 - ESP
      5 - EBP
      6 - ESI
      7 - EDI
@@ -51,7 +51,7 @@
      1 - RCX
      2 - RDX
      3 - RBX
-     4 - none
+     4 - RSP
      5 - RBP
      6 - RSI
      7 - RDI
@@ -477,11 +477,7 @@
         code_ptr += sizeof(sljit_s8);
     } else {
         jump->flags |= PATCH_MW;
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        code_ptr += sizeof(sljit_sw);
-#else
         code_ptr += sizeof(sljit_s32);
-#endif
     }


     return code_ptr;
@@ -1135,7 +1131,7 @@
         return SLJIT_SUCCESS;
     }


-    if (dst == SLJIT_UNUSED)
+    if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED))
         dst = TMP_REG1;


     if (FAST_IS_REG(dst)) {
@@ -1202,12 +1198,6 @@


     SLJIT_UNUSED_ARG(op_flags);


-    if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
-        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
-        src = TMP_REG1;
-        srcw = 0;
-    }
-
     if (cpu_has_cmov == -1)
         get_cpu_features();


@@ -1262,13 +1252,9 @@
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src, sljit_sw srcw)
 {
-    sljit_s32 update = 0;
     sljit_s32 op_flags = GET_ALL_FLAGS(op);
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     sljit_s32 dst_is_ereg = 0;
-    sljit_s32 src_is_ereg = 0;
-#else
-#    define src_is_ereg 0
 #endif


     CHECK_ERROR();
@@ -1277,7 +1263,7 @@
     ADJUST_LOCAL_OFFSET(src, srcw);


     CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
-    CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
+    CHECK_EXTRA_REGS(src, srcw, (void)0);
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
     compiler->mode32 = op_flags & SLJIT_I32_OP;
 #endif
@@ -1290,7 +1276,7 @@


     op = GET_OPCODE(op);


-    if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
+    if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
         compiler->mode32 = 0;
 #endif
@@ -1305,24 +1291,14 @@
             if (src & SLJIT_MEM) {
                 if (op == SLJIT_MOV_S32)
                     op = SLJIT_MOV_U32;
-                if (op == SLJIT_MOVU_S32)
-                    op = SLJIT_MOVU_U32;
             }
             else if (src & SLJIT_IMM) {
                 if (op == SLJIT_MOV_U32)
                     op = SLJIT_MOV_S32;
-                if (op == SLJIT_MOVU_U32)
-                    op = SLJIT_MOVU_S32;
             }
 #endif
         }


-        SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
-        if (op >= SLJIT_MOVU) {
-            update = 1;
-            op -= 8;
-        }
-
         if (src & SLJIT_IMM) {
             switch (op) {
             case SLJIT_MOV_U8:
@@ -1394,28 +1370,6 @@
         if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
             return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
 #endif
-
-        if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK)) {
-            if ((src & OFFS_REG_MASK) != 0) {
-                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
-                        (src & REG_MASK), 0, (src & REG_MASK), 0, OFFS_REG(dst), 0));
-            }
-            else if (srcw != 0) {
-                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
-                        (src & REG_MASK), 0, (src & REG_MASK), 0, SLJIT_IMM, srcw));
-            }
-        }
-
-        if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK)) {
-            if ((dst & OFFS_REG_MASK) != 0) {
-                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
-                        (dst & REG_MASK), 0, (dst & REG_MASK), 0, OFFS_REG(dst), 0));
-            }
-            else if (dstw != 0) {
-                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
-                        (dst & REG_MASK), 0, (dst & REG_MASK), 0, SLJIT_IMM, dstw));
-            }
-        }
         return SLJIT_SUCCESS;
     }


@@ -1433,10 +1387,6 @@
     }


     return SLJIT_SUCCESS;
-
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-#    undef src_is_ereg
-#endif
 }


#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)