[Pcre-svn] [1312] code/trunk/src: JIT compiler update

Page principale
Supprimer ce message
Auteur: Subversion repository
Date:  
À: pcre-svn
Sujet: [Pcre-svn] [1312] code/trunk/src: JIT compiler update
Revision: 1312
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1312
Author:   zherczeg
Date:     2021-05-27 09:11:15 +0100 (Thu, 27 May 2021)
Log Message:
-----------
JIT compiler update


Modified Paths:
--------------
    code/trunk/src/pcre2_jit_compile.c
    code/trunk/src/sljit/sljitConfigInternal.h
    code/trunk/src/sljit/sljitLir.c
    code/trunk/src/sljit/sljitLir.h
    code/trunk/src/sljit/sljitNativeARM_32.c
    code/trunk/src/sljit/sljitNativeARM_64.c
    code/trunk/src/sljit/sljitNativeARM_T2_32.c
    code/trunk/src/sljit/sljitNativeMIPS_32.c
    code/trunk/src/sljit/sljitNativeMIPS_64.c
    code/trunk/src/sljit/sljitNativeMIPS_common.c
    code/trunk/src/sljit/sljitNativePPC_32.c
    code/trunk/src/sljit/sljitNativePPC_64.c
    code/trunk/src/sljit/sljitNativePPC_common.c
    code/trunk/src/sljit/sljitNativeS390X.c
    code/trunk/src/sljit/sljitNativeSPARC_32.c
    code/trunk/src/sljit/sljitNativeSPARC_common.c
    code/trunk/src/sljit/sljitNativeX86_common.c


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/pcre2_jit_compile.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -8135,7 +8135,7 @@
     }
   else
     OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL);
-  add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32));
+  add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO));


   if (!common->endonly)
     compile_simple_assertion_matchingpath(common, OP_EODN, cc, backtracks);
@@ -8155,7 +8155,7 @@
     }
   else
     OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL);
-  add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32));
+  add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO));
   check_partial(common, FALSE);
   jump[0] = JUMP(SLJIT_JUMP);
   JUMPHERE(jump[1]);
@@ -8195,7 +8195,7 @@
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin));
     add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0));
     OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL);
-    add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32));
+    add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO));
     }
   else
     {
@@ -8202,7 +8202,7 @@
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin));
     add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0));
     OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL);
-    add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32));
+    add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO));
     }
   return cc;


@@ -8221,7 +8221,7 @@
     jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP2, 0);
     OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL);
     }
-  add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32));
+  add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO));
   jump[0] = JUMP(SLJIT_JUMP);
   JUMPHERE(jump[1]);


@@ -9575,11 +9575,11 @@

/* Check return value. */
OP2(SLJIT_SUB32 | SLJIT_SET_Z | SLJIT_SET_SIG_GREATER, SLJIT_UNUSED, 0, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
-add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_SIG_GREATER32));
+add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_SIG_GREATER));
if (common->abort_label == NULL)
- add_jump(compiler, &common->abort, JUMP(SLJIT_NOT_EQUAL32) /* SIG_LESS */);
+ add_jump(compiler, &common->abort, JUMP(SLJIT_NOT_EQUAL) /* SIG_LESS */);
else
- JUMPTO(SLJIT_NOT_EQUAL32 /* SIG_LESS */, common->abort_label);
+ JUMPTO(SLJIT_NOT_EQUAL /* SIG_LESS */, common->abort_label);
return cc + callout_length;
}


Modified: code/trunk/src/sljit/sljitConfigInternal.h
===================================================================
--- code/trunk/src/sljit/sljitConfigInternal.h    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitConfigInternal.h    2021-05-27 08:11:15 UTC (rev 1312)
@@ -761,6 +761,18 @@
 #define SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS \
     (SLJIT_NUMBER_OF_FLOAT_REGISTERS - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS)


+/********************************/
+/* CPU status flags management. */
+/********************************/
+
+#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
+    || (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+    || (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) \
+    || (defined SLJIT_CONFIG_SPARC && SLJIT_CONFIG_SPARC) \
+    || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
+#define SLJIT_HAS_STATUS_FLAGS_STATE 1
+#endif
+
 /*************************************/
 /* Debug and verbose related macros. */
 /*************************************/


Modified: code/trunk/src/sljit/sljitLir.c
===================================================================
--- code/trunk/src/sljit/sljitLir.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitLir.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -532,13 +532,21 @@
         put_label->label = label;
 }


+#define SLJIT_CURRENT_FLAGS_ALL \
+    (SLJIT_CURRENT_FLAGS_I32_OP | SLJIT_CURRENT_FLAGS_ADD_SUB | SLJIT_CURRENT_FLAGS_COMPARE)
+
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_current_flags(struct sljit_compiler *compiler, sljit_s32 current_flags)
 {
     SLJIT_UNUSED_ARG(compiler);
     SLJIT_UNUSED_ARG(current_flags);


+#if (defined SLJIT_HAS_STATUS_FLAGS_STATE && SLJIT_HAS_STATUS_FLAGS_STATE)
+    compiler->status_flags_state = current_flags;
+#endif
+
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-    if ((current_flags & ~(VARIABLE_FLAG_MASK | SLJIT_I32_OP | SLJIT_SET_Z)) == 0) {
+    compiler->last_flags = 0;
+    if ((current_flags & ~(VARIABLE_FLAG_MASK | SLJIT_SET_Z | SLJIT_CURRENT_FLAGS_ALL)) == 0) {
         compiler->last_flags = GET_FLAG_TYPE(current_flags) | (current_flags & (SLJIT_I32_OP | SLJIT_SET_Z));
     }
 #endif
@@ -968,7 +976,7 @@
 };


 #define JUMP_POSTFIX(type) \
-    ((type & 0xff) <= SLJIT_MUL_NOT_OVERFLOW ? ((type & SLJIT_I32_OP) ? "32" : "") \
+    ((type & 0xff) <= SLJIT_NOT_OVERFLOW ? ((type & SLJIT_I32_OP) ? "32" : "") \
     : ((type & 0xff) <= SLJIT_ORDERED_F64 ? ((type & SLJIT_F32_OP) ? ".f32" : ".f64") : ""))


 static char* jump_names[] = {
@@ -978,7 +986,6 @@
     (char*)"sig_less", (char*)"sig_greater_equal",
     (char*)"sig_greater", (char*)"sig_less_equal",
     (char*)"overflow", (char*)"not_overflow",
-    (char*)"mul_overflow", (char*)"mul_not_overflow",
     (char*)"carry", (char*)"",
     (char*)"equal", (char*)"not_equal",
     (char*)"less", (char*)"greater_equal",
@@ -1278,7 +1285,7 @@
     case SLJIT_MUL:
         CHECK_ARGUMENT(!(op & SLJIT_SET_Z));
         CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK)
-            || GET_FLAG_TYPE(op) == SLJIT_MUL_OVERFLOW);
+            || GET_FLAG_TYPE(op) == SLJIT_OVERFLOW);
         break;
     case SLJIT_ADD:
         CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK)
@@ -1601,9 +1608,7 @@
             CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
         else
             CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
-                || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
-                || ((type & 0xff) == SLJIT_MUL_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_MUL_OVERFLOW));
-        CHECK_ARGUMENT((type & SLJIT_I32_OP) == (compiler->last_flags & SLJIT_I32_OP));
+                || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW));
     }
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1818,8 +1823,7 @@
         CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
     else
         CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
-            || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
-            || ((type & 0xff) == SLJIT_MUL_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_MUL_OVERFLOW));
+            || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW));


     FUNCTION_CHECK_DST(dst, dstw, 0);


@@ -1858,8 +1862,7 @@
         CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
     else
         CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
-            || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
-            || ((type & 0xff) == SLJIT_MUL_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_MUL_OVERFLOW));
+            || ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
     if (SLJIT_UNLIKELY(!!compiler->verbose)) {


Modified: code/trunk/src/sljit/sljitLir.h
===================================================================
--- code/trunk/src/sljit/sljitLir.h    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitLir.h    2021-05-27 08:11:15 UTC (rev 1312)
@@ -412,6 +412,10 @@
     /* Executable size for statistical purposes. */
     sljit_uw executable_size;


+#if (defined SLJIT_HAS_STATUS_FLAGS_STATE && SLJIT_HAS_STATUS_FLAGS_STATE)
+    sljit_s32 status_flags_state;
+#endif
+
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     sljit_s32 args;
     sljit_s32 locals_offset;
@@ -460,7 +464,7 @@


 #if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
     /* Need to allocate register save area to make calls. */
-    sljit_s32 have_save_area;
+    sljit_s32 mode;
 #endif


 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -996,7 +1000,7 @@
 #define SLJIT_SUBC            (SLJIT_OP2_BASE + 3)
 #define SLJIT_SUBC32            (SLJIT_SUBC | SLJIT_I32_OP)
 /* Note: integer mul
-   Flags: MUL_OVERFLOW */
+   Flags: OVERFLOW */
 #define SLJIT_MUL            (SLJIT_OP2_BASE + 4)
 #define SLJIT_MUL32            (SLJIT_MUL | SLJIT_I32_OP)
 /* Flags: Z */
@@ -1141,89 +1145,69 @@


 /* Integer comparison types. */
 #define SLJIT_EQUAL            0
-#define SLJIT_EQUAL32            (SLJIT_EQUAL | SLJIT_I32_OP)
-#define SLJIT_ZERO            0
-#define SLJIT_ZERO32            (SLJIT_ZERO | SLJIT_I32_OP)
+#define SLJIT_ZERO            SLJIT_EQUAL
 #define SLJIT_NOT_EQUAL            1
-#define SLJIT_NOT_EQUAL32        (SLJIT_NOT_EQUAL | SLJIT_I32_OP)
-#define SLJIT_NOT_ZERO            1
-#define SLJIT_NOT_ZERO32        (SLJIT_NOT_ZERO | SLJIT_I32_OP)
+#define SLJIT_NOT_ZERO            SLJIT_NOT_EQUAL


 #define SLJIT_LESS            2
-#define SLJIT_LESS32            (SLJIT_LESS | SLJIT_I32_OP)
 #define SLJIT_SET_LESS            SLJIT_SET(SLJIT_LESS)
 #define SLJIT_GREATER_EQUAL        3
-#define SLJIT_GREATER_EQUAL32        (SLJIT_GREATER_EQUAL | SLJIT_I32_OP)
 #define SLJIT_SET_GREATER_EQUAL        SLJIT_SET(SLJIT_GREATER_EQUAL)
 #define SLJIT_GREATER            4
-#define SLJIT_GREATER32            (SLJIT_GREATER | SLJIT_I32_OP)
 #define SLJIT_SET_GREATER        SLJIT_SET(SLJIT_GREATER)
 #define SLJIT_LESS_EQUAL        5
-#define SLJIT_LESS_EQUAL32        (SLJIT_LESS_EQUAL | SLJIT_I32_OP)
 #define SLJIT_SET_LESS_EQUAL        SLJIT_SET(SLJIT_LESS_EQUAL)
 #define SLJIT_SIG_LESS            6
-#define SLJIT_SIG_LESS32        (SLJIT_SIG_LESS | SLJIT_I32_OP)
 #define SLJIT_SET_SIG_LESS        SLJIT_SET(SLJIT_SIG_LESS)
 #define SLJIT_SIG_GREATER_EQUAL        7
-#define SLJIT_SIG_GREATER_EQUAL32    (SLJIT_SIG_GREATER_EQUAL | SLJIT_I32_OP)
 #define SLJIT_SET_SIG_GREATER_EQUAL    SLJIT_SET(SLJIT_SIG_GREATER_EQUAL)
 #define SLJIT_SIG_GREATER        8
-#define SLJIT_SIG_GREATER32        (SLJIT_SIG_GREATER | SLJIT_I32_OP)
 #define SLJIT_SET_SIG_GREATER        SLJIT_SET(SLJIT_SIG_GREATER)
 #define SLJIT_SIG_LESS_EQUAL        9
-#define SLJIT_SIG_LESS_EQUAL32        (SLJIT_SIG_LESS_EQUAL | SLJIT_I32_OP)
 #define SLJIT_SET_SIG_LESS_EQUAL    SLJIT_SET(SLJIT_SIG_LESS_EQUAL)


 #define SLJIT_OVERFLOW            10
-#define SLJIT_OVERFLOW32        (SLJIT_OVERFLOW | SLJIT_I32_OP)
 #define SLJIT_SET_OVERFLOW        SLJIT_SET(SLJIT_OVERFLOW)
 #define SLJIT_NOT_OVERFLOW        11
-#define SLJIT_NOT_OVERFLOW32        (SLJIT_NOT_OVERFLOW | SLJIT_I32_OP)


-#define SLJIT_MUL_OVERFLOW        12
-#define SLJIT_MUL_OVERFLOW32        (SLJIT_MUL_OVERFLOW | SLJIT_I32_OP)
-#define SLJIT_SET_MUL_OVERFLOW        SLJIT_SET(SLJIT_MUL_OVERFLOW)
-#define SLJIT_MUL_NOT_OVERFLOW        13
-#define SLJIT_MUL_NOT_OVERFLOW32    (SLJIT_MUL_NOT_OVERFLOW | SLJIT_I32_OP)
-
 /* There is no SLJIT_CARRY or SLJIT_NOT_CARRY. */
-#define SLJIT_SET_CARRY            SLJIT_SET(14)
+#define SLJIT_SET_CARRY            SLJIT_SET(12)


 /* Floating point comparison types. */
-#define SLJIT_EQUAL_F64            16
+#define SLJIT_EQUAL_F64            14
 #define SLJIT_EQUAL_F32            (SLJIT_EQUAL_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_EQUAL_F        SLJIT_SET(SLJIT_EQUAL_F64)
-#define SLJIT_NOT_EQUAL_F64        17
+#define SLJIT_NOT_EQUAL_F64        15
 #define SLJIT_NOT_EQUAL_F32        (SLJIT_NOT_EQUAL_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_NOT_EQUAL_F        SLJIT_SET(SLJIT_NOT_EQUAL_F64)
-#define SLJIT_LESS_F64            18
+#define SLJIT_LESS_F64            16
 #define SLJIT_LESS_F32            (SLJIT_LESS_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_LESS_F        SLJIT_SET(SLJIT_LESS_F64)
-#define SLJIT_GREATER_EQUAL_F64        19
+#define SLJIT_GREATER_EQUAL_F64        17
 #define SLJIT_GREATER_EQUAL_F32        (SLJIT_GREATER_EQUAL_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_GREATER_EQUAL_F    SLJIT_SET(SLJIT_GREATER_EQUAL_F64)
-#define SLJIT_GREATER_F64        20
+#define SLJIT_GREATER_F64        18
 #define SLJIT_GREATER_F32        (SLJIT_GREATER_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_GREATER_F        SLJIT_SET(SLJIT_GREATER_F64)
-#define SLJIT_LESS_EQUAL_F64        21
+#define SLJIT_LESS_EQUAL_F64        19
 #define SLJIT_LESS_EQUAL_F32        (SLJIT_LESS_EQUAL_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_LESS_EQUAL_F        SLJIT_SET(SLJIT_LESS_EQUAL_F64)
-#define SLJIT_UNORDERED_F64        22
+#define SLJIT_UNORDERED_F64        20
 #define SLJIT_UNORDERED_F32        (SLJIT_UNORDERED_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_UNORDERED_F        SLJIT_SET(SLJIT_UNORDERED_F64)
-#define SLJIT_ORDERED_F64        23
+#define SLJIT_ORDERED_F64        21
 #define SLJIT_ORDERED_F32        (SLJIT_ORDERED_F64 | SLJIT_F32_OP)
 #define SLJIT_SET_ORDERED_F        SLJIT_SET(SLJIT_ORDERED_F64)


 /* Unconditional jump types. */
-#define SLJIT_JUMP            24
+#define SLJIT_JUMP            22
     /* Fast calling method. See sljit_emit_fast_enter / SLJIT_FAST_RETURN. */
-#define SLJIT_FAST_CALL            25
+#define SLJIT_FAST_CALL            23
     /* Called function must be declared with the SLJIT_FUNC attribute. */
-#define SLJIT_CALL            26
+#define SLJIT_CALL            24
     /* Called function must be declared with cdecl attribute.
        This is the default attribute for C functions. */
-#define SLJIT_CALL_CDECL        27
+#define SLJIT_CALL_CDECL        25


 /* The target can be changed during runtime (see: sljit_set_jump_addr). */
 #define SLJIT_REWRITABLE_JUMP        0x1000
@@ -1534,9 +1518,23 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
     void *instruction, sljit_s32 size);


-/* Define the currently available CPU status flags. It is usually used after an
-   sljit_emit_op_custom call to define which flags are set. */
+/* Flags were set by a 32 bit operation. */
+#define SLJIT_CURRENT_FLAGS_I32_OP        SLJIT_I32_OP


+/* Flags were set by an ADD, ADDC, SUB, SUBC, or NEG operation. */
+#define SLJIT_CURRENT_FLAGS_ADD_SUB        0x01
+
+/* Flags were set by a SUB with unused destination.
+   Must be combined with SLJIT_CURRENT_FLAGS_ADD_SUB. */
+#define SLJIT_CURRENT_FLAGS_COMPARE        0x02
+
+/* Define the currently available CPU status flags. It is usually used after
+   an sljit_emit_label or sljit_emit_op_custom operations to define which CPU
+   status flags are available.
+
+   The current_flags must be a valid combination of SLJIT_SET_* and
+   SLJIT_CURRENT_FLAGS_* constants. */
+
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_current_flags(struct sljit_compiler *compiler,
     sljit_s32 current_flags);



Modified: code/trunk/src/sljit/sljitNativeARM_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_32.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeARM_32.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -1197,6 +1197,8 @@


     case SLJIT_ADD:
         SLJIT_ASSERT(!(flags & INV_IMM));
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
+
         if ((flags & (UNUSED_RETURN | SET_FLAGS)) == (UNUSED_RETURN | SET_FLAGS) && !(flags & ARGS_SWAPPED))
             return push_inst(compiler, CMN | SET_FLAGS | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
         return push_inst(compiler, ADD | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
@@ -1207,6 +1209,8 @@


     case SLJIT_SUB:
         SLJIT_ASSERT(!(flags & INV_IMM));
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
+
         if ((flags & (UNUSED_RETURN | SET_FLAGS)) == (UNUSED_RETURN | SET_FLAGS) && !(flags & ARGS_SWAPPED))
             return push_inst(compiler, CMP | SET_FLAGS | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
         return push_inst(compiler, (!(flags & ARGS_SWAPPED) ? SUB : RSB) | (flags & SET_FLAGS)
@@ -1220,6 +1224,7 @@
     case SLJIT_MUL:
         SLJIT_ASSERT(!(flags & INV_IMM));
         SLJIT_ASSERT(!(src2 & SRC2_IMM));
+        compiler->status_flags_state = 0;


         if (!HAS_FLAGS(op))
             return push_inst(compiler, MUL | (reg_map[dst] << 16) | (reg_map[src2] << 8) | reg_map[src1]);
@@ -2153,16 +2158,14 @@
 /*  Conditional instructions                                             */
 /* --------------------------------------------------------------------- */


-static sljit_uw get_cc(sljit_s32 type)
+static sljit_uw get_cc(struct sljit_compiler *compiler, sljit_s32 type)
 {
     switch (type) {
     case SLJIT_EQUAL:
-    case SLJIT_MUL_NOT_OVERFLOW:
     case SLJIT_EQUAL_F64:
         return 0x00000000;


     case SLJIT_NOT_EQUAL:
-    case SLJIT_MUL_OVERFLOW:
     case SLJIT_NOT_EQUAL_F64:
         return 0x10000000;


@@ -2195,10 +2198,16 @@
         return 0xd0000000;


     case SLJIT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return 0x10000000;
+
     case SLJIT_UNORDERED_F64:
         return 0x60000000;


     case SLJIT_NOT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return 0x00000000;
+
     case SLJIT_ORDERED_F64:
         return 0x70000000;


@@ -2242,7 +2251,7 @@
     if (type >= SLJIT_FAST_CALL)
         PTR_FAIL_IF(prepare_blx(compiler));
     PTR_FAIL_IF(push_inst_with_unique_literal(compiler, ((EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1,
-        type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0)) & ~COND_MASK) | get_cc(type), 0));
+        type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0)) & ~COND_MASK) | get_cc(compiler, type), 0));


     if (jump->flags & SLJIT_REWRITABLE_JUMP) {
         jump->addr = compiler->size;
@@ -2260,7 +2269,7 @@
     if (type >= SLJIT_FAST_CALL)
         jump->flags |= IS_BL;
     PTR_FAIL_IF(emit_imm(compiler, TMP_REG1, 0));
-    PTR_FAIL_IF(push_inst(compiler, (((type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1)) & ~COND_MASK) | get_cc(type)));
+    PTR_FAIL_IF(push_inst(compiler, (((type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1)) & ~COND_MASK) | get_cc(compiler, type)));
     jump->addr = compiler->size;
 #endif
     return jump;
@@ -2589,7 +2598,7 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);


     op = GET_OPCODE(op);
-    cc = get_cc(type & 0xff);
+    cc = get_cc(compiler, type & 0xff);
     dst_reg = FAST_IS_REG(dst) ? dst : TMP_REG1;


     if (op < SLJIT_ADD) {
@@ -2629,7 +2638,7 @@


     dst_reg &= ~SLJIT_I32_OP;


-    cc = get_cc(type & 0xff);
+    cc = get_cc(compiler, type & 0xff);


     if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
         tmp = get_imm(srcw);


Modified: code/trunk/src/sljit/sljitNativeARM_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_64.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeARM_64.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -644,6 +644,7 @@
             imm = -imm;
             /* Fall through. */
         case SLJIT_ADD:
+            compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
             if (imm == 0) {
                 CHECK_FLAGS(1 << 29);
                 return push_inst(compiler, ((op == SLJIT_ADD ? ADDI : SUBI) ^ inv_bits) | RD(dst) | RN(reg));
@@ -781,6 +782,7 @@
         break; /* Set flags. */
     case SLJIT_NEG:
         SLJIT_ASSERT(arg1 == TMP_REG1);
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         if (flags & SET_FLAGS)
             inv_bits |= 1 << 29;
         return push_inst(compiler, (SUB ^ inv_bits) | RD(dst) | RN(TMP_ZERO) | RM(arg2));
@@ -789,6 +791,7 @@
         return push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | RN(arg2));
     case SLJIT_ADD:
         CHECK_FLAGS(1 << 29);
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         return push_inst(compiler, (ADD ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
     case SLJIT_ADDC:
         CHECK_FLAGS(1 << 29);
@@ -795,11 +798,13 @@
         return push_inst(compiler, (ADC ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
     case SLJIT_SUB:
         CHECK_FLAGS(1 << 29);
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         return push_inst(compiler, (SUB ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
     case SLJIT_SUBC:
         CHECK_FLAGS(1 << 29);
         return push_inst(compiler, (SBC ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
     case SLJIT_MUL:
+        compiler->status_flags_state = 0;
         if (!(flags & SET_FLAGS))
             return push_inst(compiler, (MADD ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2) | RT2(TMP_ZERO));
         if (flags & INT_OP) {
@@ -1600,16 +1605,14 @@
 /*  Conditional instructions                                             */
 /* --------------------------------------------------------------------- */


-static sljit_uw get_cc(sljit_s32 type)
+static sljit_uw get_cc(struct sljit_compiler *compiler, sljit_s32 type)
 {
     switch (type) {
     case SLJIT_EQUAL:
-    case SLJIT_MUL_NOT_OVERFLOW:
     case SLJIT_EQUAL_F64:
         return 0x1;


     case SLJIT_NOT_EQUAL:
-    case SLJIT_MUL_OVERFLOW:
     case SLJIT_NOT_EQUAL_F64:
         return 0x0;


@@ -1642,10 +1645,16 @@
         return 0xc;


     case SLJIT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return 0x0;
+
     case SLJIT_UNORDERED_F64:
         return 0x7;


     case SLJIT_NOT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return 0x1;
+
     case SLJIT_ORDERED_F64:
         return 0x6;


@@ -1685,7 +1694,7 @@

     if (type < SLJIT_JUMP) {
         jump->flags |= IS_COND;
-        PTR_FAIL_IF(push_inst(compiler, B_CC | (6 << 5) | get_cc(type)));
+        PTR_FAIL_IF(push_inst(compiler, B_CC | (6 << 5) | get_cc(compiler, type)));
     }
     else if (type >= SLJIT_FAST_CALL)
         jump->flags |= IS_BL;
@@ -1799,7 +1808,7 @@
     CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    cc = get_cc(type & 0xff);
+    cc = get_cc(compiler, type & 0xff);
     dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


     if (GET_OPCODE(op) < SLJIT_ADD) {
@@ -1854,7 +1863,7 @@
         srcw = 0;
     }


-    cc = get_cc(type & 0xff);
+    cc = get_cc(compiler, type & 0xff);
     dst_reg &= ~SLJIT_I32_OP;


     return push_inst(compiler, (CSEL ^ inv_bits) | (cc << 12) | RD(dst_reg) | RN(dst_reg) | RM(src));


Modified: code/trunk/src/sljit/sljitNativeARM_T2_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_T2_32.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeARM_T2_32.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -610,6 +610,7 @@
                Although some clever things could be done here, "NOT IMM" does not worth the efforts. */
             break;
         case SLJIT_ADD:
+            compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
             nimm = -(sljit_sw)imm;
             if (IS_2_LO_REGS(reg, dst)) {
                 if (imm <= 0x7)
@@ -643,6 +644,7 @@
             break;
         case SLJIT_SUB:
             /* SUB operation can be replaced by ADD because of the negative carry flag. */
+            compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
             if (flags & ARG1_IMM) {
                 if (imm == 0 && IS_2_LO_REGS(reg, dst))
                     return push_inst16(compiler, RSBSI | RD3(dst) | RN3(reg));
@@ -801,6 +803,7 @@
         FAIL_IF(push_inst32(compiler, CLZ | RN4(arg2) | RD4(dst) | RM4(arg2)));
         return SLJIT_SUCCESS;
     case SLJIT_ADD:
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         if (IS_3_LO_REGS(dst, arg1, arg2))
             return push_inst16(compiler, ADDS | RD3(dst) | RN3(arg1) | RM3(arg2));
         if (dst == arg1 && !(flags & SET_FLAGS))
@@ -811,6 +814,7 @@
             return push_inst16(compiler, ADCS | RD3(dst) | RN3(arg2));
         return push_inst32(compiler, ADC_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
     case SLJIT_SUB:
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         if (flags & UNUSED_RETURN) {
             if (IS_2_LO_REGS(arg1, arg2))
                 return push_inst16(compiler, CMP | RD3(arg1) | RN3(arg2));
@@ -824,6 +828,7 @@
             return push_inst16(compiler, SBCS | RD3(dst) | RN3(arg2));
         return push_inst32(compiler, SBC_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
     case SLJIT_MUL:
+        compiler->status_flags_state = 0;
         if (!(flags & SET_FLAGS))
             return push_inst32(compiler, MUL | RD4(dst) | RN4(arg1) | RM4(arg2));
         SLJIT_ASSERT(dst != TMP_REG2);
@@ -1760,16 +1765,14 @@
 /*  Conditional instructions                                             */
 /* --------------------------------------------------------------------- */


-static sljit_uw get_cc(sljit_s32 type)
+static sljit_uw get_cc(struct sljit_compiler *compiler, sljit_s32 type)
 {
     switch (type) {
     case SLJIT_EQUAL:
-    case SLJIT_MUL_NOT_OVERFLOW:
     case SLJIT_EQUAL_F64:
         return 0x0;


     case SLJIT_NOT_EQUAL:
-    case SLJIT_MUL_OVERFLOW:
     case SLJIT_NOT_EQUAL_F64:
         return 0x1;


@@ -1802,10 +1805,16 @@
         return 0xd;


     case SLJIT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return 0x1;
+
     case SLJIT_UNORDERED_F64:
         return 0x6;


     case SLJIT_NOT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return 0x0;
+
     case SLJIT_ORDERED_F64:
         return 0x7;


@@ -1847,7 +1856,7 @@
     PTR_FAIL_IF(emit_imm32_const(compiler, TMP_REG1, 0));
     if (type < SLJIT_JUMP) {
         jump->flags |= IS_COND;
-        cc = get_cc(type);
+        cc = get_cc(compiler, type);
         jump->flags |= cc << 8;
         PTR_FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
     }
@@ -2177,7 +2186,7 @@
     ADJUST_LOCAL_OFFSET(dst, dstw);


     op = GET_OPCODE(op);
-    cc = get_cc(type & 0xff);
+    cc = get_cc(compiler, type & 0xff);
     dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;


     if (op < SLJIT_ADD) {
@@ -2229,7 +2238,7 @@


     dst_reg &= ~SLJIT_I32_OP;


-    cc = get_cc(type & 0xff);
+    cc = get_cc(compiler, type & 0xff);


     if (!(src & SLJIT_IMM)) {
         FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));


Modified: code/trunk/src/sljit/sljitNativeMIPS_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeMIPS_32.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeMIPS_32.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -367,7 +367,7 @@
     case SLJIT_MUL:
         SLJIT_ASSERT(!(flags & SRC2_IMM));


-        if (GET_FLAG_TYPE(op) != SLJIT_MUL_OVERFLOW) {
+        if (GET_FLAG_TYPE(op) != SLJIT_OVERFLOW) {
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
             return push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst));
 #else /* SLJIT_MIPS_REV < 1 */


Modified: code/trunk/src/sljit/sljitNativeMIPS_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeMIPS_64.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeMIPS_64.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -458,7 +458,7 @@
     case SLJIT_MUL:
         SLJIT_ASSERT(!(flags & SRC2_IMM));


-        if (GET_FLAG_TYPE(op) != SLJIT_MUL_OVERFLOW) {
+        if (GET_FLAG_TYPE(op) != SLJIT_OVERFLOW) {
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
             return push_inst(compiler, SELECT_OP(DMUL, MUL) | S(src1) | T(src2) | D(dst), DR(dst));
 #elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)


Modified: code/trunk/src/sljit/sljitNativeMIPS_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeMIPS_common.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeMIPS_common.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -1377,6 +1377,7 @@
         return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);


     case SLJIT_NEG:
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         return emit_op(compiler, SLJIT_SUB | GET_ALL_FLAGS(op), flags | IMM_OP, dst, dstw, SLJIT_IMM, 0, src, srcw);


     case SLJIT_CLZ:
@@ -1424,13 +1425,16 @@
     switch (GET_OPCODE(op)) {
     case SLJIT_ADD:
     case SLJIT_ADDC:
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);


     case SLJIT_SUB:
     case SLJIT_SUBC:
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         return emit_op(compiler, op, flags | IMM_OP, dst, dstw, src1, src1w, src2, src2w);


     case SLJIT_MUL:
+        compiler->status_flags_state = 0;
         return emit_op(compiler, op, flags | CUMULATIVE_OP, dst, dstw, src1, src1w, src2, src2w);


     case SLJIT_AND:
@@ -1860,7 +1864,6 @@
     case SLJIT_SIG_LESS:
     case SLJIT_SIG_GREATER:
     case SLJIT_OVERFLOW:
-    case SLJIT_MUL_OVERFLOW:
         BR_Z(OTHER_FLAG);
         break;
     case SLJIT_GREATER_EQUAL:
@@ -1868,7 +1871,6 @@
     case SLJIT_SIG_GREATER_EQUAL:
     case SLJIT_SIG_LESS_EQUAL:
     case SLJIT_NOT_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
         BR_NZ(OTHER_FLAG);
         break;
     case SLJIT_NOT_EQUAL_F64:
@@ -2127,8 +2129,12 @@
         FAIL_IF(push_inst(compiler, SLTIU | SA(EQUAL_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
         src_ar = dst_ar;
         break;
-    case SLJIT_MUL_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
+    case SLJIT_OVERFLOW:
+    case SLJIT_NOT_OVERFLOW:
+        if (compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB) {
+            src_ar = OTHER_FLAG;
+            break;
+        }
         FAIL_IF(push_inst(compiler, SLTIU | SA(OTHER_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
         src_ar = dst_ar;
         type ^= 0x1; /* Flip type bit for the XORI below. */
@@ -2219,7 +2225,6 @@
     case SLJIT_SIG_LESS:
     case SLJIT_SIG_GREATER:
     case SLJIT_OVERFLOW:
-    case SLJIT_MUL_OVERFLOW:
         ins = MOVN | TA(OTHER_FLAG);
         break;
     case SLJIT_GREATER_EQUAL:
@@ -2227,7 +2232,6 @@
     case SLJIT_SIG_GREATER_EQUAL:
     case SLJIT_SIG_LESS_EQUAL:
     case SLJIT_NOT_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
         ins = MOVZ | TA(OTHER_FLAG);
         break;
     case SLJIT_EQUAL_F64:


Modified: code/trunk/src/sljit/sljitNativePPC_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativePPC_32.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativePPC_32.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -119,9 +119,10 @@
             SLJIT_ASSERT(src2 == TMP_REG2);
             return push_inst(compiler, ADDIC | D(dst) | A(src1) | compiler->imm);
         }
+        SLJIT_ASSERT(!(flags & ALT_FORM4));
         if (!(flags & ALT_SET_FLAGS))
             return push_inst(compiler, ADD | D(dst) | A(src1) | B(src2));
-        if (flags & ALT_FORM4)
+        if (flags & ALT_FORM5)
             return push_inst(compiler, ADDC | RC(ALT_SET_FLAGS) | D(dst) | A(src1) | B(src2));
         return push_inst(compiler, ADD | RC(flags) | D(dst) | A(src1) | B(src2));


@@ -143,24 +144,29 @@
         }


         if (flags & ALT_FORM2) {
+            if (flags & ALT_FORM3) {
+                FAIL_IF(push_inst(compiler, CMPI | CRD(0) | A(src1) | compiler->imm));
+                if (!(flags & ALT_FORM4))
+                    return SLJIT_SUCCESS;
+                return push_inst(compiler, ADDI | D(dst) | A(src1) | (-compiler->imm & 0xffff));
+            }
+            FAIL_IF(push_inst(compiler, CMP | CRD(0) | A(src1) | B(src2)));
+            if (!(flags & ALT_FORM4))
+                return SLJIT_SUCCESS;
+            return push_inst(compiler, SUBF | D(dst) | A(src2) | B(src1));
+        }
+
+        if (flags & ALT_FORM3) {
             /* Setting XER SO is not enough, CR SO is also needed. */
             return push_inst(compiler, SUBF | OE(ALT_SET_FLAGS) | RC(ALT_SET_FLAGS) | D(dst) | A(src2) | B(src1));
         }


-        if (flags & ALT_FORM3) {
+        if (flags & ALT_FORM4) {
             /* Flags does not set: BIN_IMM_EXTS unnecessary. */
             SLJIT_ASSERT(src2 == TMP_REG2);
             return push_inst(compiler, SUBFIC | D(dst) | A(src1) | compiler->imm);
         }


-        if (flags & ALT_FORM4) {
-            if (flags & ALT_FORM5) {
-                SLJIT_ASSERT(src2 == TMP_REG2);
-                return push_inst(compiler, CMPI | CRD(0) | A(src1) | compiler->imm);
-            }
-            return push_inst(compiler, CMP | CRD(0) | A(src1) | B(src2));
-        }
-
         if (!(flags & ALT_SET_FLAGS))
             return push_inst(compiler, SUBF | D(dst) | A(src2) | B(src1));
         if (flags & ALT_FORM5)


Modified: code/trunk/src/sljit/sljitNativePPC_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativePPC_64.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativePPC_64.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -252,10 +252,17 @@
             BIN_IMM_EXTS();
             return push_inst(compiler, ADDIC | D(dst) | A(src1) | compiler->imm);
         }
+        if (flags & ALT_FORM4) {
+            if (flags & ALT_FORM5)
+                FAIL_IF(push_inst(compiler, ADDI | D(dst) | A(src1) | compiler->imm));
+            else
+                FAIL_IF(push_inst(compiler, ADD | D(dst) | A(src1) | B(src2)));
+            return push_inst(compiler, CMPI | A(dst) | 0);
+        }
         if (!(flags & ALT_SET_FLAGS))
             return push_inst(compiler, ADD | D(dst) | A(src1) | B(src2));
         BIN_EXTS();
-        if (flags & ALT_FORM4)
+        if (flags & ALT_FORM5)
             return push_inst(compiler, ADDC | RC(ALT_SET_FLAGS) | D(dst) | A(src1) | B(src2));
         return push_inst(compiler, ADD | RC(flags) | D(dst) | A(src1) | B(src2));


@@ -278,6 +285,19 @@
         }


         if (flags & ALT_FORM2) {
+            if (flags & ALT_FORM3) {
+                FAIL_IF(push_inst(compiler, CMPI | CRD(0 | ((flags & ALT_SIGN_EXT) ? 0 : 1)) | A(src1) | compiler->imm));
+                if (!(flags & ALT_FORM4))
+                    return SLJIT_SUCCESS;
+                return push_inst(compiler, ADDI | D(dst) | A(src1) | (-compiler->imm & 0xffff));
+            }
+            FAIL_IF(push_inst(compiler, CMP | CRD(0 | ((flags & ALT_SIGN_EXT) ? 0 : 1)) | A(src1) | B(src2)));
+            if (!(flags & ALT_FORM4))
+                return SLJIT_SUCCESS;
+            return push_inst(compiler, SUBF | D(dst) | A(src2) | B(src1));
+        }
+
+        if (flags & ALT_FORM3) {
             if (flags & ALT_SIGN_EXT) {
                 FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, src1, 32, 31, 1)));
                 src1 = TMP_REG1;
@@ -291,20 +311,12 @@
             return SLJIT_SUCCESS;
         }


-        if (flags & ALT_FORM3) {
+        if (flags & ALT_FORM4) {
             /* Flags does not set: BIN_IMM_EXTS unnecessary. */
             SLJIT_ASSERT(src2 == TMP_REG2);
             return push_inst(compiler, SUBFIC | D(dst) | A(src1) | compiler->imm);
         }


-        if (flags & ALT_FORM4) {
-            if (flags & ALT_FORM5) {
-                SLJIT_ASSERT(src2 == TMP_REG2);
-                return push_inst(compiler, CMPI | CRD(0 | ((flags & ALT_SIGN_EXT) ? 0 : 1)) | A(src1) | compiler->imm);
-            }
-            return push_inst(compiler, CMP | CRD(0 | ((flags & ALT_SIGN_EXT) ? 0 : 1)) | A(src1) | B(src2));
-        }
-
         if (!(flags & ALT_SET_FLAGS))
             return push_inst(compiler, SUBF | D(dst) | A(src2) | B(src1));
         BIN_EXTS();


Modified: code/trunk/src/sljit/sljitNativePPC_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativePPC_common.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativePPC_common.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -1324,6 +1324,25 @@
     ((src) & SLJIT_IMM)
 #endif


+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+#define TEST_ADD_FORM1(op) \
+    (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW \
+        || (op & (SLJIT_I32_OP | SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_I32_OP | SLJIT_SET_Z | SLJIT_SET_CARRY))
+#define TEST_SUB_FORM2(op) \
+    ((GET_FLAG_TYPE(op) >= SLJIT_SIG_LESS && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL) \
+        || (op & (SLJIT_I32_OP | SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_I32_OP | SLJIT_SET_Z))
+#define TEST_SUB_FORM3(op) \
+    (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW \
+        || (op & (SLJIT_I32_OP | SLJIT_SET_Z)) == (SLJIT_I32_OP | SLJIT_SET_Z))
+#else
+#define TEST_ADD_FORM1(op) \
+    (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
+#define TEST_SUB_FORM2(op) \
+    (GET_FLAG_TYPE(op) >= SLJIT_SIG_LESS && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL)
+#define TEST_SUB_FORM3(op) \
+    (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
+#endif
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
@@ -1362,7 +1381,7 @@


     switch (GET_OPCODE(op)) {
     case SLJIT_ADD:
-        if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
+        if (TEST_ADD_FORM1(op))
             return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM1, dst, dstw, src1, src1w, src2, src2w);


         if (!HAS_FLAGS(op) && ((src1 | src2) & SLJIT_IMM)) {
@@ -1392,6 +1411,20 @@
                 return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM2 | ALT_FORM4, dst, dstw, src2, src2w, TMP_REG2, 0);
             }
         }
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+        if ((op & (SLJIT_I32_OP | SLJIT_SET_Z)) == (SLJIT_I32_OP | SLJIT_SET_Z)) {
+            if (TEST_SL_IMM(src2, src2w)) {
+                compiler->imm = src2w & 0xffff;
+                return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM4 | ALT_FORM5, dst, dstw, src1, src1w, TMP_REG2, 0);
+            }
+            if (TEST_SL_IMM(src1, src1w)) {
+                compiler->imm = src1w & 0xffff;
+                return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM4 | ALT_FORM5, dst, dstw, src2, src2w, TMP_REG2, 0);
+            }
+            return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM4, dst, dstw, src1, src1w, src2, src2w);
+        }
+#endif
         if (HAS_FLAGS(op)) {
             if (TEST_SL_IMM(src2, src2w)) {
                 compiler->imm = src2w & 0xffff;
@@ -1402,7 +1435,7 @@
                 return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM3, dst, dstw, src2, src2w, TMP_REG2, 0);
             }
         }
-        return emit_op(compiler, SLJIT_ADD, flags | ((GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY)) ? ALT_FORM4 : 0), dst, dstw, src1, src1w, src2, src2w);
+        return emit_op(compiler, SLJIT_ADD, flags | ((GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY)) ? ALT_FORM5 : 0), dst, dstw, src1, src1w, src2, src2w);


     case SLJIT_ADDC:
         return emit_op(compiler, SLJIT_ADDC, flags, dst, dstw, src1, src1w, src2, src2w);
@@ -1424,18 +1457,36 @@
             return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM1 | ALT_FORM3, dst, dstw, src1, src1w, src2, src2w);
         }


-        if (GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
+        if (dst == SLJIT_UNUSED && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL) {
+            if (TEST_SL_IMM(src2, src2w)) {
+                compiler->imm = src2w & 0xffff;
+                return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM2 | ALT_FORM3, dst, dstw, src1, src1w, TMP_REG2, 0);
+            }
             return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM2, dst, dstw, src1, src1w, src2, src2w);
+        }


-        if (!HAS_FLAGS(op) && ((src1 | src2) & SLJIT_IMM)) {
-            if (TEST_SL_IMM(src2, -src2w)) {
-                compiler->imm = (-src2w) & 0xffff;
-                return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM2, dst, dstw, src1, src1w, TMP_REG2, 0);
+        if (TEST_SUB_FORM2(op)) {
+            if ((src2 & SLJIT_IMM) && src2w >= -SIMM_MAX && src2w <= SIMM_MAX) {
+                compiler->imm = src2w & 0xffff;
+                return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM2 | ALT_FORM3 | ALT_FORM4, dst, dstw, src1, src1w, TMP_REG2, 0);
             }
-            if (TEST_SL_IMM(src1, src1w)) {
-                compiler->imm = src1w & 0xffff;
-                return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM3, dst, dstw, src2, src2w, TMP_REG2, 0);
-            }
+            return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM2 | ALT_FORM4, dst, dstw, src1, src1w, src2, src2w);
+        }
+
+        if (TEST_SUB_FORM3(op))
+            return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM3, dst, dstw, src1, src1w, src2, src2w);
+
+        if (TEST_SL_IMM(src2, -src2w)) {
+            compiler->imm = (-src2w) & 0xffff;
+            return emit_op(compiler, SLJIT_ADD, flags | (!HAS_FLAGS(op) ? ALT_FORM2 : ALT_FORM3), dst, dstw, src1, src1w, TMP_REG2, 0);
+        }
+
+        if (TEST_SL_IMM(src1, src1w) && !(op & SLJIT_SET_Z)) {
+            compiler->imm = src1w & 0xffff;
+            return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM4, dst, dstw, src2, src2w, TMP_REG2, 0);
+        }
+
+        if (!HAS_FLAGS(op)) {
             if (TEST_SH_IMM(src2, -src2w)) {
                 compiler->imm = ((-src2w) >> 16) & 0xffff;
                 return emit_op(compiler, SLJIT_ADD, flags |  ALT_FORM2 | ALT_FORM3, dst, dstw, src1, src1w, TMP_REG2, 0);
@@ -1447,18 +1498,6 @@
             }
         }


-        if (dst == SLJIT_UNUSED && GET_FLAG_TYPE(op) != GET_FLAG_TYPE(SLJIT_SET_CARRY)) {
-            if (TEST_SL_IMM(src2, src2w)) {
-                compiler->imm = src2w & 0xffff;
-                return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM4 | ALT_FORM5, dst, dstw, src1, src1w, TMP_REG2, 0);
-            }
-            return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM4, dst, dstw, src1, src1w, src2, src2w);
-        }
-
-        if (TEST_SL_IMM(src2, -src2w)) {
-            compiler->imm = (-src2w) & 0xffff;
-            return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM3, dst, dstw, src1, src1w, TMP_REG2, 0);
-        }
         /* We know ALT_SIGN_EXT is set if it is an SLJIT_I32_OP on 64 bit systems. */
         return emit_op(compiler, SLJIT_SUB, flags | ((GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY)) ? ALT_FORM5 : 0), dst, dstw, src1, src1w, src2, src2w);


@@ -1536,6 +1575,10 @@
     return SLJIT_SUCCESS;
 }


+#undef TEST_ADD_FORM1
+#undef TEST_SUB_FORM2
+#undef TEST_SUB_FORM3
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 src, sljit_sw srcw)
 {
@@ -1941,11 +1984,9 @@
         return (4 << 21) | ((4 + 1) << 16);


     case SLJIT_OVERFLOW:
-    case SLJIT_MUL_OVERFLOW:
         return (12 << 21) | (3 << 16);


     case SLJIT_NOT_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
         return (4 << 21) | (3 << 16);


     case SLJIT_EQUAL_F64:
@@ -2143,12 +2184,10 @@
         break;


     case SLJIT_OVERFLOW:
-    case SLJIT_MUL_OVERFLOW:
         cr_bit = 3;
         break;


     case SLJIT_NOT_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
         cr_bit = 3;
         invert = 1;
         break;


Modified: code/trunk/src/sljit/sljitNativeS390X.c
===================================================================
--- code/trunk/src/sljit/sljitNativeS390X.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeS390X.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -45,7 +45,7 @@
 static const sljit_ins sljit_ins_const = (sljit_ins)1 << 48;


 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
-    14, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0, 1
+    0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0, 1
 };


 /* there are also a[2-15] available, but they are slower to access and
@@ -120,8 +120,7 @@
 /* Convert SLJIT register to hardware register. */
 static SLJIT_INLINE sljit_gpr gpr(sljit_s32 r)
 {
-    SLJIT_ASSERT(r != SLJIT_UNUSED);
-    SLJIT_ASSERT(r < (sljit_s32)(sizeof(reg_map) / sizeof(reg_map[0])));
+    SLJIT_ASSERT(r >= 0 && r < (sljit_s32)(sizeof(reg_map) / sizeof(reg_map[0])));
     return reg_map[r];
 }


@@ -172,51 +171,93 @@
     return SLJIT_SUCCESS;
 }


+#define SLJIT_ADD_SUB_NO_COMPARE(status_flags_state) \
+    (((status_flags_state) & (SLJIT_CURRENT_FLAGS_ADD_SUB | SLJIT_CURRENT_FLAGS_COMPARE)) == SLJIT_CURRENT_FLAGS_ADD_SUB)
+
 /* Map the given type to a 4-bit condition code mask. */
-static SLJIT_INLINE sljit_u8 get_cc(sljit_s32 type) {
-    const sljit_u8 eq = 1 << 3; /* equal {,to zero} */
-    const sljit_u8 lt = 1 << 2; /* less than {,zero} */
-    const sljit_u8 gt = 1 << 1; /* greater than {,zero} */
-    const sljit_u8 ov = 1 << 0; /* {overflow,NaN} */
+static SLJIT_INLINE sljit_u8 get_cc(struct sljit_compiler *compiler, sljit_s32 type) {
+    const sljit_u8 cc0 = 1 << 3; /* equal {,to zero} */
+    const sljit_u8 cc1 = 1 << 2; /* less than {,zero} */
+    const sljit_u8 cc2 = 1 << 1; /* greater than {,zero} */
+    const sljit_u8 cc3 = 1 << 0; /* {overflow,NaN} */


     switch (type) {
     case SLJIT_EQUAL:
+        if (SLJIT_ADD_SUB_NO_COMPARE(compiler->status_flags_state)) {
+            sljit_s32 type = GET_FLAG_TYPE(compiler->status_flags_state);
+            if (type >= SLJIT_SIG_LESS && type <= SLJIT_SIG_LESS_EQUAL)
+                return cc0;
+            if (type == SLJIT_OVERFLOW)
+                return (cc0 | cc3);
+            return (cc0 | cc2);
+        }
+
     case SLJIT_EQUAL_F64:
-        return eq;
+        return cc0;


     case SLJIT_NOT_EQUAL:
+        if (SLJIT_ADD_SUB_NO_COMPARE(compiler->status_flags_state)) {
+            sljit_s32 type = GET_FLAG_TYPE(compiler->status_flags_state);
+            if (type >= SLJIT_SIG_LESS && type <= SLJIT_SIG_LESS_EQUAL)
+                return (cc1 | cc2 | cc3);
+            if (type == SLJIT_OVERFLOW)
+                return (cc1 | cc2);
+            return (cc1 | cc3);
+        }
+
     case SLJIT_NOT_EQUAL_F64:
-        return ~eq;
+        return (cc1 | cc2 | cc3);


     case SLJIT_LESS:
+        return cc1;
+
+    case SLJIT_GREATER_EQUAL:
+        return (cc0 | cc2 | cc3);
+
+    case SLJIT_GREATER:
+        if (compiler->status_flags_state & SLJIT_CURRENT_FLAGS_COMPARE)
+            return cc2;
+        return cc3;
+
+    case SLJIT_LESS_EQUAL:
+        if (compiler->status_flags_state & SLJIT_CURRENT_FLAGS_COMPARE)
+            return (cc0 | cc1);
+        return (cc0 | cc1 | cc2);
+
     case SLJIT_SIG_LESS:
     case SLJIT_LESS_F64:
-        return lt;
+        return cc1;


-    case SLJIT_LESS_EQUAL:
     case SLJIT_SIG_LESS_EQUAL:
     case SLJIT_LESS_EQUAL_F64:
-        return (lt | eq);
+        return (cc0 | cc1);


-    case SLJIT_GREATER:
     case SLJIT_SIG_GREATER:
-    case SLJIT_GREATER_F64:
-        return gt;
+        /* Overflow is considered greater, see SLJIT_SUB. */
+        return cc2 | cc3;


-    case SLJIT_GREATER_EQUAL:
     case SLJIT_SIG_GREATER_EQUAL:
-    case SLJIT_GREATER_EQUAL_F64:
-        return (gt | eq);
+        return (cc0 | cc2 | cc3);


     case SLJIT_OVERFLOW:
-    case SLJIT_MUL_OVERFLOW:
+        if (compiler->status_flags_state & SLJIT_SET_Z)
+            return (cc2 | cc3);
+
     case SLJIT_UNORDERED_F64:
-        return ov;
+        return cc3;


     case SLJIT_NOT_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
+        if (compiler->status_flags_state & SLJIT_SET_Z)
+            return (cc0 | cc1);
+
     case SLJIT_ORDERED_F64:
-        return ~ov;
+        return (cc0 | cc1 | cc2);
+
+    case SLJIT_GREATER_F64:
+        return cc2;
+
+    case SLJIT_GREATER_EQUAL_F64:
+        return (cc0 | cc2);
     }


     SLJIT_UNREACHABLE();
@@ -346,19 +387,20 @@
 #define is_u32(d)    (0 <= (d) && (d) <= 0xffffffffL)


 #define CHECK_SIGNED(v, bitlen) \
-    ((v) == (((v) << (sizeof(v) * 8 - bitlen)) >> (sizeof(v) * 8 - bitlen)))
+    ((v) >= -(1 << ((bitlen) - 1)) && (v) < (1 << ((bitlen) - 1)))


+#define is_s8(d)    CHECK_SIGNED((d), 8)
 #define is_s16(d)    CHECK_SIGNED((d), 16)
 #define is_s20(d)    CHECK_SIGNED((d), 20)
-#define is_s32(d)    CHECK_SIGNED((d), 32)
+#define is_s32(d)    ((d) == (sljit_s32)(d))


-static SLJIT_INLINE sljit_uw disp_s20(sljit_s32 d)
+static SLJIT_INLINE sljit_ins disp_s20(sljit_s32 d)
 {
+    SLJIT_ASSERT(is_s20(d));
+
     sljit_uw dh = (d >> 12) & 0xff;
     sljit_uw dl = (d << 8) & 0xfff00;
-
-    SLJIT_ASSERT(is_s20(d));
-    return dh | dl;
+    return (dh | dl) << 8;
 }


 /* TODO(carenas): variadic macro is not strictly needed */
@@ -372,12 +414,6 @@
     return (pattern) | ((dst & 0xf) << 4) | (src & 0xf); \
 }


-/* ADD */
-SLJIT_S390X_RR(ar, 0x1a00)
-
-/* ADD LOGICAL */
-SLJIT_S390X_RR(alr, 0x1e00)
-
/* AND */
SLJIT_S390X_RR(nr, 0x1400)

@@ -387,12 +423,6 @@
/* BRANCH ON CONDITION */
SLJIT_S390X_RR(bcr, 0x0700) /* TODO(mundaym): type for mask? */

-/* COMPARE */
-SLJIT_S390X_RR(cr, 0x1900)
-
-/* COMPARE LOGICAL */
-SLJIT_S390X_RR(clr, 0x1500)
-
/* DIVIDE */
SLJIT_S390X_RR(dr, 0x1d00)

@@ -408,12 +438,6 @@
/* OR */
SLJIT_S390X_RR(or, 0x1600)

-/* SUBTRACT */
-SLJIT_S390X_RR(sr, 0x1b00)
-
-/* SUBTRACT LOGICAL */
-SLJIT_S390X_RR(slr, 0x1f00)
-
#undef SLJIT_S390X_RR

 /* RRE form instructions */
@@ -423,25 +447,9 @@
     return (pattern) | ((dst & 0xf) << 4) | (src & 0xf); \
 }


-/* ADD */
-SLJIT_S390X_RRE(agr, 0xb9080000)
-
-/* ADD LOGICAL */
-SLJIT_S390X_RRE(algr, 0xb90a0000)
-
-/* ADD LOGICAL WITH CARRY */
-SLJIT_S390X_RRE(alcr, 0xb9980000)
-SLJIT_S390X_RRE(alcgr, 0xb9880000)
-
/* AND */
SLJIT_S390X_RRE(ngr, 0xb9800000)

-/* COMPARE */
-SLJIT_S390X_RRE(cgr, 0xb9200000)
-
-/* COMPARE LOGICAL */
-SLJIT_S390X_RRE(clgr, 0xb9210000)
-
/* DIVIDE LOGICAL */
SLJIT_S390X_RRE(dlr, 0xb9970000)
SLJIT_S390X_RRE(dlgr, 0xb9870000)
@@ -482,8 +490,6 @@
SLJIT_S390X_RRE(mlgr, 0xb9860000)

/* MULTIPLY SINGLE */
-SLJIT_S390X_RRE(msr, 0xb2520000)
-SLJIT_S390X_RRE(msgr, 0xb90c0000)
SLJIT_S390X_RRE(msgfr, 0xb91c0000)

/* OR */
@@ -492,13 +498,6 @@
/* SUBTRACT */
SLJIT_S390X_RRE(sgr, 0xb9090000)

-/* SUBTRACT LOGICAL */
-SLJIT_S390X_RRE(slgr, 0xb90b0000)
-
-/* SUBTRACT LOGICAL WITH BORROW */
-SLJIT_S390X_RRE(slbr, 0xb9990000)
-SLJIT_S390X_RRE(slbgr, 0xb9890000)
-
#undef SLJIT_S390X_RRE

/* RI-a form instructions */
@@ -509,13 +508,8 @@
}

/* ADD HALFWORD IMMEDIATE */
-SLJIT_S390X_RIA(ahi, 0xa70a0000, sljit_s16)
SLJIT_S390X_RIA(aghi, 0xa70b0000, sljit_s16)

-/* COMPARE HALFWORD IMMEDIATE */
-SLJIT_S390X_RIA(chi, 0xa70e0000, sljit_s16)
-SLJIT_S390X_RIA(cghi, 0xa70f0000, sljit_s16)
-
/* LOAD HALFWORD IMMEDIATE */
SLJIT_S390X_RIA(lhi, 0xa7080000, sljit_s16)
SLJIT_S390X_RIA(lghi, 0xa7090000, sljit_s16)
@@ -533,9 +527,6 @@
/* OR IMMEDIATE */
SLJIT_S390X_RIA(oilh, 0xa50a0000, sljit_u16)

-/* TEST UNDER MASK */
-SLJIT_S390X_RIA(tmlh, 0xa7000000, sljit_u16)
-
#undef SLJIT_S390X_RIA

/* RIL-a form instructions (requires extended immediate facility) */
@@ -547,31 +538,14 @@
}

/* ADD IMMEDIATE */
-SLJIT_S390X_RILA(afi, 0xc20900000000, sljit_s32)
SLJIT_S390X_RILA(agfi, 0xc20800000000, sljit_s32)

/* ADD IMMEDIATE HIGH */
SLJIT_S390X_RILA(aih, 0xcc0800000000, sljit_s32) /* TODO(mundaym): high-word facility? */

-/* ADD LOGICAL IMMEDIATE */
-SLJIT_S390X_RILA(alfi, 0xc20b00000000, sljit_u32)
-SLJIT_S390X_RILA(algfi, 0xc20a00000000, sljit_u32)
-
/* AND IMMEDIATE */
SLJIT_S390X_RILA(nihf, 0xc00a00000000, sljit_u32)
-SLJIT_S390X_RILA(nilf, 0xc00b00000000, sljit_u32)

-/* COMPARE IMMEDIATE */
-SLJIT_S390X_RILA(cfi, 0xc20d00000000, sljit_s32)
-SLJIT_S390X_RILA(cgfi, 0xc20c00000000, sljit_s32)
-
-/* COMPARE IMMEDIATE HIGH */
-SLJIT_S390X_RILA(cih, 0xcc0d00000000, sljit_s32) /* TODO(mundaym): high-word facility? */
-
-/* COMPARE LOGICAL IMMEDIATE */
-SLJIT_S390X_RILA(clfi, 0xc20f00000000, sljit_u32)
-SLJIT_S390X_RILA(clgfi, 0xc20e00000000, sljit_u32)
-
/* EXCLUSIVE OR IMMEDIATE */
SLJIT_S390X_RILA(xilf, 0xc00700000000, sljit_u32)

@@ -586,8 +560,8 @@
SLJIT_S390X_RILA(llihf, 0xc00e00000000, sljit_u32)
SLJIT_S390X_RILA(llilf, 0xc00f00000000, sljit_u32)

-/* OR IMMEDIATE */
-SLJIT_S390X_RILA(oilf, 0xc00d00000000, sljit_u32)
+/* SUBTRACT LOGICAL IMMEDIATE */
+SLJIT_S390X_RILA(slfi, 0xc20500000000, sljit_u32)

#undef SLJIT_S390X_RILA

@@ -606,18 +580,6 @@
     return (pattern) | ri | xi | bi | di; \
 }


-/* ADD */
-SLJIT_S390X_RXA(a, 0x5a000000)
-
-/* ADD LOGICAL */
-SLJIT_S390X_RXA(al, 0x5e000000)
-
-/* AND */
-SLJIT_S390X_RXA(n, 0x54000000)
-
-/* EXCLUSIVE OR */
-SLJIT_S390X_RXA(x, 0x57000000)
-
/* LOAD */
SLJIT_S390X_RXA(l, 0x58000000)

@@ -630,9 +592,6 @@
/* MULTIPLY SINGLE */
SLJIT_S390X_RXA(ms, 0x71000000)

-/* OR */
-SLJIT_S390X_RXA(o, 0x56000000)
-
/* STORE */
SLJIT_S390X_RXA(st, 0x50000000)

@@ -642,12 +601,6 @@
/* STORE HALFWORD */
SLJIT_S390X_RXA(sth, 0x40000000)

-/* SUBTRACT */
-SLJIT_S390X_RXA(s, 0x5b000000)
-
-/* SUBTRACT LOGICAL */
-SLJIT_S390X_RXA(sl, 0x5f000000)
-
#undef SLJIT_S390X_RXA

 /* RXY-a instructions */
@@ -660,31 +613,11 @@
     ri = (sljit_ins)(r & 0xf) << 36; \
     xi = (sljit_ins)(x & 0xf) << 32; \
     bi = (sljit_ins)(b & 0xf) << 28; \
-    di = (sljit_ins)disp_s20(d) << 8; \
+    di = disp_s20(d); \
 \
     return (pattern) | ri | xi | bi | di; \
 }


-/* ADD */
-SLJIT_S390X_RXYA(ay,    0xe3000000005a, have_ldisp())
-SLJIT_S390X_RXYA(ag,    0xe30000000008, 1)
-
-/* ADD LOGICAL */
-SLJIT_S390X_RXYA(aly,   0xe3000000005e, have_ldisp())
-SLJIT_S390X_RXYA(alg,   0xe3000000000a, 1)
-
-/* ADD LOGICAL WITH CARRY */
-SLJIT_S390X_RXYA(alc,   0xe30000000098, 1)
-SLJIT_S390X_RXYA(alcg,  0xe30000000088, 1)
-
-/* AND */
-SLJIT_S390X_RXYA(ny,    0xe30000000054, have_ldisp())
-SLJIT_S390X_RXYA(ng,    0xe30000000080, 1)
-
-/* EXCLUSIVE OR */
-SLJIT_S390X_RXYA(xy,    0xe30000000057, have_ldisp())
-SLJIT_S390X_RXYA(xg,    0xe30000000082, 1)
-
 /* LOAD */
 SLJIT_S390X_RXYA(ly,    0xe30000000058, have_ldisp())
 SLJIT_S390X_RXYA(lg,    0xe30000000004, 1)
@@ -713,10 +646,6 @@
 SLJIT_S390X_RXYA(msy,   0xe30000000051, have_ldisp())
 SLJIT_S390X_RXYA(msg,   0xe3000000000c, 1)


-/* OR */
-SLJIT_S390X_RXYA(oy,    0xe30000000056, have_ldisp())
-SLJIT_S390X_RXYA(og,    0xe30000000081, 1)
-
 /* STORE */
 SLJIT_S390X_RXYA(sty,   0xe30000000050, have_ldisp())
 SLJIT_S390X_RXYA(stg,   0xe30000000024, 1)
@@ -727,41 +656,8 @@
 /* STORE HALFWORD */
 SLJIT_S390X_RXYA(sthy,  0xe30000000070, have_ldisp())


-/* SUBTRACT */
-SLJIT_S390X_RXYA(sy,    0xe3000000005b, have_ldisp())
-SLJIT_S390X_RXYA(sg,    0xe30000000009, 1)
-
-/* SUBTRACT LOGICAL */
-SLJIT_S390X_RXYA(sly,   0xe3000000005f, have_ldisp())
-SLJIT_S390X_RXYA(slg,   0xe3000000000b, 1)
-
-/* SUBTRACT LOGICAL WITH BORROW */
-SLJIT_S390X_RXYA(slb,   0xe30000000099, 1)
-SLJIT_S390X_RXYA(slbg,  0xe30000000089, 1)
-
 #undef SLJIT_S390X_RXYA


-/* RS-a instructions */
-#define SLJIT_S390X_RSA(name, pattern) \
-SLJIT_S390X_INSTRUCTION(name, sljit_gpr reg, sljit_sw d, sljit_gpr b) \
-{ \
-    sljit_ins r1 = (sljit_ins)(reg & 0xf) << 20; \
-    sljit_ins b2 = (sljit_ins)(b & 0xf) << 12; \
-    sljit_ins d2 = (sljit_ins)(d & 0xfff); \
-    return (pattern) | r1 | b2 | d2; \
-}
-
-/* SHIFT LEFT SINGLE LOGICAL */
-SLJIT_S390X_RSA(sll, 0x89000000)
-
-/* SHIFT RIGHT SINGLE */
-SLJIT_S390X_RSA(sra, 0x8a000000)
-
-/* SHIFT RIGHT SINGLE LOGICAL */
-SLJIT_S390X_RSA(srl, 0x88000000)
-
-#undef SLJIT_S390X_RSA
-
 /* RSY-a instructions */
 #define SLJIT_S390X_RSYA(name, pattern, cond) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src, sljit_sw d, sljit_gpr b) \
@@ -772,7 +668,7 @@
     r1 = (sljit_ins)(dst & 0xf) << 36; \
     r3 = (sljit_ins)(src & 0xf) << 32; \
     b2 = (sljit_ins)(b & 0xf) << 28; \
-    d2 = (sljit_ins)disp_s20(d) << 8; \
+    d2 = disp_s20(d); \
 \
     return (pattern) | r1 | r3 | b2 | d2; \
 }
@@ -786,9 +682,6 @@
 /* SHIFT RIGHT SINGLE */
 SLJIT_S390X_RSYA(srag,  0xeb000000000a, 1)


-/* SHIFT RIGHT SINGLE LOGICAL */
-SLJIT_S390X_RSYA(srlg, 0xeb000000000c, 1)
-
/* STORE MULTIPLE */
SLJIT_S390X_RSYA(stmg, 0xeb0000000024, 1)

@@ -831,26 +724,6 @@

#undef SLJIT_S390X_RIEF

-/* RRF-a instructions */
-#define SLJIT_S390X_RRFA(name, pattern, cond) \
-SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src1, sljit_gpr src2) \
-{ \
-    sljit_ins r1, r2, r3; \
-\
-    SLJIT_ASSERT(cond); \
-    r1 = (sljit_ins)(dst & 0xf) << 4; \
-    r2 = (sljit_ins)(src1 & 0xf); \
-    r3 = (sljit_ins)(src2 & 0xf) << 12; \
-\
-    return (pattern) | r3 | r1 | r2; \
-}
-
-/* MULTIPLY */
-SLJIT_S390X_RRFA(msrkc,  0xb9fd0000, have_misc2())
-SLJIT_S390X_RRFA(msgrkc, 0xb9ed0000, have_misc2())
-
-#undef SLJIT_S390X_RRFA
-
 /* RRF-c instructions (require load/store-on-condition 1 facility) */
 #define SLJIT_S390X_RRFC(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src, sljit_uw mask) \
@@ -919,6 +792,13 @@
     return 0x07f0 | target;
 }


+SLJIT_S390X_INSTRUCTION(brc, sljit_uw mask, sljit_sw target)
+{
+    sljit_ins m1 = (sljit_ins)(mask & 0xf) << 20;
+    sljit_ins ri2 = (sljit_ins)target & 0xffff;
+    return 0xa7040000L | m1 | ri2;
+}
+
 SLJIT_S390X_INSTRUCTION(brcl, sljit_uw mask, sljit_sw target)
 {
     sljit_ins m1 = (sljit_ins)(mask & 0xf) << 36;
@@ -940,6 +820,12 @@
     return 0xb2220000 | ((sljit_ins)(dst & 0xf) << 4);
 }


+/* SET PROGRAM MASK */
+SLJIT_S390X_INSTRUCTION(spm, sljit_gpr dst)
+{
+    return 0x0400 | ((sljit_ins)(dst & 0xf) << 4);
+}
+
 /* ROTATE THEN INSERT SELECTED BITS HIGH (ZERO) */
 SLJIT_S390X_INSTRUCTION(risbhgz, sljit_gpr dst, sljit_gpr src, sljit_u8 start, sljit_u8 end, sljit_u8 rot)
 {
@@ -948,33 +834,23 @@


#undef SLJIT_S390X_INSTRUCTION

-/* load condition code as needed to match type */
-static sljit_s32 push_load_cc(struct sljit_compiler *compiler, sljit_s32 type)
+static sljit_s32 update_zero_overflow(struct sljit_compiler *compiler, sljit_s32 op, sljit_gpr dst_r)
 {
-    type &= ~SLJIT_I32_OP;
-    switch (type) {
-    case SLJIT_ZERO:
-    case SLJIT_NOT_ZERO:
-        return push_inst(compiler, cih(flag_r, 0));
-        break;
-    default:
-        return push_inst(compiler, tmlh(flag_r, 0x3000));
-        break;
-    }
+    /* Condition codes: bits 18 and 19.
+       Transformation:
+         0 (zero and no overflow) : unchanged
+         1 (non-zero and no overflow) : unchanged
+         2 (zero and overflow) : decreased by 1
+         3 (non-zero and overflow) : decreased by 1 if non-zero */
+    FAIL_IF(push_inst(compiler, brc(0xc, 2 + 2 + ((op & SLJIT_I32_OP) ? 1 : 2) + 2 + 3 + 1)));
+    FAIL_IF(push_inst(compiler, ipm(flag_r)));
+    FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ? or(dst_r, dst_r) : ogr(dst_r, dst_r)));
+    FAIL_IF(push_inst(compiler, brc(0x8, 2 + 3)));
+    FAIL_IF(push_inst(compiler, slfi(flag_r, 0x10000000)));
+    FAIL_IF(push_inst(compiler, spm(flag_r)));
     return SLJIT_SUCCESS;
 }


-static sljit_s32 push_store_zero_flag(struct sljit_compiler *compiler, sljit_s32 op, sljit_gpr source)
-{
-    /* insert low 32-bits into high 32-bits of flag register */
-    FAIL_IF(push_inst(compiler, risbhgz(flag_r, source, 0, 31, 32)));
-    if (!(op & SLJIT_I32_OP)) {
-        /* OR high 32-bits with high 32-bits of flag register */
-        return push_inst(compiler, rosbg(flag_r, source, 0, 31, 0));
-    }
-    return SLJIT_SUCCESS;
-}
-
 /* load 64-bit immediate into register without clobbering flags */
 static sljit_s32 push_load_imm_inst(struct sljit_compiler *compiler, sljit_gpr target, sljit_sw v)
 {
@@ -1088,9 +964,10 @@
 #define WHEN(cond, r, i1, i2, addr) \
     (cond) ? EVAL(i1, r, addr) : EVAL(i2, r, addr)


+/* May clobber tmp1. */
 static sljit_s32 load_word(struct sljit_compiler *compiler, sljit_gpr dst,
         sljit_s32 src, sljit_sw srcw,
-        sljit_gpr tmp /* clobbered */, sljit_s32 is_32bit)
+        sljit_s32 is_32bit)
 {
     struct addr addr;
     sljit_ins ins;
@@ -1097,9 +974,9 @@


     SLJIT_ASSERT(src & SLJIT_MEM);
     if (have_ldisp() || !is_32bit)
-        FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp));
+        FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
     else
-        FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp));
+        FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1));


     if (is_32bit)
         ins = WHEN(is_u12(addr.offset), dst, l, ly, addr);
@@ -1109,9 +986,10 @@
     return push_inst(compiler, ins);
 }


+/* May clobber tmp1. */
 static sljit_s32 store_word(struct sljit_compiler *compiler, sljit_gpr src,
         sljit_s32 dst, sljit_sw dstw,
-        sljit_gpr tmp /* clobbered */, sljit_s32 is_32bit)
+        sljit_s32 is_32bit)
 {
     struct addr addr;
     sljit_ins ins;
@@ -1118,9 +996,9 @@


     SLJIT_ASSERT(dst & SLJIT_MEM);
     if (have_ldisp() || !is_32bit)
-        FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp));
+        FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp1));
     else
-        FAIL_IF(make_addr_bx(compiler, &addr, dst, dstw, tmp));
+        FAIL_IF(make_addr_bx(compiler, &addr, dst, dstw, tmp1));


     if (is_32bit)
         ins = WHEN(is_u12(addr.offset), src, st, sty, addr);
@@ -1132,6 +1010,358 @@


#undef WHEN

+static sljit_s32 emit_move(struct sljit_compiler *compiler,
+    sljit_gpr dst_r,
+    sljit_s32 src, sljit_sw srcw)
+{
+    SLJIT_ASSERT(!SLOW_IS_REG(src) || dst_r != gpr(src & REG_MASK));
+
+    if (src & SLJIT_IMM)
+        return push_load_imm_inst(compiler, dst_r, srcw);
+
+    if (src & SLJIT_MEM)
+        return load_word(compiler, dst_r, src, srcw, (compiler->mode & SLJIT_I32_OP) != 0);
+
+    sljit_gpr src_r = gpr(src & REG_MASK);
+    return push_inst(compiler, (compiler->mode & SLJIT_I32_OP) ? lr(dst_r, src_r) : lgr(dst_r, src_r));
+}
+
+static sljit_s32 emit_rr(struct sljit_compiler *compiler, sljit_ins ins,
+    sljit_s32 dst,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    sljit_gpr dst_r = tmp0;
+    sljit_gpr src_r = tmp1;
+    sljit_s32 needs_move = 1;
+
+    if (SLOW_IS_REG(dst)) {
+        dst_r = gpr(dst & REG_MASK);
+
+        if (dst == src1)
+            needs_move = 0;
+        else if (dst == src2) {
+            dst_r = tmp0;
+            needs_move = 2;
+        }
+    }
+
+    if (needs_move)
+        FAIL_IF(emit_move(compiler, dst_r, src1, src1w));
+
+    if (FAST_IS_REG(src2))
+        src_r = gpr(src2 & REG_MASK);
+    else
+        FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
+
+    FAIL_IF(push_inst(compiler, ins | (dst_r << 4) | src_r));
+
+    if (needs_move != 2)
+        return SLJIT_SUCCESS;
+
+    dst_r = gpr(dst & REG_MASK);
+    return push_inst(compiler, (compiler->mode & SLJIT_I32_OP) ? lr(dst_r, tmp0) : lgr(dst_r, tmp0));
+}
+
+static sljit_s32 emit_rrf(struct sljit_compiler *compiler, sljit_ins ins,
+    sljit_s32 dst,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    sljit_gpr dst_r = SLOW_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+    sljit_gpr src1_r = tmp0;
+    sljit_gpr src2_r = tmp1;
+
+    if (FAST_IS_REG(src1))
+        src1_r = gpr(src1 & REG_MASK);
+    else
+        FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
+
+    if (FAST_IS_REG(src2))
+        src2_r = gpr(src2 & REG_MASK);
+    else
+        FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
+
+    return push_inst(compiler, ins | (dst_r << 4) | src1_r | (src2_r << 12));
+}
+
+typedef enum {
+    RI_A,
+    RIL_A,
+} emit_ril_type;
+
+static sljit_s32 emit_ri(struct sljit_compiler *compiler, sljit_ins ins,
+    sljit_s32 dst,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_sw src2w,
+    emit_ril_type type)
+{
+    sljit_gpr dst_r = tmp0;
+    sljit_s32 needs_move = 1;
+
+    if (SLOW_IS_REG(dst)) {
+        dst_r = gpr(dst & REG_MASK);
+
+        if (dst == src1)
+            needs_move = 0;
+    }
+
+    if (needs_move)
+        FAIL_IF(emit_move(compiler, dst_r, src1, src1w));
+
+    if (type == RIL_A)
+        return push_inst(compiler, ins | (dst_r << 36) | (src2w & 0xffffffff));
+    return push_inst(compiler, ins | (dst_r << 20) | (src2w & 0xffff));
+}
+
+static sljit_s32 emit_rie_d(struct sljit_compiler *compiler, sljit_ins ins,
+    sljit_s32 dst,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_sw src2w)
+{
+    sljit_gpr dst_r = SLOW_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+    sljit_gpr src_r = tmp0;
+
+    if (!SLOW_IS_REG(src1))
+        FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
+    else
+        src_r = gpr(src1 & REG_MASK);
+
+    return push_inst(compiler, ins | (dst_r << 36) | (src_r << 32) | (src2w & 0xffff) << 16);
+}
+
+typedef enum {
+    RX_A,
+    RXY_A,
+} emit_rx_type;
+
+static sljit_s32 emit_rx(struct sljit_compiler *compiler, sljit_ins ins,
+    sljit_s32 dst,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w,
+    emit_rx_type type)
+{
+    sljit_gpr dst_r = tmp0;
+    sljit_s32 needs_move = 1;
+    sljit_gpr base, index;
+
+    SLJIT_ASSERT(src2 & SLJIT_MEM);
+
+    if (SLOW_IS_REG(dst)) {
+        dst_r = gpr(dst);
+
+        if (dst == src1)
+            needs_move = 0;
+        else if (dst == (src2 & REG_MASK) || (dst == OFFS_REG(src2))) {
+            dst_r = tmp0;
+            needs_move = 2;
+        }
+    }
+
+    if (needs_move)
+        FAIL_IF(emit_move(compiler, dst_r, src1, src1w));
+
+    base = gpr(src2 & REG_MASK);
+    index = tmp0;
+
+    if (src2 & OFFS_REG_MASK) {
+        index = gpr(OFFS_REG(src2));
+
+        if (src2w != 0) {
+            FAIL_IF(push_inst(compiler, sllg(tmp1, index, src2w & 0x3, 0)));
+            src2w = 0;
+            index = tmp1;
+        }
+    } else if ((type == RX_A && !is_u12(src2w)) || (type == RXY_A && !is_s20(src2w))) {
+        FAIL_IF(push_load_imm_inst(compiler, tmp1, src2w));
+
+        if (src2 & REG_MASK)
+            index = tmp1;
+        else
+            base = tmp1;
+        src2w = 0;
+    }
+
+    if (type == RX_A)
+        ins |= (dst_r << 20) | (index << 16) | (base << 12) | src2w;
+    else
+        ins |= (dst_r << 36) | (index << 32) | (base << 28) | disp_s20(src2w);
+
+    FAIL_IF(push_inst(compiler, ins));
+
+    if (needs_move != 2)
+        return SLJIT_SUCCESS;
+
+    dst_r = gpr(dst);
+    return push_inst(compiler, (compiler->mode & SLJIT_I32_OP) ? lr(dst_r, tmp0) : lgr(dst_r, tmp0));
+}
+
+static sljit_s32 emit_siy(struct sljit_compiler *compiler, sljit_ins ins,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_sw srcw)
+{
+    SLJIT_ASSERT(dst & SLJIT_MEM);
+
+    sljit_gpr dst_r = tmp1;
+
+    if (dst & OFFS_REG_MASK) {
+        sljit_gpr index = tmp1;
+
+        if ((dstw & 0x3) == 0)
+            index = gpr(OFFS_REG(dst));
+        else
+            FAIL_IF(push_inst(compiler, sllg(tmp1, index, dstw & 0x3, 0)));
+
+        FAIL_IF(push_inst(compiler, la(tmp1, 0, dst_r, index)));
+        dstw = 0;
+    }
+    else if (!is_s20(dstw)) {
+        FAIL_IF(push_load_imm_inst(compiler, tmp1, dstw));
+
+        if (dst & REG_MASK)
+            FAIL_IF(push_inst(compiler, la(tmp1, 0, dst_r, tmp1)));
+
+        dstw = 0;
+    }
+    else
+        dst_r = gpr(dst & REG_MASK);
+
+    return push_inst(compiler, ins | ((srcw & 0xff) << 32) | (dst_r << 28) | disp_s20(dstw));
+}
+
+struct ins_forms {
+    sljit_ins op_r;
+    sljit_ins op_gr;
+    sljit_ins op_rk;
+    sljit_ins op_grk;
+    sljit_ins op;
+    sljit_ins op_y;
+    sljit_ins op_g;
+};
+
+static sljit_s32 emit_commutative(struct sljit_compiler *compiler, const struct ins_forms *forms,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    sljit_s32 mode = compiler->mode;
+    sljit_ins ins, ins_k;
+
+    if ((src1 | src2) & SLJIT_MEM) {
+        sljit_ins ins12, ins20;
+
+        if (mode & SLJIT_I32_OP) {
+            ins12 = forms->op;
+            ins20 = forms->op_y;
+        }
+        else {
+            ins12 = 0;
+            ins20 = forms->op_g;
+        }
+
+        if (ins12 && ins20) {
+            /* Extra instructions needed for address computation can be executed independently. */
+            if ((src2 & SLJIT_MEM) && (!(src1 & SLJIT_MEM)
+                    || ((src1 & OFFS_REG_MASK) ? (src1w & 0x3) == 0 : is_s20(src1w)))) {
+                if ((src2 & OFFS_REG_MASK) || is_u12(src2w) || !is_s20(src2w))
+                    return emit_rx(compiler, ins12, dst, src1, src1w, src2, src2w, RX_A);
+
+                return emit_rx(compiler, ins20, dst, src1, src1w, src2, src2w, RXY_A);
+            }
+
+            if (src1 & SLJIT_MEM) {
+                if ((src1 & OFFS_REG_MASK) || is_u12(src1w) || !is_s20(src1w))
+                    return emit_rx(compiler, ins12, dst, src2, src2w, src1, src1w, RX_A);
+
+                return emit_rx(compiler, ins20, dst, src2, src2w, src1, src1w, RXY_A);
+            }
+        }
+        else if (ins12 || ins20) {
+            emit_rx_type rx_type;
+
+            if (ins12) {
+                rx_type = RX_A;
+                ins = ins12;
+            }
+            else {
+                rx_type = RXY_A;
+                ins = ins20;
+            }
+
+            if ((src2 & SLJIT_MEM) && (!(src1 & SLJIT_MEM)
+                    || ((src1 & OFFS_REG_MASK) ? (src1w & 0x3) == 0 : (rx_type == RX_A ? is_u12(src1w) : is_s20(src1w)))))
+                return emit_rx(compiler, ins, dst, src1, src1w, src2, src2w, rx_type);
+
+            if (src1 & SLJIT_MEM)
+                return emit_rx(compiler, ins, dst, src2, src2w, src1, src1w, rx_type);
+        }
+    }
+
+    if (mode & SLJIT_I32_OP) {
+        ins = forms->op_r;
+        ins_k = forms->op_rk;
+    }
+    else {
+        ins = forms->op_gr;
+        ins_k = forms->op_grk;
+    }
+
+    SLJIT_ASSERT(ins != 0 || ins_k != 0);
+
+    if (ins && SLOW_IS_REG(dst)) {
+        if (dst == src1)
+            return emit_rr(compiler, ins, dst, src1, src1w, src2, src2w);
+
+        if (dst == src2)
+            return emit_rr(compiler, ins, dst, src2, src2w, src1, src1w);
+    }
+
+    if (ins_k == 0)
+        return emit_rr(compiler, ins, dst, src1, src1w, src2, src2w);
+
+    return emit_rrf(compiler, ins_k, dst, src1, src1w, src2, src2w);
+}
+
+static sljit_s32 emit_non_commutative(struct sljit_compiler *compiler, const struct ins_forms *forms,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    sljit_s32 mode = compiler->mode;
+    sljit_ins ins;
+
+    if (src2 & SLJIT_MEM) {
+        sljit_ins ins12, ins20;
+
+        if (mode & SLJIT_I32_OP) {
+            ins12 = forms->op;
+            ins20 = forms->op_y;
+        }
+        else {
+            ins12 = 0;
+            ins20 = forms->op_g;
+        }
+
+        if (ins12 && ins20) {
+            if ((src2 & OFFS_REG_MASK) || is_u12(src2w) || !is_s20(src2w))
+                return emit_rx(compiler, ins12, dst, src1, src1w, src2, src2w, RX_A);
+
+            return emit_rx(compiler, ins20, dst, src1, src1w, src2, src2w, RXY_A);
+        }
+        else if (ins12)
+            return emit_rx(compiler, ins12, dst, src1, src1w, src2, src2w, RX_A);
+        else if (ins20)
+            return emit_rx(compiler, ins20, dst, src1, src1w, src2, src2w, RXY_A);
+    }
+
+    ins = (mode & SLJIT_I32_OP) ? forms->op_rk : forms->op_grk;
+
+    if (ins == 0 || (SLOW_IS_REG(dst) && dst == src1))
+        return emit_rr(compiler, (mode & SLJIT_I32_OP) ? forms->op_r : forms->op_gr, dst, src1, src1w, src2, src2w);
+
+    return emit_rrf(compiler, ins, dst, src1, src1w, src2, src2w);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
 {
     struct sljit_label *label;
@@ -1560,6 +1790,7 @@
         /* TODO(carenas): implement prefetch? */
         return SLJIT_SUCCESS;
     }
+
     if (opcode >= SLJIT_MOV && opcode <= SLJIT_MOV_P) {
         /* LOAD REGISTER */
         if (FAST_IS_REG(dst) && FAST_IS_REG(src)) {
@@ -1610,11 +1841,6 @@
                 SLJIT_UNREACHABLE();
             }
             FAIL_IF(push_inst(compiler, ins));
-            if (HAS_FLAGS(op)) {
-                /* only handle zero flag */
-                SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-                return push_store_zero_flag(compiler, op, dst_r);
-            }
             return SLJIT_SUCCESS;
         }
         /* LOAD IMMEDIATE */
@@ -1691,11 +1917,6 @@
                 SLJIT_UNREACHABLE();
             }
             FAIL_IF(push_inst(compiler, ins));
-            if (HAS_FLAGS(op)) {
-                /* only handle zero flag */
-                SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-                return push_store_zero_flag(compiler, op, reg);
-            }
             return SLJIT_SUCCESS;
         }
         /* STORE and STORE IMMEDIATE */
@@ -1724,11 +1945,6 @@
             case SLJIT_MOV_P:
             case SLJIT_MOV:
                 FAIL_IF(push_inst(compiler, LEVAL(stg)));
-                if (HAS_FLAGS(op)) {
-                    /* only handle zero flag */
-                    SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-                    return push_store_zero_flag(compiler, op, reg);
-                }
                 return SLJIT_SUCCESS;
             default:
                 SLJIT_UNREACHABLE();
@@ -1768,11 +1984,6 @@
                 FAIL_IF(make_addr_bxy(compiler, &mem, dst, dstw, tmp1));
                 FAIL_IF(push_inst(compiler,
                     EVAL(stg, tmp0, mem)));
-                if (HAS_FLAGS(op)) {
-                    /* only handle zero flag */
-                    SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-                    return push_store_zero_flag(compiler, op, tmp0);
-                }
                 return SLJIT_SUCCESS;
             default:
                 SLJIT_UNREACHABLE();
@@ -1786,8 +1997,10 @@
     dst_r = SLOW_IS_REG(dst) ? gpr(REG_MASK & dst) : tmp0;
     src_r = FAST_IS_REG(src) ? gpr(REG_MASK & src) : tmp0;
     if (src & SLJIT_MEM)
-        FAIL_IF(load_word(compiler, src_r, src, srcw, tmp1, src & SLJIT_I32_OP));
+        FAIL_IF(load_word(compiler, src_r, src, srcw, src & SLJIT_I32_OP));


+    compiler->status_flags_state = op & (VARIABLE_FLAG_MASK | SLJIT_SET_Z);
+
     /* TODO(mundaym): optimize loads and stores */
     switch (opcode | (op & SLJIT_I32_OP)) {
     case SLJIT_NOT:
@@ -1811,9 +2024,11 @@
         }
         break;
     case SLJIT_NEG:
+        compiler->status_flags_state |= SLJIT_CURRENT_FLAGS_ADD_SUB;
         FAIL_IF(push_inst(compiler, lcgr(dst_r, src_r)));
         break;
     case SLJIT_NEG32:
+        compiler->status_flags_state |= SLJIT_CURRENT_FLAGS_ADD_SUB;
         FAIL_IF(push_inst(compiler, lcr(dst_r, src_r)));
         break;
     case SLJIT_CLZ:
@@ -1840,17 +2055,12 @@
         SLJIT_UNREACHABLE();
     }


-    /* write condition code to emulated flag register */
-    if (op & VARIABLE_FLAG_MASK)
-        FAIL_IF(push_inst(compiler, ipm(flag_r)));
+    if ((op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_SET_Z | SLJIT_SET_OVERFLOW))
+        FAIL_IF(update_zero_overflow(compiler, op, dst_r));


-    /* write zero flag to emulated flag register */
-    if (op & SLJIT_SET_Z)
-        FAIL_IF(push_store_zero_flag(compiler, op, dst_r));
-
     /* TODO(carenas): doesn't need FAIL_IF */
     if ((dst != SLJIT_UNUSED) && (dst & SLJIT_MEM))
-        FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
+        FAIL_IF(store_word(compiler, dst_r, dst, dstw, op & SLJIT_I32_OP));


     return SLJIT_SUCCESS;
 }
@@ -1888,530 +2098,554 @@
     return 0;
 }


-/* Report whether we have an instruction for:
-     op dst src imm
-   where dst and src are separate registers. */
-static int have_op_3_imm(sljit_s32 op, sljit_sw imm) {
-    return 0; /* TODO(mundaym): implement */
-}
+static const struct ins_forms add_forms = {
+    0x1a00, /* ar */
+    0xb9080000, /* agr */
+    0xb9f80000, /* ark */
+    0xb9e80000, /* agrk */
+    0x5a000000, /* a */
+    0xe3000000005a, /* ay */
+    0xe30000000008, /* ag */
+};


-/* Report whether we have an instruction for:
-     op reg imm
-  where reg is both a source and the destination. */
-static int have_op_2_imm(sljit_s32 op, sljit_sw imm) {
-    switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
-    case SLJIT_ADD32:
-    case SLJIT_ADD:
-        if (!HAS_FLAGS(op) || sets_signed_flag(op))
-            return have_eimm() ? is_s32(imm) : is_s16(imm);
+static const struct ins_forms logical_add_forms = {
+    0x1e00, /* alr */
+    0xb90a0000, /* algr */
+    0xb9fa0000, /* alrk */
+    0xb9ea0000, /* algrk */
+    0x5e000000, /* al */
+    0xe3000000005e, /* aly */
+    0xe3000000000a, /* alg */
+};


-        return have_eimm() && is_u32(imm);
-    case SLJIT_MUL32:
-    case SLJIT_MUL:
-        /* TODO(mundaym): general extension check */
-        /* for ms{,g}fi */
-        if (op & VARIABLE_FLAG_MASK)
-            return 0;
+static sljit_s32 sljit_emit_add(struct sljit_compiler *compiler, sljit_s32 op,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    int sets_overflow = (op & VARIABLE_FLAG_MASK) == SLJIT_SET_OVERFLOW;
+    int sets_zero_overflow = (op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_SET_Z | SLJIT_SET_OVERFLOW);
+    const struct ins_forms *forms;
+    sljit_ins ins;


-        return have_genext() && is_s16(imm);
-    case SLJIT_OR32:
-    case SLJIT_XOR32:
-    case SLJIT_AND32:
-        /* only use if have extended immediate facility */
-        /* this ensures flags are set correctly */
-        return have_eimm();
-    case SLJIT_AND:
-    case SLJIT_OR:
-    case SLJIT_XOR:
-        /* TODO(mundaym): make this more flexible */
-        /* avoid using immediate variations, flags */
-        /* won't be set correctly */
-        return 0;
-    case SLJIT_ADDC32:
-    case SLJIT_ADDC:
-        /* no ADD LOGICAL WITH CARRY IMMEDIATE */
-        return 0;
-    case SLJIT_SUB:
-    case SLJIT_SUB32:
-    case SLJIT_SUBC:
-    case SLJIT_SUBC32:
-        /* no SUBTRACT IMMEDIATE */
-        /* TODO(mundaym): SUBTRACT LOGICAL IMMEDIATE */
-        return 0;
+    if (src2 & SLJIT_IMM) {
+        if (!sets_zero_overflow && is_s8(src2w) && (src1 & SLJIT_MEM) && (dst == src1 && dstw == src1w)) {
+            if (sets_overflow)
+                ins = (op & SLJIT_I32_OP) ? 0xeb000000006a /* asi */ : 0xeb000000007a /* agsi */;
+            else
+                ins = (op & SLJIT_I32_OP) ? 0xeb000000006e /* alsi */ : 0xeb000000007e /* algsi */;
+            return emit_siy(compiler, ins, dst, dstw, src2w);
+        }
+
+        if (is_s16(src2w)) {
+            if (sets_overflow)
+                ins = (op & SLJIT_I32_OP) ? 0xec00000000d8 /* ahik */ : 0xec00000000d9 /* aghik */;
+            else
+                ins = (op & SLJIT_I32_OP) ? 0xec00000000da /* alhsik */ : 0xec00000000db /* alghsik */;
+            FAIL_IF(emit_rie_d(compiler, ins, dst, src1, src1w, src2w));
+            goto done;
+        }
+
+        if (!sets_overflow) {
+            if ((op & SLJIT_I32_OP) || is_u32(src2w)) {
+                ins = (op & SLJIT_I32_OP) ? 0xc20b00000000 /* alfi */ : 0xc20a00000000 /* algfi */;
+                FAIL_IF(emit_ri(compiler, ins, dst, src1, src1w, src2w, RIL_A));
+                goto done;
+            }
+            if (is_u32(-src2w)) {
+                FAIL_IF(emit_ri(compiler, 0xc20400000000 /* slgfi */, dst, src1, src1w, -src2w, RIL_A));
+                goto done;
+            }
+        }
+        else if ((op & SLJIT_I32_OP) || is_s32(src2w)) {
+            ins = (op & SLJIT_I32_OP) ? 0xc20900000000 /* afi */ : 0xc20800000000 /* agfi */;
+            FAIL_IF(emit_ri(compiler, ins, dst, src1, src1w, src2w, RIL_A));
+            goto done;
+        }
     }
-    return 0;
+
+    forms = sets_overflow ? &add_forms : &logical_add_forms;
+    FAIL_IF(emit_commutative(compiler, forms, dst, dstw, src1, src1w, src2, src2w));
+
+done:
+    if (sets_zero_overflow)
+        FAIL_IF(update_zero_overflow(compiler, op, SLOW_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0));
+
+    if (dst & SLJIT_MEM)
+        return store_word(compiler, tmp0, dst, dstw, op & SLJIT_I32_OP);
+
+    return SLJIT_SUCCESS;
 }


-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
+static const struct ins_forms sub_forms = {
+    0x1b00, /* sr */
+    0xb9090000, /* sgr */
+    0xb9f90000, /* srk */
+    0xb9e90000, /* sgrk */
+    0x5b000000, /* s */
+    0xe3000000005b, /* sy */
+    0xe30000000009, /* sg */
+};
+
+static const struct ins_forms logical_sub_forms = {
+    0x1f00, /* slr */
+    0xb90b0000, /* slgr */
+    0xb9fb0000, /* slrk */
+    0xb9eb0000, /* slgrk */
+    0x5f000000, /* sl */
+    0xe3000000005f, /* sly */
+    0xe3000000000b, /* slg */
+};
+
+static sljit_s32 sljit_emit_sub(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w)
 {
-    CHECK_ERROR();
-    CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
-    ADJUST_LOCAL_OFFSET(dst, dstw);
-    ADJUST_LOCAL_OFFSET(src1, src1w);
-    ADJUST_LOCAL_OFFSET(src2, src2w);
+    int sets_signed = sets_signed_flag(op);
+    int sets_zero_overflow = (op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_SET_Z | SLJIT_SET_OVERFLOW);
+    const struct ins_forms *forms;
+    sljit_ins ins;


-    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
-        return SLJIT_SUCCESS;
+    if (dst == SLJIT_UNUSED && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL) {
+        int compare_signed = GET_FLAG_TYPE(op) >= SLJIT_SIG_LESS;


-    sljit_gpr dst_r = SLOW_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+        compiler->status_flags_state |= SLJIT_CURRENT_FLAGS_COMPARE;


-    if (is_commutative(op)) {
-        #define SWAP_ARGS \
-        do {                         \
-            sljit_s32 t = src1;  \
-            sljit_sw tw = src1w; \
-            src1 = src2;         \
-            src1w = src2w;       \
-            src2 = t;            \
-            src2w = tw;          \
-        } while(0);
-
-        /* prefer immediate in src2 */
-        if (src1 & SLJIT_IMM) {
-            SWAP_ARGS
+        if (src2 & SLJIT_IMM) {
+            if (compare_signed || ((op & VARIABLE_FLAG_MASK) == 0 && is_s32(src2w)))
+            {
+                if ((op & SLJIT_I32_OP) || is_s32(src2w)) {
+                    ins = (op & SLJIT_I32_OP) ? 0xc20d00000000 /* cfi */ : 0xc20c00000000 /* cgfi */;
+                    return emit_ri(compiler, ins, src1, src1, src1w, src2w, RIL_A);
+                }
+            }
+            else {
+                if ((op & SLJIT_I32_OP) || is_u32(src2w)) {
+                    ins = (op & SLJIT_I32_OP) ? 0xc20f00000000 /* clfi */ : 0xc20e00000000 /* clgfi */;
+                    return emit_ri(compiler, ins, src1, src1, src1w, src2w, RIL_A);
+                }
+                if (is_s16(src2w))
+                    return emit_rie_d(compiler, 0xec00000000db /* alghsik */, SLJIT_UNUSED, src1, src1w, src2w);
+            }
         }
+        else if (src2 & SLJIT_MEM) {
+            if ((op & SLJIT_I32_OP) && ((src2 & OFFS_REG_MASK) || is_u12(src2w))) {
+                ins = compare_signed ? 0x59000000 /* c */ : 0x55000000 /* cl */;
+                return emit_rx(compiler, ins, src1, src1, src1w, src2, src2w, RX_A);
+            }


-        /* prefer to have src1 use same register as dst */
-        if (FAST_IS_REG(src2) && gpr(src2 & REG_MASK) == dst_r) {
-            SWAP_ARGS
+            if (compare_signed)
+                ins = (op & SLJIT_I32_OP) ? 0xe30000000059 /* cy */ : 0xe30000000020 /* cg */;
+            else
+                ins = (op & SLJIT_I32_OP) ? 0xe30000000055 /* cly */ : 0xe30000000021 /* clg */;
+            return emit_rx(compiler, ins, src1, src1, src1w, src2, src2w, RXY_A);
         }


-        /* prefer memory argument in src2 */
-        if (FAST_IS_REG(src2) && (src1 & SLJIT_MEM)) {
-            SWAP_ARGS
-        }
-        #undef SWAP_ARGS
+        if (compare_signed)
+            ins = (op & SLJIT_I32_OP) ? 0x1900 /* cr */ : 0xb9200000 /* cgr */;
+        else
+            ins = (op & SLJIT_I32_OP) ? 0x1500 /* clr */ : 0xb9210000 /* clgr */;
+        return emit_rr(compiler, ins, src1, src1, src1w, src2, src2w);
     }


-    /* src1 must be in a register */
-    sljit_gpr src1_r = FAST_IS_REG(src1) ? gpr(src1 & REG_MASK) : tmp0;
-    if (src1 & SLJIT_IMM)
-        FAIL_IF(push_load_imm_inst(compiler, src1_r, src1w));
+    if (src2 & SLJIT_IMM) {
+        sljit_sw neg_src2w = -src2w;


-    if (src1 & SLJIT_MEM)
-        FAIL_IF(load_word(compiler, src1_r, src1, src1w, tmp1, op & SLJIT_I32_OP));
+        if (sets_signed || neg_src2w != 0 || (op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == 0) {
+            if (!sets_zero_overflow && is_s8(neg_src2w) && (src1 & SLJIT_MEM) && (dst == src1 && dstw == src1w)) {
+                if (sets_signed)
+                    ins = (op & SLJIT_I32_OP) ? 0xeb000000006a /* asi */ : 0xeb000000007a /* agsi */;
+                else
+                    ins = (op & SLJIT_I32_OP) ? 0xeb000000006e /* alsi */ : 0xeb000000007e /* algsi */;
+                return emit_siy(compiler, ins, dst, dstw, neg_src2w);
+            }


-    /* emit comparison before subtract */
-    if (GET_OPCODE(op) == SLJIT_SUB && (op & VARIABLE_FLAG_MASK)) {
-        sljit_sw cmp = 0;
-        switch (GET_FLAG_TYPE(op)) {
-        case SLJIT_LESS:
-        case SLJIT_LESS_EQUAL:
-        case SLJIT_GREATER:
-        case SLJIT_GREATER_EQUAL:
-            cmp = 1; /* unsigned */
-            break;
-        case SLJIT_EQUAL:
-        case SLJIT_SIG_LESS:
-        case SLJIT_SIG_LESS_EQUAL:
-        case SLJIT_SIG_GREATER:
-        case SLJIT_SIG_GREATER_EQUAL:
-            cmp = -1; /* signed */
-            break;
+            if (is_s16(neg_src2w)) {
+                if (sets_signed)
+                    ins = (op & SLJIT_I32_OP) ? 0xec00000000d8 /* ahik */ : 0xec00000000d9 /* aghik */;
+                else
+                    ins = (op & SLJIT_I32_OP) ? 0xec00000000da /* alhsik */ : 0xec00000000db /* alghsik */;
+                FAIL_IF(emit_rie_d(compiler, ins, dst, src1, src1w, neg_src2w));
+                goto done;
+            }
         }
-        if (cmp) {
-            /* clear flags - no need to generate now */
-            op &= ~VARIABLE_FLAG_MASK;
-            sljit_gpr src2_r = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : tmp1;
-            if (src2 & SLJIT_IMM) {
-                #define LEVAL(i) i(src1_r, src2w)
-                if (cmp > 0 && is_u32(src2w)) {
-                    /* unsigned */
-                    FAIL_IF(push_inst(compiler,
-                    WHEN2(op & SLJIT_I32_OP, clfi, clgfi)));
-                }
-                else if (cmp < 0 && is_s16(src2w)) {
-                    /* signed */
-                    FAIL_IF(push_inst(compiler,
-                    WHEN2(op & SLJIT_I32_OP, chi, cghi)));
-                }
-                else if (cmp < 0 && is_s32(src2w)) {
-                    /* signed */
-                    FAIL_IF(push_inst(compiler,
-                    WHEN2(op & SLJIT_I32_OP, cfi, cgfi)));
-                }
-                #undef LEVAL
-                #define LEVAL(i) i(src1_r, src2_r)
-                else {
-                    FAIL_IF(push_load_imm_inst(compiler, src2_r, src2w));
-                    if (cmp > 0) {
-                        /* unsigned */
-                        FAIL_IF(push_inst(compiler,
-                        WHEN2(op & SLJIT_I32_OP, clr, clgr)));
-                    }
-                    if (cmp < 0) {
-                        /* signed */
-                        FAIL_IF(push_inst(compiler,
-                        WHEN2(op & SLJIT_I32_OP, cr, cgr)));
-                    }
-                }
+
+        if (!sets_signed) {
+            if ((op & SLJIT_I32_OP) || is_u32(src2w)) {
+                ins = (op & SLJIT_I32_OP) ? 0xc20500000000 /* slfi */ : 0xc20400000000 /* slgfi */;
+                FAIL_IF(emit_ri(compiler, ins, dst, src1, src1w, src2w, RIL_A));
+                goto done;
             }
-            else {
-                if (src2 & SLJIT_MEM) {
-                    /* TODO(mundaym): comparisons with memory */
-                    /* load src2 into register */
-                    FAIL_IF(load_word(compiler, src2_r, src2, src2w, tmp1, op & SLJIT_I32_OP));
-                }
-                if (cmp > 0) {
-                    /* unsigned */
-                    FAIL_IF(push_inst(compiler,
-                        WHEN2(op & SLJIT_I32_OP, clr, clgr)));
-                }
-                if (cmp < 0) {
-                    /* signed */
-                    FAIL_IF(push_inst(compiler,
-                        WHEN2(op & SLJIT_I32_OP, cr, cgr)));
-                }
-                #undef LEVAL
+            if (is_u32(neg_src2w)) {
+                FAIL_IF(emit_ri(compiler, 0xc20a00000000 /* algfi */, dst, src1, src1w, neg_src2w, RIL_A));
+                goto done;
             }
-            FAIL_IF(push_inst(compiler, ipm(flag_r)));
         }
+        else if ((op & SLJIT_I32_OP) || is_s32(neg_src2w)) {
+            ins = (op & SLJIT_I32_OP) ? 0xc20900000000 /* afi */ : 0xc20800000000 /* agfi */;
+            FAIL_IF(emit_ri(compiler, ins, dst, src1, src1w, neg_src2w, RIL_A));
+            goto done;
+        }
     }


-    if (!HAS_FLAGS(op) && dst == SLJIT_UNUSED)
-        return SLJIT_SUCCESS;
+    forms = sets_signed ? &sub_forms : &logical_sub_forms;
+    FAIL_IF(emit_non_commutative(compiler, forms, dst, dstw, src1, src1w, src2, src2w));


-    /* need to specify signed or logical operation */
-    int signed_flags = sets_signed_flag(op);
+done:
+    if (sets_signed) {
+        sljit_gpr dst_r = SLOW_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;


-    if (is_shift(op)) {
-        /* handle shifts first, they have more constraints than other operations */
-        sljit_sw d = 0;
-        sljit_gpr b = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : r0;
-        if (src2 & SLJIT_IMM)
-            d = src2w & ((op & SLJIT_I32_OP) ? 31 : 63);
+        if ((op & VARIABLE_FLAG_MASK) != SLJIT_SET_OVERFLOW) {
+            /* In case of overflow, the sign bit of the two source operands must be different, and
+                 - the first operand is greater if the sign bit of the result is set
+                 - the first operand is less if the sign bit of the result is not set
+               The -result operation sets the corrent sign, because the result cannot be zero.
+               The overflow is considered greater, since the result must be equal to INT_MIN so its sign bit is set. */
+            FAIL_IF(push_inst(compiler, brc(0xe, 2 + 2)));
+            FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ? lcr(tmp1, dst_r) : lcgr(tmp1, dst_r)));
+        }
+        else if (op & SLJIT_SET_Z)
+            FAIL_IF(update_zero_overflow(compiler, op, dst_r));
+    }


-        if (src2 & SLJIT_MEM) {
-            /* shift amount (b) cannot be in r0 (i.e. tmp0) */
-            FAIL_IF(load_word(compiler, tmp1, src2, src2w, tmp1, op & SLJIT_I32_OP));
-            b = tmp1;
+    if (dst & SLJIT_MEM)
+        return store_word(compiler, tmp0, dst, dstw, op & SLJIT_I32_OP);
+
+    return SLJIT_SUCCESS;
+}
+
+static const struct ins_forms multiply_forms = {
+    0xb2520000, /* msr */
+    0xb90c0000, /* msgr */
+    0xb9fd0000, /* msrkc */
+    0xb9ed0000, /* msgrkc */
+    0x71000000, /* ms */
+    0xe30000000051, /* msy */
+    0xe3000000000c, /* msg */
+};
+
+static const struct ins_forms multiply_overflow_forms = {
+    0,
+    0,
+    0xb9fd0000, /* msrkc */
+    0xb9ed0000, /* msgrkc */
+    0,
+    0xe30000000053, /* msc */
+    0xe30000000083, /* msgc */
+};
+
+static sljit_s32 sljit_emit_multiply(struct sljit_compiler *compiler, sljit_s32 op,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    sljit_ins ins;
+
+    if (HAS_FLAGS(op)) {
+        /* if have_misc2 fails, this operation should be emulated. 32 bit emulation:
+        FAIL_IF(push_inst(compiler, lgfr(tmp0, src1_r)));
+        FAIL_IF(push_inst(compiler, msgfr(tmp0, src2_r)));
+        if (dst_r != tmp0) {
+            FAIL_IF(push_inst(compiler, lr(dst_r, tmp0)));
         }
-        /* src1 and dst share the same register in the base 32-bit ISA */
-        /* TODO(mundaym): not needed when distinct-operand facility is available */
-        int workaround_alias = op & SLJIT_I32_OP && src1_r != dst_r;
-        if (workaround_alias) {
-            /* put src1 into tmp0 so we can overwrite it */
-            FAIL_IF(push_inst(compiler, lr(tmp0, src1_r)));
-            src1_r = tmp0;
-        }
-        switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
-        case SLJIT_SHL:
-            FAIL_IF(push_inst(compiler, sllg(dst_r, src1_r, d, b)));
-            break;
-        case SLJIT_SHL32:
-            FAIL_IF(push_inst(compiler, sll(src1_r, d, b)));
-            break;
-        case SLJIT_LSHR:
-            FAIL_IF(push_inst(compiler, srlg(dst_r, src1_r, d, b)));
-            break;
-        case SLJIT_LSHR32:
-            FAIL_IF(push_inst(compiler, srl(src1_r, d, b)));
-            break;
-        case SLJIT_ASHR:
-            FAIL_IF(push_inst(compiler, srag(dst_r, src1_r, d, b)));
-            break;
-        case SLJIT_ASHR32:
-            FAIL_IF(push_inst(compiler, sra(src1_r, d, b)));
-            break;
-        default:
-            SLJIT_UNREACHABLE();
-        }
-        if (workaround_alias && dst_r != src1_r)
-            FAIL_IF(push_inst(compiler, lr(dst_r, src1_r)));
+        FAIL_IF(push_inst(compiler, aih(tmp0, 1)));
+        FAIL_IF(push_inst(compiler, nihf(tmp0, ~1U)));
+        FAIL_IF(push_inst(compiler, ipm(flag_r)));
+        FAIL_IF(push_inst(compiler, oilh(flag_r, 0x2000))); */


+        return emit_commutative(compiler, &multiply_overflow_forms, dst, dstw, src1, src1w, src2, src2w);
     }
-    else if ((GET_OPCODE(op) == SLJIT_MUL) && HAS_FLAGS(op)) {
-        /* multiply instructions do not generally set flags so we need to manually */
-        /* detect overflow conditions */
-        /* TODO(mundaym): 64-bit overflow */
-        SLJIT_ASSERT(GET_FLAG_TYPE(op) == SLJIT_MUL_OVERFLOW ||
-                     GET_FLAG_TYPE(op) == SLJIT_MUL_NOT_OVERFLOW);
-        sljit_gpr src2_r = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : tmp1;
-        if (src2 & SLJIT_IMM) {
-            /* load src2 into register */
-            FAIL_IF(push_load_imm_inst(compiler, src2_r, src2w));
+
+    if (src2 & SLJIT_IMM) {
+        if (is_s16(src2w)) {
+            ins = (op & SLJIT_I32_OP) ? 0xa70c0000 /* mhi */ : 0xa70d0000 /* mghi */;
+            return emit_ri(compiler, ins, dst, src1, src1w, src2w, RI_A);
         }
-        if (src2 & SLJIT_MEM) {
-            /* load src2 into register */
-            FAIL_IF(load_word(compiler, src2_r, src2, src2w, tmp1, op & SLJIT_I32_OP));
+
+        if (is_s32(src2w)) {
+            ins = (op & SLJIT_I32_OP) ? 0xc20100000000 /* msfi */ : 0xc20000000000 /* msgfi */;
+            return emit_ri(compiler, ins, dst, src1, src1w, src2w, RIL_A);
         }
-        if (have_misc2()) {
-            #define LEVAL(i) i(dst_r, src1_r, src2_r)
-            FAIL_IF(push_inst(compiler,
-                WHEN2(op & SLJIT_I32_OP, msrkc, msgrkc)));
-            #undef LEVAL
-        }
-        else if (op & SLJIT_I32_OP) {
-            op &= ~VARIABLE_FLAG_MASK;
-            FAIL_IF(push_inst(compiler, lgfr(tmp0, src1_r)));
-            FAIL_IF(push_inst(compiler, msgfr(tmp0, src2_r)));
-            if (dst_r != tmp0) {
-                FAIL_IF(push_inst(compiler, lr(dst_r, tmp0)));
-            }
-            FAIL_IF(push_inst(compiler, aih(tmp0, 1)));
-            FAIL_IF(push_inst(compiler, nihf(tmp0, ~1U)));
-            FAIL_IF(push_inst(compiler, ipm(flag_r)));
-            FAIL_IF(push_inst(compiler, oilh(flag_r, 0x2000)));
-        }
-        else
-            return SLJIT_ERR_UNSUPPORTED;
+    }


+    return emit_commutative(compiler, &multiply_forms, dst, dstw, src1, src1w, src2, src2w);
+}
+
+static sljit_s32 sljit_emit_bitwise_imm(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_uw imm, sljit_s32 count16)
+{
+    sljit_s32 mode = compiler->mode;
+    sljit_gpr dst_r = tmp0;
+    sljit_s32 needs_move = 1;
+
+    if (SLOW_IS_REG(dst)) {
+        dst_r = gpr(dst & REG_MASK);
+        if (dst == src1)
+            needs_move = 0;
     }
-    else if ((GET_OPCODE(op) == SLJIT_SUB) && (op & SLJIT_SET_Z) && !signed_flags) {
-        /* subtract logical instructions do not set the right flags unfortunately */
-        /* instead, negate src2 and issue an add logical */
-        /* TODO(mundaym): distinct operand facility where needed */
-        if (src1_r != dst_r && src1_r != tmp0) {
-            #define LEVAL(i) i(tmp0, src1_r)
-            FAIL_IF(push_inst(compiler,
-                WHEN2(op & SLJIT_I32_OP, lr, lgr)));
-            src1_r = tmp0;
-            #undef LEVAL
+
+    if (needs_move)
+        FAIL_IF(emit_move(compiler, dst_r, src1, src1w));
+
+    if (type == SLJIT_AND) {
+        if (!(mode & SLJIT_I32_OP))
+            FAIL_IF(push_inst(compiler, 0xc00a00000000 /* nihf */ | (dst_r << 36) | (imm >> 32)));
+        return push_inst(compiler, 0xc00b00000000 /* nilf */ | (dst_r << 36) | (imm & 0xffffffff));
+    }
+    else if (type == SLJIT_OR) {
+        if (count16 >= 3) {
+            FAIL_IF(push_inst(compiler, 0xc00c00000000 /* oihf */ | (dst_r << 36) | (imm >> 32)));
+            return push_inst(compiler, 0xc00d00000000 /* oilf */ | (dst_r << 36) | (imm & 0xffffffff));
         }
-        sljit_gpr src2_r = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : tmp1;
-        if (src2 & SLJIT_IMM) {
-            /* load src2 into register */
-            FAIL_IF(push_load_imm_inst(compiler, src2_r, src2w));
+
+        if (count16 >= 2) {
+            if ((imm & 0x00000000ffffffffull) == 0)
+                return push_inst(compiler, 0xc00c00000000 /* oihf */ | (dst_r << 36) | (imm >> 32));
+            if ((imm & 0xffffffff00000000ull) == 0)
+                return push_inst(compiler, 0xc00d00000000 /* oilf */ | (dst_r << 36) | (imm & 0xffffffff));
         }
-        if (src2 & SLJIT_MEM) {
-            /* load src2 into register */
-            FAIL_IF(load_word(compiler, src2_r, src2, src2w, tmp1, op & SLJIT_I32_OP));
-        }
-        if (op & SLJIT_I32_OP) {
-            FAIL_IF(push_inst(compiler, lcr(tmp1, src2_r)));
-            FAIL_IF(push_inst(compiler, alr(src1_r, tmp1)));
-            if (src1_r != dst_r)
-                FAIL_IF(push_inst(compiler, lr(dst_r, src1_r)));
-        }
-        else {
-            FAIL_IF(push_inst(compiler, lcgr(tmp1, src2_r)));
-            FAIL_IF(push_inst(compiler, algr(src1_r, tmp1)));
-            if (src1_r != dst_r)
-                FAIL_IF(push_inst(compiler, lgr(dst_r, src1_r)));
-        }
+
+        if ((imm & 0xffff000000000000ull) != 0)
+            FAIL_IF(push_inst(compiler, 0xa5080000 /* oihh */ | (dst_r << 20) | (imm >> 48)));
+        if ((imm & 0x0000ffff00000000ull) != 0)
+            FAIL_IF(push_inst(compiler, 0xa5090000 /* oihl */ | (dst_r << 20) | ((imm >> 32) & 0xffff)));
+        if ((imm & 0x00000000ffff0000ull) != 0)
+            FAIL_IF(push_inst(compiler, 0xa50a0000 /* oilh */ | (dst_r << 20) | ((imm >> 16) & 0xffff)));
+        if ((imm & 0x000000000000ffffull) != 0 || imm == 0)
+            return push_inst(compiler, 0xa50b0000 /* oill */ | (dst_r << 20) | (imm & 0xffff));
+        return SLJIT_SUCCESS;
     }
-    else if ((src2 & SLJIT_IMM) && (src1_r == dst_r) && have_op_2_imm(op, src2w)) {
-        switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
-        #define LEVAL(i) i(dst_r, src2w)
-        case SLJIT_ADD:
-            if (!HAS_FLAGS(op) || signed_flags) {
-                FAIL_IF(push_inst(compiler,
-                    WHEN2(is_s16(src2w), aghi, agfi)));
-            }
-            else
-                FAIL_IF(push_inst(compiler, LEVAL(algfi)));


-            break;
-        case SLJIT_ADD32:
-            if (!HAS_FLAGS(op) || signed_flags)
-                FAIL_IF(push_inst(compiler,
-                    WHEN2(is_s16(src2w), ahi, afi)));
+    if ((imm & 0xffffffff00000000ull) != 0)
+        FAIL_IF(push_inst(compiler, 0xc00600000000 /* xihf */ | (dst_r << 36) | (imm >> 32)));
+    if ((imm & 0x00000000ffffffffull) != 0 || imm == 0)
+        return push_inst(compiler, 0xc00700000000 /* xilf */ | (dst_r << 36) | (imm & 0xffffffff));
+    return SLJIT_SUCCESS;
+}
+
+static const struct ins_forms bitwise_and_forms = {
+    0x1400, /* nr */
+    0xb9800000, /* ngr */
+    0xb9f40000, /* nrk */
+    0xb9e40000, /* ngrk */
+    0x54000000, /* n */
+    0xe30000000054, /* ny */
+    0xe30000000080, /* ng */
+};
+
+static const struct ins_forms bitwise_or_forms = {
+    0x1600, /* or */
+    0xb9810000, /* ogr */
+    0xb9f60000, /* ork */
+    0xb9e60000, /* ogrk */
+    0x56000000, /* o */
+    0xe30000000056, /* oy */
+    0xe30000000081, /* og */
+};
+
+static const struct ins_forms bitwise_xor_forms = {
+    0x1700, /* xr */
+    0xb9820000, /* xgr */
+    0xb9f70000, /* xrk */
+    0xb9e70000, /* xgrk */
+    0x57000000, /* x */
+    0xe30000000057, /* xy */
+    0xe30000000082, /* xg */
+};
+
+static sljit_s32 sljit_emit_bitwise(struct sljit_compiler *compiler, sljit_s32 op,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    sljit_s32 type = GET_OPCODE(op);
+    const struct ins_forms *forms;
+
+    if ((src2 & SLJIT_IMM) && (!(op & SLJIT_SET_Z) || (type == SLJIT_AND && dst == SLJIT_UNUSED))) {
+        sljit_s32 count16 = 0;
+        sljit_uw imm = (sljit_uw)src2w;
+
+        if (op & SLJIT_I32_OP)
+            imm &= 0xffffffffull;
+
+        if ((imm & 0x000000000000ffffull) != 0 || imm == 0)
+            count16++;
+        if ((imm & 0x00000000ffff0000ull) != 0)
+            count16++;
+        if ((imm & 0x0000ffff00000000ull) != 0)
+            count16++;
+        if ((imm & 0xffff000000000000ull) != 0)
+            count16++;
+
+        if (type == SLJIT_AND && dst == SLJIT_UNUSED && count16 == 1) {
+            sljit_gpr src_r = tmp0;
+
+            if (FAST_IS_REG(src1))
+                src_r = gpr(src1 & REG_MASK);
             else
-                FAIL_IF(push_inst(compiler, LEVAL(alfi)));
+                FAIL_IF(emit_move(compiler, tmp0, src1, src1w));


-            break;
-        #undef LEVAL /* TODO(carenas): move down and refactor? */
-        case SLJIT_MUL:
-            FAIL_IF(push_inst(compiler, mhi(dst_r, src2w)));
-            break;
-        case SLJIT_MUL32:
-            FAIL_IF(push_inst(compiler, mghi(dst_r, src2w)));
-            break;
-        case SLJIT_OR32:
-            FAIL_IF(push_inst(compiler, oilf(dst_r, src2w)));
-            break;
-        case SLJIT_XOR32:
-            FAIL_IF(push_inst(compiler, xilf(dst_r, src2w)));
-            break;
-        case SLJIT_AND32:
-            FAIL_IF(push_inst(compiler, nilf(dst_r, src2w)));
-            break;
-        default:
-            SLJIT_UNREACHABLE();
+            if ((imm & 0x000000000000ffffull) != 0 || imm == 0)
+                return push_inst(compiler, 0xa7010000 | (src_r << 20) | imm);
+            if ((imm & 0x00000000ffff0000ull) != 0)
+                return push_inst(compiler, 0xa7000000 | (src_r << 20) | (imm >> 16));
+            if ((imm & 0x0000ffff00000000ull) != 0)
+                return push_inst(compiler, 0xa7030000 | (src_r << 20) | (imm >> 32));
+            return push_inst(compiler, 0xa7020000 | (src_r << 20) | (imm >> 48));
         }
+
+        if (!(op & SLJIT_SET_Z))
+            return sljit_emit_bitwise_imm(compiler, type, dst, dstw, src1, src1w, imm, count16);
     }
-    else if ((src2 & SLJIT_IMM) && have_op_3_imm(op, src2w)) {
-        abort(); /* TODO(mundaym): implement */
+
+    if (type == SLJIT_AND)
+        forms = &bitwise_and_forms;
+    else if (type == SLJIT_OR)
+        forms = &bitwise_or_forms;
+    else
+        forms = &bitwise_xor_forms;
+
+    return emit_commutative(compiler, forms, dst, dstw, src1, src1w, src2, src2w);
+}
+
+static sljit_s32 sljit_emit_shift(struct sljit_compiler *compiler, sljit_s32 op,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    sljit_s32 type = GET_OPCODE(op);
+    sljit_gpr dst_r = SLOW_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+    sljit_gpr src_r = tmp0;
+    sljit_gpr base_r = tmp0;
+    sljit_ins imm = 0;
+    sljit_ins ins;
+
+    if (FAST_IS_REG(src1))
+        src_r = gpr(src1 & REG_MASK);
+    else
+        FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
+
+    if (src2 & SLJIT_IMM)
+        imm = src2w & ((op & SLJIT_I32_OP) ? 0x1f : 0x3f);
+    else if (FAST_IS_REG(src2))
+        base_r = gpr(src2 & REG_MASK);
+    else {
+        FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
+        base_r = tmp1;
     }
-    else if ((src2 & SLJIT_MEM) && (dst_r == src1_r)) {
-        /* most 32-bit instructions can only handle 12-bit immediate offsets */
-        int need_u12 = !have_ldisp() &&
-            (op & SLJIT_I32_OP) &&
-            (GET_OPCODE(op) != SLJIT_ADDC) &&
-            (GET_OPCODE(op) != SLJIT_SUBC);
-        struct addr mem;
-        if (need_u12)
-            FAIL_IF(make_addr_bx(compiler, &mem, src2, src2w, tmp1));
+
+    if ((op & SLJIT_I32_OP) && dst_r == src_r) {
+        if (type == SLJIT_SHL)
+            ins = 0x89000000 /* sll */;
+        else if (type == SLJIT_LSHR)
+            ins = 0x88000000 /* srl */;
         else
-            FAIL_IF(make_addr_bxy(compiler, &mem, src2, src2w, tmp1));
+            ins = 0x8a000000 /* sra */;


-        int can_u12 = is_u12(mem.offset) ? 1 : 0;
-        sljit_ins ins = 0;
-        switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
-        /* 64-bit ops */
-        #define LEVAL(i) EVAL(i, dst_r, mem)
-        case SLJIT_ADD:
-            ins = WHEN2(signed_flags, ag, alg);
-            break;
-        case SLJIT_SUB:
-            ins = WHEN2(signed_flags, sg, slg);
-            break;
-        case SLJIT_ADDC:
-            ins = LEVAL(alcg);
-            break;
-        case SLJIT_SUBC:
-            ins = LEVAL(slbg);
-            break;
-        case SLJIT_MUL:
-            ins = LEVAL(msg);
-            break;
-        case SLJIT_OR:
-            ins = LEVAL(og);
-            break;
-        case SLJIT_XOR:
-            ins = LEVAL(xg);
-            break;
-        case SLJIT_AND:
-            ins = LEVAL(ng);
-            break;
-        /* 32-bit ops */
-        case SLJIT_ADD32:
-            if (signed_flags)
-                ins = WHEN2(can_u12, a, ay);
-            else
-                ins = WHEN2(can_u12, al, aly);
-            break;
-        case SLJIT_SUB32:
-            if (signed_flags)
-                ins = WHEN2(can_u12, s, sy);
-            else
-                ins = WHEN2(can_u12, sl, sly);
-            break;
-        case SLJIT_ADDC32:
-            ins = LEVAL(alc);
-            break;
-        case SLJIT_SUBC32:
-            ins = LEVAL(slb);
-            break;
-        case SLJIT_MUL32:
-            ins = WHEN2(can_u12, ms, msy);
-            break;
-        case SLJIT_OR32:
-            ins = WHEN2(can_u12, o, oy);
-            break;
-        case SLJIT_XOR32:
-            ins = WHEN2(can_u12, x, xy);
-            break;
-        case SLJIT_AND32:
-            ins = WHEN2(can_u12, n, ny);
-            break;
-        #undef LEVAL
-        default:
-            SLJIT_UNREACHABLE();
-        }
-        FAIL_IF(push_inst(compiler, ins));
+        FAIL_IF(push_inst(compiler, ins | (dst_r << 20) | (base_r << 12) | imm));
     }
     else {
-        sljit_gpr src2_r = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : tmp1;
-        if (src2 & SLJIT_IMM) {
-            /* load src2 into register */
-            FAIL_IF(push_load_imm_inst(compiler, src2_r, src2w));
-        }
-        if (src2 & SLJIT_MEM) {
-            /* load src2 into register */
-            FAIL_IF(load_word(compiler, src2_r, src2, src2w, tmp1, op & SLJIT_I32_OP));
-        }
-        /* TODO(mundaym): distinct operand facility where needed */
-        #define LEVAL(i) i(tmp0, src1_r)
-        if (src1_r != dst_r && src1_r != tmp0) {
-            FAIL_IF(push_inst(compiler,
-                WHEN2(op & SLJIT_I32_OP, lr, lgr)));
-            src1_r = tmp0;
-        }
-        #undef LEVAL
-        sljit_ins ins = 0;
-        switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
-        #define LEVAL(i) i(src1_r, src2_r)
-        /* 64-bit ops */
-        case SLJIT_ADD:
-            ins = WHEN2(signed_flags, agr, algr);
-            break;
-        case SLJIT_SUB:
-            ins = WHEN2(signed_flags, sgr, slgr);
-            break;
-        case SLJIT_ADDC:
-            ins = LEVAL(alcgr);
-            break;
-        case SLJIT_SUBC:
-            ins = LEVAL(slbgr);
-            break;
-        case SLJIT_MUL:
-            ins = LEVAL(msgr);
-            break;
-        case SLJIT_AND:
-            ins = LEVAL(ngr);
-            break;
-        case SLJIT_OR:
-            ins = LEVAL(ogr);
-            break;
-        case SLJIT_XOR:
-            ins = LEVAL(xgr);
-            break;
-        /* 32-bit ops */
-        case SLJIT_ADD32:
-            ins = WHEN2(signed_flags, ar, alr);
-            break;
-        case SLJIT_SUB32:
-            ins = WHEN2(signed_flags, sr, slr);
-            break;
-        case SLJIT_ADDC32:
-            ins = LEVAL(alcr);
-            break;
-        case SLJIT_SUBC32:
-            ins = LEVAL(slbr);
-            break;
-        case SLJIT_MUL32:
-            ins = LEVAL(msr);
-            break;
-        case SLJIT_AND32:
-            ins = LEVAL(nr);
-            break;
-        case SLJIT_OR32:
-            ins = LEVAL(or);
-            break;
-        case SLJIT_XOR32:
-            ins = LEVAL(xr);
-            break;
-        #undef LEVAL
-        default:
-            SLJIT_UNREACHABLE();
-        }
-        FAIL_IF(push_inst(compiler, ins));
-        #define LEVAL(i) i(dst_r, src1_r)
-        if (src1_r != dst_r)
-            FAIL_IF(push_inst(compiler,
-                WHEN2(op & SLJIT_I32_OP, lr, lgr)));
-        #undef LEVAL
+        if (type == SLJIT_SHL)
+            ins = (op & SLJIT_I32_OP) ? 0xeb00000000df /* sllk */ : 0xeb000000000d /* sllg */;
+        else if (type == SLJIT_LSHR)
+            ins = (op & SLJIT_I32_OP) ? 0xeb00000000de /* srlk */ : 0xeb000000000c /* srlg */;
+        else
+            ins = (op & SLJIT_I32_OP) ? 0xeb00000000dc /* srak */ : 0xeb000000000a /* srag */;
+
+        FAIL_IF(push_inst(compiler, ins | (dst_r << 36) | (src_r << 32) | (base_r << 28) | (imm << 16)));
     }


-    /* write condition code to emulated flag register */
-    if (op & VARIABLE_FLAG_MASK)
-        FAIL_IF(push_inst(compiler, ipm(flag_r)));
+    if ((op & SLJIT_SET_Z) && type != SLJIT_ASHR)
+        return push_inst(compiler, (op & SLJIT_I32_OP) ? or(dst_r, dst_r) : ogr(dst_r, dst_r));


-    /* write zero flag to emulated flag register */
-    if (op & SLJIT_SET_Z)
-        FAIL_IF(push_store_zero_flag(compiler, op, dst_r));
+    return SLJIT_SUCCESS;
+}


-    /* finally write the result to memory if required */
-    if (dst & SLJIT_MEM) {
-        SLJIT_ASSERT(dst_r != tmp1);
-        /* TODO(carenas): s/FAIL_IF/ return */
-        FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
+static const struct ins_forms addc_forms = {
+    0xb9980000, /* alcr */
+    0xb9880000, /* alcgr */
+    0,
+    0,
+    0,
+    0xe30000000098, /* alc */
+    0xe30000000088, /* alcg */
+};
+
+static const struct ins_forms subc_forms = {
+    0xb9990000, /* slbr */
+    0xb9890000, /* slbgr */
+    0,
+    0,
+    0,
+    0xe30000000099, /* slb */
+    0xe30000000089, /* slbg */
+};
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
+    sljit_s32 dst, sljit_sw dstw,
+    sljit_s32 src1, sljit_sw src1w,
+    sljit_s32 src2, sljit_sw src2w)
+{
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
+    ADJUST_LOCAL_OFFSET(dst, dstw);
+    ADJUST_LOCAL_OFFSET(src1, src1w);
+    ADJUST_LOCAL_OFFSET(src2, src2w);
+
+    if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+        return SLJIT_SUCCESS;
+
+    compiler->mode = op & SLJIT_I32_OP;
+    compiler->status_flags_state = op & (VARIABLE_FLAG_MASK | SLJIT_SET_Z);
+
+    if (GET_OPCODE(op) >= SLJIT_ADD || GET_OPCODE(op) <= SLJIT_SUBC)
+        compiler->status_flags_state |= SLJIT_CURRENT_FLAGS_ADD_SUB;
+
+    if (is_commutative(op) && (src1 & SLJIT_IMM) && !(src2 & SLJIT_IMM)) {
+        src1 ^= src2;
+        src2 ^= src1;
+        src1 ^= src2;
+
+        src1w ^= src2w;
+        src2w ^= src1w;
+        src1w ^= src2w;
     }


+    switch (GET_OPCODE(op)) {
+    case SLJIT_ADD:
+        return sljit_emit_add(compiler, op, dst, dstw, src1, src1w, src2, src2w);
+    case SLJIT_ADDC:
+        FAIL_IF(emit_commutative(compiler, &addc_forms, dst, dstw, src1, src1w, src2, src2w));
+        if (dst & SLJIT_MEM)
+            return store_word(compiler, tmp0, dst, dstw, op & SLJIT_I32_OP);
+        return SLJIT_SUCCESS;
+    case SLJIT_SUB:
+        return sljit_emit_sub(compiler, op, dst, dstw, src1, src1w, src2, src2w);
+    case SLJIT_SUBC:
+        FAIL_IF(emit_non_commutative(compiler, &subc_forms, dst, dstw, src1, src1w, src2, src2w));
+        if (dst & SLJIT_MEM)
+            return store_word(compiler, tmp0, dst, dstw, op & SLJIT_I32_OP);
+        return SLJIT_SUCCESS;
+    case SLJIT_MUL:
+        FAIL_IF(sljit_emit_multiply(compiler, op, dst, dstw, src1, src1w, src2, src2w));
+        break;
+    case SLJIT_AND:
+    case SLJIT_OR:
+    case SLJIT_XOR:
+        FAIL_IF(sljit_emit_bitwise(compiler, op, dst, dstw, src1, src1w, src2, src2w));
+        break;
+    case SLJIT_SHL:
+    case SLJIT_LSHR:
+    case SLJIT_ASHR:
+        FAIL_IF(sljit_emit_shift(compiler, op, dst, dstw, src1, src1w, src2, src2w));
+        break;
+    }
+
+    if (dst & SLJIT_MEM)
+        return store_word(compiler, tmp0, dst, dstw, op & SLJIT_I32_OP);
     return SLJIT_SUCCESS;
 }


@@ -2429,7 +2663,7 @@
     case SLJIT_FAST_RETURN:
         src_r = FAST_IS_REG(src) ? gpr(src) : tmp1;
         if (src & SLJIT_MEM)
-            FAIL_IF(load_word(compiler, tmp1, src, srcw, tmp1, 0));
+            FAIL_IF(load_word(compiler, tmp1, src, srcw, 0));


         return push_inst(compiler, br(src_r));
     case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
@@ -2508,7 +2742,7 @@
         return push_inst(compiler, lgr(gpr(dst), fast_link_r));


     /* memory */
-    return store_word(compiler, fast_link_r, dst, dstw, tmp1, 0);
+    return store_word(compiler, fast_link_r, dst, dstw, 0);
 }


/* --------------------------------------------------------------------- */
@@ -2533,15 +2767,11 @@

 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
 {
-    sljit_u8 mask = ((type & 0xff) < SLJIT_JUMP) ? get_cc(type & 0xff) : 0xf;
+    sljit_u8 mask = ((type & 0xff) < SLJIT_JUMP) ? get_cc(compiler, type & 0xff) : 0xf;


     CHECK_ERROR_PTR();
     CHECK_PTR(check_sljit_emit_jump(compiler, type));


-    /* reload condition code */
-    if (mask != 0xf)
-        PTR_FAIL_IF(push_load_cc(compiler, type & 0xff));
-
     /* record jump */
     struct sljit_jump *jump = (struct sljit_jump *)
         ensure_abuf(compiler, sizeof(struct sljit_jump));
@@ -2586,7 +2816,7 @@
         FAIL_IF(push_load_imm_inst(compiler, src_r, srcw));
     }
     else if (src & SLJIT_MEM)
-        FAIL_IF(load_word(compiler, src_r, src, srcw, tmp1, 0 /* 64-bit */));
+        FAIL_IF(load_word(compiler, src_r, src, srcw, 0 /* 64-bit */));


     /* emit jump instruction */
     if (type >= SLJIT_FAST_CALL)
@@ -2614,7 +2844,7 @@
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)
 {
-    sljit_u8 mask = get_cc(type & 0xff);
+    sljit_u8 mask = get_cc(compiler, type & 0xff);


     CHECK_ERROR();
     CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
@@ -2625,9 +2855,11 @@
     case SLJIT_AND:
     case SLJIT_OR:
     case SLJIT_XOR:
+        compiler->status_flags_state = op & SLJIT_SET_Z;
+
         /* dst is also source operand */
         if (dst & SLJIT_MEM)
-            FAIL_IF(load_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
+            FAIL_IF(load_word(compiler, dst_r, dst, dstw, op & SLJIT_I32_OP));


         break;
     case SLJIT_MOV:
@@ -2639,9 +2871,6 @@
         SLJIT_UNREACHABLE();
     }


-    if (mask != 0xf)
-        FAIL_IF(push_load_cc(compiler, type & 0xff));
-
     /* TODO(mundaym): fold into cmov helper function? */
     #define LEVAL(i) i(loc_r, 1, mask)
     if (have_lscond2()) {
@@ -2672,14 +2901,9 @@
     #undef LEVAL
     }


-    /* set zero flag if needed */
-    if (op & SLJIT_SET_Z)
-        FAIL_IF(push_store_zero_flag(compiler, op, dst_r));
-
     /* store result to memory if required */
-    /* TODO(carenas): s/FAIL_IF/ return */
     if (dst & SLJIT_MEM)
-        FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
+        return store_word(compiler, dst_r, dst, dstw, op & SLJIT_I32_OP);


     return SLJIT_SUCCESS;
 }
@@ -2688,7 +2912,7 @@
     sljit_s32 dst_reg,
     sljit_s32 src, sljit_sw srcw)
 {
-    sljit_u8 mask = get_cc(type & 0xff);
+    sljit_u8 mask = get_cc(compiler, type & 0xff);
     sljit_gpr dst_r = gpr(dst_reg & ~SLJIT_I32_OP);
     sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp0;


@@ -2695,9 +2919,6 @@
     CHECK_ERROR();
     CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));


-    if (mask != 0xf)
-        FAIL_IF(push_load_cc(compiler, type & 0xff));
-
     if (src & SLJIT_IMM) {
         /* TODO(mundaym): fast path with lscond2 */
         FAIL_IF(push_load_imm_inst(compiler, src_r, srcw));
@@ -2751,7 +2972,7 @@
     }


     if (dst & SLJIT_MEM)
-        PTR_FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, 0 /* always 64-bit */));
+        PTR_FAIL_IF(store_word(compiler, dst_r, dst, dstw, 0 /* always 64-bit */));


     return (struct sljit_const*)const_;
 }
@@ -2798,7 +3019,7 @@
     }


     if (dst & SLJIT_MEM)
-        PTR_FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, 0));
+        PTR_FAIL_IF(store_word(compiler, dst_r, dst, dstw, 0));


     return put_label;
 }


Modified: code/trunk/src/sljit/sljitNativeSPARC_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeSPARC_32.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeSPARC_32.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -93,6 +93,7 @@
         return push_inst(compiler, ADD | D(dst) | S1(dst) | IMM(1), UNMOVABLE_INS);


     case SLJIT_ADD:
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         return push_inst(compiler, ADD | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst) | (flags & SET_FLAGS));


     case SLJIT_ADDC:
@@ -99,6 +100,7 @@
         return push_inst(compiler, ADDC | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst) | (flags & SET_FLAGS));


     case SLJIT_SUB:
+        compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD_SUB;
         return push_inst(compiler, SUB | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst) | (flags & SET_FLAGS));


     case SLJIT_SUBC:
@@ -105,6 +107,7 @@
         return push_inst(compiler, SUBC | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst) | (flags & SET_FLAGS));


     case SLJIT_MUL:
+        compiler->status_flags_state = 0;
         FAIL_IF(push_inst(compiler, SMUL | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst)));
         if (!(flags & SET_FLAGS))
             return SLJIT_SUCCESS;


Modified: code/trunk/src/sljit/sljitNativeSPARC_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeSPARC_common.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeSPARC_common.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -1275,16 +1275,14 @@
     return label;
 }


-static sljit_ins get_cc(sljit_s32 type)
+static sljit_ins get_cc(struct sljit_compiler *compiler, sljit_s32 type)
 {
     switch (type) {
     case SLJIT_EQUAL:
-    case SLJIT_MUL_NOT_OVERFLOW:
     case SLJIT_NOT_EQUAL_F64: /* Unordered. */
         return DA(0x1);


     case SLJIT_NOT_EQUAL:
-    case SLJIT_MUL_OVERFLOW:
     case SLJIT_EQUAL_F64:
         return DA(0x9);


@@ -1317,10 +1315,16 @@
         return DA(0x2);


     case SLJIT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return DA(0x9);
+
     case SLJIT_UNORDERED_F64:
         return DA(0x7);


     case SLJIT_NOT_OVERFLOW:
+        if (!(compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD_SUB))
+            return DA(0x1);
+
     case SLJIT_ORDERED_F64:
         return DA(0xf);


@@ -1347,7 +1351,7 @@
         if (((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS) && !(compiler->delay_slot & ICC_IS_SET))
             jump->flags |= IS_MOVABLE;
 #if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-        PTR_FAIL_IF(push_inst(compiler, BICC | get_cc(type ^ 1) | 5, UNMOVABLE_INS));
+        PTR_FAIL_IF(push_inst(compiler, BICC | get_cc(compiler, type ^ 1) | 5, UNMOVABLE_INS));
 #else
 #error "Implementation required"
 #endif
@@ -1357,7 +1361,7 @@
         if (((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS) && !(compiler->delay_slot & FCC_IS_SET))
             jump->flags |= IS_MOVABLE;
 #if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-        PTR_FAIL_IF(push_inst(compiler, FBFCC | get_cc(type ^ 1) | 5, UNMOVABLE_INS));
+        PTR_FAIL_IF(push_inst(compiler, FBFCC | get_cc(compiler, type ^ 1) | 5, UNMOVABLE_INS));
 #else
 #error "Implementation required"
 #endif
@@ -1474,9 +1478,9 @@


     type &= 0xff;
     if (type < SLJIT_EQUAL_F64)
-        FAIL_IF(push_inst(compiler, BICC | get_cc(type) | 3, UNMOVABLE_INS));
+        FAIL_IF(push_inst(compiler, BICC | get_cc(compiler, type) | 3, UNMOVABLE_INS));
     else
-        FAIL_IF(push_inst(compiler, FBFCC | get_cc(type) | 3, UNMOVABLE_INS));
+        FAIL_IF(push_inst(compiler, FBFCC | get_cc(compiler, type) | 3, UNMOVABLE_INS));


     FAIL_IF(push_inst(compiler, OR | D(reg) | S1(0) | IMM(1), UNMOVABLE_INS));
     FAIL_IF(push_inst(compiler, OR | D(reg) | S1(0) | IMM(0), UNMOVABLE_INS));


Modified: code/trunk/src/sljit/sljitNativeX86_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_common.c    2021-05-26 14:34:55 UTC (rev 1311)
+++ code/trunk/src/sljit/sljitNativeX86_common.c    2021-05-27 08:11:15 UTC (rev 1312)
@@ -411,11 +411,9 @@
         return 0x8e /* jle */;


     case SLJIT_OVERFLOW:
-    case SLJIT_MUL_OVERFLOW:
         return 0x80 /* jo */;


     case SLJIT_NOT_OVERFLOW:
-    case SLJIT_MUL_NOT_OVERFLOW:
         return 0x81 /* jno */;


     case SLJIT_UNORDERED_F64: