[Pcre-svn] [1716] code/trunk: JIT compiler update.

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1716] code/trunk: JIT compiler update.
Revision: 1716
          http://vcs.pcre.org/viewvc?view=rev&revision=1716
Author:   zherczeg
Date:     2017-11-29 13:40:20 +0000 (Wed, 29 Nov 2017)
Log Message:
-----------
JIT compiler update.


Modified Paths:
--------------
    code/trunk/pcre_jit_compile.c
    code/trunk/sljit/sljitConfig.h
    code/trunk/sljit/sljitConfigInternal.h
    code/trunk/sljit/sljitLir.c
    code/trunk/sljit/sljitLir.h
    code/trunk/sljit/sljitNativeARM_32.c
    code/trunk/sljit/sljitNativeARM_64.c
    code/trunk/sljit/sljitNativeARM_T2_32.c
    code/trunk/sljit/sljitNativeMIPS_32.c
    code/trunk/sljit/sljitNativeMIPS_64.c
    code/trunk/sljit/sljitNativeMIPS_common.c
    code/trunk/sljit/sljitNativePPC_64.c
    code/trunk/sljit/sljitNativePPC_common.c
    code/trunk/sljit/sljitNativeSPARC_32.c
    code/trunk/sljit/sljitNativeSPARC_common.c
    code/trunk/sljit/sljitNativeX86_32.c
    code/trunk/sljit/sljitNativeX86_64.c
    code/trunk/sljit/sljitNativeX86_common.c
    code/trunk/sljit/sljitUtils.c


Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/pcre_jit_compile.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -214,7 +214,7 @@
   type_then_trap = 1
 };


-typedef int (SLJIT_CALL *jit_function)(jit_arguments *args);
+typedef int (SLJIT_FUNC *jit_function)(jit_arguments *args);

/* The following structure is the key data type for the recursive
code generator. It is allocated by compile_matchingpath, and contains
@@ -2439,7 +2439,7 @@
OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(STACK_TOP), SLJIT_OFFSETOF(struct sljit_stack, base));
}

-static sljit_sw SLJIT_CALL do_search_mark(sljit_sw *current, const pcre_uchar *skip_arg)
+static sljit_sw SLJIT_FUNC do_search_mark(sljit_sw *current, const pcre_uchar *skip_arg)
{
while (current != NULL)
{
@@ -5248,7 +5248,7 @@

#if defined SUPPORT_UTF && defined SUPPORT_UCP

-static const pcre_uchar * SLJIT_CALL do_utf_caselesscmp(pcre_uchar *src1, jit_arguments *args, pcre_uchar *end1)
+static const pcre_uchar * SLJIT_FUNC do_utf_caselesscmp(pcre_uchar *src1, jit_arguments *args, pcre_uchar *end1)
 {
 /* This function would be ineffective to do in JIT level. */
 sljit_u32 c1, c2;
@@ -6789,8 +6789,9 @@
   OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STACK_TOP, 0);
   OP1(SLJIT_MOV, SLJIT_R1, 0, ARGUMENTS, 0);
   OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(jit_arguments, uchar_ptr), STR_PTR, 0);
-  sljit_emit_ijump(compiler, SLJIT_CALL3, SLJIT_IMM, SLJIT_FUNC_OFFSET(do_utf_caselesscmp));
+  sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_utf_caselesscmp));
   OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+
   if (common->mode == JIT_COMPILE)
     add_jump(compiler, backtracks, CMP(SLJIT_LESS_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1));
   else
@@ -7125,7 +7126,7 @@
 return cc + 1 + LINK_SIZE;
 }


-static int SLJIT_CALL do_callout(struct jit_arguments *arguments, PUBL(callout_block) *callout_block, pcre_uchar **jit_ovector)
+static sljit_s32 SLJIT_FUNC do_callout(struct jit_arguments *arguments, PUBL(callout_block) *callout_block, pcre_uchar **jit_ovector)
{
const pcre_uchar *begin = arguments->begin;
int *offset_vector = arguments->offsets;
@@ -7207,7 +7208,7 @@
/* SLJIT_R0 = arguments */
OP1(SLJIT_MOV, SLJIT_R1, 0, STACK_TOP, 0);
GET_LOCAL_BASE(SLJIT_R2, 0, OVECTOR_START);
-sljit_emit_ijump(compiler, SLJIT_CALL3, SLJIT_IMM, SLJIT_FUNC_OFFSET(do_callout));
+sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(S32) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_callout));
OP1(SLJIT_MOV_S32, SLJIT_RETURN_REG, 0, SLJIT_RETURN_REG, 0);
OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
free_stack(common, CALLOUT_ARG_SIZE / sizeof(sljit_sw));
@@ -10439,7 +10440,7 @@
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STACK_TOP, 0);
OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, (sljit_sw)(current->cc + 2));
- sljit_emit_ijump(compiler, SLJIT_CALL2, SLJIT_IMM, SLJIT_FUNC_OFFSET(do_search_mark));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_search_mark));
OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);

OP1(SLJIT_MOV, STR_PTR, 0, TMP1, 0);
@@ -11031,7 +11032,7 @@
common->compiler = compiler;

/* Main pcre_jit_exec entry. */
-sljit_emit_enter(compiler, 0, 1, 5, 5, 0, 0, private_data_size);
+sljit_emit_enter(compiler, 0, SLJIT_ARG1(SW), 5, 5, 0, 0, private_data_size);

/* Register init. */
reset_ovector(common, (re->top_bracket + 1) * 2);
@@ -11257,7 +11258,7 @@
OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(struct sljit_stack, top), STACK_TOP, 0);
OP2(SLJIT_SUB, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(struct sljit_stack, limit), SLJIT_IMM, STACK_GROWTH_RATE);

-sljit_emit_ijump(compiler, SLJIT_CALL2, SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_stack_resize));
+sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_stack_resize));
jump = CMP(SLJIT_NOT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, stack));
@@ -11529,7 +11530,7 @@
else if ((options & PCRE_PARTIAL_SOFT) != 0)
mode = JIT_PARTIAL_SOFT_COMPILE;

-if (functions->executable_funcs == NULL || functions->executable_funcs[mode] == NULL)
+if (functions == NULL || functions->executable_funcs[mode] == NULL)
return PCRE_ERROR_JIT_BADOPTION;

/* Sanity checks should be handled by pcre_exec. */

Modified: code/trunk/sljit/sljitConfig.h
===================================================================
--- code/trunk/sljit/sljitConfig.h    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitConfig.h    2017-11-29 13:40:20 UTC (rev 1716)
@@ -108,8 +108,10 @@


 /* Force cdecl calling convention even if a better calling
    convention (e.g. fastcall) is supported by the C compiler.
-   If this option is enabled, C functions without
-   SLJIT_CALL can also be called from JIT code. */
+   If this option is disabled (this is the default), functions
+   called from JIT should be defined with SLJIT_FUNC attribute.
+   Standard C functions can still be called by using the
+   SLJIT_CALL_CDECL jump type. */
 #ifndef SLJIT_USE_CDECL_CALLING_CONVENTION
 /* Disabled by default */
 #define SLJIT_USE_CDECL_CALLING_CONVENTION 0


Modified: code/trunk/sljit/sljitConfigInternal.h
===================================================================
--- code/trunk/sljit/sljitConfigInternal.h    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitConfigInternal.h    2017-11-29 13:40:20 UTC (rev 1716)
@@ -60,11 +60,13 @@
                        a single precision floating point array by index
      SLJIT_F64_SHIFT : the shift required to apply when accessing
                        a double precision floating point array by index
+     SLJIT_PREF_SHIFT_REG : x86 systems prefers ecx for shifting by register
+                            the scratch register index of ecx is stored in this variable
      SLJIT_LOCALS_OFFSET : local space starting offset (SLJIT_SP + SLJIT_LOCALS_OFFSET)
      SLJIT_RETURN_ADDRESS_OFFSET : a return instruction always adds this offset to the return address


    Other macros:
-     SLJIT_CALL : C calling convention define for both calling JIT form C and C callbacks for JIT
+     SLJIT_FUNC : calling convention attribute for both calling JIT form C and C calling back from JIT
      SLJIT_W(number) : defining 64 bit constants on 64 bit architectures (compiler independent helper)
 */


@@ -471,44 +473,44 @@
/* Calling convention of functions generated by SLJIT or called from the generated code. */
/*****************************************************************************************/

-#ifndef SLJIT_CALL
+#ifndef SLJIT_FUNC

#if (defined SLJIT_USE_CDECL_CALLING_CONVENTION && SLJIT_USE_CDECL_CALLING_CONVENTION)

/* Force cdecl. */
-#define SLJIT_CALL
+#define SLJIT_FUNC

#elif (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)

#if defined(__GNUC__) && !defined(__APPLE__)

-#define SLJIT_CALL __attribute__ ((fastcall))
+#define SLJIT_FUNC __attribute__ ((fastcall))
#define SLJIT_X86_32_FASTCALL 1

#elif defined(_MSC_VER)

-#define SLJIT_CALL __fastcall
+#define SLJIT_FUNC __fastcall
#define SLJIT_X86_32_FASTCALL 1

#elif defined(__BORLANDC__)

-#define SLJIT_CALL __msfastcall
+#define SLJIT_FUNC __msfastcall
#define SLJIT_X86_32_FASTCALL 1

#else /* Unknown compiler. */

/* The cdecl attribute is the default. */
-#define SLJIT_CALL
+#define SLJIT_FUNC

#endif

#else /* Non x86-32 architectures. */

-#define SLJIT_CALL
+#define SLJIT_FUNC

#endif /* SLJIT_CONFIG_X86_32 */

-#endif /* !SLJIT_CALL */
+#endif /* !SLJIT_FUNC */

#ifndef SLJIT_INDIRECT_CALL
#if ((defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) && (defined SLJIT_BIG_ENDIAN && SLJIT_BIG_ENDIAN)) \
@@ -557,24 +559,20 @@

#define SLJIT_NUMBER_OF_REGISTERS 12
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 9
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
#define SLJIT_LOCALS_OFFSET_BASE (compiler->locals_offset)
-#else
-/* Maximum 3 arguments are passed on the stack, +1 for double alignment. */
-#define SLJIT_LOCALS_OFFSET_BASE (compiler->locals_offset)
-#endif /* SLJIT_X86_32_FASTCALL */
+#define SLJIT_PREF_SHIFT_REG SLJIT_R2

#elif (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)

+#define SLJIT_NUMBER_OF_REGISTERS 13
#ifndef _WIN64
-#define SLJIT_NUMBER_OF_REGISTERS 13
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 6
#define SLJIT_LOCALS_OFFSET_BASE 0
-#else
-#define SLJIT_NUMBER_OF_REGISTERS 13
+#else /* _WIN64 */
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 8
#define SLJIT_LOCALS_OFFSET_BASE (compiler->locals_offset)
-#endif /* _WIN64 */
+#endif /* !_WIN64 */
+#define SLJIT_PREF_SHIFT_REG SLJIT_R3

#elif (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5) || (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)

@@ -622,8 +620,9 @@
#define SLJIT_NUMBER_OF_REGISTERS 18
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 14
#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-/* Add +1 for double alignment. */
-#define SLJIT_LOCALS_OFFSET_BASE ((23 + 1) * sizeof(sljit_sw))
+/* saved registers (16), return struct pointer (1), space for 6 argument words (1),
+ 4th double arg (2), double alignment (1). */
+#define SLJIT_LOCALS_OFFSET_BASE ((16 + 1 + 6 + 2 + 1) * sizeof(sljit_sw))
#endif

#elif (defined SLJIT_CONFIG_TILEGX && SLJIT_CONFIG_TILEGX)

Modified: code/trunk/sljit/sljitLir.c
===================================================================
--- code/trunk/sljit/sljitLir.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitLir.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -97,8 +97,13 @@
 #define GET_ALL_FLAGS(op) \
     ((op) & (SLJIT_I32_OP | SLJIT_SET_Z | VARIABLE_FLAG_MASK))


+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 #define TYPE_CAST_NEEDED(op) \
+    (((op) >= SLJIT_MOV_U8 && (op) <= SLJIT_MOV_S32) || ((op) >= SLJIT_MOVU_U8 && (op) <= SLJIT_MOVU_S32))
+#else
+#define TYPE_CAST_NEEDED(op) \
     (((op) >= SLJIT_MOV_U8 && (op) <= SLJIT_MOV_S16) || ((op) >= SLJIT_MOVU_U8 && (op) <= SLJIT_MOVU_S16))
+#endif


 #define BUF_SIZE    4096


@@ -118,6 +123,9 @@
 /* When reg can be unused. */
 #define SLOW_IS_REG(reg)    ((reg) > 0 && (reg) <= REG_MASK)


+/* Mask for argument types. */
+#define SLJIT_DEF_MASK ((1 << SLJIT_DEF_SHIFT) - 1)
+
 /* Jump flags. */
 #define JUMP_LABEL    0x1
 #define JUMP_ADDR    0x2
@@ -591,6 +599,19 @@
     compiler->buf = prev;
 }


+static SLJIT_INLINE sljit_s32 get_arg_count(sljit_s32 arg_types)
+{
+    sljit_s32 arg_count = 0;
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+    while (arg_types) {
+        arg_count++;
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return arg_count;
+}
+
 static SLJIT_INLINE void set_emit_enter(struct sljit_compiler *compiler,
     sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
@@ -864,9 +885,13 @@
     (char*)"greater", (char*)"less_equal",
     (char*)"unordered", (char*)"ordered",
     (char*)"jump", (char*)"fast_call",
-    (char*)"call0", (char*)"call1", (char*)"call2", (char*)"call3"
+    (char*)"call", (char*)"call.cdecl"
 };


+static char* call_arg_names[] = {
+    (char*)"void", (char*)"sw", (char*)"uw", (char*)"s32", (char*)"u32", (char*)"f32", (char*)"f64"
+};
+
 #endif /* SLJIT_VERBOSE */


/* --------------------------------------------------------------------- */
@@ -897,53 +922,104 @@
}

 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    sljit_s32 types, arg_count, curr_type;
+#endif
+
     SLJIT_UNUSED_ARG(compiler);


 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
     CHECK_ARGUMENT(!(options & ~SLJIT_F64_ALIGNMENT));
-    CHECK_ARGUMENT(args >= 0 && args <= 3);
     CHECK_ARGUMENT(scratches >= 0 && scratches <= SLJIT_NUMBER_OF_REGISTERS);
     CHECK_ARGUMENT(saveds >= 0 && saveds <= SLJIT_NUMBER_OF_REGISTERS);
     CHECK_ARGUMENT(scratches + saveds <= SLJIT_NUMBER_OF_REGISTERS);
-    CHECK_ARGUMENT(args <= saveds);
     CHECK_ARGUMENT(fscratches >= 0 && fscratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
     CHECK_ARGUMENT(fsaveds >= 0 && fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
     CHECK_ARGUMENT(fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
     CHECK_ARGUMENT(local_size >= 0 && local_size <= SLJIT_MAX_LOCAL_SIZE);
+    CHECK_ARGUMENT((arg_types & SLJIT_DEF_MASK) == 0);
+
+    types = (arg_types >> SLJIT_DEF_SHIFT);
+    arg_count = 0;
+    while (types != 0 && arg_count < 3) {
+        curr_type = (types & SLJIT_DEF_MASK);
+        CHECK_ARGUMENT(curr_type == SLJIT_ARG_TYPE_SW || curr_type == SLJIT_ARG_TYPE_UW);
+        arg_count++;
+        types >>= SLJIT_DEF_SHIFT;
+    }
+    CHECK_ARGUMENT(arg_count <= saveds && types == 0);
+
     compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-    if (SLJIT_UNLIKELY(!!compiler->verbose))
-        fprintf(compiler->verbose, "  enter options:none args:%d scratches:%d saveds:%d fscratches:%d fsaveds:%d local_size:%d\n",
-            args, scratches, saveds, fscratches, fsaveds, local_size);
+    if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+        fprintf(compiler->verbose, "  enter options:%s args[", (options & SLJIT_F64_ALIGNMENT) ? "f64_align" : "");
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+        while (arg_types) {
+            fprintf(compiler->verbose, "%s", call_arg_names[arg_types & SLJIT_DEF_MASK]);
+            arg_types >>= SLJIT_DEF_SHIFT;
+            if (arg_types)
+                fprintf(compiler->verbose, ",");
+        }
+
+        fprintf(compiler->verbose, "] scratches:%d saveds:%d fscratches:%d fsaveds:%d local_size:%d\n",
+            scratches, saveds, fscratches, fsaveds, local_size);
+    }
 #endif
     CHECK_RETURN_OK;
 }


 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    sljit_s32 types, arg_count, curr_type;
+#endif
+
+    SLJIT_UNUSED_ARG(compiler);
+
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
     CHECK_ARGUMENT(!(options & ~SLJIT_F64_ALIGNMENT));
-    CHECK_ARGUMENT(args >= 0 && args <= 3);
     CHECK_ARGUMENT(scratches >= 0 && scratches <= SLJIT_NUMBER_OF_REGISTERS);
     CHECK_ARGUMENT(saveds >= 0 && saveds <= SLJIT_NUMBER_OF_REGISTERS);
     CHECK_ARGUMENT(scratches + saveds <= SLJIT_NUMBER_OF_REGISTERS);
-    CHECK_ARGUMENT(args <= saveds);
     CHECK_ARGUMENT(fscratches >= 0 && fscratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
     CHECK_ARGUMENT(fsaveds >= 0 && fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
     CHECK_ARGUMENT(fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
     CHECK_ARGUMENT(local_size >= 0 && local_size <= SLJIT_MAX_LOCAL_SIZE);
+
+    types = (arg_types >> SLJIT_DEF_SHIFT);
+    arg_count = 0;
+    while (types != 0 && arg_count < 3) {
+        curr_type = (types & SLJIT_DEF_MASK);
+        CHECK_ARGUMENT(curr_type == SLJIT_ARG_TYPE_SW || curr_type == SLJIT_ARG_TYPE_UW);
+        arg_count++;
+        types >>= SLJIT_DEF_SHIFT;
+    }
+    CHECK_ARGUMENT(arg_count <= saveds && types == 0);
+
     compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-    if (SLJIT_UNLIKELY(!!compiler->verbose))
-        fprintf(compiler->verbose, "  set_context options:none args:%d scratches:%d saveds:%d fscratches:%d fsaveds:%d local_size:%d\n",
-            args, scratches, saveds, fscratches, fsaveds, local_size);
+    if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+        fprintf(compiler->verbose, "  set_context options:%s args[", (options & SLJIT_F64_ALIGNMENT) ? "f64_align" : "");
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+        while (arg_types) {
+            fprintf(compiler->verbose, "%s", call_arg_names[arg_types & SLJIT_DEF_MASK]);
+            arg_types >>= SLJIT_DEF_SHIFT;
+            if (arg_types)
+                fprintf(compiler->verbose, ",");
+        }
+
+        fprintf(compiler->verbose, "] scratches:%d saveds:%d fscratches:%d fsaveds:%d local_size:%d\n",
+            scratches, saveds, fscratches, fsaveds, local_size);
+    }
 #endif
     CHECK_RETURN_OK;
 }
@@ -1417,9 +1493,8 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
     CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP | SLJIT_I32_OP)));
     CHECK_ARGUMENT((type & 0xff) != GET_FLAG_TYPE(SLJIT_SET_CARRY) && (type & 0xff) != (GET_FLAG_TYPE(SLJIT_SET_CARRY) + 1));
-    CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_CALL3);
+    CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_FAST_CALL);
     CHECK_ARGUMENT((type & 0xff) < SLJIT_JUMP || !(type & SLJIT_I32_OP));
-    CHECK_ARGUMENT((type & 0xff) <= SLJIT_CALL0 || ((type & 0xff) - SLJIT_CALL0) <= compiler->scratches);


     if ((type & 0xff) < SLJIT_JUMP) {
         if ((type & 0xff) <= SLJIT_NOT_ZERO)
@@ -1439,6 +1514,63 @@
     CHECK_RETURN_OK;
 }


+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    sljit_s32 i, types, curr_type, scratches, fscratches;
+
+    CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP)));
+    CHECK_ARGUMENT((type & 0xff) == SLJIT_CALL || (type & 0xff) == SLJIT_CALL_CDECL);
+
+    types = arg_types;
+    scratches = 0;
+    fscratches = 0;
+    for (i = 0; i < 5; i++) {
+        curr_type = (types & SLJIT_DEF_MASK);
+        CHECK_ARGUMENT(curr_type <= SLJIT_ARG_TYPE_F64);
+        if (i > 0) {
+            if (curr_type == 0) {
+                break;
+            }
+            if (curr_type >= SLJIT_ARG_TYPE_F32)
+                fscratches++;
+            else
+                scratches++;
+        } else {
+            if (curr_type >= SLJIT_ARG_TYPE_F32) {
+                CHECK_ARGUMENT(compiler->fscratches > 0);
+            } else if (curr_type >= SLJIT_ARG_TYPE_SW) {
+                CHECK_ARGUMENT(compiler->scratches > 0);
+            }
+        }
+        types >>= SLJIT_DEF_SHIFT;
+    }
+    CHECK_ARGUMENT(compiler->scratches >= scratches);
+    CHECK_ARGUMENT(compiler->fscratches >= fscratches);
+    CHECK_ARGUMENT(types == 0);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+    if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+        fprintf(compiler->verbose, "  %s%s ret[%s", jump_names[type & 0xff],
+            !(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r", call_arg_names[arg_types & SLJIT_DEF_MASK]);
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+        if (arg_types) {
+            fprintf(compiler->verbose, "], args[");
+            do {
+                fprintf(compiler->verbose, "%s", call_arg_names[arg_types & SLJIT_DEF_MASK]);
+                arg_types >>= SLJIT_DEF_SHIFT;
+                if (arg_types)
+                    fprintf(compiler->verbose, ",");
+            } while (arg_types);
+        }
+        fprintf(compiler->verbose, "]\n");
+    }
+#endif
+    CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_cmp(struct sljit_compiler *compiler, sljit_s32 type,
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w)
@@ -1488,12 +1620,9 @@
     CHECK_RETURN_OK;
 }


-static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 src, sljit_sw srcw)
 {
-#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-    compiler->last_flags = 0;
-#endif
-
     if (SLJIT_UNLIKELY(compiler->skip_checks)) {
         compiler->skip_checks = 0;
         CHECK_RETURN_OK;
@@ -1500,8 +1629,7 @@
     }


 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-    CHECK_ARGUMENT(type >= SLJIT_JUMP && type <= SLJIT_CALL3);
-    CHECK_ARGUMENT(type <= SLJIT_CALL0 || (type - SLJIT_CALL0) <= compiler->scratches);
+    CHECK_ARGUMENT(type >= SLJIT_JUMP && type <= SLJIT_FAST_CALL);
     FUNCTION_CHECK_SRC(src, srcw);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1514,6 +1642,66 @@
     CHECK_RETURN_OK;
 }


+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    sljit_s32 i, types, curr_type, scratches, fscratches;
+
+    CHECK_ARGUMENT(type == SLJIT_CALL || type == SLJIT_CALL_CDECL);
+    FUNCTION_CHECK_SRC(src, srcw);
+
+    types = arg_types;
+    scratches = 0;
+    fscratches = 0;
+    for (i = 0; i < 5; i++) {
+        curr_type = (types & SLJIT_DEF_MASK);
+        CHECK_ARGUMENT(curr_type <= SLJIT_ARG_TYPE_F64);
+        if (i > 0) {
+            if (curr_type == 0) {
+                break;
+            }
+            if (curr_type >= SLJIT_ARG_TYPE_F32)
+                fscratches++;
+            else
+                scratches++;
+        } else {
+            if (curr_type >= SLJIT_ARG_TYPE_F32) {
+                CHECK_ARGUMENT(compiler->fscratches > 0);
+            } else if (curr_type >= SLJIT_ARG_TYPE_SW) {
+                CHECK_ARGUMENT(compiler->scratches > 0);
+            }
+        }
+        types >>= SLJIT_DEF_SHIFT;
+    }
+    CHECK_ARGUMENT(compiler->scratches >= scratches);
+    CHECK_ARGUMENT(compiler->fscratches >= fscratches);
+    CHECK_ARGUMENT(types == 0);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+    if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+        fprintf(compiler->verbose, "  i%s%s ret[%s", jump_names[type & 0xff],
+            !(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r", call_arg_names[arg_types & SLJIT_DEF_MASK]);
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+        if (arg_types) {
+            fprintf(compiler->verbose, "], args[");
+            do {
+                fprintf(compiler->verbose, "%s", call_arg_names[arg_types & SLJIT_DEF_MASK]);
+                arg_types >>= SLJIT_DEF_SHIFT;
+                if (arg_types)
+                    fprintf(compiler->verbose, ",");
+            } while (arg_types);
+        }
+        fprintf(compiler->verbose, "], ");
+        sljit_verbose_param(compiler, src, srcw);
+        fprintf(compiler->verbose, "\n");
+    }
+#endif
+    CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)
@@ -1943,12 +2131,12 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     SLJIT_UNUSED_ARG(compiler);
     SLJIT_UNUSED_ARG(options);
-    SLJIT_UNUSED_ARG(args);
+    SLJIT_UNUSED_ARG(arg_types);
     SLJIT_UNUSED_ARG(scratches);
     SLJIT_UNUSED_ARG(saveds);
     SLJIT_UNUSED_ARG(fscratches);
@@ -1959,12 +2147,12 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     SLJIT_UNUSED_ARG(compiler);
     SLJIT_UNUSED_ARG(options);
-    SLJIT_UNUSED_ARG(args);
+    SLJIT_UNUSED_ARG(arg_types);
     SLJIT_UNUSED_ARG(scratches);
     SLJIT_UNUSED_ARG(saveds);
     SLJIT_UNUSED_ARG(fscratches);
@@ -2109,6 +2297,16 @@
     return NULL;
 }


+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    SLJIT_UNUSED_ARG(compiler);
+    SLJIT_UNUSED_ARG(type);
+    SLJIT_UNUSED_ARG(arg_types);
+    SLJIT_UNREACHABLE();
+    return NULL;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_cmp(struct sljit_compiler *compiler, sljit_s32 type,
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w)
@@ -2161,6 +2359,19 @@
     return SLJIT_ERR_UNSUPPORTED;
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    SLJIT_UNUSED_ARG(compiler);
+    SLJIT_UNUSED_ARG(type);
+    SLJIT_UNUSED_ARG(arg_types);
+    SLJIT_UNUSED_ARG(src);
+    SLJIT_UNUSED_ARG(srcw);
+    SLJIT_UNREACHABLE();
+    return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)


Modified: code/trunk/sljit/sljitLir.h
===================================================================
--- code/trunk/sljit/sljitLir.h    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitLir.h    2017-11-29 13:40:20 UTC (rev 1716)
@@ -213,14 +213,6 @@


 #define SLJIT_RETURN_REG    SLJIT_R0


-/* x86 prefers specific registers for special purposes. In case of shift
-   by register it supports only SLJIT_R2 for shift argument
-   (which is the src2 argument of sljit_emit_op2). If another register is
-   used, sljit must exchange data between registers which cause a minor
-   slowdown. Other architectures has no such limitation. */
-
-#define SLJIT_PREF_SHIFT_REG    SLJIT_R2
-
 /* --------------------------------------------------------------------- */
 /*  Floating point registers                                             */
 /* --------------------------------------------------------------------- */
@@ -258,6 +250,79 @@
 #define SLJIT_FIRST_SAVED_FLOAT_REG (SLJIT_FS0 - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS + 1)


 /* --------------------------------------------------------------------- */
+/*  Argument type definitions                                            */
+/* --------------------------------------------------------------------- */
+
+/* Argument type definitions.
+   Used by SLJIT_[DEF_]ARGx and SLJIT_[DEF]_RET macros. */
+
+#define SLJIT_ARG_TYPE_VOID 0
+#define SLJIT_ARG_TYPE_SW 1
+#define SLJIT_ARG_TYPE_UW 2
+#define SLJIT_ARG_TYPE_S32 3
+#define SLJIT_ARG_TYPE_U32 4
+#define SLJIT_ARG_TYPE_F32 5
+#define SLJIT_ARG_TYPE_F64 6
+
+/* The following argument type definitions are used by sljit_emit_enter,
+   sljit_set_context, sljit_emit_call, and sljit_emit_icall functions.
+   The following return type definitions are used by sljit_emit_call
+   and sljit_emit_icall functions.
+
+   When a function is called, the first integer argument must be placed
+   in SLJIT_R0, the second in SLJIT_R1, and so on. Similarly the first
+   floating point argument must be placed in SLJIT_FR0, the second in
+   SLJIT_FR1, and so on.
+
+   Example function definition:
+     sljit_f32 SLJIT_FUNC example_c_callback(sljit_sw arg_a,
+         sljit_f64 arg_b, sljit_u32 arg_c, sljit_f32 arg_d);
+
+   Argument type definition:
+     SLJIT_DEF_RET(SLJIT_ARG_TYPE_F32)
+        | SLJIT_DEF_ARG1(SLJIT_ARG_TYPE_SW) | SLJIT_DEF_ARG2(SLJIT_ARG_TYPE_F64)
+        | SLJIT_DEF_ARG3(SLJIT_ARG_TYPE_U32) | SLJIT_DEF_ARG2(SLJIT_ARG_TYPE_F32)
+
+   Short form of argument type definition:
+     SLJIT_RET(F32) | SLJIT_ARG1(SW) | SLJIT_ARG2(F64)
+        | SLJIT_ARG3(S32) | SLJIT_ARG4(F32)
+
+   Argument passing:
+     arg_a must be placed in SLJIT_R0
+     arg_c must be placed in SLJIT_R1
+     arg_b must be placed in SLJIT_FR0
+     arg_d must be placed in SLJIT_FR1
+
+Note:
+   The SLJIT_ARG_TYPE_VOID type is only supported by
+   SLJIT_DEF_RET, and SLJIT_ARG_TYPE_VOID is also the
+   default value when SLJIT_DEF_RET is not specified. */
+#define SLJIT_DEF_SHIFT 4
+#define SLJIT_DEF_RET(type) (type)
+#define SLJIT_DEF_ARG1(type) ((type) << SLJIT_DEF_SHIFT)
+#define SLJIT_DEF_ARG2(type) ((type) << (2 * SLJIT_DEF_SHIFT))
+#define SLJIT_DEF_ARG3(type) ((type) << (3 * SLJIT_DEF_SHIFT))
+#define SLJIT_DEF_ARG4(type) ((type) << (4 * SLJIT_DEF_SHIFT))
+
+/* Short form of the macros above.
+
+   For example the following definition:
+   SLJIT_DEF_RET(SLJIT_ARG_TYPE_SW) | SLJIT_DEF_ARG1(SLJIT_ARG_TYPE_F32)
+
+   can be shortened to:
+   SLJIT_RET(SW) | SLJIT_ARG1(F32)
+
+Note:
+   The VOID type is only supported by SLJIT_RET, and
+   VOID is also the default value when SLJIT_RET is
+   not specified. */
+#define SLJIT_RET(type) SLJIT_DEF_RET(SLJIT_ARG_TYPE_ ## type)
+#define SLJIT_ARG1(type) SLJIT_DEF_ARG1(SLJIT_ARG_TYPE_ ## type)
+#define SLJIT_ARG2(type) SLJIT_DEF_ARG2(SLJIT_ARG_TYPE_ ## type)
+#define SLJIT_ARG3(type) SLJIT_DEF_ARG3(SLJIT_ARG_TYPE_ ## type)
+#define SLJIT_ARG4(type) SLJIT_DEF_ARG4(SLJIT_ARG_TYPE_ ## type)
+
+/* --------------------------------------------------------------------- */
 /*  Main structures and functions                                        */
 /* --------------------------------------------------------------------- */


@@ -331,6 +396,7 @@
     sljit_s32 args;
     sljit_s32 locals_offset;
     sljit_s32 saveds_offset;
+    sljit_s32 stack_tmp_size;
 #endif


 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -505,8 +571,6 @@
 #define SLJIT_HAS_CLZ            3
 /* [Emulated] Conditional move is supported. */
 #define SLJIT_HAS_CMOV            4
-/* [Limitation] [Emulated] Shifting with register is limited to SLJIT_PREF_SHIFT_REG. */
-#define SLJIT_HAS_PREF_SHIFT_REG    5


 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
 /* [Not emulated] SSE2 support is available on x86. */
@@ -519,10 +583,10 @@
    error, they return with SLJIT_SUCCESS. */


 /*
-   The executable code is a function call from the viewpoint of the C
+   The executable code is a function from the viewpoint of the C
    language. The function calls must obey to the ABI (Application
    Binary Interface) of the platform, which specify the purpose of
-   all machine registers and stack handling among other things. The
+   machine registers and stack handling among other things. The
    sljit_emit_enter function emits the necessary instructions for
    setting up a new context for the executable code and moves function
    arguments to the saved registers. Furthermore the options argument
@@ -529,17 +593,18 @@
    can be used to pass configuration options to the compiler. The
    available options are listed before sljit_emit_enter.


-   The number of sljit_sw arguments passed to the generated function
-   are specified in the "args" parameter. The number of arguments must
-   be less than or equal to 3. The first argument goes to SLJIT_S0,
-   the second goes to SLJIT_S1 and so on. The register set used by
-   the function must be declared as well. The number of scratch and
-   saved registers used by the function must be passed to sljit_emit_enter.
-   Only R registers between R0 and "scratches" argument can be used
-   later. E.g. if "scratches" is set to 2, the register set will be
-   limited to R0 and R1. The S registers and the floating point
+   The function argument list is the combination of SLJIT_ARGx
+   (SLJIT_DEF_ARG1) macros. Currently maximum 3 SW / UW
+   (SLJIT_ARG_TYPE_SW / LJIT_ARG_TYPE_UW) arguments are supported.
+   The first argument goes to SLJIT_S0, the second goes to SLJIT_S1
+   and so on. The register set used by the function must be declared
+   as well. The number of scratch and saved registers used by the
+   function must be passed to sljit_emit_enter. Only R registers
+   between R0 and "scratches" argument can be used later. E.g. if
+   "scratches" is set to 2, the scratch register set will be limited
+   to SLJIT_R0 and SLJIT_R1. The S registers and the floating point
    registers ("fscratches" and "fsaveds") are specified in a similar
-   way. The sljit_emit_enter is also capable of allocating a stack
+   manner. The sljit_emit_enter is also capable of allocating a stack
    space for local variables. The "local_size" argument contains the
    size in bytes of this local area and its staring address is stored
    in SLJIT_SP. The memory area between SLJIT_SP (inclusive) and
@@ -566,7 +631,7 @@
 #define SLJIT_MAX_LOCAL_SIZE    65536


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size);


 /* The machine code has a context (which contains the local stack space size,
@@ -580,7 +645,7 @@
          the previous context. */


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size);


/* Return from machine code. The op argument can be SLJIT_UNUSED which means the
@@ -1136,25 +1201,32 @@

 /* Unconditional jump types. */
 #define SLJIT_JUMP            24
+    /* Fast calling method. See sljit_emit_fast_enter / sljit_emit_fast_return. */
 #define SLJIT_FAST_CALL            25
-#define SLJIT_CALL0            26
-#define SLJIT_CALL1            27
-#define SLJIT_CALL2            28
-#define SLJIT_CALL3            29
+    /* Called function must be declared with the SLJIT_FUNC attribute. */
+#define SLJIT_CALL            26
+    /* Called function must be decalred with cdecl attribute.
+       This is the default attribute for C functions. */
+#define SLJIT_CALL_CDECL        27


-/* Fast calling method. See sljit_emit_fast_enter / sljit_emit_fast_return. */
-
 /* The target can be changed during runtime (see: sljit_set_jump_addr). */
 #define SLJIT_REWRITABLE_JUMP        0x1000


 /* Emit a jump instruction. The destination is not set, only the type of the jump.
-    type must be between SLJIT_EQUAL and SLJIT_CALL3
+    type must be between SLJIT_EQUAL and SLJIT_FAST_CALL
     type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP


-   Flags: does not modify flags for conditional and unconditional
-          jumps but destroy all flags for calls. */
+   Flags: does not modify flags. */
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type);


+/* Emit a C compiler (ABI) compatible function call.
+    type must be SLJIT_CALL or SLJIT_CALL_CDECL
+    type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP
+    arg_types is the combination of SLJIT_RET / SLJIT_ARGx (SLJIT_DEF_RET / SLJIT_DEF_ARGx) macros
+
+   Flags: destroy all flags. */
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 arg_types);
+
 /* Basic arithmetic comparison. In most architectures it is implemented as
    an SLJIT_SUB operation (with SLJIT_UNUSED destination and setting
    appropriate flags) followed by a sljit_emit_jump. However some
@@ -1162,6 +1234,7 @@
    It is suggested to use this comparison form when appropriate.
     type must be between SLJIT_EQUAL and SLJIT_I_SIG_LESS_EQUAL
     type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP
+
    Flags: may destroy flags. */
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_cmp(struct sljit_compiler *compiler, sljit_s32 type,
     sljit_s32 src1, sljit_sw src1w,
@@ -1186,15 +1259,23 @@
 /* Set the destination address of the jump to this label. */
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_target(struct sljit_jump *jump, sljit_uw target);


-/* Call function or jump anywhere. Both direct and indirect form
-    type must be between SLJIT_JUMP and SLJIT_CALL3
-    Direct form: set src to SLJIT_IMM() and srcw to the address
-    Indirect form: any other valid addressing mode
+/* Emit an indirect jump or fast call. Both direct and indirect form
+   Direct form: set src to SLJIT_IMM() and srcw to the address
+   Indirect form: any other valid addressing mode
+    type must be between SLJIT_JUMP and SLJIT_FAST_CALL


-   Flags: does not modify flags for unconditional jumps but
-          destroy all flags for calls. */
+   Flags: does not modify flags. */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw);


+/* Emit a C compiler (ABI) compatible function call.
+   Direct form: set src to SLJIT_IMM() and srcw to the address
+   Indirect form: any other valid addressing mode
+    type must be SLJIT_CALL or SLJIT_CALL_CDECL
+    arg_types is the combination of SLJIT_RET / SLJIT_ARGx (SLJIT_DEF_RET / SLJIT_DEF_ARGx) macros
+
+   Flags: destroy all flags. */
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 arg_types, sljit_s32 src, sljit_sw srcw);
+
 /* Perform the operation using the conditional flags as the second argument.
    Type must always be between SLJIT_EQUAL and SLJIT_ORDERED_F64. The value
    represented by the type is 1, if the condition represented by the type
@@ -1270,8 +1351,8 @@


#if (defined SLJIT_UTIL_GLOBAL_LOCK && SLJIT_UTIL_GLOBAL_LOCK)
/* This global lock is useful to compile common functions. */
-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_grab_lock(void);
-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_release_lock(void);
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_grab_lock(void);
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_release_lock(void);
#endif

 #if (defined SLJIT_UTIL_STACK && SLJIT_UTIL_STACK)
@@ -1312,8 +1393,8 @@
    Note: limit contains the starting stack size in bytes.
    Note: the top field is initialized to base.
    Note: see sljit_create_compiler for the explanation of allocator_data. */
-SLJIT_API_FUNC_ATTRIBUTE struct sljit_stack* SLJIT_CALL sljit_allocate_stack(sljit_uw limit, sljit_uw max_limit, void *allocator_data);
-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_free_stack(struct sljit_stack *stack, void *allocator_data);
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_stack* SLJIT_FUNC sljit_allocate_stack(sljit_uw limit, sljit_uw max_limit, void *allocator_data);
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_free_stack(struct sljit_stack *stack, void *allocator_data);


 /* Can be used to increase (allocate) or decrease (free) the memory area.
    Returns with a non-zero value if unsuccessful. If new_limit is greater than
@@ -1321,7 +1402,7 @@
    since the growth ratio can be added to the current limit, and sljit_stack_resize
    will do all the necessary checks. The fields of the stack are not changed if
    sljit_stack_resize fails. */
-SLJIT_API_FUNC_ATTRIBUTE sljit_sw SLJIT_CALL sljit_stack_resize(struct sljit_stack *stack, sljit_u8 *new_limit);
+SLJIT_API_FUNC_ATTRIBUTE sljit_sw SLJIT_FUNC sljit_stack_resize(struct sljit_stack *stack, sljit_u8 *new_limit);


#endif /* (defined SLJIT_UTIL_STACK && SLJIT_UTIL_STACK) */


Modified: code/trunk/sljit/sljitNativeARM_32.c
===================================================================
--- code/trunk/sljit/sljitNativeARM_32.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeARM_32.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -24,12 +24,18 @@
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */


+#ifdef __SOFTFP__
+#define ARM_ABI_INFO " ABI:softfp"
+#else
+#define ARM_ABI_INFO " ABI:hardfp"
+#endif
+
 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
 {
 #if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
-    return "ARMv7" SLJIT_CPUINFO;
+    return "ARMv7" SLJIT_CPUINFO ARM_ABI_INFO;
 #elif (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-    return "ARMv5" SLJIT_CPUINFO;
+    return "ARMv5" SLJIT_CPUINFO ARM_ABI_INFO;
 #else
 #error "Internal error: Unknown ARM architecture"
 #endif
@@ -40,8 +46,8 @@
 #define TMP_REG2    (SLJIT_NUMBER_OF_REGISTERS + 3)
 #define TMP_PC        (SLJIT_NUMBER_OF_REGISTERS + 4)


-#define TMP_FREG1    (0)
-#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


 /* In ARM instruction words.
    Cache lines are usually 32 byte aligned. */
@@ -55,9 +61,13 @@


 /* See sljit_emit_enter and sljit_emit_op0 if you want to change them. */
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
-    0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 14, 12, 15
+    0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15
 };


+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+    0, 0, 1, 2, 3, 4, 5, 6, 7
+};
+
 #define RM(rm) (reg_map[rm])
 #define RD(rd) (reg_map[rd] << 12)
 #define RN(rn) (reg_map[rn] << 16)
@@ -72,32 +82,31 @@
 #define CONDITIONAL    0xe0000000
 #define PUSH_POOL    0xff000000


-/* DP - Data Processing instruction (use with EMIT_DATA_PROCESS_INS). */
-#define ADC_DP        0x5
-#define ADD_DP        0x4
-#define AND_DP        0x0
+#define ADC        0xe0a00000
+#define ADD        0xe0800000
+#define AND        0xe0000000
 #define B        0xea000000
-#define BIC_DP        0xe
+#define BIC        0xe1c00000
 #define BL        0xeb000000
 #define BLX        0xe12fff30
 #define BX        0xe12fff10
 #define CLZ        0xe16f0f10
-#define CMN_DP        0xb
-#define CMP_DP        0xa
+#define CMN        0xe1600000
+#define CMP        0xe1400000
 #define BKPT        0xe1200070
-#define EOR_DP        0x1
-#define MOV_DP        0xd
+#define EOR        0xe0200000
+#define MOV        0xe1a00000
 #define MUL        0xe0000090
-#define MVN_DP        0xf
+#define MVN        0xe1e00000
 #define NOP        0xe1a00000
-#define ORR_DP        0xc
+#define ORR        0xe1800000
 #define PUSH        0xe92d0000
 #define POP        0xe8bd0000
-#define RSB_DP        0x3
-#define RSC_DP        0x7
-#define SBC_DP        0x6
+#define RSB        0xe0600000
+#define RSC        0xe0e00000
+#define SBC        0xe0c00000
 #define SMULL        0xe0c00090
-#define SUB_DP        0x2
+#define SUB        0xe0400000
 #define UMULL        0xe0800090
 #define VABS_F32    0xeeb00ac0
 #define VADD_F32    0xee300a00
@@ -108,6 +117,7 @@
 #define VDIV_F32    0xee800a00
 #define VMOV_F32    0xeeb00a40
 #define VMOV        0xee000a10
+#define VMOV2        0xec400a10
 #define VMRS        0xeef1fa10
 #define VMUL_F32    0xee200a00
 #define VNEG_F32    0xeeb10a40
@@ -260,7 +270,9 @@
 {
     /* Must follow tightly the previous instruction (to be able to convert it to bl instruction). */
     SLJIT_ASSERT(compiler->cpool_diff == CONST_POOL_EMPTY || compiler->size - compiler->cpool_diff < MAX_DIFFERENCE(4092));
-    return push_inst(compiler, BLX | RM(TMP_REG2));
+    SLJIT_ASSERT(reg_map[TMP_REG1] != 14);
+
+    return push_inst(compiler, BLX | RM(TMP_REG1));
 }


 static sljit_uw patch_pc_relative_loads(sljit_uw *last_pc_patch, sljit_uw *code_ptr, sljit_uw* const_pool, sljit_uw cpool_size)
@@ -889,10 +901,6 @@
 #define TYPE2_TRANSFER_IMM(imm) \
     (((imm) & 0xf) | (((imm) & 0xf0) << 4) | (1 << 22))


-/* Condition: AL. */
-#define EMIT_DATA_PROCESS_INS(opcode, set_flags, dst, src1, src2) \
-    (0xe0000000 | ((opcode) << 21) | (set_flags) | RD(dst) | RN(src1) | (src2))
-
 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 inp_flags,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
@@ -899,15 +907,15 @@
     sljit_s32 src2, sljit_sw src2w);


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-    sljit_s32 size, i, tmp;
+    sljit_s32 args, size, i, tmp;
     sljit_uw push;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     /* Push saved registers, temporary registers
        stmdb sp!, {..., lr} */
@@ -929,25 +937,27 @@
     if (local_size > 0)
         FAIL_IF(emit_op(compiler, SLJIT_SUB, ALLOW_IMM, SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, local_size));


+    args = get_arg_count(arg_types);
+
     if (args >= 1)
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, SLJIT_S0, SLJIT_UNUSED, RM(SLJIT_R0))));
+        FAIL_IF(push_inst(compiler, MOV | RD(SLJIT_S0) | RM(SLJIT_R0)));
     if (args >= 2)
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, SLJIT_S1, SLJIT_UNUSED, RM(SLJIT_R1))));
+        FAIL_IF(push_inst(compiler, MOV | RD(SLJIT_S1) | RM(SLJIT_R1)));
     if (args >= 3)
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, SLJIT_S2, SLJIT_UNUSED, RM(SLJIT_R2))));
+        FAIL_IF(push_inst(compiler, MOV | RD(SLJIT_S2) | RM(SLJIT_R2)));


     return SLJIT_SUCCESS;
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     sljit_s32 size;


     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
     compiler->local_size = ((size + local_size + 7) & ~7) - size;
@@ -1009,12 +1019,12 @@
         SLJIT_ASSERT(!(flags & ARGS_SWAPPED)); \
         \
         if (compiler->shift_imm != 0) \
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, flags & SET_FLAGS, \
-                dst, SLJIT_UNUSED, (compiler->shift_imm << 7) | (opcode << 5) | RM(src2))); \
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, flags & SET_FLAGS, dst, SLJIT_UNUSED, RM(src2))); \
+            return push_inst(compiler, MOV | (flags & SET_FLAGS) | \
+                RD(dst) | (compiler->shift_imm << 7) | (opcode << 5) | RM(src2)); \
+        return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) | RM(src2)); \
     } \
-    return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, flags & SET_FLAGS, \
-        dst, SLJIT_UNUSED, (reg_map[(flags & ARGS_SWAPPED) ? src1 : src2] << 8) | (opcode << 5) | 0x10 | RM((flags & ARGS_SWAPPED) ? src2 : src1)));
+    return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) | \
+        (reg_map[(flags & ARGS_SWAPPED) ? src1 : src2] << 8) | (opcode << 5) | 0x10 | RM((flags & ARGS_SWAPPED) ? src2 : src1));


 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
     sljit_s32 dst, sljit_s32 src1, sljit_s32 src2)
@@ -1024,10 +1034,9 @@
         SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
         if (dst != src2) {
             if (src2 & SRC2_IMM) {
-                return push_inst(compiler, EMIT_DATA_PROCESS_INS((flags & INV_IMM) ? MVN_DP : MOV_DP, 0,
-                    dst, SLJIT_UNUSED, src2));
+                return push_inst(compiler, ((flags & INV_IMM) ? MVN : MOV) | RD(dst) | src2);
             }
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, RM(src2)));
+            return push_inst(compiler, MOV | RD(dst) | RM(src2));
         }
         return SLJIT_SUCCESS;


@@ -1037,9 +1046,9 @@
         if (flags & MOVE_REG_CONV) {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
             if (op == SLJIT_MOV_U8)
-                return push_inst(compiler, EMIT_DATA_PROCESS_INS(AND_DP, 0, dst, src2, SRC2_IMM | 0xff));
-            FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (24 << 7) | RM(src2))));
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (24 << 7) | (op == SLJIT_MOV_U8 ? 0x20 : 0x40) | RM(dst)));
+                return push_inst(compiler, AND | RD(dst) | RN(src2) | SRC2_IMM | 0xff);
+            FAIL_IF(push_inst(compiler, MOV | RD(dst) | (24 << 7) | RM(src2)));
+            return push_inst(compiler, MOV | RD(dst) | (24 << 7) | (op == SLJIT_MOV_U8 ? 0x20 : 0x40) | RM(dst));
 #else
             return push_inst(compiler, (op == SLJIT_MOV_U8 ? UXTB : SXTB) | RD(dst) | RM(src2));
 #endif
@@ -1046,8 +1055,7 @@
         }
         else if (dst != src2) {
             SLJIT_ASSERT(src2 & SRC2_IMM);
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS((flags & INV_IMM) ? MVN_DP : MOV_DP, 0,
-                dst, SLJIT_UNUSED, src2));
+            return push_inst(compiler, ((flags & INV_IMM) ? MVN : MOV) | RD(dst) | src2);
         }
         return SLJIT_SUCCESS;


@@ -1056,8 +1064,8 @@
         SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
         if (flags & MOVE_REG_CONV) {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-            FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (16 << 7) | RM(src2))));
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (16 << 7) | (op == SLJIT_MOV_U16 ? 0x20 : 0x40) | RM(dst)));
+            FAIL_IF(push_inst(compiler, MOV | RD(dst) | (16 << 7) | RM(src2)));
+            return push_inst(compiler, MOV | RD(dst) | (16 << 7) | (op == SLJIT_MOV_U16 ? 0x20 : 0x40) | RM(dst));
 #else
             return push_inst(compiler, (op == SLJIT_MOV_U16 ? UXTH : SXTH) | RD(dst) | RM(src2));
 #endif
@@ -1064,17 +1072,15 @@
         }
         else if (dst != src2) {
             SLJIT_ASSERT(src2 & SRC2_IMM);
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS((flags & INV_IMM) ? MVN_DP : MOV_DP, 0,
-                dst, SLJIT_UNUSED, src2));
+            return push_inst(compiler, ((flags & INV_IMM) ? MVN : MOV) | RD(dst) | src2);
         }
         return SLJIT_SUCCESS;


     case SLJIT_NOT:
         if (src2 & SRC2_IMM) {
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS((flags & INV_IMM) ? MOV_DP : MVN_DP, flags & SET_FLAGS,
-                dst, SLJIT_UNUSED, src2));
+            return push_inst(compiler, ((flags & INV_IMM) ? MOV : MVN) | (flags & SET_FLAGS) | RD(dst) | src2);
         }
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(MVN_DP, flags & SET_FLAGS, dst, SLJIT_UNUSED, RM(src2)));
+        return push_inst(compiler, MVN | (flags & SET_FLAGS) | RD(dst) | RM(src2));


     case SLJIT_CLZ:
         SLJIT_ASSERT(!(flags & INV_IMM));
@@ -1085,28 +1091,24 @@
     case SLJIT_ADD:
         SLJIT_ASSERT(!(flags & INV_IMM));
         if ((flags & (UNUSED_RETURN | SET_FLAGS)) == (UNUSED_RETURN | SET_FLAGS) && !(flags & ARGS_SWAPPED))
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS(CMN_DP, SET_FLAGS,
-                SLJIT_UNUSED, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, flags & SET_FLAGS,
-            dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
+            return push_inst(compiler, CMN | SET_FLAGS | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
+        return push_inst(compiler, ADD | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));


     case SLJIT_ADDC:
         SLJIT_ASSERT(!(flags & INV_IMM));
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(ADC_DP, flags & SET_FLAGS,
-            dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
+        return push_inst(compiler, ADC | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));


     case SLJIT_SUB:
         SLJIT_ASSERT(!(flags & INV_IMM));
         if ((flags & (UNUSED_RETURN | SET_FLAGS)) == (UNUSED_RETURN | SET_FLAGS) && !(flags & ARGS_SWAPPED))
-            return push_inst(compiler, EMIT_DATA_PROCESS_INS(CMP_DP, SET_FLAGS,
-                SLJIT_UNUSED, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(!(flags & ARGS_SWAPPED) ? SUB_DP : RSB_DP, flags & SET_FLAGS,
-            dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
+            return push_inst(compiler, CMP | SET_FLAGS | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
+        return push_inst(compiler, (!(flags & ARGS_SWAPPED) ? SUB : RSB) | (flags & SET_FLAGS)
+            | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));


     case SLJIT_SUBC:
         SLJIT_ASSERT(!(flags & INV_IMM));
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(!(flags & ARGS_SWAPPED) ? SBC_DP : RSC_DP, flags & SET_FLAGS,
-            dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
+        return push_inst(compiler, (!(flags & ARGS_SWAPPED) ? SBC : RSC) | (flags & SET_FLAGS)
+            | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));


     case SLJIT_MUL:
         SLJIT_ASSERT(!(flags & INV_IMM));
@@ -1118,19 +1120,19 @@
         FAIL_IF(push_inst(compiler, SMULL | (reg_map[TMP_REG1] << 16) | (reg_map[dst] << 12) | (reg_map[src2] << 8) | reg_map[src1]));


         /* cmp TMP_REG1, dst asr #31. */
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(CMP_DP, SET_FLAGS, SLJIT_UNUSED, TMP_REG1, RM(dst) | 0xfc0));
+        return push_inst(compiler, CMP | SET_FLAGS | RN(TMP_REG1) | RM(dst) | 0xfc0);


     case SLJIT_AND:
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(!(flags & INV_IMM) ? AND_DP : BIC_DP, flags & SET_FLAGS,
-            dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
+        return push_inst(compiler, (!(flags & INV_IMM) ? AND : BIC) | (flags & SET_FLAGS)
+            | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));


     case SLJIT_OR:
         SLJIT_ASSERT(!(flags & INV_IMM));
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(ORR_DP, flags & SET_FLAGS, dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
+        return push_inst(compiler, ORR | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));


     case SLJIT_XOR:
         SLJIT_ASSERT(!(flags & INV_IMM));
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(EOR_DP, flags & SET_FLAGS, dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
+        return push_inst(compiler, EOR | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));


     case SLJIT_SHL:
         EMIT_SHIFT_INS_AND_RETURN(0);
@@ -1293,8 +1295,8 @@
             return 0;
     }


-    FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(positive ? MOV_DP : MVN_DP, 0, reg, SLJIT_UNUSED, imm1)));
-    FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(positive ? ORR_DP : BIC_DP, 0, reg, reg, imm2)));
+    FAIL_IF(push_inst(compiler, (positive ? MOV : MVN) | RD(reg) | imm1));
+    FAIL_IF(push_inst(compiler, (positive ? ORR : BIC) | RD(reg) | RN(reg) | imm2));
     return 1;
 }
 #endif
@@ -1311,11 +1313,11 @@
     /* Create imm by 1 inst. */
     tmp = get_imm(imm);
     if (tmp)
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, reg, SLJIT_UNUSED, tmp));
+        return push_inst(compiler, MOV | RD(reg) | tmp);


     tmp = get_imm(~imm);
     if (tmp)
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(MVN_DP, 0, reg, SLJIT_UNUSED, tmp));
+        return push_inst(compiler, MVN | RD(reg) | tmp);


 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
     /* Create imm by 2 inst. */
@@ -1365,7 +1367,7 @@
         if (argw != 0 && !is_type1_transfer) {
             SLJIT_ASSERT(!(flags & WRITE_BACK));


-            FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, tmp_reg, arg, RM(offset_reg) | (argw << 7))));
+            FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | RM(offset_reg) | (argw << 7)));
             return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 0, reg, tmp_reg, TYPE2_TRANSFER_IMM(0)));
         }


@@ -1381,7 +1383,7 @@
             imm = get_imm(argw & ~0xfff);
             if (imm) {
                 offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, offset_reg, arg, imm)));
+                FAIL_IF(push_inst(compiler, ADD | RD(offset_reg) | RN(arg) | imm));
                 argw = argw & 0xfff;
                 arg = offset_reg;
             }
@@ -1390,7 +1392,7 @@
             imm = get_imm(-argw & ~0xfff);
             if (imm) {
                 offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(SUB_DP, 0, offset_reg, arg, imm)));
+                FAIL_IF(push_inst(compiler, SUB | RD(offset_reg) | RN(arg) | imm));
                 argw = -(-argw & 0xfff);
                 arg = offset_reg;
             }
@@ -1408,7 +1410,7 @@
             imm = get_imm(argw & ~0xff);
             if (imm) {
                 offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, offset_reg, arg, imm)));
+                FAIL_IF(push_inst(compiler, ADD | RD(offset_reg) | RN(arg) | imm));
                 argw = argw & 0xff;
                 arg = offset_reg;
             }
@@ -1417,7 +1419,7 @@
             imm = get_imm(-argw & ~0xff);
             if (imm) {
                 offset_reg = (flags & WRITE_BACK) ? arg : tmp_reg;
-                FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(SUB_DP, 0, offset_reg, arg, imm)));
+                FAIL_IF(push_inst(compiler, SUB | RD(offset_reg) | RN(arg) | imm));
                 argw = -(-argw & 0xff);
                 arg = offset_reg;
             }
@@ -1785,7 +1787,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
     CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-    return reg << 1;
+    return (freg_map[reg] << 1);
 }


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -1804,9 +1806,9 @@

 #define FPU_LOAD (1 << 20)
 #define EMIT_FPU_DATA_TRANSFER(inst, add, base, freg, offs) \
-    ((inst) | ((add) << 23) | (reg_map[base] << 16) | (freg << 12) | (offs))
+    ((inst) | ((add) << 23) | (reg_map[base] << 16) | (freg_map[freg] << 12) | (offs))
 #define EMIT_FPU_OPERATION(opcode, mode, dst, src1, src2) \
-    ((opcode) | (mode) | ((dst) << 12) | (src1) | ((src2) << 16))
+    ((opcode) | (mode) | (freg_map[dst] << 12) | freg_map[src1] | (freg_map[src2] << 16))


 static sljit_s32 emit_fop_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
 {
@@ -1817,7 +1819,7 @@
     arg &= ~SLJIT_MEM;


     if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG2, arg & REG_MASK, RM(OFFS_REG(arg)) | ((argw & 0x3) << 7))));
+        FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG2) | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | ((argw & 0x3) << 7)));
         arg = TMP_REG2;
         argw = 0;
     }
@@ -1831,13 +1833,13 @@


         imm = get_imm(argw & ~0x3fc);
         if (imm) {
-            FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG2, arg & REG_MASK, imm)));
+            FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG2) | RN(arg & REG_MASK) | imm));
             return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 1, TMP_REG2, reg, (argw & 0x3fc) >> 2));
         }
         imm = get_imm(-argw & ~0x3fc);
         if (imm) {
             argw = -argw;
-            FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(SUB_DP, 0, TMP_REG2, arg & REG_MASK, imm)));
+            FAIL_IF(push_inst(compiler, SUB | RD(TMP_REG2) | RN(arg & REG_MASK) | imm));
             return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 0, TMP_REG2, reg, (argw & 0x3fc) >> 2));
         }
     }
@@ -1844,7 +1846,7 @@


     if (arg) {
         FAIL_IF(load_immediate(compiler, TMP_REG2, argw));
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG2, arg & REG_MASK, RM(TMP_REG2))));
+        FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG2) | RN(arg & REG_MASK) | RM(TMP_REG2)));
     }
     else
         FAIL_IF(load_immediate(compiler, TMP_REG2, argw));
@@ -1866,7 +1868,7 @@
     FAIL_IF(push_inst(compiler, EMIT_FPU_OPERATION(VCVT_S32_F32, op & SLJIT_F32_OP, TMP_FREG1, src, 0)));


     if (FAST_IS_REG(dst))
-        return push_inst(compiler, VMOV | (1 << 20) | RD(dst) | (TMP_FREG1 << 16));
+        return push_inst(compiler, VMOV | (1 << 20) | RD(dst) | (freg_map[TMP_FREG1] << 16));


     /* Store the integer value from a VFP register. */
     return emit_fop_mem(compiler, 0, TMP_FREG1, dst, dstw);
@@ -1881,7 +1883,7 @@
     op ^= SLJIT_F32_OP;


     if (FAST_IS_REG(src))
-        FAIL_IF(push_inst(compiler, VMOV | RD(src) | (TMP_FREG1 << 16)));
+        FAIL_IF(push_inst(compiler, VMOV | RD(src) | (freg_map[TMP_FREG1] << 16)));
     else if (src & SLJIT_MEM) {
         /* Load the integer value into a VFP register. */
         FAIL_IF(emit_fop_mem(compiler, FPU_LOAD, TMP_FREG1, src, srcw));
@@ -1888,7 +1890,7 @@
     }
     else {
         FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
-        FAIL_IF(push_inst(compiler, VMOV | RD(TMP_REG1) | (TMP_FREG1 << 16)));
+        FAIL_IF(push_inst(compiler, VMOV | RD(TMP_REG1) | (freg_map[TMP_FREG1] << 16)));
     }


     FAIL_IF(push_inst(compiler, EMIT_FPU_OPERATION(VCVT_F32_S32, op & SLJIT_F32_OP, dst_r, TMP_FREG1, 0)));
@@ -2018,7 +2020,6 @@


#undef FPU_LOAD
#undef EMIT_FPU_DATA_TRANSFER
-#undef EMIT_FPU_OPERATION

 /* --------------------------------------------------------------------- */
 /*  Other instructions                                                   */
@@ -2030,13 +2031,13 @@
     CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
     ADJUST_LOCAL_OFFSET(dst, dstw);


-    SLJIT_ASSERT(reg_map[TMP_REG1] == 14);
+    SLJIT_ASSERT(reg_map[TMP_REG2] == 14);


     if (FAST_IS_REG(dst))
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, RM(TMP_REG1)));
+        return push_inst(compiler, MOV | RD(dst) | RM(TMP_REG2));


     /* Memory. */
-    return emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2);
+    return emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw, TMP_REG1);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
@@ -2045,16 +2046,16 @@
     CHECK(check_sljit_emit_fast_return(compiler, src, srcw));
     ADJUST_LOCAL_OFFSET(src, srcw);


-    SLJIT_ASSERT(reg_map[TMP_REG1] == 14);
+    SLJIT_ASSERT(reg_map[TMP_REG2] == 14);


     if (FAST_IS_REG(src))
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, TMP_REG1, 0, RM(src))));
+        FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG2) | RM(src)));
     else if (src & SLJIT_MEM)
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG2));
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, src, srcw, TMP_REG1));
     else if (src & SLJIT_IMM)
-        FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
+        FAIL_IF(load_immediate(compiler, TMP_REG2, srcw));


-    return push_inst(compiler, BX | RM(TMP_REG1));
+    return push_inst(compiler, BX | RM(TMP_REG2));
 }


 /* --------------------------------------------------------------------- */
@@ -2111,7 +2112,7 @@
         return 0x70000000;


     default:
-        SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL3);
+        SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL_CDECL);
         return 0xe0000000;
     }
 }
@@ -2144,12 +2145,13 @@
     set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
     type &= 0xff;


-    /* In ARM, we don't need to touch the arguments. */
+    SLJIT_ASSERT(reg_map[TMP_REG1] != 14);
+
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
     if (type >= SLJIT_FAST_CALL)
         PTR_FAIL_IF(prepare_blx(compiler));
     PTR_FAIL_IF(push_inst_with_unique_literal(compiler, ((EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0,
-        type <= SLJIT_JUMP ? TMP_PC : TMP_REG2, TMP_PC, 0)) & ~COND_MASK) | get_cc(type), 0));
+        type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0)) & ~COND_MASK) | get_cc(type), 0));


     if (jump->flags & SLJIT_REWRITABLE_JUMP) {
         jump->addr = compiler->size;
@@ -2166,13 +2168,248 @@
 #else
     if (type >= SLJIT_FAST_CALL)
         jump->flags |= IS_BL;
-    PTR_FAIL_IF(emit_imm(compiler, TMP_REG2, 0));
-    PTR_FAIL_IF(push_inst(compiler, (((type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG2)) & ~COND_MASK) | get_cc(type)));
+    PTR_FAIL_IF(emit_imm(compiler, TMP_REG1, 0));
+    PTR_FAIL_IF(push_inst(compiler, (((type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1)) & ~COND_MASK) | get_cc(type)));
     jump->addr = compiler->size;
 #endif
     return jump;
 }


+#ifdef __SOFTFP__
+
+static sljit_s32 softfloat_call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *src)
+{
+    sljit_s32 stack_offset = 0;
+    sljit_s32 arg_count = 0;
+    sljit_s32 word_arg_offset = 0;
+    sljit_s32 float_arg_count = 0;
+    sljit_s32 types = 0;
+    sljit_s32 src_offset = 4 * sizeof(sljit_sw);
+    sljit_u8 offsets[4];
+
+    if (src && FAST_IS_REG(*src))
+        src_offset = reg_map[*src] * sizeof(sljit_sw);
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        types = (types << SLJIT_DEF_SHIFT) | (arg_types & SLJIT_DEF_MASK);
+
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            offsets[arg_count] = (sljit_u8)stack_offset;
+            stack_offset += sizeof(sljit_f32);
+            arg_count++;
+            float_arg_count++;
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (stack_offset & 0x7)
+                stack_offset += sizeof(sljit_sw);
+            offsets[arg_count] = (sljit_u8)stack_offset;
+            stack_offset += sizeof(sljit_f64);
+            arg_count++;
+            float_arg_count++;
+            break;
+        default:
+            offsets[arg_count] = (sljit_u8)stack_offset;
+            stack_offset += sizeof(sljit_sw);
+            arg_count++;
+            word_arg_offset += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    if (stack_offset > 16)
+        FAIL_IF(push_inst(compiler, SUB | RD(SLJIT_SP) | RN(SLJIT_SP) | SRC2_IMM | (((stack_offset - 16) + 0x7) & ~0x7)));
+
+    /* Process arguments in reversed direction. */
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            arg_count--;
+            float_arg_count--;
+            stack_offset = offsets[arg_count];
+
+            if (stack_offset < 16) {
+                if (src_offset == stack_offset) {
+                    FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | (src_offset >> 2)));
+                    *src = TMP_REG1;
+                }
+                FAIL_IF(push_inst(compiler, VMOV | 0x100000 | (float_arg_count << 16) | (stack_offset << 10)));
+            } else
+                FAIL_IF(push_inst(compiler, VSTR_F32 | 0x800000 | RN(SLJIT_SP) | (float_arg_count << 12) | ((stack_offset - 16) >> 2)));
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            arg_count--;
+            float_arg_count--;
+            stack_offset = offsets[arg_count];
+
+            SLJIT_ASSERT((stack_offset & 0x7) == 0);
+
+            if (stack_offset < 16) {
+                if (src_offset == stack_offset || src_offset == stack_offset + sizeof(sljit_sw)) {
+                    FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | (src_offset >> 2)));
+                    *src = TMP_REG1;
+                }
+                FAIL_IF(push_inst(compiler, VMOV2 | 0x100000 | (stack_offset << 10) | ((stack_offset + sizeof(sljit_sw)) << 14) | float_arg_count));
+            } else
+                FAIL_IF(push_inst(compiler, VSTR_F32 | 0x800100 | RN(SLJIT_SP) | (float_arg_count << 12) | ((stack_offset - 16) >> 2)));
+            break;
+        default:
+            arg_count--;
+            word_arg_offset -= sizeof(sljit_sw);
+            stack_offset = offsets[arg_count];
+
+            SLJIT_ASSERT(stack_offset >= word_arg_offset);
+
+            if (stack_offset != word_arg_offset) {
+                if (stack_offset < 16) {
+                    if (src_offset == stack_offset) {
+                        FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | (src_offset >> 2)));
+                        *src = TMP_REG1;
+                    }
+                    else if (src_offset == word_arg_offset) {
+                        *src = 1 + (stack_offset >> 2);
+                        src_offset = stack_offset;
+                    }
+                    FAIL_IF(push_inst(compiler, MOV | (stack_offset << 10) | (word_arg_offset >> 2)));
+                } else
+                    FAIL_IF(push_inst(compiler, data_transfer_insts[WORD_DATA] | 0x800000 | RN(SLJIT_SP) | (word_arg_offset << 10) | (stack_offset - 16)));
+            }
+            break;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+static sljit_s32 softfloat_post_call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types)
+{
+    sljit_s32 stack_size = 0;
+
+    if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F32)
+        FAIL_IF(push_inst(compiler, VMOV | (0 << 16) | (0 << 12)));
+    if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F64)
+        FAIL_IF(push_inst(compiler, VMOV2 | (1 << 16) | (0 << 12) | 0));
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            stack_size += sizeof(sljit_f32);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (stack_size & 0x7)
+                stack_size += sizeof(sljit_sw);
+            stack_size += sizeof(sljit_f64);
+            break;
+        default:
+            stack_size += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    if (stack_size <= 16)
+        return SLJIT_SUCCESS;
+
+    return push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | SRC2_IMM | (((stack_size - 16) + 0x7) & ~0x7));
+}
+
+#else /* !__SOFTFP__ */
+
+static sljit_s32 hardfloat_call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types)
+{
+    sljit_u32 remap = 0;
+    sljit_u32 offset = 0;
+    sljit_u32 new_offset, mask;
+
+    /* Remove return value. */
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F32) {
+            new_offset = 0;
+            mask = 1;
+
+            while (remap & mask) {
+                new_offset++;
+                mask <<= 1;
+            }
+            remap |= mask;
+
+            if (offset != new_offset)
+                FAIL_IF(push_inst(compiler, EMIT_FPU_OPERATION(VMOV_F32,
+                    0, (new_offset >> 1) + 1, (offset >> 1) + 1, 0) | ((new_offset & 0x1) ? 0x400000 : 0)));
+
+            offset += 2;
+        }
+        else if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F64) {
+            new_offset = 0;
+            mask = 3;
+
+            while (remap & mask) {
+                new_offset += 2;
+                mask <<= 2;
+            }
+            remap |= mask;
+
+            if (offset != new_offset)
+                FAIL_IF(push_inst(compiler, EMIT_FPU_OPERATION(VMOV_F32, SLJIT_F32_OP, (new_offset >> 1) + 1, (offset >> 1) + 1, 0)));
+
+            offset += 2;
+        }
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+#endif /* __SOFTFP__ */
+
+#undef EMIT_FPU_OPERATION
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+#ifdef __SOFTFP__
+    struct sljit_jump *jump;
+#endif
+
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+#ifdef __SOFTFP__
+    PTR_FAIL_IF(softfloat_call_with_args(compiler, arg_types, NULL));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    jump = sljit_emit_jump(compiler, type);
+    PTR_FAIL_IF(jump == NULL);
+
+    PTR_FAIL_IF(softfloat_post_call_with_args(compiler, arg_types));
+    return jump;
+#else /* !__SOFTFP__ */
+    PTR_FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_jump(compiler, type);
+#endif /* __SOFTFP__ */
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
     struct sljit_jump *jump;
@@ -2181,16 +2418,20 @@
     CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
     ADJUST_LOCAL_OFFSET(src, srcw);


-    /* In ARM, we don't need to touch the arguments. */
+    SLJIT_ASSERT(reg_map[TMP_REG1] != 14);
+
     if (!(src & SLJIT_IMM)) {
-        if (FAST_IS_REG(src))
+        if (FAST_IS_REG(src)) {
+            SLJIT_ASSERT(reg_map[src] != 14);
             return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(src));
+        }


         SLJIT_ASSERT(src & SLJIT_MEM);
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, src, srcw, TMP_REG2));
-        return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG2));
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
+        return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1));
     }


+    /* These jumps are converted to jump/call instructions when possible. */
     jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
     FAIL_IF(!jump);
     set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
@@ -2199,22 +2440,57 @@
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
     if (type >= SLJIT_FAST_CALL)
         FAIL_IF(prepare_blx(compiler));
-    FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, type <= SLJIT_JUMP ? TMP_PC : TMP_REG2, TMP_PC, 0), 0));
+    FAIL_IF(push_inst_with_unique_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0), 0));
     if (type >= SLJIT_FAST_CALL)
         FAIL_IF(emit_blx(compiler));
 #else
-    FAIL_IF(emit_imm(compiler, TMP_REG2, 0));
-    FAIL_IF(push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG2)));
+    FAIL_IF(emit_imm(compiler, TMP_REG1, 0));
+    FAIL_IF(push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(TMP_REG1)));
 #endif
     jump->addr = compiler->size;
     return SLJIT_SUCCESS;
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+#ifdef __SOFTFP__
+    if (src & SLJIT_MEM) {
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
+        src = TMP_REG1;
+    }
+
+    FAIL_IF(softfloat_call_with_args(compiler, arg_types, &src));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
+
+    return softfloat_post_call_with_args(compiler, arg_types);
+#else /* !__SOFTFP__ */
+    FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_ijump(compiler, type, src, srcw);
+#endif /* __SOFTFP__ */
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)
 {
-    sljit_s32 dst_r, flags = GET_ALL_FLAGS(op);
+    sljit_s32 dst_reg, flags = GET_ALL_FLAGS(op);
     sljit_uw cc, ins;


     CHECK_ERROR();
@@ -2223,31 +2499,31 @@


     op = GET_OPCODE(op);
     cc = get_cc(type & 0xff);
-    dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+    dst_reg = FAST_IS_REG(dst) ? dst : TMP_REG1;


     if (op < SLJIT_ADD) {
-        FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_r, SLJIT_UNUSED, SRC2_IMM | 0)));
-        FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_r, SLJIT_UNUSED, SRC2_IMM | 1) & ~COND_MASK) | cc));
+        FAIL_IF(push_inst(compiler, MOV | RD(dst_reg) | SRC2_IMM | 0));
+        FAIL_IF(push_inst(compiler, ((MOV | RD(dst_reg) | SRC2_IMM | 1) & ~COND_MASK) | cc));
         if (dst & SLJIT_MEM)
             return emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2);
         return SLJIT_SUCCESS;
     }


-    ins = (op == SLJIT_AND ? AND_DP : (op == SLJIT_OR ? ORR_DP : EOR_DP));
+    ins = (op == SLJIT_AND ? AND : (op == SLJIT_OR ? ORR : EOR));


     if (dst & SLJIT_MEM)
         FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, dst, dstw, TMP_REG2));


-    FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_r, dst_r, SRC2_IMM | 1) & ~COND_MASK) | cc));
+    FAIL_IF(push_inst(compiler, ((ins | RD(dst_reg) | RN(dst_reg) | SRC2_IMM | 1) & ~COND_MASK) | cc));


     if (op == SLJIT_AND)
-        FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_r, dst_r, SRC2_IMM | 0) & ~COND_MASK) | (cc ^ 0x10000000)));
+        FAIL_IF(push_inst(compiler, ((ins | RD(dst_reg) | RN(dst_reg) | SRC2_IMM | 0) & ~COND_MASK) | (cc ^ 0x10000000)));


     if (dst & SLJIT_MEM)
         FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2));


     if (flags & SLJIT_SET_Z)
-        return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, SET_FLAGS, TMP_REG2, SLJIT_UNUSED, RM(dst_r)));
+        return push_inst(compiler, MOV | SET_FLAGS | RD(TMP_REG2) | RM(dst_reg));
     return SLJIT_SUCCESS;
 }


@@ -2267,11 +2543,11 @@
     if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
         tmp = get_imm(srcw);
         if (tmp)
-            return push_inst(compiler, (EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_reg, SLJIT_UNUSED, tmp) & ~COND_MASK) | cc);
+            return push_inst(compiler, ((MOV | RD(dst_reg) | tmp) & ~COND_MASK) | cc);


         tmp = get_imm(~srcw);
         if (tmp)
-            return push_inst(compiler, (EMIT_DATA_PROCESS_INS(MVN_DP, 0, dst_reg, SLJIT_UNUSED, tmp) & ~COND_MASK) | cc);
+            return push_inst(compiler, ((MVN | RD(dst_reg) | tmp) & ~COND_MASK) | cc);


 #if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
         tmp = (sljit_uw) srcw;
@@ -2285,7 +2561,7 @@
 #endif
     }


-    return push_inst(compiler, (EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_reg, SLJIT_UNUSED, RM(src)) & ~COND_MASK) | cc);
+    return push_inst(compiler, ((MOV | RD(dst_reg) | RM(src)) & ~COND_MASK) | cc);
 }


SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)

Modified: code/trunk/sljit/sljitNativeARM_64.c
===================================================================
--- code/trunk/sljit/sljitNativeARM_64.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeARM_64.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -40,13 +40,17 @@
 #define TMP_LR        (SLJIT_NUMBER_OF_REGISTERS + 5)
 #define TMP_SP        (SLJIT_NUMBER_OF_REGISTERS + 6)


-#define TMP_FREG1    (0)
-#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 8] = {
31, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 16, 17, 8, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 29, 9, 10, 11, 30, 31
};

+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+    0, 0, 1, 2, 3, 4, 5, 6, 7
+};
+
 #define W_OP (1 << 31)
 #define RD(rd) (reg_map[rd])
 #define RT(rt) (reg_map[rt])
@@ -53,10 +57,10 @@
 #define RN(rn) (reg_map[rn] << 5)
 #define RT2(rt2) (reg_map[rt2] << 10)
 #define RM(rm) (reg_map[rm] << 16)
-#define VD(vd) (vd)
-#define VT(vt) (vt)
-#define VN(vn) ((vn) << 5)
-#define VM(vm) ((vm) << 16)
+#define VD(vd) (freg_map[vd])
+#define VT(vt) (freg_map[vt])
+#define VN(vn) (freg_map[vn] << 5)
+#define VM(vm) (freg_map[vm] << 16)


 /* --------------------------------------------------------------------- */
 /*  Instrucion forms                                                     */
@@ -1088,14 +1092,14 @@
 /* --------------------------------------------------------------------- */


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-    sljit_s32 i, tmp, offs, prev, saved_regs_size;
+    sljit_s32 args, i, tmp, offs, prev, saved_regs_size;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 0);
     local_size += saved_regs_size + SLJIT_LOCALS_OFFSET;
@@ -1165,6 +1169,8 @@
         FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_SP) | (0 << 10)));
     }


+    args = get_arg_count(arg_types);
+
     if (args >= 1)
         FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_S0) | RN(TMP_ZERO) | RM(SLJIT_R0)));
     if (args >= 2)
@@ -1176,12 +1182,12 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 0) + SLJIT_LOCALS_OFFSET;
     local_size = (local_size + 15) & ~0xf;
@@ -1568,7 +1574,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
     CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-    return reg;
+    return freg_map[reg];
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -1936,6 +1942,20 @@
     return jump;
 }


+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_jump(compiler, type);
+}
+
 static SLJIT_INLINE struct sljit_jump* emit_cmp_to0(struct sljit_compiler *compiler, sljit_s32 type,
     sljit_s32 src, sljit_sw srcw)
 {
@@ -1978,7 +1998,6 @@
     CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
     ADJUST_LOCAL_OFFSET(src, srcw);


-    /* In ARM, we don't need to touch the arguments. */
     if (!(src & SLJIT_IMM)) {
         if (src & SLJIT_MEM) {
             FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw));
@@ -1987,6 +2006,7 @@
         return push_inst(compiler, ((type >= SLJIT_FAST_CALL) ? BLR : BR) | RN(src));
     }


+    /* These jumps are converted to jump/call instructions when possible. */
     jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
     FAIL_IF(!jump);
     set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
@@ -1997,6 +2017,21 @@
     return push_inst(compiler, ((type >= SLJIT_FAST_CALL) ? BLR : BR) | RN(TMP_REG1));
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_ijump(compiler, type, src, srcw);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)


Modified: code/trunk/sljit/sljitNativeARM_T2_32.c
===================================================================
--- code/trunk/sljit/sljitNativeARM_T2_32.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeARM_T2_32.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -26,7 +26,11 @@


 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
 {
-    return "ARM-Thumb2" SLJIT_CPUINFO;
+#ifdef __SOFTFP__
+    return "ARM-Thumb2" SLJIT_CPUINFO " ABI:softfp";
+#else
+    return "ARM-Thumb2" SLJIT_CPUINFO " ABI:hardfp";
+#endif
 }


 /* Length of an instruction word. */
@@ -37,14 +41,18 @@
 #define TMP_REG2    (SLJIT_NUMBER_OF_REGISTERS + 3)
 #define TMP_PC        (SLJIT_NUMBER_OF_REGISTERS + 4)


-#define TMP_FREG1    (0)
-#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


 /* See sljit_emit_enter and sljit_emit_op0 if you want to change them. */
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
-    0, 0, 1, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 13, 3, 14, 15
+    0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15
 };


+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+    0, 0, 1, 2, 3, 4, 5, 6, 7
+};
+
 #define COPY_BITS(src, from, to, bits) \
     ((from >= to ? (src >> (from - to)) : (src << (to - from))) & (((1 << bits) - 1) << to))


@@ -69,9 +77,9 @@
 #define RN4(rn) (reg_map[rn] << 16)
 #define RM4(rm) (reg_map[rm])
 #define RT4(rt) (reg_map[rt] << 12)
-#define DD4(dd) ((dd) << 12)
-#define DN4(dn) ((dn) << 16)
-#define DM4(dm) (dm)
+#define DD4(dd) (freg_map[dd] << 12)
+#define DN4(dn) (freg_map[dn] << 16)
+#define DM4(dm) (freg_map[dm])
 #define IMM5(imm) \
     (COPY_BITS(imm, 2, 12, 3) | ((imm & 0x3) << 6))
 #define IMM12(imm) \
@@ -178,6 +186,7 @@
 #define VDIV_F32    0xee800a00
 #define VMOV_F32    0xeeb00a40
 #define VMOV        0xee000a10
+#define VMOV2        0xec400a10
 #define VMRS        0xeef1fa10
 #define VMUL_F32    0xee200a00
 #define VNEG_F32    0xeeb10a40
@@ -208,10 +217,10 @@


 static SLJIT_INLINE sljit_s32 emit_imm32_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_uw imm)
 {
-    FAIL_IF(push_inst32(compiler, MOVW | RD4(dst) |
-        COPY_BITS(imm, 12, 16, 4) | COPY_BITS(imm, 11, 26, 1) | COPY_BITS(imm, 8, 12, 3) | (imm & 0xff)));
-    return push_inst32(compiler, MOVT | RD4(dst) |
-        COPY_BITS(imm, 12 + 16, 16, 4) | COPY_BITS(imm, 11 + 16, 26, 1) | COPY_BITS(imm, 8 + 16, 12, 3) | ((imm & 0xff0000) >> 16));
+    FAIL_IF(push_inst32(compiler, MOVW | RD4(dst)
+        | COPY_BITS(imm, 12, 16, 4) | COPY_BITS(imm, 11, 26, 1) | COPY_BITS(imm, 8, 12, 3) | (imm & 0xff)));
+    return push_inst32(compiler, MOVT | RD4(dst)
+        | COPY_BITS(imm, 12 + 16, 16, 4) | COPY_BITS(imm, 11 + 16, 26, 1) | COPY_BITS(imm, 8 + 16, 12, 3) | ((imm & 0xff0000) >> 16));
 }


 static SLJIT_INLINE void modify_imm32_const(sljit_u16 *inst, sljit_uw new_imm)
@@ -522,13 +531,13 @@
     }


     /* set low 16 bits, set hi 16 bits to 0. */
-    FAIL_IF(push_inst32(compiler, MOVW | RD4(dst) |
-        COPY_BITS(imm, 12, 16, 4) | COPY_BITS(imm, 11, 26, 1) | COPY_BITS(imm, 8, 12, 3) | (imm & 0xff)));
+    FAIL_IF(push_inst32(compiler, MOVW | RD4(dst)
+        | COPY_BITS(imm, 12, 16, 4) | COPY_BITS(imm, 11, 26, 1) | COPY_BITS(imm, 8, 12, 3) | (imm & 0xff)));


     /* set hi 16 bit if needed. */
     if (imm >= 0x10000)
-        return push_inst32(compiler, MOVT | RD4(dst) |
-            COPY_BITS(imm, 12 + 16, 16, 4) | COPY_BITS(imm, 11 + 16, 26, 1) | COPY_BITS(imm, 8 + 16, 12, 3) | ((imm & 0xff0000) >> 16));
+        return push_inst32(compiler, MOVT | RD4(dst)
+            | COPY_BITS(imm, 12 + 16, 16, 4) | COPY_BITS(imm, 11 + 16, 26, 1) | COPY_BITS(imm, 8 + 16, 12, 3) | ((imm & 0xff0000) >> 16));
     return SLJIT_SUCCESS;
 }


@@ -1088,15 +1097,15 @@
/* --------------------------------------------------------------------- */

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-    sljit_s32 size, i, tmp;
+    sljit_s32 args, size, i, tmp;
     sljit_ins push = 0;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
     for (i = SLJIT_S0; i >= tmp; i--)
@@ -1120,6 +1129,8 @@
             FAIL_IF(emit_op_imm(compiler, SLJIT_SUB | ARG2_IMM, SLJIT_SP, SLJIT_SP, local_size));
     }


+    args = get_arg_count(arg_types);
+
     if (args >= 1)
         FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_S0, SLJIT_R0)));
     if (args >= 2)
@@ -1131,14 +1142,14 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     sljit_s32 size;


     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
     compiler->local_size = ((size + local_size + 7) & ~7) - size;
@@ -1219,11 +1230,11 @@
     case SLJIT_DIV_UW:
     case SLJIT_DIV_SW:
         SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
-        SLJIT_ASSERT(reg_map[2] == 1 && reg_map[3] == 2 && reg_map[4] == 12);
+        SLJIT_ASSERT(reg_map[2] == 1 && reg_map[3] == 2 && reg_map[4] == 3);


         saved_reg_count = 0;
         if (compiler->scratches >= 4)
-            saved_reg_list[saved_reg_count++] = 12;
+            saved_reg_list[saved_reg_count++] = 3;
         if (compiler->scratches >= 3)
             saved_reg_list[saved_reg_count++] = 2;
         if (op >= SLJIT_DIV_UW)
@@ -1448,7 +1459,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
     CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-    return reg << 1;
+    return (freg_map[reg] << 1);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -1798,7 +1809,6 @@
     set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
     type &= 0xff;


-    /* In ARM, we don't need to touch the arguments. */
     PTR_FAIL_IF(emit_imm32_const(compiler, TMP_REG1, 0));
     if (type < SLJIT_JUMP) {
         jump->flags |= IS_COND;
@@ -1818,6 +1828,241 @@
     return jump;
 }


+#ifdef __SOFTFP__
+
+static sljit_s32 softfloat_call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *src)
+{
+    sljit_s32 stack_offset = 0;
+    sljit_s32 arg_count = 0;
+    sljit_s32 word_arg_offset = 0;
+    sljit_s32 float_arg_count = 0;
+    sljit_s32 types = 0;
+    sljit_s32 src_offset = 4 * sizeof(sljit_sw);
+    sljit_u8 offsets[4];
+
+    if (src && FAST_IS_REG(*src))
+        src_offset = reg_map[*src] * sizeof(sljit_sw);
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        types = (types << SLJIT_DEF_SHIFT) | (arg_types & SLJIT_DEF_MASK);
+
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            offsets[arg_count] = (sljit_u8)stack_offset;
+            stack_offset += sizeof(sljit_f32);
+            arg_count++;
+            float_arg_count++;
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (stack_offset & 0x7)
+                stack_offset += sizeof(sljit_sw);
+            offsets[arg_count] = (sljit_u8)stack_offset;
+            stack_offset += sizeof(sljit_f64);
+            arg_count++;
+            float_arg_count++;
+            break;
+        default:
+            offsets[arg_count] = (sljit_u8)stack_offset;
+            stack_offset += sizeof(sljit_sw);
+            arg_count++;
+            word_arg_offset += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    if (stack_offset > 16)
+        FAIL_IF(push_inst16(compiler, SUB_SP | (((stack_offset - 16) + 0x7) & ~0x7) >> 2));
+
+    SLJIT_ASSERT(reg_map[TMP_REG1] == 12);
+
+    /* Process arguments in reversed direction. */
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            arg_count--;
+            float_arg_count--;
+            stack_offset = offsets[arg_count];
+
+            if (stack_offset < 16) {
+                if (src_offset == stack_offset) {
+                    FAIL_IF(push_inst16(compiler, MOV | (src_offset << 1) | 4 | (1 << 7)));
+                    *src = TMP_REG1;
+                }
+                FAIL_IF(push_inst32(compiler, VMOV | 0x100000 | (float_arg_count << 16) | (stack_offset << 10)));
+            } else
+                FAIL_IF(push_inst32(compiler, VSTR_F32 | 0x800000 | RN4(SLJIT_SP) | (float_arg_count << 12) | ((stack_offset - 16) >> 2)));
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            arg_count--;
+            float_arg_count--;
+            stack_offset = offsets[arg_count];
+
+            SLJIT_ASSERT((stack_offset & 0x7) == 0);
+
+            if (stack_offset < 16) {
+                if (src_offset == stack_offset || src_offset == stack_offset + sizeof(sljit_sw)) {
+                    FAIL_IF(push_inst16(compiler, MOV | (src_offset << 1) | 4 | (1 << 7)));
+                    *src = TMP_REG1;
+                }
+                FAIL_IF(push_inst32(compiler, VMOV2 | 0x100000 | (stack_offset << 10) | ((stack_offset + sizeof(sljit_sw)) << 14) | float_arg_count));
+            } else
+                FAIL_IF(push_inst32(compiler, VSTR_F32 | 0x800100 | RN4(SLJIT_SP) | (float_arg_count << 12) | ((stack_offset - 16) >> 2)));
+            break;
+        default:
+            arg_count--;
+            word_arg_offset -= sizeof(sljit_sw);
+            stack_offset = offsets[arg_count];
+
+            SLJIT_ASSERT(stack_offset >= word_arg_offset);
+
+            if (stack_offset != word_arg_offset) {
+                if (stack_offset < 16) {
+                    if (src_offset == stack_offset) {
+                        FAIL_IF(push_inst16(compiler, MOV | (src_offset << 1) | 4 | (1 << 7)));
+                        *src = TMP_REG1;
+                    }
+                    else if (src_offset == word_arg_offset) {
+                        *src = 1 + (stack_offset >> 2);
+                        src_offset = stack_offset;
+                    }
+                    FAIL_IF(push_inst16(compiler, MOV | (stack_offset >> 2) | (word_arg_offset << 1)));
+                } else
+                    FAIL_IF(push_inst16(compiler, STR_SP | (word_arg_offset << 6) | ((stack_offset - 16) >> 2)));
+            }
+            break;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+static sljit_s32 softfloat_post_call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types)
+{
+    sljit_s32 stack_size = 0;
+
+    if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F32)
+        FAIL_IF(push_inst32(compiler, VMOV | (0 << 16) | (0 << 12)));
+    if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F64)
+        FAIL_IF(push_inst32(compiler, VMOV2 | (1 << 16) | (0 << 12) | 0));
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            stack_size += sizeof(sljit_f32);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (stack_size & 0x7)
+                stack_size += sizeof(sljit_sw);
+            stack_size += sizeof(sljit_f64);
+            break;
+        default:
+            stack_size += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    if (stack_size <= 16)
+        return SLJIT_SUCCESS;
+
+    return push_inst16(compiler, ADD_SP | ((((stack_size - 16) + 0x7) & ~0x7) >> 2));
+}
+
+#else
+
+static sljit_s32 hardfloat_call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types)
+{
+    sljit_u32 remap = 0;
+    sljit_u32 offset = 0;
+    sljit_u32 new_offset, mask;
+
+    /* Remove return value. */
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F32) {
+            new_offset = 0;
+            mask = 1;
+
+            while (remap & mask) {
+                new_offset++;
+                mask <<= 1;
+            }
+            remap |= mask;
+
+            if (offset != new_offset)
+                FAIL_IF(push_inst32(compiler, VMOV_F32 | DD4((new_offset >> 1) + 1)
+                    | ((new_offset & 0x1) ? 0x400000 : 0) | DM4((offset >> 1) + 1)));
+
+            offset += 2;
+        }
+        else if ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F64) {
+            new_offset = 0;
+            mask = 3;
+
+            while (remap & mask) {
+                new_offset += 2;
+                mask <<= 2;
+            }
+            remap |= mask;
+
+            if (offset != new_offset)
+                FAIL_IF(push_inst32(compiler, VMOV_F32 | SLJIT_F32_OP | DD4((new_offset >> 1) + 1) | DM4((offset >> 1) + 1)));
+
+            offset += 2;
+        }
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+#endif
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+#ifdef __SOFTFP__
+    struct sljit_jump *jump;
+#endif
+
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+#ifdef __SOFTFP__
+    PTR_FAIL_IF(softfloat_call_with_args(compiler, arg_types, NULL));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    jump = sljit_emit_jump(compiler, type);
+    PTR_FAIL_IF(jump == NULL);
+
+    PTR_FAIL_IF(softfloat_post_call_with_args(compiler, arg_types));
+    return jump;
+#else
+    PTR_FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_jump(compiler, type);
+#endif
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
     struct sljit_jump *jump;
@@ -1826,10 +2071,13 @@
     CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
     ADJUST_LOCAL_OFFSET(src, srcw);


-    /* In ARM, we don't need to touch the arguments. */
+    SLJIT_ASSERT(reg_map[TMP_REG1] != 14);
+
     if (!(src & SLJIT_IMM)) {
-        if (FAST_IS_REG(src))
+        if (FAST_IS_REG(src)) {
+            SLJIT_ASSERT(reg_map[src] != 14);
             return push_inst16(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RN3(src));
+        }


         FAIL_IF(emit_op_mem(compiler, WORD_SIZE, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, src, srcw, TMP_REG1));
         if (type >= SLJIT_FAST_CALL)
@@ -1836,6 +2084,7 @@
             return push_inst16(compiler, BLX | RN3(TMP_REG1));
     }


+    /* These jumps are converted to jump/call instructions when possible. */
     jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
     FAIL_IF(!jump);
     set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_BL : 0));
@@ -1846,6 +2095,41 @@
     return push_inst16(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RN3(TMP_REG1));
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+#ifdef __SOFTFP__
+    if (src & SLJIT_MEM) {
+        FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
+        src = TMP_REG1;
+    }
+
+    FAIL_IF(softfloat_call_with_args(compiler, arg_types, &src));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
+
+    return softfloat_post_call_with_args(compiler, arg_types);
+#else /* !__SOFTFP__ */
+    FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_ijump(compiler, type, src, srcw);
+#endif /* __SOFTFP__ */
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)
@@ -1896,8 +2180,6 @@
         return SLJIT_SUCCESS;


     /* The condition must always be set, even if the ORR/EORI is not executed above. */
-    if (reg_map[dst_r] <= 7)
-        return push_inst16(compiler, MOVS | RD3(TMP_REG1) | RN3(dst_r));
     return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(TMP_REG1) | RM4(dst_r));
 }


@@ -1924,8 +2206,8 @@
     if (tmp < 0x10000) {
         /* set low 16 bits, set hi 16 bits to 0. */
         FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
-        return push_inst32(compiler, MOVW | RD4(dst_reg) |
-            COPY_BITS(tmp, 12, 16, 4) | COPY_BITS(tmp, 11, 26, 1) | COPY_BITS(tmp, 8, 12, 3) | (tmp & 0xff));
+        return push_inst32(compiler, MOVW | RD4(dst_reg)
+            | COPY_BITS(tmp, 12, 16, 4) | COPY_BITS(tmp, 11, 26, 1) | COPY_BITS(tmp, 8, 12, 3) | (tmp & 0xff));
     }


     tmp = get_imm(srcw);
@@ -1943,10 +2225,10 @@
     FAIL_IF(push_inst16(compiler, IT | (cc << 4) | ((cc & 0x1) << 3) | 0x4));


     tmp = (sljit_uw) srcw;
-    FAIL_IF(push_inst32(compiler, MOVW | RD4(dst_reg) |
-        COPY_BITS(tmp, 12, 16, 4) | COPY_BITS(tmp, 11, 26, 1) | COPY_BITS(tmp, 8, 12, 3) | (tmp & 0xff)));
-    return push_inst32(compiler, MOVT | RD4(dst_reg) |
-        COPY_BITS(tmp, 12 + 16, 16, 4) | COPY_BITS(tmp, 11 + 16, 26, 1) | COPY_BITS(tmp, 8 + 16, 12, 3) | ((tmp & 0xff0000) >> 16));
+    FAIL_IF(push_inst32(compiler, MOVW | RD4(dst_reg)
+        | COPY_BITS(tmp, 12, 16, 4) | COPY_BITS(tmp, 11, 26, 1) | COPY_BITS(tmp, 8, 12, 3) | (tmp & 0xff)));
+    return push_inst32(compiler, MOVT | RD4(dst_reg)
+        | COPY_BITS(tmp, 12 + 16, 16, 4) | COPY_BITS(tmp, 11 + 16, 26, 1) | COPY_BITS(tmp, 8 + 16, 12, 3) | ((tmp & 0xff0000) >> 16));
 }


SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)

Modified: code/trunk/sljit/sljitNativeMIPS_32.c
===================================================================
--- code/trunk/sljit/sljitNativeMIPS_32.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeMIPS_32.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -435,3 +435,232 @@
     inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
     SLJIT_CACHE_FLUSH(inst, inst + 2);
 }
+
+static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_ins *ins_ptr)
+{
+    sljit_s32 stack_offset = 0;
+    sljit_s32 arg_count = 0;
+    sljit_s32 float_arg_count = 0;
+    sljit_s32 word_arg_count = 0;
+    sljit_s32 types = 0;
+    sljit_s32 arg_count_save, types_save;
+    sljit_ins prev_ins = NOP;
+    sljit_ins ins = NOP;
+    sljit_u8 offsets[4];
+
+    SLJIT_ASSERT(reg_map[TMP_REG3] == 4 && freg_map[TMP_FREG1] == 12);
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        types = (types << SLJIT_DEF_SHIFT) | (arg_types & SLJIT_DEF_MASK);
+
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            offsets[arg_count] = (sljit_u8)stack_offset;
+
+            if (word_arg_count == 0 && arg_count <= 1)
+                offsets[arg_count] = 254 + arg_count;
+
+            stack_offset += sizeof(sljit_f32);
+            arg_count++;
+            float_arg_count++;
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (stack_offset & 0x7)
+                stack_offset += sizeof(sljit_sw);
+            offsets[arg_count] = (sljit_u8)stack_offset;
+
+            if (word_arg_count == 0 && arg_count <= 1)
+                offsets[arg_count] = 254 + arg_count;
+
+            stack_offset += sizeof(sljit_f64);
+            arg_count++;
+            float_arg_count++;
+            break;
+        default:
+            offsets[arg_count] = (sljit_u8)stack_offset;
+            stack_offset += sizeof(sljit_sw);
+            arg_count++;
+            word_arg_count++;
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    /* Stack is aligned to 16 bytes, max two doubles can be placed on the stack. */
+    if (stack_offset > 16)
+        FAIL_IF(push_inst(compiler, ADDIU | S(SLJIT_SP) | T(SLJIT_SP) | IMM(-16), DR(SLJIT_SP)));
+
+    types_save = types;
+    arg_count_save = arg_count;
+
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            arg_count--;
+            if (offsets[arg_count] < 254)
+                ins = SWC1 | S(SLJIT_SP) | FT(float_arg_count) | IMM(offsets[arg_count]);
+            float_arg_count--;
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            arg_count--;
+            if (offsets[arg_count] < 254)
+                ins = SDC1 | S(SLJIT_SP) | FT(float_arg_count) | IMM(offsets[arg_count]);
+            float_arg_count--;
+            break;
+        default:
+            if (offsets[arg_count - 1] >= 16)
+                ins = SW | S(SLJIT_SP) | T(word_arg_count) | IMM(offsets[arg_count - 1]);
+            else if (arg_count != word_arg_count)
+                ins = ADDU | S(word_arg_count) | TA(0) | DA(4 + (offsets[arg_count - 1] >> 2));
+            else if (arg_count == 1)
+                ins = ADDU | S(SLJIT_R0) | TA(0) | D(TMP_REG3);
+
+            arg_count--;
+            word_arg_count--;
+            break;
+        }
+
+        if (ins != NOP) {
+            if (prev_ins != NOP)
+                FAIL_IF(push_inst(compiler, prev_ins, MOVABLE_INS));
+            prev_ins = ins;
+            ins = NOP;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
+    types = types_save;
+    arg_count = arg_count_save;
+
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            arg_count--;
+            if (offsets[arg_count] == 254)
+                ins = MOV_S | FMT_S | FS(SLJIT_FR0) | FD(TMP_FREG1);
+            else if (offsets[arg_count] < 16)
+                ins = LW | S(SLJIT_SP) | TA(4 + (offsets[arg_count] >> 2)) | IMM(offsets[arg_count]);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            arg_count--;
+            if (offsets[arg_count] == 254)
+                ins = MOV_S | FMT_D | FS(SLJIT_FR0) | FD(TMP_FREG1);
+            else if (offsets[arg_count] < 16) {
+                if (prev_ins != NOP)
+                    FAIL_IF(push_inst(compiler, prev_ins, MOVABLE_INS));
+                prev_ins = LW | S(SLJIT_SP) | TA(4 + (offsets[arg_count] >> 2)) | IMM(offsets[arg_count]);
+                ins = LW | S(SLJIT_SP) | TA(5 + (offsets[arg_count] >> 2)) | IMM(offsets[arg_count] + sizeof(sljit_sw));
+            }
+            break;
+        default:
+            arg_count--;
+            break;
+        }
+
+        if (ins != NOP) {
+            if (prev_ins != NOP)
+                FAIL_IF(push_inst(compiler, prev_ins, MOVABLE_INS));
+            prev_ins = ins;
+            ins = NOP;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
+    *ins_ptr = prev_ins;
+
+    return SLJIT_SUCCESS;
+}
+
+static sljit_s32 post_call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types)
+{
+    sljit_s32 stack_offset = 0;
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            stack_offset += sizeof(sljit_f32);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (stack_offset & 0x7)
+                stack_offset += sizeof(sljit_sw);
+            stack_offset += sizeof(sljit_f64);
+            break;
+        default:
+            stack_offset += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    /* Stack is aligned to 16 bytes, max two doubles can be placed on the stack. */
+    if (stack_offset > 16)
+        return push_inst(compiler, ADDIU | S(SLJIT_SP) | T(SLJIT_SP) | IMM(16), DR(SLJIT_SP));
+
+    return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    struct sljit_jump *jump;
+    sljit_ins ins;
+
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+    jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+    PTR_FAIL_IF(!jump);
+    set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
+    type &= 0xff;
+
+    PTR_FAIL_IF(call_with_args(compiler, arg_types, &ins));
+
+    SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
+
+    PTR_FAIL_IF(emit_const(compiler, PIC_ADDR_REG, 0));
+
+    jump->flags |= IS_JAL | IS_CALL;
+    PTR_FAIL_IF(push_inst(compiler, JALR | S(PIC_ADDR_REG) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
+    jump->addr = compiler->size;
+    PTR_FAIL_IF(push_inst(compiler, ins, UNMOVABLE_INS));
+
+    PTR_FAIL_IF(post_call_with_args(compiler, arg_types));
+
+    return jump;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    sljit_ins ins;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+    SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
+
+    if (src & SLJIT_IMM)
+        FAIL_IF(load_immediate(compiler, DR(PIC_ADDR_REG), srcw));
+    else if (FAST_IS_REG(src))
+        FAIL_IF(push_inst(compiler, ADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
+    else if (src & SLJIT_MEM) {
+        ADJUST_LOCAL_OFFSET(src, srcw);
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
+    }
+
+    FAIL_IF(call_with_args(compiler, arg_types, &ins));
+
+    /* Register input. */
+    FAIL_IF(push_inst(compiler, JALR | S(PIC_ADDR_REG) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
+    FAIL_IF(push_inst(compiler, ins, UNMOVABLE_INS));
+    return post_call_with_args(compiler, arg_types);
+}


Modified: code/trunk/sljit/sljitNativeMIPS_64.c
===================================================================
--- code/trunk/sljit/sljitNativeMIPS_64.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeMIPS_64.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -537,3 +537,132 @@
     inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
     SLJIT_CACHE_FLUSH(inst, inst + 6);
 }
+
+static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_ins *ins_ptr)
+{
+    sljit_s32 arg_count = 0;
+    sljit_s32 word_arg_count = 0;
+    sljit_s32 float_arg_count = 0;
+    sljit_s32 types = 0;
+    sljit_ins prev_ins = NOP;
+    sljit_ins ins = NOP;
+
+    SLJIT_ASSERT(reg_map[TMP_REG3] == 4 && freg_map[TMP_FREG1] == 12);
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        types = (types << SLJIT_DEF_SHIFT) | (arg_types & SLJIT_DEF_MASK);
+
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+        case SLJIT_ARG_TYPE_F64:
+            arg_count++;
+            float_arg_count++;
+            break;
+        default:
+            arg_count++;
+            word_arg_count++;
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            if (arg_count != float_arg_count)
+                ins = MOV_S | FMT_S | FS(float_arg_count) | FD(arg_count);
+            else if (arg_count == 1)
+                ins = MOV_S | FMT_S | FS(SLJIT_FR0) | FD(TMP_FREG1);
+            arg_count--;
+            float_arg_count--;
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (arg_count != float_arg_count)
+                ins = MOV_S | FMT_D | FS(float_arg_count) | FD(arg_count);
+            else if (arg_count == 1)
+                ins = MOV_S | FMT_D | FS(SLJIT_FR0) | FD(TMP_FREG1);
+            arg_count--;
+            float_arg_count--;
+            break;
+        default:
+            if (arg_count != word_arg_count)
+                ins = DADDU | S(word_arg_count) | TA(0) | D(arg_count);
+            else if (arg_count == 1)
+                ins = DADDU | S(SLJIT_R0) | TA(0) | D(TMP_REG3);
+            arg_count--;
+            word_arg_count--;
+            break;
+        }
+
+        if (ins != NOP) {
+            if (prev_ins != NOP)
+                FAIL_IF(push_inst(compiler, prev_ins, MOVABLE_INS));
+            prev_ins = ins;
+            ins = NOP;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
+    *ins_ptr = prev_ins;
+
+    return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    struct sljit_jump *jump;
+    sljit_ins ins;
+
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+    jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+    PTR_FAIL_IF(!jump);
+    set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
+    type &= 0xff;
+
+    PTR_FAIL_IF(call_with_args(compiler, arg_types, &ins));
+
+    SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
+
+    PTR_FAIL_IF(emit_const(compiler, PIC_ADDR_REG, 0));
+
+    jump->flags |= IS_JAL | IS_CALL;
+    PTR_FAIL_IF(push_inst(compiler, JALR | S(PIC_ADDR_REG) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
+    jump->addr = compiler->size;
+    PTR_FAIL_IF(push_inst(compiler, ins, UNMOVABLE_INS));
+
+    return jump;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    sljit_ins ins;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+    SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
+
+    if (src & SLJIT_IMM)
+        FAIL_IF(load_immediate(compiler, DR(PIC_ADDR_REG), srcw));
+    else if (FAST_IS_REG(src))
+        FAIL_IF(push_inst(compiler, DADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
+    else if (src & SLJIT_MEM) {
+        ADJUST_LOCAL_OFFSET(src, srcw);
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
+    }
+
+    FAIL_IF(call_with_args(compiler, arg_types, &ins));
+
+    /* Register input. */
+    FAIL_IF(push_inst(compiler, JALR | S(PIC_ADDR_REG) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
+    return push_inst(compiler, ins, UNMOVABLE_INS);
+}


Modified: code/trunk/sljit/sljitNativeMIPS_common.c
===================================================================
--- code/trunk/sljit/sljitNativeMIPS_common.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeMIPS_common.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -60,13 +60,27 @@
 #define EQUAL_FLAG    31
 #define OTHER_FLAG    1


-#define TMP_FREG1    (0)
-#define TMP_FREG2    ((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) << 1)
+#define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
     0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 23, 22, 21, 20, 19, 18, 17, 16, 29, 3, 25, 4
 };


+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+
+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+    0, 0, 14, 2, 4, 6, 8, 12, 10
+};
+
+#else
+
+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+    0, 0, 13, 14, 15, 16, 17, 12, 18
+};
+
+#endif
+
 /* --------------------------------------------------------------------- */
 /*  Instrucion forms                                                     */
 /* --------------------------------------------------------------------- */
@@ -74,21 +88,23 @@
 #define S(s)        (reg_map[s] << 21)
 #define T(t)        (reg_map[t] << 16)
 #define D(d)        (reg_map[d] << 11)
+#define FT(t)        (freg_map[t] << 16)
+#define FS(s)        (freg_map[s] << 11)
+#define FD(d)        (freg_map[d] << 6)
 /* Absolute registers. */
 #define SA(s)        ((s) << 21)
 #define TA(t)        ((t) << 16)
 #define DA(d)        ((d) << 11)
-#define FT(t)        ((t) << 16)
-#define FS(s)        ((s) << 11)
-#define FD(d)        ((d) << 6)
 #define IMM(imm)    ((imm) & 0xffff)
 #define SH_IMM(imm)    ((imm) << 6)


 #define DR(dr)        (reg_map[dr])
+#define FR(dr)        (freg_map[dr])
 #define HI(opcode)    ((opcode) << 26)
 #define LO(opcode)    (opcode)
 /* S = (16 << 21) D = (17 << 21) */
 #define FMT_S        (16 << 21)
+#define FMT_D        (17 << 21)


 #define ABS_S        (HI(17) | FMT_S | LO(5))
 #define ADD_S        (HI(17) | FMT_S | LO(0))
@@ -153,6 +169,7 @@
 #define OR        (HI(0) | LO(37))
 #define ORI        (HI(13))
 #define SD        (HI(63))
+#define SDC1        (HI(61))
 #define SLT        (HI(0) | LO(42))
 #define SLTI        (HI(10))
 #define SLTIU        (HI(11))
@@ -166,6 +183,7 @@
 #define SUB_S        (HI(17) | FMT_S | LO(1))
 #define SUBU        (HI(0) | LO(35))
 #define SW        (HI(43))
+#define SWC1        (HI(57))
 #define TRUNC_W_S    (HI(17) | FMT_S | LO(13))
 #define XOR        (HI(0) | LO(38))
 #define XORI        (HI(14))
@@ -564,6 +582,8 @@
 #define STACK_LOAD    LD
 #endif


+static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw);
+
#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
#include "sljitNativeMIPS_32.c"
#else
@@ -571,15 +591,15 @@
#endif

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     sljit_ins base;
-    sljit_s32 i, tmp, offs;
+    sljit_s32 args, i, tmp, offs;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1) + SLJIT_LOCALS_OFFSET;
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -616,6 +636,8 @@
         FAIL_IF(push_inst(compiler, STACK_STORE | base | T(i) | IMM(offs), MOVABLE_INS));
     }


+    args = get_arg_count(arg_types);
+
     if (args >= 1)
         FAIL_IF(push_inst(compiler, ADDU_W | SA(4) | TA(0) | D(SLJIT_S0), DR(SLJIT_S0)));
     if (args >= 2)
@@ -627,12 +649,12 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1) + SLJIT_LOCALS_OFFSET;
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -1298,7 +1320,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
     CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-    return reg << 1;
+    return FR(reg);
 }


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -1328,11 +1350,9 @@
#endif

     if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
+        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG1), src, srcw, dst, dstw));
         src = TMP_FREG1;
     }
-    else
-        src <<= 1;


     FAIL_IF(push_inst(compiler, (TRUNC_W_S ^ (flags >> 19)) | FMT(op) | FS(src) | FD(TMP_FREG1), MOVABLE_INS));


@@ -1340,7 +1360,7 @@
         return push_inst(compiler, MFC1 | flags | T(dst) | FS(TMP_FREG1), MOVABLE_INS);


     /* Store the integer value from a VFP register. */
-    return emit_op_mem2(compiler, flags ? DOUBLE_DATA : SINGLE_DATA, TMP_FREG1, dst, dstw, 0, 0);
+    return emit_op_mem2(compiler, flags ? DOUBLE_DATA : SINGLE_DATA, FR(TMP_FREG1), dst, dstw, 0, 0);


 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 #    undef is_long
@@ -1357,13 +1377,13 @@
     sljit_s32 flags = (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW) << 21;
 #endif


-    sljit_s32 dst_r = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG1;
+    sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;


     if (FAST_IS_REG(src))
         FAIL_IF(push_inst(compiler, MTC1 | flags | T(src) | FS(TMP_FREG1), MOVABLE_INS));
     else if (src & SLJIT_MEM) {
         /* Load the integer value into a VFP register. */
-        FAIL_IF(emit_op_mem2(compiler, ((flags) ? DOUBLE_DATA : SINGLE_DATA) | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
+        FAIL_IF(emit_op_mem2(compiler, ((flags) ? DOUBLE_DATA : SINGLE_DATA) | LOAD_DATA, FR(TMP_FREG1), src, srcw, dst, dstw));
     }
     else {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -1377,7 +1397,7 @@
     FAIL_IF(push_inst(compiler, CVT_S_S | flags | (4 << 21) | (((op & SLJIT_F32_OP) ^ SLJIT_F32_OP) >> 8) | FS(TMP_FREG1) | FD(dst_r), MOVABLE_INS));


     if (dst & SLJIT_MEM)
-        return emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, 0, 0);
+        return emit_op_mem2(compiler, FLOAT_DATA(op), FR(TMP_FREG1), dst, dstw, 0, 0);
     return SLJIT_SUCCESS;


 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -1392,18 +1412,14 @@
     sljit_ins inst;


     if (src1 & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
+        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG1), src1, src1w, src2, src2w));
         src1 = TMP_FREG1;
     }
-    else
-        src1 <<= 1;


     if (src2 & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, 0, 0));
+        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG2), src2, src2w, 0, 0));
         src2 = TMP_FREG2;
     }
-    else
-        src2 <<= 1;


     switch (GET_FLAG_TYPE(op)) {
     case SLJIT_EQUAL_F64:
@@ -1443,14 +1459,12 @@
     if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32)
         op ^= SLJIT_F32_OP;


-    dst_r = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG1;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;


     if (src & SLJIT_MEM) {
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_r, src, srcw, dst, dstw));
+        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(dst_r), src, srcw, dst, dstw));
         src = dst_r;
     }
-    else
-        src <<= 1;


     switch (GET_OPCODE(op)) {
     case SLJIT_MOV_F64:
@@ -1474,7 +1488,7 @@
     }


     if (dst & SLJIT_MEM)
-        return emit_op_mem2(compiler, FLOAT_DATA(op), dst_r, dst, dstw, 0, 0);
+        return emit_op_mem2(compiler, FLOAT_DATA(op), FR(dst_r), dst, dstw, 0, 0);
     return SLJIT_SUCCESS;
 }


@@ -1494,42 +1508,38 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    dst_r = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG2;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG2;


     if (src1 & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
+        if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG1), src1, src1w)) {
             FAIL_IF(compiler->error);
             src1 = TMP_FREG1;
         } else
             flags |= SLOW_SRC1;
     }
-    else
-        src1 <<= 1;


     if (src2 & SLJIT_MEM) {
-        if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
+        if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG2), src2, src2w)) {
             FAIL_IF(compiler->error);
             src2 = TMP_FREG2;
         } else
             flags |= SLOW_SRC2;
     }
-    else
-        src2 <<= 1;


     if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
         if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, src1, src1w));
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
+            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG2), src2, src2w, src1, src1w));
+            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG1), src1, src1w, dst, dstw));
         }
         else {
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
-            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
+            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG1), src1, src1w, src2, src2w));
+            FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG2), src2, src2w, dst, dstw));
         }
     }
     else if (flags & SLOW_SRC1)
-        FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
+        FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG1), src1, src1w, dst, dstw));
     else if (flags & SLOW_SRC2)
-        FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
+        FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, FR(TMP_FREG2), src2, src2w, dst, dstw));


     if (flags & SLOW_SRC1)
         src1 = TMP_FREG1;
@@ -1555,7 +1565,7 @@
     }


     if (dst_r == TMP_FREG2)
-        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG2, dst, dstw, 0, 0));
+        FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op), FR(TMP_FREG2), dst, dstw, 0, 0));


     return SLJIT_SUCCESS;
 }
@@ -1705,19 +1715,16 @@
         PTR_FAIL_IF(push_inst(compiler, inst, UNMOVABLE_INS));


     PTR_FAIL_IF(emit_const(compiler, TMP_REG2, 0));
-    if (type <= SLJIT_JUMP) {
+
+    if (type <= SLJIT_JUMP)
         PTR_FAIL_IF(push_inst(compiler, JR | S(TMP_REG2), UNMOVABLE_INS));
-        jump->addr = compiler->size;
-        PTR_FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-    } else {
-        SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
-        /* Cannot be optimized out if type is >= CALL0. */
-        jump->flags |= IS_JAL | (type >= SLJIT_CALL0 ? IS_CALL : 0);
+    else {
+        jump->flags |= IS_JAL;
         PTR_FAIL_IF(push_inst(compiler, JALR | S(TMP_REG2) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
-        jump->addr = compiler->size;
-        /* A NOP if type < CALL1. */
-        PTR_FAIL_IF(push_inst(compiler, ADDU_W | S(SLJIT_R0) | TA(0) | DA(4), UNMOVABLE_INS));
     }
+
+    jump->addr = compiler->size;
+    PTR_FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
     return jump;
 }


@@ -1873,7 +1880,6 @@

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
-    sljit_s32 src_r = TMP_REG2;
     struct sljit_jump *jump = NULL;


     CHECK_ERROR();
@@ -1880,34 +1886,6 @@
     CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
     ADJUST_LOCAL_OFFSET(src, srcw);


-    if (FAST_IS_REG(src)) {
-        if (DR(src) != 4)
-            src_r = src;
-        else
-            FAIL_IF(push_inst(compiler, ADDU_W | S(src) | TA(0) | D(TMP_REG2), DR(TMP_REG2)));
-    }
-
-    if (type >= SLJIT_CALL0) {
-        SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
-        if (src & (SLJIT_IMM | SLJIT_MEM)) {
-            if (src & SLJIT_IMM)
-                FAIL_IF(load_immediate(compiler, DR(PIC_ADDR_REG), srcw));
-            else {
-                SLJIT_ASSERT(src_r == TMP_REG2 && (src & SLJIT_MEM));
-                FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
-            }
-            FAIL_IF(push_inst(compiler, JALR | S(PIC_ADDR_REG) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
-            /* We need an extra instruction in any case. */
-            return push_inst(compiler, ADDU_W | S(SLJIT_R0) | TA(0) | DA(4), UNMOVABLE_INS);
-        }
-
-        /* Register input. */
-        if (type >= SLJIT_CALL1)
-            FAIL_IF(push_inst(compiler, ADDU_W | S(SLJIT_R0) | TA(0) | DA(4), 4));
-        FAIL_IF(push_inst(compiler, JALR | S(src_r) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
-        return push_inst(compiler, ADDU_W | S(src_r) | TA(0) | D(PIC_ADDR_REG), UNMOVABLE_INS);
-    }
-
     if (src & SLJIT_IMM) {
         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
         FAIL_IF(!jump);
@@ -1918,11 +1896,14 @@
             jump->flags |= IS_MOVABLE;


         FAIL_IF(emit_const(compiler, TMP_REG2, 0));
+        src = TMP_REG2;
     }
-    else if (src & SLJIT_MEM)
-        FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
+    else if (src & SLJIT_MEM) {
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(TMP_REG2), src, srcw));
+        src = TMP_REG2;
+    }


-    FAIL_IF(push_inst(compiler, JR | S(src_r), UNMOVABLE_INS));
+    FAIL_IF(push_inst(compiler, JR | S(src), UNMOVABLE_INS));
     if (jump)
         jump->addr = compiler->size;
     FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));


Modified: code/trunk/sljit/sljitNativePPC_64.c
===================================================================
--- code/trunk/sljit/sljitNativePPC_64.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativePPC_64.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -413,6 +413,61 @@
     return SLJIT_SUCCESS;
 }


+static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *src)
+{
+    sljit_s32 arg_count = 0;
+    sljit_s32 word_arg_count = 0;
+    sljit_s32 types = 0;
+    sljit_s32 reg = 0;
+
+    if (src)
+        reg = *src & REG_MASK;
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        types = (types << SLJIT_DEF_SHIFT) | (arg_types & SLJIT_DEF_MASK);
+
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+        case SLJIT_ARG_TYPE_F64:
+            arg_count++;
+            break;
+        default:
+            arg_count++;
+            word_arg_count++;
+
+            if (arg_count != word_arg_count && arg_count == reg) {
+                FAIL_IF(push_inst(compiler, OR | S(reg) | A(TMP_CALL_REG) | B(reg)));
+                *src = TMP_CALL_REG;
+            }
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+        case SLJIT_ARG_TYPE_F64:
+            arg_count--;
+            break;
+        default:
+            if (arg_count != word_arg_count)
+                FAIL_IF(push_inst(compiler, OR | S(word_arg_count) | A(arg_count) | B(word_arg_count)));
+
+            arg_count--;
+            word_arg_count--;
+            break;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return SLJIT_SUCCESS;
+}
+
 static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 reg, sljit_sw init_value)
 {
     FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | IMM(init_value >> 48)));


Modified: code/trunk/sljit/sljitNativePPC_common.c
===================================================================
--- code/trunk/sljit/sljitNativePPC_common.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativePPC_common.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -102,13 +102,17 @@
 #define TMP_CALL_REG    TMP_REG2
 #endif


-#define TMP_FREG1    (0)
-#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = {
     0, 3, 4, 5, 6, 7, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 8, 9, 10, 31, 12
 };


+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+    0, 1, 2, 3, 4, 5, 6, 0, 7
+};
+
 /* --------------------------------------------------------------------- */
 /*  Instrucion forms                                                     */
 /* --------------------------------------------------------------------- */
@@ -117,11 +121,11 @@
 #define A(a)        (reg_map[a] << 16)
 #define B(b)        (reg_map[b] << 11)
 #define C(c)        (reg_map[c] << 6)
-#define FD(fd)        ((fd) << 21)
-#define FS(fs)        ((fs) << 21)
-#define FA(fa)        ((fa) << 16)
-#define FB(fb)        ((fb) << 11)
-#define FC(fc)        ((fc) << 6)
+#define FD(fd)        (freg_map[fd] << 21)
+#define FS(fs)        (freg_map[fs] << 21)
+#define FA(fa)        (freg_map[fa] << 16)
+#define FB(fb)        (freg_map[fb] << 11)
+#define FC(fc)        (freg_map[fc] << 6)
 #define IMM(imm)    ((imm) & 0xffff)
 #define CRD(d)        ((d) << 21)


@@ -610,14 +614,14 @@
#endif

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-    sljit_s32 i, tmp, offs;
+    sljit_s32 args, i, tmp, offs;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     FAIL_IF(push_inst(compiler, MFLR | D(0)));
     offs = -(sljit_s32)(sizeof(sljit_sw));
@@ -643,6 +647,9 @@
 #endif


     FAIL_IF(push_inst(compiler, ADDI | D(TMP_ZERO) | A(0) | 0));
+
+    args = get_arg_count(arg_types);
+
     if (args >= 1)
         FAIL_IF(push_inst(compiler, OR | S(SLJIT_R0) | A(SLJIT_S0) | B(SLJIT_R0)));
     if (args >= 2)
@@ -674,12 +681,12 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1) + SLJIT_LOCALS_OFFSET;
     compiler->local_size = (local_size + 15) & ~0xf;
@@ -1385,21 +1392,26 @@
     if (GET_FLAG_TYPE(op_flags) == SLJIT_OVERFLOW)
         FAIL_IF(push_inst(compiler, MTXER | S(TMP_ZERO)));


+    if (op < SLJIT_NOT && FAST_IS_REG(src) && src == dst) {
+        if (!TYPE_CAST_NEEDED(op))
+            return SLJIT_SUCCESS;
+    }
+
     if (op_flags & SLJIT_I32_OP) {
         if (op < SLJIT_NOT) {
-            if (FAST_IS_REG(src) && src == dst) {
-                if (!TYPE_CAST_NEEDED(op))
-                    return SLJIT_SUCCESS;
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+            if (src & SLJIT_MEM) {
+                if (op == SLJIT_MOV_S32)
+                    op = SLJIT_MOV_U32;
+                if (op == SLJIT_MOVU_S32)
+                    op = SLJIT_MOVU_U32;
             }
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-            if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
-                op = SLJIT_MOV_U32;
-            if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
-                op = SLJIT_MOVU_U32;
-            if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
-                op = SLJIT_MOV_S32;
-            if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
-                op = SLJIT_MOVU_S32;
+            else if (src & SLJIT_IMM) {
+                if (op == SLJIT_MOV_U32)
+                    op = SLJIT_MOV_S32;
+                if (op == SLJIT_MOVU_U32)
+                    op = SLJIT_MOVU_S32;
+            }
 #endif
         }
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
@@ -1746,7 +1758,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
     CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-    return reg;
+    return freg_map[reg];
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -2183,7 +2195,7 @@
         return (4 << 21) | ((4 + 3) << 16);


     default:
-        SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL3);
+        SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL_CDECL);
         return (20 << 21);
     }
 }
@@ -2209,7 +2221,7 @@
     if (type < SLJIT_JUMP)
         jump->flags |= IS_COND;
 #if (defined SLJIT_PASS_ENTRY_ADDR_TO_CALL && SLJIT_PASS_ENTRY_ADDR_TO_CALL)
-    if (type >= SLJIT_CALL0)
+    if (type >= SLJIT_CALL)
         jump->flags |= IS_CALL;
 #endif


@@ -2220,6 +2232,24 @@
     return jump;
 }


+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
+#endif
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_jump(compiler, type);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
     struct sljit_jump *jump = NULL;
@@ -2231,7 +2261,7 @@


     if (FAST_IS_REG(src)) {
 #if (defined SLJIT_PASS_ENTRY_ADDR_TO_CALL && SLJIT_PASS_ENTRY_ADDR_TO_CALL)
-        if (type >= SLJIT_CALL0) {
+        if (type >= SLJIT_CALL) {
             FAIL_IF(push_inst(compiler, OR | S(src) | A(TMP_CALL_REG) | B(src)));
             src_r = TMP_CALL_REG;
         }
@@ -2241,12 +2271,13 @@
         src_r = src;
 #endif
     } else if (src & SLJIT_IMM) {
+        /* These jumps are converted to jump/call instructions when possible. */
         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
         FAIL_IF(!jump);
         set_jump(jump, compiler, JUMP_ADDR);
         jump->u.target = srcw;
 #if (defined SLJIT_PASS_ENTRY_ADDR_TO_CALL && SLJIT_PASS_ENTRY_ADDR_TO_CALL)
-        if (type >= SLJIT_CALL0)
+        if (type >= SLJIT_CALL)
             jump->flags |= IS_CALL;
 #endif
         FAIL_IF(emit_const(compiler, TMP_CALL_REG, 0));
@@ -2263,6 +2294,31 @@
     return push_inst(compiler, BCCTR | (20 << 21) | (type >= SLJIT_FAST_CALL ? 1 : 0));
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+    if (src & SLJIT_MEM) {
+        ADJUST_LOCAL_OFFSET(src, srcw);
+        FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_CALL_REG, 0, TMP_REG1, 0, src, srcw));
+        src = TMP_CALL_REG;
+    }
+
+    FAIL_IF(call_with_args(compiler, arg_types, &src));
+#endif
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_ijump(compiler, type, src, srcw);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)


Modified: code/trunk/sljit/sljitNativeSPARC_32.c
===================================================================
--- code/trunk/sljit/sljitNativeSPARC_32.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeSPARC_32.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -138,6 +138,125 @@
     return SLJIT_SUCCESS;
 }


+static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *src)
+{
+    sljit_s32 reg_index = 8;
+    sljit_s32 word_reg_index = 8;
+    sljit_s32 float_arg_index = 1;
+    sljit_s32 double_arg_count = 0;
+    sljit_s32 float_offset = (16 + 6) * sizeof(sljit_sw);
+    sljit_s32 types = 0;
+    sljit_s32 reg = 0;
+    sljit_s32 move_to_tmp2 = 0;
+
+    if (src)
+        reg = reg_map[*src & REG_MASK];
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        types = (types << SLJIT_DEF_SHIFT) | (arg_types & SLJIT_DEF_MASK);
+
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            float_arg_index++;
+            if (reg_index == reg)
+                move_to_tmp2 = 1;
+            reg_index++;
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            float_arg_index++;
+            double_arg_count++;
+            if (reg_index == reg || reg_index + 1 == reg)
+                move_to_tmp2 = 1;
+            reg_index += 2;
+            break;
+        default:
+            if (reg_index != word_reg_index && reg_index < 14 && reg_index == reg)
+                move_to_tmp2 = 1;
+            reg_index++;
+            word_reg_index++;
+            break;
+        }
+
+        if (move_to_tmp2) {
+            move_to_tmp2 = 0;
+            if (reg < 14)
+                FAIL_IF(push_inst(compiler, OR | D(TMP_REG1) | S1(0) | S2A(reg), DR(TMP_REG1)));
+            *src = TMP_REG1;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    arg_types = types;
+
+    while (arg_types) {
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            float_arg_index--;
+            FAIL_IF(push_inst(compiler, STF | FD(float_arg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
+            float_offset -= sizeof(sljit_f64);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            float_arg_index--;
+            if (float_arg_index == 4 && double_arg_count == 4) {
+                FAIL_IF(push_inst(compiler, STF | FD(float_arg_index) | S1(SLJIT_SP) | IMM((16 + 7) * sizeof(sljit_sw)), MOVABLE_INS));
+                FAIL_IF(push_inst(compiler, STF | FD(float_arg_index) | (1 << 25) | S1(SLJIT_SP) | IMM((16 + 8) * sizeof(sljit_sw)), MOVABLE_INS));
+            }
+            else
+                FAIL_IF(push_inst(compiler, STDF | FD(float_arg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
+            float_offset -= sizeof(sljit_f64);
+            break;
+        default:
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    float_offset = (16 + 6) * sizeof(sljit_sw);
+
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            reg_index--;
+            if (reg_index < 14)
+                FAIL_IF(push_inst(compiler, LDUW | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), reg_index));
+            float_offset -= sizeof(sljit_f64);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            reg_index -= 2;
+            if (reg_index < 14) {
+                if ((reg_index & 0x1) != 0) {
+                    FAIL_IF(push_inst(compiler, LDUW | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), reg_index));
+                    if (reg_index < 13)
+                        FAIL_IF(push_inst(compiler, LDUW | DA(reg_index + 1) | S1(SLJIT_SP) | IMM(float_offset + sizeof(sljit_sw)), reg_index + 1));
+                }
+                else 
+                    FAIL_IF(push_inst(compiler, LDD | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), reg_index));
+            }
+            float_offset -= sizeof(sljit_f64);
+            break;
+        default:
+            reg_index--;
+            word_reg_index--;
+
+            if (reg_index != word_reg_index) {
+                if (reg_index < 14)
+                    FAIL_IF(push_inst(compiler, OR | DA(reg_index) | S1(0) | S2A(word_reg_index), reg_index));
+                else
+                    FAIL_IF(push_inst(compiler, STW | DA(word_reg_index) | S1(SLJIT_SP) | IMM(92), word_reg_index));
+            }
+            break;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return SLJIT_SUCCESS;
+}
+
 static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value)
 {
     FAIL_IF(push_inst(compiler, SETHI | D(dst) | ((init_value >> 10) & 0x3fffff), DR(dst)));


Modified: code/trunk/sljit/sljitNativeSPARC_common.c
===================================================================
--- code/trunk/sljit/sljitNativeSPARC_common.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeSPARC_common.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -90,24 +90,35 @@
 #define TMP_REG1    (SLJIT_NUMBER_OF_REGISTERS + 2)
 #define TMP_REG2    (SLJIT_NUMBER_OF_REGISTERS + 3)
 #define TMP_REG3    (SLJIT_NUMBER_OF_REGISTERS + 4)
+/* This register is modified by calls, which affects the instruction
+   in the delay slot if it is used as a source register. */
 #define TMP_LINK    (SLJIT_NUMBER_OF_REGISTERS + 5)


-#define TMP_FREG1    (0)
-#define TMP_FREG2    ((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) << 1)
+#define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 6] = {
-    0, 8, 9, 10, 13, 29, 28, 27, 23, 22, 21, 20, 19, 18, 17, 16, 26, 25, 24, 14, 1, 11, 12, 15
+    0, 8, 9, 10, 11, 29, 28, 27, 23, 22, 21, 20, 19, 18, 17, 16, 26, 25, 24, 14, 1, 12, 13, 15
 };


+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+    0, 0, 2, 4, 6, 8, 10, 12, 14
+};
+
 /* --------------------------------------------------------------------- */
 /*  Instrucion forms                                                     */
 /* --------------------------------------------------------------------- */


 #define D(d)        (reg_map[d] << 25)
+#define FD(d)        (freg_map[d] << 25)
+#define FDN(d)        ((freg_map[d] | 0x1) << 25)
 #define DA(d)        ((d) << 25)
 #define S1(s1)        (reg_map[s1] << 14)
+#define FS1(s1)        (freg_map[s1] << 14)
+#define S1A(s1)        ((s1) << 14)
 #define S2(s2)        (reg_map[s2])
-#define S1A(s1)        ((s1) << 14)
+#define FS2(s2)        (freg_map[s2])
+#define FS2N(s2)    (freg_map[s2] | 0x1)
 #define S2A(s2)        (s2)
 #define IMM_ARG        0x2000
 #define DOP(op)        ((op) << 5)
@@ -144,6 +155,8 @@
 #define FSUBD        (OPC1(0x2) | OPC3(0x34) | DOP(0x46))
 #define FSUBS        (OPC1(0x2) | OPC3(0x34) | DOP(0x45))
 #define JMPL        (OPC1(0x2) | OPC3(0x38))
+#define LDD        (OPC1(0x3) | OPC3(0x03))
+#define LDUW        (OPC1(0x3) | OPC3(0x00))
 #define NOP        (OPC1(0x0) | OPC2(0x04))
 #define OR        (OPC1(0x2) | OPC3(0x02))
 #define ORN        (OPC1(0x2) | OPC3(0x06))
@@ -157,6 +170,9 @@
 #define SRAX        (OPC1(0x2) | OPC3(0x27) | (1 << 12))
 #define SRL        (OPC1(0x2) | OPC3(0x26))
 #define SRLX        (OPC1(0x2) | OPC3(0x26) | (1 << 12))
+#define STDF        (OPC1(0x3) | OPC3(0x27))
+#define STF        (OPC1(0x3) | OPC3(0x24))
+#define STW        (OPC1(0x3) | OPC3(0x04))
 #define SUB        (OPC1(0x2) | OPC3(0x04))
 #define SUBC        (OPC1(0x2) | OPC3(0x0c))
 #define TA        (OPC1(0x2) | OPC3(0x3a) | (8 << 25))
@@ -455,12 +471,12 @@
 #endif


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     local_size = (local_size + SLJIT_LOCALS_OFFSET + 7) & ~0x7;
     compiler->local_size = local_size;
@@ -479,12 +495,12 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


     compiler->local_size = (local_size + SLJIT_LOCALS_OFFSET + 7) & ~0x7;
     return SLJIT_SUCCESS;
@@ -553,7 +569,7 @@
             if (SLJIT_UNLIKELY(flags & ARG_TEST))
                 return 1;
             FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK]
-                | ((flags & MEM_MASK) <= GPR_REG ? D(reg) : DA(reg))
+                | ((flags & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg))
                 | S1(arg & REG_MASK) | ((arg & OFFS_REG_MASK) ? S2(OFFS_REG(arg)) : IMM(argw)),
                 ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS));
             return -1;
@@ -638,7 +654,7 @@
         }
     }


-    dest = ((flags & MEM_MASK) <= GPR_REG ? D(reg) : DA(reg));
+    dest = ((flags & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg));
     delay_slot = ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS;
     if (!base)
         return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | dest | S1(arg2) | IMM(0), delay_slot);
@@ -962,7 +978,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
     CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-    return reg << 1;
+    return freg_map[reg];
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -990,10 +1006,8 @@
         FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
         src = TMP_FREG1;
     }
-    else
-        src <<= 1;


-    FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSTOI, FDTOI) | DA(TMP_FREG1) | S2A(src), MOVABLE_INS));
+    FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSTOI, FDTOI) | FD(TMP_FREG1) | FS2(src), MOVABLE_INS));


     if (FAST_IS_REG(dst)) {
         FAIL_IF(emit_op_mem2(compiler, SINGLE_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
@@ -1008,7 +1022,7 @@
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src, sljit_sw srcw)
 {
-    sljit_s32 dst_r = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG1;
+    sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;


     if (src & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -1027,7 +1041,7 @@
     }


     FAIL_IF(emit_op_mem2(compiler, SINGLE_DATA | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
-    FAIL_IF(push_inst(compiler, SELECT_FOP(op, FITOS, FITOD) | DA(dst_r) | S2A(TMP_FREG1), MOVABLE_INS));
+    FAIL_IF(push_inst(compiler, SELECT_FOP(op, FITOS, FITOD) | FD(dst_r) | FS2(TMP_FREG1), MOVABLE_INS));


     if (dst & SLJIT_MEM)
         return emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, 0, 0);
@@ -1042,17 +1056,13 @@
         FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
         src1 = TMP_FREG1;
     }
-    else
-        src1 <<= 1;


     if (src2 & SLJIT_MEM) {
         FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, 0, 0));
         src2 = TMP_FREG2;
     }
-    else
-        src2 <<= 1;


-    return push_inst(compiler, SELECT_FOP(op, FCMPS, FCMPD) | S1A(src1) | S2A(src2), FCC_IS_SET | MOVABLE_INS);
+    return push_inst(compiler, SELECT_FOP(op, FCMPS, FCMPD) | FS1(src1) | FS2(src2), FCC_IS_SET | MOVABLE_INS);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
@@ -1071,22 +1081,20 @@
     if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32)
         op ^= SLJIT_F32_OP;


-    dst_r = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG1;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;


     if (src & SLJIT_MEM) {
         FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_r, src, srcw, dst, dstw));
         src = dst_r;
     }
-    else
-        src <<= 1;


     switch (GET_OPCODE(op)) {
     case SLJIT_MOV_F64:
         if (src != dst_r) {
             if (dst_r != TMP_FREG1) {
-                FAIL_IF(push_inst(compiler, FMOVS | DA(dst_r) | S2A(src), MOVABLE_INS));
+                FAIL_IF(push_inst(compiler, FMOVS | FD(dst_r) | FS2(src), MOVABLE_INS));
                 if (!(op & SLJIT_F32_OP))
-                    FAIL_IF(push_inst(compiler, FMOVS | DA(dst_r | 1) | S2A(src | 1), MOVABLE_INS));
+                    FAIL_IF(push_inst(compiler, FMOVS | FDN(dst_r) | FS2N(src), MOVABLE_INS));
             }
             else
                 dst_r = src;
@@ -1093,17 +1101,17 @@
         }
         break;
     case SLJIT_NEG_F64:
-        FAIL_IF(push_inst(compiler, FNEGS | DA(dst_r) | S2A(src), MOVABLE_INS));
+        FAIL_IF(push_inst(compiler, FNEGS | FD(dst_r) | FS2(src), MOVABLE_INS));
         if (dst_r != src && !(op & SLJIT_F32_OP))
-            FAIL_IF(push_inst(compiler, FMOVS | DA(dst_r | 1) | S2A(src | 1), MOVABLE_INS));
+            FAIL_IF(push_inst(compiler, FMOVS | FDN(dst_r) | FS2N(src), MOVABLE_INS));
         break;
     case SLJIT_ABS_F64:
-        FAIL_IF(push_inst(compiler, FABSS | DA(dst_r) | S2A(src), MOVABLE_INS));
+        FAIL_IF(push_inst(compiler, FABSS | FD(dst_r) | FS2(src), MOVABLE_INS));
         if (dst_r != src && !(op & SLJIT_F32_OP))
-            FAIL_IF(push_inst(compiler, FMOVS | DA(dst_r | 1) | S2A(src | 1), MOVABLE_INS));
+            FAIL_IF(push_inst(compiler, FMOVS | FDN(dst_r) | FS2N(src), MOVABLE_INS));
         break;
     case SLJIT_CONV_F64_FROM_F32:
-        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSTOD, FDTOS) | DA(dst_r) | S2A(src), MOVABLE_INS));
+        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSTOD, FDTOS) | FD(dst_r) | FS2(src), MOVABLE_INS));
         op ^= SLJIT_F32_OP;
         break;
     }
@@ -1129,7 +1137,7 @@
     compiler->cache_arg = 0;
     compiler->cache_argw = 0;


-    dst_r = FAST_IS_REG(dst) ? (dst << 1) : TMP_FREG2;
+    dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG2;


     if (src1 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
@@ -1138,8 +1146,6 @@
         } else
             flags |= SLOW_SRC1;
     }
-    else
-        src1 <<= 1;


     if (src2 & SLJIT_MEM) {
         if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
@@ -1148,8 +1154,6 @@
         } else
             flags |= SLOW_SRC2;
     }
-    else
-        src2 <<= 1;


     if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
         if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
@@ -1173,19 +1177,19 @@


     switch (GET_OPCODE(op)) {
     case SLJIT_ADD_F64:
-        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FADDS, FADDD) | DA(dst_r) | S1A(src1) | S2A(src2), MOVABLE_INS));
+        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FADDS, FADDD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
         break;


     case SLJIT_SUB_F64:
-        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSUBS, FSUBD) | DA(dst_r) | S1A(src1) | S2A(src2), MOVABLE_INS));
+        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSUBS, FSUBD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
         break;


     case SLJIT_MUL_F64:
-        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FMULS, FMULD) | DA(dst_r) | S1A(src1) | S2A(src2), MOVABLE_INS));
+        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FMULS, FMULD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
         break;


     case SLJIT_DIV_F64:
-        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FDIVS, FDIVD) | DA(dst_r) | S1A(src1) | S2A(src2), MOVABLE_INS));
+        FAIL_IF(push_inst(compiler, SELECT_FOP(op, FDIVS, FDIVD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
         break;
     }


@@ -1339,7 +1343,8 @@
 #else
 #error "Implementation required"
 #endif
-    } else {
+    }
+    else {
         if ((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS)
             jump->flags |= IS_MOVABLE;
         if (type >= SLJIT_FAST_CALL)
@@ -1346,8 +1351,8 @@
             jump->flags |= IS_CALL;
     }


-    PTR_FAIL_IF(emit_const(compiler, TMP_REG2, 0));
-    PTR_FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? TMP_LINK : 0) | S1(TMP_REG2) | IMM(0), UNMOVABLE_INS));
+    PTR_FAIL_IF(emit_const(compiler, TMP_REG1, 0));
+    PTR_FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? TMP_LINK : 0) | S1(TMP_REG1) | IMM(0), UNMOVABLE_INS));
     jump->addr = compiler->size;
     PTR_FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));


@@ -1354,6 +1359,22 @@
     return jump;
 }


+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+    PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_jump(compiler, type);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
     struct sljit_jump *jump = NULL;
@@ -1370,17 +1391,18 @@
         FAIL_IF(!jump);
         set_jump(jump, compiler, JUMP_ADDR);
         jump->u.target = srcw;
+
         if ((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS)
             jump->flags |= IS_MOVABLE;
         if (type >= SLJIT_FAST_CALL)
             jump->flags |= IS_CALL;


-        FAIL_IF(emit_const(compiler, TMP_REG2, 0));
-        src_r = TMP_REG2;
+        FAIL_IF(emit_const(compiler, TMP_REG1, 0));
+        src_r = TMP_REG1;
     }
     else {
-        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, src, srcw));
-        src_r = TMP_REG2;
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
+        src_r = TMP_REG1;
     }


     FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? TMP_LINK : 0) | S1(src_r) | IMM(0), UNMOVABLE_INS));
@@ -1389,6 +1411,29 @@
     return push_inst(compiler, NOP, UNMOVABLE_INS);
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+    if (src & SLJIT_MEM) {
+        ADJUST_LOCAL_OFFSET(src, srcw);
+        FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
+        src = TMP_REG1;
+    }
+
+    FAIL_IF(call_with_args(compiler, arg_types, &src));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_ijump(compiler, type, src, srcw);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 type)


Modified: code/trunk/sljit/sljitNativeX86_32.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_32.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeX86_32.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -64,29 +64,28 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-    sljit_s32 size;
+    sljit_s32 args, size;
     sljit_u8 *inst;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


+    args = get_arg_count(arg_types);
     compiler->args = args;


-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-    /* [esp+0] for saving temporaries and third argument for calls. */
-    compiler->saveds_offset = 1 * sizeof(sljit_sw);
-#else
-    /* [esp+0] for saving temporaries and space for maximum three arguments. */
-    if (scratches <= 1)
-        compiler->saveds_offset = 1 * sizeof(sljit_sw);
-    else
-        compiler->saveds_offset = ((scratches == 2) ? 2 : 3) * sizeof(sljit_sw);
+    /* [esp+0] for saving temporaries and function calls. */
+    compiler->stack_tmp_size = 2 * sizeof(sljit_sw);
+
+#if !(defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+    if (scratches > 3)
+        compiler->stack_tmp_size = 3 * sizeof(sljit_sw);
 #endif


+    compiler->saveds_offset = compiler->stack_tmp_size;
     if (scratches > 3)
         compiler->saveds_offset += ((scratches > (3 + 6)) ? 6 : (scratches - 3)) * sizeof(sljit_sw);


@@ -178,10 +177,10 @@
         /* Space for a single argument. This amount is excluded when the stack is allocated below. */
         local_size -= sizeof(sljit_sw);
         FAIL_IF(emit_do_imm(compiler, MOV_r_i32 + reg_map[SLJIT_R0], local_size));
-        FAIL_IF(emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
+        FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
             SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, sizeof(sljit_sw)));
 #endif
-        FAIL_IF(sljit_emit_ijump(compiler, SLJIT_CALL1, SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
+        FAIL_IF(sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARG1(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
     }
 #endif


@@ -192,12 +191,12 @@
         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_SP, 0);


         /* Some space might allocated during sljit_grow_stack() above on WIN32. */
-        FAIL_IF(emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
+        FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
             SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, local_size + sizeof(sljit_sw)));


 #if defined _WIN32 && !(defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
         if (compiler->local_size > 1024)
-            FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+            FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
                 TMP_REG1, 0, TMP_REG1, 0, SLJIT_IMM, sizeof(sljit_sw)));
 #endif


@@ -213,31 +212,29 @@
         return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), compiler->local_size, TMP_REG1, 0);
     }
 #endif
-    return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
+    return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
         SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, local_size);
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


-    compiler->args = args;
+    compiler->args = get_arg_count(arg_types);


-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-    /* [esp+0] for saving temporaries and third argument for calls. */
-    compiler->saveds_offset = 1 * sizeof(sljit_sw);
-#else
-    /* [esp+0] for saving temporaries and space for maximum three arguments. */
-    if (scratches <= 1)
-        compiler->saveds_offset = 1 * sizeof(sljit_sw);
-    else
-        compiler->saveds_offset = ((scratches == 2) ? 2 : 3) * sizeof(sljit_sw);
+    /* [esp+0] for saving temporaries and function calls. */
+    compiler->stack_tmp_size = 2 * sizeof(sljit_sw);
+
+#if !(defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+    if (scratches > 3)
+        compiler->stack_tmp_size = 3 * sizeof(sljit_sw);
 #endif


+    compiler->saveds_offset = compiler->stack_tmp_size;
     if (scratches > 3)
         compiler->saveds_offset += ((scratches > (3 + 6)) ? 6 : (scratches - 3)) * sizeof(sljit_sw);


@@ -278,10 +275,10 @@
     if (compiler->options & SLJIT_F64_ALIGNMENT)
         EMIT_MOV(compiler, SLJIT_SP, 0, SLJIT_MEM1(SLJIT_SP), compiler->local_size)
     else
-        FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+        FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
             SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, compiler->local_size));
 #else
-    FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+    FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
         SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, compiler->local_size));
 #endif


@@ -418,7 +415,7 @@
         if ((flags & EX86_BIN_INS) && (a & SLJIT_IMM))
             *inst = (flags & EX86_BYTE_ARG) ? GROUP_BINARY_83 : GROUP_BINARY_81;


-        if ((a & SLJIT_IMM) || (a == 0))
+        if (a & SLJIT_IMM)
             *buf_ptr = 0;
         else if (!(flags & EX86_SSE2_OP1))
             *buf_ptr = reg_map[a] << 3;
@@ -490,42 +487,324 @@
 /*  Call / return instructions                                           */
 /* --------------------------------------------------------------------- */


-static SLJIT_INLINE sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 type)
+#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+
+static sljit_s32 c_fast_call_get_stack_size(sljit_s32 arg_types, sljit_s32 *word_arg_count_ptr)
 {
+    sljit_s32 stack_size = 0;
+    sljit_s32 word_arg_count = 0;
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            stack_size += sizeof(sljit_f32);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            stack_size += sizeof(sljit_f64);
+            break;
+        default:
+            word_arg_count++;
+            if (word_arg_count > 2)
+                stack_size += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    if (word_arg_count_ptr)
+        *word_arg_count_ptr = word_arg_count;
+
+    return stack_size;
+}
+
+static sljit_s32 c_fast_call_with_args(struct sljit_compiler *compiler,
+    sljit_s32 arg_types, sljit_s32 stack_size, sljit_s32 word_arg_count, sljit_s32 swap_args)
+{
     sljit_u8 *inst;
+    sljit_s32 float_arg_count;


-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-    inst = (sljit_u8*)ensure_buf(compiler, type >= SLJIT_CALL3 ? 1 + 2 + 1 : 1 + 2);
-    FAIL_IF(!inst);
-    INC_SIZE(type >= SLJIT_CALL3 ? 2 + 1 : 2);
+    if (stack_size == sizeof(sljit_sw) && word_arg_count == 3) {
+        inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
+        FAIL_IF(!inst);
+        INC_SIZE(1);
+        PUSH_REG(reg_map[SLJIT_R2]);
+    }
+    else if (stack_size > 0) {
+        if (word_arg_count >= 4)
+            EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), compiler->saveds_offset - sizeof(sljit_sw));


-    if (type >= SLJIT_CALL3)
-        PUSH_REG(reg_map[SLJIT_R2]);
-    *inst++ = MOV_r_rm;
-    *inst++ = MOD_REG | (reg_map[SLJIT_R2] << 3) | reg_map[SLJIT_R0];
+        FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+            SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, stack_size));
+
+        stack_size = 0;
+        arg_types >>= SLJIT_DEF_SHIFT;
+        word_arg_count = 0;
+        float_arg_count = 0;
+        while (arg_types) {
+            switch (arg_types & SLJIT_DEF_MASK) {
+            case SLJIT_ARG_TYPE_F32:
+                float_arg_count++;
+                FAIL_IF(emit_sse2_store(compiler, 1, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
+                stack_size += sizeof(sljit_f32);
+                break;
+            case SLJIT_ARG_TYPE_F64:
+                float_arg_count++;
+                FAIL_IF(emit_sse2_store(compiler, 0, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
+                stack_size += sizeof(sljit_f64);
+                break;
+            default:
+                word_arg_count++;
+                if (word_arg_count == 3) {
+                    EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size, SLJIT_R2, 0);
+                    stack_size += sizeof(sljit_sw);
+                }
+                else if (word_arg_count == 4) {
+                    EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size, TMP_REG1, 0);
+                    stack_size += sizeof(sljit_sw);
+                }
+                break;
+            }
+
+            arg_types >>= SLJIT_DEF_SHIFT;
+        }
+    }
+
+    if (word_arg_count > 0) {
+        if (swap_args) {
+            inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
+            FAIL_IF(!inst);
+            INC_SIZE(1);
+
+            *inst++ = XCHG_EAX_r | reg_map[SLJIT_R2];
+        }
+        else {
+            inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
+            FAIL_IF(!inst);
+            INC_SIZE(2);
+
+            *inst++ = MOV_r_rm;
+            *inst++ = MOD_REG | (reg_map[SLJIT_R2] << 3) | reg_map[SLJIT_R0];
+        }
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+#endif
+
+static sljit_s32 cdecl_call_get_stack_size(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *word_arg_count_ptr)
+{
+    sljit_s32 stack_size = 0;
+    sljit_s32 word_arg_count = 0;
+
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            stack_size += sizeof(sljit_f32);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            stack_size += sizeof(sljit_f64);
+            break;
+        default:
+            word_arg_count++;
+            stack_size += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    if (word_arg_count_ptr)
+        *word_arg_count_ptr = word_arg_count;
+
+    if (stack_size <= compiler->stack_tmp_size)
+        return 0;
+
+#if defined(__APPLE__)
+    return ((stack_size - compiler->stack_tmp_size + 15) & ~15);
 #else
-    inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 * (type - SLJIT_CALL0));
+    return stack_size - compiler->stack_tmp_size;
+#endif
+}
+
+static sljit_s32 cdecl_call_with_args(struct sljit_compiler *compiler,
+    sljit_s32 arg_types, sljit_s32 stack_size, sljit_s32 word_arg_count)
+{
+    sljit_s32 float_arg_count = 0;
+
+    if (word_arg_count >= 4)
+        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), compiler->saveds_offset - sizeof(sljit_sw));
+
+    if (stack_size > 0)
+        FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+            SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, stack_size));
+
+    stack_size = 0;
+    word_arg_count = 0;
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            float_arg_count++;
+            FAIL_IF(emit_sse2_store(compiler, 1, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
+            stack_size += sizeof(sljit_f32);
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            float_arg_count++;
+            FAIL_IF(emit_sse2_store(compiler, 0, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
+            stack_size += sizeof(sljit_f64);
+            break;
+        default:
+            word_arg_count++;
+            EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size, (word_arg_count >= 4) ? TMP_REG1 : word_arg_count, 0);
+            stack_size += sizeof(sljit_sw);
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
+    }
+
+    return SLJIT_SUCCESS;
+}
+
+static sljit_s32 post_call_with_args(struct sljit_compiler *compiler,
+    sljit_s32 arg_types, sljit_s32 stack_size)
+{
+    sljit_u8 *inst;
+    sljit_s32 single;
+
+    if (stack_size > 0)
+        FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
+            SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, stack_size));
+
+    if ((arg_types & SLJIT_DEF_MASK) < SLJIT_ARG_TYPE_F32)
+        return SLJIT_SUCCESS;
+
+    single = ((arg_types & SLJIT_DEF_MASK) == SLJIT_ARG_TYPE_F32);
+
+    inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
     FAIL_IF(!inst);
-    INC_SIZE(4 * (type - SLJIT_CALL0));
+    INC_SIZE(3);
+    inst[0] = single ? FSTPS : FSTPD;
+    inst[1] = (0x03 << 3) | 0x04;
+    inst[2] = (0x04 << 3) | reg_map[SLJIT_SP];


-    *inst++ = MOV_rm_r;
-    *inst++ = MOD_DISP8 | (reg_map[SLJIT_R0] << 3) | 0x4 /* SIB */;
-    *inst++ = (0x4 /* none*/ << 3) | reg_map[SLJIT_SP];
-    *inst++ = 0;
-    if (type >= SLJIT_CALL2) {
-        *inst++ = MOV_rm_r;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_R1] << 3) | 0x4 /* SIB */;
-        *inst++ = (0x4 /* none*/ << 3) | reg_map[SLJIT_SP];
-        *inst++ = sizeof(sljit_sw);
+    return emit_sse2_load(compiler, single, SLJIT_FR0, SLJIT_MEM1(SLJIT_SP), 0);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    struct sljit_jump *jump;
+    sljit_s32 stack_size = 0;
+    sljit_s32 word_arg_count;
+
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+    if ((type & 0xff) == SLJIT_CALL) {
+        stack_size = c_fast_call_get_stack_size(arg_types, &word_arg_count);
+        PTR_FAIL_IF(c_fast_call_with_args(compiler, arg_types, stack_size, word_arg_count, 0));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+        compiler->skip_checks = 1;
+#endif
+
+        jump = sljit_emit_jump(compiler, type);
+        PTR_FAIL_IF(jump == NULL);
+
+        PTR_FAIL_IF(post_call_with_args(compiler, arg_types, 0));
+        return jump;
     }
-    if (type >= SLJIT_CALL3) {
-        *inst++ = MOV_rm_r;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_R2] << 3) | 0x4 /* SIB */;
-        *inst++ = (0x4 /* none*/ << 3) | reg_map[SLJIT_SP];
-        *inst++ = 2 * sizeof(sljit_sw);
+#endif
+
+    stack_size = cdecl_call_get_stack_size(compiler, arg_types, &word_arg_count);
+    PTR_FAIL_IF(cdecl_call_with_args(compiler, arg_types, stack_size, word_arg_count));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    jump = sljit_emit_jump(compiler, type);
+    PTR_FAIL_IF(jump == NULL);
+
+    PTR_FAIL_IF(post_call_with_args(compiler, arg_types, stack_size));
+    return jump;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    sljit_s32 stack_size = 0;
+    sljit_s32 word_arg_count;
+#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+    sljit_s32 swap_args;
+#endif
+
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+    SLJIT_ASSERT(reg_map[SLJIT_R0] == 0 && reg_map[SLJIT_R2] == 1 && SLJIT_R0 == 1 && SLJIT_R2 == 3);
+
+    if ((type & 0xff) == SLJIT_CALL) {
+        stack_size = c_fast_call_get_stack_size(arg_types, &word_arg_count);
+        swap_args = 0;
+
+        if (word_arg_count > 0) {
+            if ((src & REG_MASK) == SLJIT_R2 || OFFS_REG(src) == SLJIT_R2) {
+                swap_args = 1;
+                if (((src & REG_MASK) | 0x2) == SLJIT_R2)
+                    src ^= 0x2;
+                if ((OFFS_REG(src) | 0x2) == SLJIT_R2)
+                    src ^= TO_OFFS_REG(0x2);
+            }
+        }
+
+        FAIL_IF(c_fast_call_with_args(compiler, arg_types, stack_size, word_arg_count, swap_args));
+
+        compiler->saveds_offset += stack_size;
+        compiler->locals_offset += stack_size;
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+        compiler->skip_checks = 1;
+#endif
+        FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
+
+        compiler->saveds_offset -= stack_size;
+        compiler->locals_offset -= stack_size;
+
+        return post_call_with_args(compiler, arg_types, 0);
     }
 #endif
-    return SLJIT_SUCCESS;
+
+    stack_size = cdecl_call_get_stack_size(compiler, arg_types, &word_arg_count);
+    FAIL_IF(cdecl_call_with_args(compiler, arg_types, stack_size, word_arg_count));
+
+    compiler->saveds_offset += stack_size;
+    compiler->locals_offset += stack_size;
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+    FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
+
+    compiler->saveds_offset -= stack_size;
+    compiler->locals_offset -= stack_size;
+
+    return post_call_with_args(compiler, arg_types, stack_size);
 }


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)

Modified: code/trunk/sljit/sljitNativeX86_64.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_64.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeX86_64.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -66,15 +66,15 @@
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-    sljit_s32 i, tmp, size, saved_register_size;
+    sljit_s32 args, i, tmp, size, saved_register_size;
     sljit_u8 *inst;


     CHECK_ERROR();
-    CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


 #ifdef _WIN64
     /* Two/four register slots for parameters plus space for xmm6 register if needed. */
@@ -108,6 +108,8 @@
         PUSH_REG(reg_lmap[i]);
     }


+    args = get_arg_count(arg_types);
+
     if (args > 0) {
         size = args * 3;
         inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
@@ -182,7 +184,7 @@
             || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
         compiler->skip_checks = 1;
 #endif
-        FAIL_IF(sljit_emit_ijump(compiler, SLJIT_CALL1, SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
+        FAIL_IF(sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARG1(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
     }
 #endif


@@ -223,14 +225,14 @@
}

 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-    sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
+    sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
     sljit_s32 saved_register_size;


     CHECK_ERROR();
-    CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
-    set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+    CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+    set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


 #ifdef _WIN64
     /* Two/four register slots for parameters plus space for xmm6 register if needed. */
@@ -414,7 +416,11 @@
             }
         }
     }
-    else if (!(flags & EX86_SSE2_OP2) && reg_map[b] >= 8)
+    else if (!(flags & EX86_SSE2_OP2)) {
+        if (reg_map[b] >= 8)
+            rex |= REX_B;
+    }
+    else if (freg_map[b] >= 8)
         rex |= REX_B;


     if (a & SLJIT_IMM) {
@@ -441,7 +447,11 @@
     else {
         SLJIT_ASSERT(!(flags & EX86_SHIFT_INS) || a == SLJIT_PREF_SHIFT_REG);
         /* reg_map[SLJIT_PREF_SHIFT_REG] is less than 8. */
-        if (!(flags & EX86_SSE2_OP1) && reg_map[a] >= 8)
+        if (!(flags & EX86_SSE2_OP1)) {
+            if (reg_map[a] >= 8)
+                rex |= REX_R;
+        }
+        else if (freg_map[a] >= 8)
             rex |= REX_R;
     }


@@ -468,12 +478,12 @@
         if ((flags & EX86_BIN_INS) && (a & SLJIT_IMM))
             *inst = (flags & EX86_BYTE_ARG) ? GROUP_BINARY_83 : GROUP_BINARY_81;


-        if ((a & SLJIT_IMM) || (a == 0))
+        if (a & SLJIT_IMM)
             *buf_ptr = 0;
         else if (!(flags & EX86_SSE2_OP1))
             *buf_ptr = reg_lmap[a] << 3;
         else
-            *buf_ptr = a << 3;
+            *buf_ptr = freg_lmap[a] << 3;
     }
     else {
         if (a & SLJIT_IMM) {
@@ -487,7 +497,7 @@
     }


     if (!(b & SLJIT_MEM))
-        *buf_ptr++ |= MOD_REG + ((!(flags & EX86_SSE2_OP2)) ? reg_lmap[b] : b);
+        *buf_ptr++ |= MOD_REG + ((!(flags & EX86_SSE2_OP2)) ? reg_lmap[b] : freg_lmap[b]);
     else if ((b & REG_MASK) != SLJIT_UNUSED) {
         if ((b & OFFS_REG_MASK) == SLJIT_UNUSED || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SP)) {
             if (immb != 0 || reg_lmap[b & REG_MASK] == 5) {
@@ -545,45 +555,161 @@
 /*  Call / return instructions                                           */
 /* --------------------------------------------------------------------- */


-static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 type)
+#ifndef _WIN64
+
+static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *src_ptr, sljit_sw srcw)
 {
-    sljit_u8 *inst;
+    sljit_s32 src = src_ptr ? (*src_ptr) : 0;
+    sljit_s32 word_arg_count = 0;


-    /* After any change update IS_REG_CHANGED_BY_CALL as well. */
-#ifndef _WIN64
-    SLJIT_ASSERT(reg_map[SLJIT_R1] == 6 && reg_map[SLJIT_R0] < 8 && reg_map[SLJIT_R2] < 8 && reg_map[TMP_REG1] == 2);
+    SLJIT_ASSERT(reg_map[SLJIT_R1] == 6 && reg_map[SLJIT_R3] == 1 && reg_map[TMP_REG1] == 2);


-    inst = (sljit_u8*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
-    FAIL_IF(!inst);
-    INC_SIZE((type < SLJIT_CALL3) ? 3 : 6);
-    if (type >= SLJIT_CALL3) {
-        /* Move third argument to TMP_REG1. */
-        *inst++ = REX_W;
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (0x2 /* rdx */ << 3) | reg_lmap[SLJIT_R2];
+    compiler->mode32 = 0;
+
+    /* Remove return value. */
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        if ((arg_types & SLJIT_DEF_MASK) < SLJIT_ARG_TYPE_F32)
+            word_arg_count++;
+        arg_types >>= SLJIT_DEF_SHIFT;
     }
-    *inst++ = REX_W;
-    *inst++ = MOV_r_rm;
-    *inst++ = MOD_REG | (0x7 /* rdi */ << 3) | reg_lmap[SLJIT_R0];
+
+    if (word_arg_count == 0)
+        return SLJIT_SUCCESS;
+
+    if (src & SLJIT_MEM) {
+        ADJUST_LOCAL_OFFSET(src, srcw);
+        EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
+        *src_ptr = TMP_REG2;
+    }
+    else if (src == SLJIT_R2 && word_arg_count >= SLJIT_R2)
+        *src_ptr = TMP_REG1;
+
+    if (word_arg_count >= 3)
+        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R2, 0);
+    return emit_mov(compiler, SLJIT_R2, 0, SLJIT_R0, 0);
+}
+
 #else
-    SLJIT_ASSERT(reg_map[SLJIT_R1] == 2 && reg_map[SLJIT_R0] < 8 && reg_map[SLJIT_R2] < 8 && reg_map[TMP_REG1] == 8);


-    inst = (sljit_u8*)ensure_buf(compiler, 1 + ((type < SLJIT_CALL3) ? 3 : 6));
-    FAIL_IF(!inst);
-    INC_SIZE((type < SLJIT_CALL3) ? 3 : 6);
-    if (type >= SLJIT_CALL3) {
-        /* Move third argument to TMP_REG1. */
-        *inst++ = REX_W | REX_R;
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (0x0 /* r8 */ << 3) | reg_lmap[SLJIT_R2];
+static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *src_ptr, sljit_sw srcw)
+{
+    sljit_s32 src = src_ptr ? (*src_ptr) : 0;
+    sljit_s32 arg_count = 0;
+    sljit_s32 word_arg_count = 0;
+    sljit_s32 float_arg_count = 0;
+    sljit_s32 types = 0;
+    sljit_s32 data_trandfer = 0;
+    static sljit_u8 word_arg_regs[5] = { 0, SLJIT_R3, SLJIT_R1, SLJIT_R2, TMP_REG1 };
+
+    SLJIT_ASSERT(reg_map[SLJIT_R3] == 1 && reg_map[SLJIT_R1] == 2 && reg_map[SLJIT_R2] == 8 && reg_map[TMP_REG1] == 9);
+
+    compiler->mode32 = 0;
+    arg_types >>= SLJIT_DEF_SHIFT;
+
+    while (arg_types) {
+        types = (types << SLJIT_DEF_SHIFT) | (arg_types & SLJIT_DEF_MASK);
+
+        switch (arg_types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+        case SLJIT_ARG_TYPE_F64:
+            arg_count++;
+            float_arg_count++;
+
+            if (arg_count != float_arg_count)
+                data_trandfer = 1;
+            break;
+        default:
+            arg_count++;
+            word_arg_count++;
+
+            if (arg_count != word_arg_count || arg_count != word_arg_regs[arg_count]) {
+                data_trandfer = 1;
+
+                if (src == word_arg_regs[arg_count]) {
+                    EMIT_MOV(compiler, TMP_REG2, 0, src, 0);
+                    *src_ptr = TMP_REG2;
+                }
+            }
+            break;
+        }
+
+        arg_types >>= SLJIT_DEF_SHIFT;
     }
-    *inst++ = REX_W;
-    *inst++ = MOV_r_rm;
-    *inst++ = MOD_REG | (0x1 /* rcx */ << 3) | reg_lmap[SLJIT_R0];
-#endif
+
+    if (!data_trandfer)
+        return SLJIT_SUCCESS;
+
+    if (src & SLJIT_MEM) {
+        ADJUST_LOCAL_OFFSET(src, srcw);
+        EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
+        *src_ptr = TMP_REG2;
+    }
+
+    while (types) {
+        switch (types & SLJIT_DEF_MASK) {
+        case SLJIT_ARG_TYPE_F32:
+            if (arg_count != float_arg_count)
+                FAIL_IF(emit_sse2_load(compiler, 1, arg_count, float_arg_count, 0));
+            arg_count--;
+            float_arg_count--;
+            break;
+        case SLJIT_ARG_TYPE_F64:
+            if (arg_count != float_arg_count)
+                FAIL_IF(emit_sse2_load(compiler, 0, arg_count, float_arg_count, 0));
+            arg_count--;
+            float_arg_count--;
+            break;
+        default:
+            if (arg_count != word_arg_count || arg_count != word_arg_regs[arg_count])
+                EMIT_MOV(compiler, word_arg_regs[arg_count], 0, word_arg_count, 0);
+            arg_count--;
+            word_arg_count--;
+            break;
+        }
+
+        types >>= SLJIT_DEF_SHIFT;
+    }
+
     return SLJIT_SUCCESS;
 }


+#endif
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types)
+{
+    CHECK_ERROR_PTR();
+    CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+    PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL, 0));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_jump(compiler, type);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+    sljit_s32 arg_types,
+    sljit_s32 src, sljit_sw srcw)
+{
+    CHECK_ERROR();
+    CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+    FAIL_IF(call_with_args(compiler, arg_types, &src, srcw));
+
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
+        || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+    compiler->skip_checks = 1;
+#endif
+
+    return sljit_emit_ijump(compiler, type, src, srcw);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
 {
     sljit_u8 *inst;
@@ -679,7 +805,6 @@
     return SLJIT_SUCCESS;
 }


-
 /* --------------------------------------------------------------------- */
 /*  Extend input                                                         */
 /* --------------------------------------------------------------------- */


Modified: code/trunk/sljit/sljitNativeX86_common.c
===================================================================
--- code/trunk/sljit/sljitNativeX86_common.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitNativeX86_common.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -26,7 +26,11 @@


 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
 {
+#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+    return "x86" SLJIT_CPUINFO " ABI:fastcall";
+#else
     return "x86" SLJIT_CPUINFO;
+#endif
 }


 /*
@@ -92,23 +96,32 @@
 #ifndef _WIN64
 /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
-    0, 0, 6, 1, 7, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
+    0, 0, 6, 7, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
 };
 /* low-map. reg_map & 0x7. */
 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
-    0, 0, 6, 1, 7, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 1
+    0, 0, 6, 7, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 1
 };
 #else
 /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
-    0, 0, 2, 1, 10, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 8, 9
+    0, 0, 2, 8, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 9, 10
 };
 /* low-map. reg_map & 0x7. */
 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
-    0, 0, 2, 1, 2,  3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 0, 1
+    0, 0, 2, 0, 1,  3,  4, 5,  5,  6,  7, 7, 6, 3, 4, 1,  2
 };
 #endif


+/* Args: xmm0-xmm3 */
+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
+    4, 0, 1, 2, 3, 5, 6
+};
+/* low-map. freg_map & 0x7. */
+static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
+    4, 0, 1, 2, 3, 5, 6
+};
+
 #define REX_W        0x48
 #define REX_R        0x44
 #define REX_X        0x42
@@ -178,6 +191,8 @@
 #define CVTTSD2SI_r_xm    0x2c
 #define DIV        (/* GROUP_F7 */ 6 << 3)
 #define DIVSD_x_xm    0x5e
+#define FSTPS        0xd9
+#define FSTPD        0xdd
 #define INT3        0xcc
 #define IDIV        (/* GROUP_F7 */ 7 << 3)
 #define IMUL        (/* GROUP_F7 */ 5 << 3)
@@ -613,9 +628,6 @@
             get_cpu_features();
         return cpu_has_cmov;


-    case SLJIT_HAS_PREF_SHIFT_REG:
-        return 1;
-
     case SLJIT_HAS_SSE2:
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
         if (cpu_has_sse2 == -1)
@@ -634,14 +646,16 @@
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */


+#define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode))
+
 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
-    sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
+    sljit_u32 op_types,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w);


 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
-    sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
+    sljit_u32 op_types,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w);
@@ -653,10 +667,16 @@
 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
     FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));


+static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
+    sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
+
+static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
+    sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
+
 #ifdef _WIN32
 #include <malloc.h>


-static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
+static void SLJIT_FUNC sljit_grow_stack(sljit_sw local_size)
 {
     /* Workaround for calling the internal _chkstk() function on Windows.
     This function touches all 4k pages belongs to the requested stack space,
@@ -1275,20 +1295,25 @@
         compiler->mode32 = 0;
 #endif


+        if (FAST_IS_REG(src) && src == dst) {
+            if (!TYPE_CAST_NEEDED(op))
+                return SLJIT_SUCCESS;
+        }
+
         if (op_flags & SLJIT_I32_OP) {
-            if (FAST_IS_REG(src) && src == dst) {
-                if (!TYPE_CAST_NEEDED(op))
-                    return SLJIT_SUCCESS;
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+            if (src & SLJIT_MEM) {
+                if (op == SLJIT_MOV_S32)
+                    op = SLJIT_MOV_U32;
+                if (op == SLJIT_MOVU_S32)
+                    op = SLJIT_MOVU_U32;
             }
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-            if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
-                op = SLJIT_MOV_U32;
-            if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
-                op = SLJIT_MOVU_U32;
-            if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
-                op = SLJIT_MOV_S32;
-            if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
-                op = SLJIT_MOVU_S32;
+            else if (src & SLJIT_IMM) {
+                if (op == SLJIT_MOV_U32)
+                    op = SLJIT_MOV_S32;
+                if (op == SLJIT_MOVU_U32)
+                    op = SLJIT_MOVU_S32;
+            }
 #endif
         }


@@ -1372,11 +1397,11 @@

         if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK)) {
             if ((src & OFFS_REG_MASK) != 0) {
-                FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
                         (src & REG_MASK), 0, (src & REG_MASK), 0, OFFS_REG(dst), 0));
             }
             else if (srcw != 0) {
-                FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
                         (src & REG_MASK), 0, (src & REG_MASK), 0, SLJIT_IMM, srcw));
             }
         }
@@ -1383,11 +1408,11 @@


         if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK)) {
             if ((dst & OFFS_REG_MASK) != 0) {
-                FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
                         (dst & REG_MASK), 0, (dst & REG_MASK), 0, OFFS_REG(dst), 0));
             }
             else if (dstw != 0) {
-                FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+                FAIL_IF(emit_cum_binary(compiler, BINARY_OPCODE(ADD),
                         (dst & REG_MASK), 0, (dst & REG_MASK), 0, SLJIT_IMM, dstw));
             }
         }
@@ -1445,12 +1470,16 @@
 #endif


 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
-    sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
+    sljit_u32 op_types,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w)
 {
     sljit_u8* inst;
+    sljit_u8 op_eax_imm = (op_types >> 24);
+    sljit_u8 op_rm = (op_types >> 16) & 0xff;
+    sljit_u8 op_mr = (op_types >> 8) & 0xff;
+    sljit_u8 op_imm = op_types & 0xff;


     if (dst == SLJIT_UNUSED) {
         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
@@ -1561,12 +1590,16 @@
 }


 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
-    sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
+    sljit_u32 op_types,
     sljit_s32 dst, sljit_sw dstw,
     sljit_s32 src1, sljit_sw src1w,
     sljit_s32 src2, sljit_sw src2w)
 {
     sljit_u8* inst;
+    sljit_u8 op_eax_imm = (op_types >> 24);
+    sljit_u8 op_rm = (op_types >> 16) & 0xff;
+    sljit_u8 op_mr = (op_types >> 8) & 0xff;
+    sljit_u8 op_imm = op_types & 0xff;


     if (dst == SLJIT_UNUSED) {
         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
@@ -2044,7 +2077,7 @@
         *inst |= mode;
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
     }
-    else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
+    else if (SLOW_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
         if (src1 != dst)
             EMIT_MOV(compiler, dst, 0, src1, src1w);
         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
@@ -2057,8 +2090,8 @@
     else {
         /* This case is complex since ecx itself may be used for
            addressing, and this case must be supported as well. */
+        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
         EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
@@ -2065,19 +2098,16 @@
         FAIL_IF(!inst);
         *inst |= mode;
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
-        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
 #else
-        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
-        EMIT_MOV(compiler, TMP_REG2, 0, src2, src2w);
-        inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
-        FAIL_IF(!inst);
-        *inst = XCHG_r_rm;
+        EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
+        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
         FAIL_IF(!inst);
         *inst |= mode;
         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
-        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
 #endif
+        if (dst != SLJIT_UNUSED)
+            return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
     }


     return SLJIT_SUCCESS;
@@ -2101,7 +2131,7 @@
         if (!set_flags)
             return emit_mov(compiler, dst, dstw, src1, src1w);
         /* OR dst, src, 0 */
-        return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
+        return emit_cum_binary(compiler, BINARY_OPCODE(OR),
             dst, dstw, src1, src1w, SLJIT_IMM, 0);
     }


@@ -2111,10 +2141,10 @@
     if (!FAST_IS_REG(dst))
         FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));


-    FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
+    FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w));


     if (FAST_IS_REG(dst))
-        return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
+        return emit_cmp_binary(compiler, (dst == SLJIT_UNUSED) ? TMP_REG1 : dst, dstw, SLJIT_IMM, 0);
     return SLJIT_SUCCESS;
 }


@@ -2145,10 +2175,10 @@
             if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
                 return compiler->error;
         }
-        return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
+        return emit_cum_binary(compiler, BINARY_OPCODE(ADD),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_ADDC:
-        return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
+        return emit_cum_binary(compiler, BINARY_OPCODE(ADC),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_SUB:
         if (!HAS_FLAGS(op)) {
@@ -2158,10 +2188,10 @@


         if (dst == SLJIT_UNUSED)
             return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
-        return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
+        return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_SUBC:
-        return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
+        return emit_non_cum_binary(compiler, BINARY_OPCODE(SBB),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_MUL:
         return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
@@ -2168,13 +2198,13 @@
     case SLJIT_AND:
         if (dst == SLJIT_UNUSED)
             return emit_test_binary(compiler, src1, src1w, src2, src2w);
-        return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
+        return emit_cum_binary(compiler, BINARY_OPCODE(AND),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_OR:
-        return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
+        return emit_cum_binary(compiler, BINARY_OPCODE(OR),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_XOR:
-        return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
+        return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
             dst, dstw, src1, src1w, src2, src2w);
     case SLJIT_SHL:
         return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
@@ -2203,7 +2233,11 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
     CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     return reg;
+#else
+    return freg_map[reg];
+#endif
 }


 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -2345,6 +2379,7 @@
         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
         src1 = TMP_FREG;
     }
+
     return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
 }


@@ -2516,9 +2551,6 @@
     set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
     type &= 0xff;


-    if (type >= SLJIT_CALL1)
-        PTR_FAIL_IF(call_with_args(compiler, type));
-
     /* Worst case size. */
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
@@ -2534,14 +2566,6 @@
     return jump;
 }


-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-#ifndef _WIN64
-#define IS_REG_CHANGED_BY_CALL(src, type) ((src) == SLJIT_R3)
-#else
-#define IS_REG_CHANGED_BY_CALL(src, type) ((src) == SLJIT_R2)
-#endif
-#endif
-
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
     sljit_u8 *inst;
@@ -2553,25 +2577,6 @@


     CHECK_EXTRA_REGS(src, srcw, (void)0);


-    if (type >= SLJIT_CALL1) {
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-        if (src == SLJIT_R2) {
-            EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
-            src = TMP_REG1;
-        }
-        if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
-            srcw += sizeof(sljit_sw);
-#endif
-#else
-        if ((src & SLJIT_MEM) || IS_REG_CHANGED_BY_CALL(src, type)) {
-            EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
-            src = TMP_REG2;
-        }
-#endif
-        FAIL_IF(call_with_args(compiler, type));
-    }
-
     if (src == SLJIT_IMM) {
         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
         FAIL_IF_NULL(jump);


Modified: code/trunk/sljit/sljitUtils.c
===================================================================
--- code/trunk/sljit/sljitUtils.c    2017-11-23 07:56:49 UTC (rev 1715)
+++ code/trunk/sljit/sljitUtils.c    2017-11-29 13:40:20 UTC (rev 1716)
@@ -48,12 +48,12 @@


#if (defined SLJIT_UTIL_GLOBAL_LOCK && SLJIT_UTIL_GLOBAL_LOCK)

-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_grab_lock(void)
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_grab_lock(void)
 {
     /* Always successful. */
 }


-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_release_lock(void)
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_release_lock(void)
 {
     /* Always successful. */
 }
@@ -88,7 +88,7 @@


static HANDLE global_mutex = 0;

-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_grab_lock(void)
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_grab_lock(void)
 {
     /* No idea what to do if an error occures. Static mutexes should never fail... */
     if (!global_mutex)
@@ -97,7 +97,7 @@
         WaitForSingleObject(global_mutex, INFINITE);
 }


-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_release_lock(void)
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_release_lock(void)
 {
     ReleaseMutex(global_mutex);
 }
@@ -130,12 +130,12 @@


static pthread_mutex_t global_mutex = PTHREAD_MUTEX_INITIALIZER;

-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_grab_lock(void)
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_grab_lock(void)
 {
     pthread_mutex_lock(&global_mutex);
 }


-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_release_lock(void)
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_release_lock(void)
 {
     pthread_mutex_unlock(&global_mutex);
 }
@@ -203,7 +203,7 @@
 /* Planning to make it even more clever in the future. */
 static sljit_sw sljit_page_align = 0;


-SLJIT_API_FUNC_ATTRIBUTE struct sljit_stack* SLJIT_CALL sljit_allocate_stack(sljit_uw limit, sljit_uw max_limit, void *allocator_data)
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_stack* SLJIT_FUNC sljit_allocate_stack(sljit_uw limit, sljit_uw max_limit, void *allocator_data)
 {
     struct sljit_stack *stack;
     void *ptr;
@@ -276,7 +276,7 @@


#undef PAGE_ALIGN

-SLJIT_API_FUNC_ATTRIBUTE void SLJIT_CALL sljit_free_stack(struct sljit_stack *stack, void *allocator_data)
+SLJIT_API_FUNC_ATTRIBUTE void SLJIT_FUNC sljit_free_stack(struct sljit_stack *stack, void *allocator_data)
 {
     SLJIT_UNUSED_ARG(allocator_data);
 #ifdef _WIN32
@@ -287,7 +287,7 @@
     SLJIT_FREE(stack, allocator_data);
 }


-SLJIT_API_FUNC_ATTRIBUTE sljit_sw SLJIT_CALL sljit_stack_resize(struct sljit_stack *stack, sljit_u8 *new_limit)
+SLJIT_API_FUNC_ATTRIBUTE sljit_sw SLJIT_FUNC sljit_stack_resize(struct sljit_stack *stack, sljit_u8 *new_limit)
 {
     sljit_uw aligned_old_limit;
     sljit_uw aligned_new_limit;