[Pcre-svn] [928] code/trunk/src/sljit: JIT compiler update.

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [928] code/trunk/src/sljit: JIT compiler update.
Revision: 928
          http://www.exim.org/viewvc/pcre2?view=rev&revision=928
Author:   zherczeg
Date:     2018-03-13 12:05:48 +0000 (Tue, 13 Mar 2018)
Log Message:
-----------
JIT compiler update.


Modified Paths:
--------------
    code/trunk/src/sljit/sljitConfigInternal.h
    code/trunk/src/sljit/sljitLir.c
    code/trunk/src/sljit/sljitNativeARM_64.c
    code/trunk/src/sljit/sljitNativeARM_T2_32.c
    code/trunk/src/sljit/sljitNativeX86_32.c
    code/trunk/src/sljit/sljitNativeX86_64.c
    code/trunk/src/sljit/sljitNativeX86_common.c


Modified: code/trunk/src/sljit/sljitConfigInternal.h
===================================================================
--- code/trunk/src/sljit/sljitConfigInternal.h    2018-03-04 15:30:46 UTC (rev 927)
+++ code/trunk/src/sljit/sljitConfigInternal.h    2018-03-13 12:05:48 UTC (rev 928)
@@ -147,17 +147,23 @@
 #define SLJIT_CONFIG_UNSUPPORTED 1
 #endif


-#else /* !_WIN32 */
+#else /* _WIN32 */

#if defined(_M_X64) || defined(__x86_64__)
#define SLJIT_CONFIG_X86_64 1
+#elif (defined(_M_ARM) && _M_ARM >= 7 && defined(_M_ARMT)) || defined(__thumb2__)
+#define SLJIT_CONFIG_ARM_THUMB2 1
+#elif (defined(_M_ARM) && _M_ARM >= 7)
+#define SLJIT_CONFIG_ARM_V7 1
#elif defined(_ARM_)
#define SLJIT_CONFIG_ARM_V5 1
+#elif defined(_M_ARM64) || defined(__aarch64__)
+#define SLJIT_CONFIG_ARM_64 1
#else
#define SLJIT_CONFIG_X86_32 1
#endif

-#endif /* !WIN32 */
+#endif /* !_WIN32 */
#endif /* SLJIT_CONFIG_AUTO */

 #if (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)
@@ -324,6 +330,11 @@
     sparc_cache_flush((from), (to))
 #define SLJIT_CACHE_FLUSH_OWN_IMPL 1


+#elif defined _WIN32
+
+#define SLJIT_CACHE_FLUSH(from, to) \
+    FlushInstructionCache(GetCurrentProcess(), (char*)(from), (char*)(to) - (char*)(from))
+
 #else


/* Calls __ARM_NR_cacheflush on ARM-Linux. */
@@ -371,13 +382,19 @@
#define SLJIT_64BIT_ARCHITECTURE 1
#define SLJIT_WORD_SHIFT 3
#ifdef _WIN32
+#ifdef __GNUC__
+/* These types do not require windows.h */
+typedef unsigned long long sljit_uw;
+typedef long long sljit_sw;
+#else
typedef unsigned __int64 sljit_uw;
typedef __int64 sljit_sw;
-#else
+#endif
+#else /* !_WIN32 */
typedef unsigned long int sljit_uw;
typedef long int sljit_sw;
+#endif /* _WIN32 */
#endif
-#endif

typedef sljit_uw sljit_p;

@@ -590,7 +607,7 @@

#define SLJIT_NUMBER_OF_REGISTERS 26
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 10
-#define SLJIT_LOCALS_OFFSET_BASE (2 * sizeof(sljit_sw))
+#define SLJIT_LOCALS_OFFSET_BASE 0

#elif (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)


Modified: code/trunk/src/sljit/sljitLir.c
===================================================================
--- code/trunk/src/sljit/sljitLir.c    2018-03-04 15:30:46 UTC (rev 927)
+++ code/trunk/src/sljit/sljitLir.c    2018-03-13 12:05:48 UTC (rev 928)
@@ -26,6 +26,13 @@


#include "sljitLir.h"

+#ifdef _WIN32
+
+/* For SLJIT_CACHE_FLUSH, which can expand to FlushInstructionCache. */
+#include <windows.h>
+
+#endif /* _WIN32 */
+
#if !(defined SLJIT_STD_MACROS_DEFINED && SLJIT_STD_MACROS_DEFINED)

/* These libraries are needed for the macros below. */
@@ -2178,7 +2185,8 @@

#endif

-#if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+#if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
+    && !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)


SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
{

Modified: code/trunk/src/sljit/sljitNativeARM_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_64.c    2018-03-04 15:30:46 UTC (rev 927)
+++ code/trunk/src/sljit/sljitNativeARM_64.c    2018-03-13 12:05:48 UTC (rev 928)
@@ -37,7 +37,7 @@
 #define TMP_REG1    (SLJIT_NUMBER_OF_REGISTERS + 2)
 #define TMP_REG2    (SLJIT_NUMBER_OF_REGISTERS + 3)
 #define TMP_LR        (SLJIT_NUMBER_OF_REGISTERS + 4)
-#define TMP_SP        (SLJIT_NUMBER_OF_REGISTERS + 5)
+#define TMP_FP        (SLJIT_NUMBER_OF_REGISTERS + 5)


 #define TMP_FREG1    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
 #define TMP_FREG2    (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
@@ -44,7 +44,7 @@


 /* r18 - platform register, currently not used */
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 8] = {
-    31, 0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 8, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 29, 9, 10, 30, 31
+    31, 0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 8, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 31, 9, 10, 30, 29
 };


static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
@@ -68,6 +68,7 @@

 #define ADC 0x9a000000
 #define ADD 0x8b000000
+#define ADDE 0x8b200000
 #define ADDI 0x91000000
 #define AND 0x8a000000
 #define ANDI 0x92000000
@@ -96,7 +97,8 @@
 #define FSUB 0x1e603800
 #define LDRI 0xf9400000
 #define LDP 0xa9400000
-#define LDP_PST 0xa8c00000
+#define LDP_PRE 0xa9c00000
+#define LDR_PRE 0xf8400c00
 #define LSLV 0x9ac02000
 #define LSRV 0x9ac02400
 #define MADD 0x9b000000
@@ -873,40 +875,32 @@
     CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
     set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


-    saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 0);
-    local_size += saved_regs_size + SLJIT_LOCALS_OFFSET;
+    saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 2);
+    if (saved_regs_size & 0x8)
+        saved_regs_size += sizeof(sljit_sw);
+
     local_size = (local_size + 15) & ~0xf;
-    compiler->local_size = local_size;
+    compiler->local_size = local_size + saved_regs_size;


-    if (local_size <= (63 * sizeof(sljit_sw))) {
-        FAIL_IF(push_inst(compiler, STP_PRE | 29 | RT2(TMP_LR)
-            | RN(TMP_SP) | ((-(local_size >> 3) & 0x7f) << 15)));
-        FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_SP) | (0 << 10)));
-        offs = (local_size - saved_regs_size) << (15 - 3);
-    } else {
-        offs = 0 << 15;
-        if (saved_regs_size & 0x8) {
-            offs = 1 << 15;
-            saved_regs_size += sizeof(sljit_sw);
-        }
-        local_size -= saved_regs_size + SLJIT_LOCALS_OFFSET;
-        if (saved_regs_size > 0)
-            FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
-    }
+    FAIL_IF(push_inst(compiler, STP_PRE | RT(TMP_FP) | RT2(TMP_LR)
+        | RN(SLJIT_SP) | ((-(saved_regs_size >> 3) & 0x7f) << 15)));


+#ifdef _WIN32
+    if (local_size >= 4096)
+        FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(SLJIT_SP) | (1 << 10) | (1 << 22)));
+    else if (local_size > 256)
+        FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(SLJIT_SP) | (local_size << 10)));
+#endif
+
     tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
     prev = -1;
+    offs = 2 << 15;
     for (i = SLJIT_S0; i >= tmp; i--) {
         if (prev == -1) {
-            if (!(offs & (1 << 15))) {
-                prev = i;
-                continue;
-            }
-            FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
-            offs += 1 << 15;
+            prev = i;
             continue;
         }
-        FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+        FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
         offs += 2 << 15;
         prev = -1;
     }
@@ -913,34 +907,20 @@


     for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
         if (prev == -1) {
-            if (!(offs & (1 << 15))) {
-                prev = i;
-                continue;
-            }
-            FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
-            offs += 1 << 15;
+            prev = i;
             continue;
         }
-        FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+        FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
         offs += 2 << 15;
         prev = -1;
     }


-    SLJIT_ASSERT(prev == -1);
+    if (prev != -1)
+        FAIL_IF(push_inst(compiler, STRI | RT(prev) | RN(SLJIT_SP) | (offs >> 5)));


-    if (compiler->local_size > (63 * sizeof(sljit_sw))) {
-        /* The local_size is already adjusted by the saved registers. */
-        if (local_size > 0xfff) {
-            FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | ((local_size >> 12) << 10) | (1 << 22)));
-            local_size &= 0xfff;
-        }
-        if (local_size)
-            FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (local_size << 10)));
-        FAIL_IF(push_inst(compiler, STP_PRE | 29 | RT2(TMP_LR)
-            | RN(TMP_SP) | ((-(16 >> 3) & 0x7f) << 15)));
-        FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_SP) | (0 << 10)));
-    }


+    FAIL_IF(push_inst(compiler, ADDI | RD(TMP_FP) | RN(SLJIT_SP) | (0 << 10)));
+
     args = get_arg_count(arg_types);


     if (args >= 1)
@@ -950,6 +930,64 @@
     if (args >= 3)
         FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_S2) | RN(TMP_ZERO) | RM(SLJIT_R2)));


+#ifdef _WIN32
+    if (local_size >= 4096) {
+        if (local_size < 4 * 4096) {
+            /* No need for a loop. */
+            if (local_size >= 2 * 4096) {
+                FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+                FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (1 << 10) | (1 << 22)));
+                local_size -= 4096;
+            }
+
+            if (local_size >= 2 * 4096) {
+                FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+                FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (1 << 10) | (1 << 22)));
+                local_size -= 4096;
+            }
+
+            FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+            local_size -= 4096;
+        }
+        else {
+            FAIL_IF(push_inst(compiler, MOVZ | RD(TMP_REG2) | (((local_size >> 12) - 1) << 5)));
+            FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+            FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (1 << 10) | (1 << 22)));
+            FAIL_IF(push_inst(compiler, SUBI | (1 << 29) | RD(TMP_REG2) | RN(TMP_REG2) | (1 << 10)));
+            FAIL_IF(push_inst(compiler, B_CC | ((((sljit_ins) -3) & 0x7ffff) << 5) | 0x1 /* not-equal */));
+            FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+
+            local_size &= 0xfff;
+        }
+
+        if (local_size > 256) {
+            FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (local_size << 10)));
+            FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+        }
+        else if (local_size > 0)
+            FAIL_IF(push_inst(compiler, LDR_PRE | RT(TMP_ZERO) | RN(TMP_REG1) | ((-local_size & 0x1ff) << 12)));
+
+        FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_REG1) | (0 << 10)));
+    }
+    else if (local_size > 256) {
+        FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+        FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_REG1) | (0 << 10)));
+    }
+    else if (local_size > 0)
+        FAIL_IF(push_inst(compiler, LDR_PRE | RT(TMP_ZERO) | RN(SLJIT_SP) | ((-local_size & 0x1ff) << 12)));
+
+#else /* !_WIN32 */
+
+    /* The local_size does not include saved registers size. */
+    if (local_size > 0xfff) {
+        FAIL_IF(push_inst(compiler, SUBI | RD(SLJIT_SP) | RN(SLJIT_SP) | ((local_size >> 12) << 10) | (1 << 22)));
+        local_size &= 0xfff;
+    }
+    if (local_size != 0)
+        FAIL_IF(push_inst(compiler, SUBI | RD(SLJIT_SP) | RN(SLJIT_SP) | (local_size << 10)));
+
+#endif /* _WIN32 */
+
     return SLJIT_SUCCESS;
 }


@@ -957,13 +995,17 @@
     sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
     sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
+    sljit_s32 saved_regs_size;
+
     CHECK_ERROR();
     CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
     set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


-    local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 0) + SLJIT_LOCALS_OFFSET;
-    local_size = (local_size + 15) & ~0xf;
-    compiler->local_size = local_size;
+    saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 2);
+    if (saved_regs_size & 0x8)
+        saved_regs_size += sizeof(sljit_sw);
+
+    compiler->local_size = saved_regs_size + ((local_size + 15) & ~0xf);
     return SLJIT_SUCCESS;
 }


@@ -977,41 +1019,39 @@

     FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));


-    local_size = compiler->local_size;
+    saved_regs_size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds, 2);
+    if (saved_regs_size & 0x8)
+        saved_regs_size += sizeof(sljit_sw);


-    saved_regs_size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds, 0);
-    if (local_size <= (63 * sizeof(sljit_sw)))
-        offs = (local_size - saved_regs_size) << (15 - 3);
+    local_size = compiler->local_size - saved_regs_size;
+
+    /* Load LR as early as possible. */
+    if (local_size == 0)
+        FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
+    else if (local_size < 63 * sizeof(sljit_sw)) {
+        FAIL_IF(push_inst(compiler, LDP_PRE | RT(TMP_FP) | RT2(TMP_LR)
+            | RN(SLJIT_SP) | (local_size << (15 - 3))));
+    }
     else {
-        FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR)
-            | RN(TMP_SP) | (((16 >> 3) & 0x7f) << 15)));
-        offs = 0 << 15;
-        if (saved_regs_size & 0x8) {
-            offs = 1 << 15;
-            saved_regs_size += sizeof(sljit_sw);
-        }
-        local_size -= saved_regs_size + SLJIT_LOCALS_OFFSET;
         if (local_size > 0xfff) {
-            FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | ((local_size >> 12) << 10) | (1 << 22)));
+            FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(SLJIT_SP) | ((local_size >> 12) << 10) | (1 << 22)));
             local_size &= 0xfff;
         }
         if (local_size)
-            FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (local_size << 10)));
+            FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(SLJIT_SP) | (local_size << 10)));
+
+        FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
     }


     tmp = compiler->saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - compiler->saveds) : SLJIT_FIRST_SAVED_REG;
     prev = -1;
+    offs = 2 << 15;
     for (i = SLJIT_S0; i >= tmp; i--) {
         if (prev == -1) {
-            if (!(offs & (1 << 15))) {
-                prev = i;
-                continue;
-            }
-            FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
-            offs += 1 << 15;
+            prev = i;
             continue;
         }
-        FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+        FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
         offs += 2 << 15;
         prev = -1;
     }
@@ -1018,30 +1058,20 @@


     for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
         if (prev == -1) {
-            if (!(offs & (1 << 15))) {
-                prev = i;
-                continue;
-            }
-            FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
-            offs += 1 << 15;
+            prev = i;
             continue;
         }
-        FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+        FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
         offs += 2 << 15;
         prev = -1;
     }


-    SLJIT_ASSERT(prev == -1);
+    if (prev != -1)
+        FAIL_IF(push_inst(compiler, LDRI | RT(prev) | RN(SLJIT_SP) | (offs >> 5)));


-    if (compiler->local_size <= (63 * sizeof(sljit_sw))) {
-        FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR)
-            | RN(TMP_SP) | (((local_size >> 3) & 0x7f) << 15)));
-    } else if (saved_regs_size > 0) {
-        FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
-    }
-
-    FAIL_IF(push_inst(compiler, RET | RN(TMP_LR)));
-    return SLJIT_SUCCESS;
+    /* These two can be executed in parallel. */
+    FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(SLJIT_SP) | (saved_regs_size << 10)));
+    return push_inst(compiler, RET | RN(TMP_LR));
 }


 /* --------------------------------------------------------------------- */
@@ -1856,6 +1886,46 @@
     return push_inst(compiler, inst | VT(freg) | RN(mem & REG_MASK) | ((memw & 0x1ff) << 12));
 }


+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
+{
+    sljit_s32 dst_reg;
+    sljit_ins ins;
+
+    CHECK_ERROR();
+    CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
+
+    SLJIT_ASSERT (SLJIT_LOCALS_OFFSET_BASE == 0);
+
+    dst_reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
+
+    if (offset <= 0xffffff && offset >= -0xffffff) {
+        ins = ADDI;
+        if (offset < 0) {
+            offset = -offset;
+            ins = SUBI;
+        }
+
+        if (offset <= 0xfff)
+            FAIL_IF(push_inst(compiler, ins | RD(dst_reg) | RN(SLJIT_SP) | (offset << 10)));
+        else {
+            FAIL_IF(push_inst(compiler, ins | RD(dst_reg) | RN(SLJIT_SP) | ((offset & 0xfff000) >> (12 - 10)) | (1 << 22)));
+
+            offset &= 0xfff;
+            if (offset != 0)
+                FAIL_IF(push_inst(compiler, ins | RD(dst_reg) | RN(dst_reg) | (offset << 10)));
+        }
+    }
+    else {
+        FAIL_IF(load_immediate (compiler, dst_reg, offset));
+        /* Add extended register form. */
+        FAIL_IF(push_inst(compiler, ADDE | (0x3 << 13) | RD(dst_reg) | RN(SLJIT_SP) | RM(dst_reg)));
+    }
+
+    if (SLJIT_UNLIKELY(dst & SLJIT_MEM))
+        return emit_op_mem(compiler, WORD_SIZE | STORE, dst_reg, dst, dstw, TMP_REG1);
+    return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
     struct sljit_const *const_;


Modified: code/trunk/src/sljit/sljitNativeARM_T2_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeARM_T2_32.c    2018-03-04 15:30:46 UTC (rev 927)
+++ code/trunk/src/sljit/sljitNativeARM_T2_32.c    2018-03-13 12:05:48 UTC (rev 928)
@@ -110,6 +110,7 @@
 #define ASRSI        0x1000
 #define ASR_W        0xfa40f000
 #define ASR_WI        0xea4f0020
+#define BCC        0xd000
 #define BICI        0xf0200000
 #define BKPT        0xbe00
 #define BLX        0x4780
@@ -125,6 +126,7 @@
 #define EORS        0x4040
 #define EOR_W        0xea800000
 #define IT        0xbf00
+#define LDRI        0xf8500800
 #define LSLS        0x4080
 #define LSLSI        0x0000
 #define LSL_W        0xfa00f000
@@ -158,6 +160,7 @@
 #define SBCI        0xf1600000
 #define SBCS        0x4180
 #define SBC_W        0xeb600000
+#define SDIV        0xfb90f0f0
 #define SMULL        0xfb800000
 #define STR_SP        0x9000
 #define SUBS        0x1a00
@@ -172,6 +175,7 @@
 #define SXTH        0xb200
 #define SXTH_W        0xfa0ff080
 #define TST        0x4200
+#define UDIV        0xfbb0f0f0
 #define UMULL        0xfba00000
 #define UXTB        0xb2c0
 #define UXTB_W        0xfa5ff080
@@ -339,8 +343,8 @@


     /* Really complex instruction form for branches. */
     s = (diff >> 23) & 0x1;
-    j1 = (~(diff >> 21) ^ s) & 0x1;
-    j2 = (~(diff >> 22) ^ s) & 0x1;
+    j1 = (~(diff >> 22) ^ s) & 0x1;
+    j2 = (~(diff >> 21) ^ s) & 0x1;
     jump_inst[0] = 0xf000 | (s << 10) | COPY_BITS(diff, 11, 0, 10);
     jump_inst[1] = (j1 << 13) | (j2 << 11) | (diff & 0x7ff);


@@ -520,6 +524,8 @@
 {
     sljit_uw tmp;


+    /* MOVS cannot be used since it destroy flags. */
+
     if (imm >= 0x10000) {
         tmp = get_imm(imm);
         if (tmp != INVALID_IMM)
@@ -1032,6 +1038,9 @@
 {
     sljit_s32 args, size, i, tmp;
     sljit_ins push = 0;
+#ifdef _WIN32
+    sljit_uw imm;
+#endif


     CHECK_ERROR();
     CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
@@ -1052,6 +1061,18 @@
     size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
     local_size = ((size + local_size + 7) & ~7) - size;
     compiler->local_size = local_size;
+
+#ifdef _WIN32
+    if (local_size >= 256) {
+        if (local_size > 4096)
+            imm = get_imm(4096);
+        else
+            imm = get_imm(local_size & ~0xff);
+
+        SLJIT_ASSERT(imm != INVALID_IMM);
+        FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(SLJIT_SP) | imm));
+    }
+#else
     if (local_size > 0) {
         if (local_size <= (127 << 2))
             FAIL_IF(push_inst16(compiler, SUB_SP | (local_size >> 2)));
@@ -1058,6 +1079,7 @@
         else
             FAIL_IF(emit_op_imm(compiler, SLJIT_SUB | ARG2_IMM, SLJIT_SP, SLJIT_SP, local_size));
     }
+#endif


     args = get_arg_count(arg_types);


@@ -1068,6 +1090,61 @@
     if (args >= 3)
         FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_S2, SLJIT_R2)));


+#ifdef _WIN32
+    if (local_size >= 256) {
+        if (local_size > 4096) {
+            imm = get_imm(4096);
+            SLJIT_ASSERT(imm != INVALID_IMM);
+
+            if (local_size < 4 * 4096) {
+                if (local_size > 2 * 4096) {
+                    FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+                    FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+                    local_size -= 4096;
+                }
+
+                if (local_size > 2 * 4096) {
+                    FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+                    FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+                    local_size -= 4096;
+                }
+
+                FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+                local_size -= 4096;
+
+                SLJIT_ASSERT(local_size > 0);
+            }
+            else {
+                FAIL_IF(load_immediate(compiler, SLJIT_R3, (local_size >> 12) - 1));
+                FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+                FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+                SLJIT_ASSERT(reg_map[SLJIT_R3] < 7);
+                FAIL_IF(push_inst16(compiler, SUBSI8 | RDN3(SLJIT_R3) | 1));
+                FAIL_IF(push_inst16(compiler, BCC | (0x1 << 8) /* not-equal */ | (-7 & 0xff)));
+
+                local_size &= 0xfff;
+
+                if (local_size != 0)
+                    FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+            }
+
+            if (local_size >= 256) {
+                imm = get_imm(local_size & ~0xff);
+                SLJIT_ASSERT(imm != INVALID_IMM);
+
+                FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+            }
+        }
+
+        local_size &= 0xff;
+        FAIL_IF(push_inst32(compiler, LDRI | 0x400 | (local_size > 0 ? 0x100 : 0) | RT4(TMP_REG2) | RN4(TMP_REG1) | local_size));
+
+        FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_SP, TMP_REG1)));
+    }
+    else if (local_size > 0)
+        FAIL_IF(push_inst32(compiler, LDRI | 0x500 | RT4(TMP_REG1) | RN4(SLJIT_SP) | local_size));
+#endif
+
     return SLJIT_SUCCESS;
 }


@@ -1119,11 +1196,16 @@
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */


+#if !(defined __ARM_FEATURE_IDIV) && !(defined __ARM_ARCH_EXT_IDIV__)
+
#ifdef __cplusplus
extern "C" {
#endif

-#if defined(__GNUC__)
+#ifdef _WIN32
+extern unsigned long long __rt_udiv(unsigned int denominator, unsigned int numerator);
+extern long long __rt_sdiv(int denominator, int numerator);
+#elif defined(__GNUC__)
extern unsigned int __aeabi_uidivmod(unsigned int numerator, int unsigned denominator);
extern int __aeabi_idivmod(int numerator, int denominator);
#else
@@ -1134,10 +1216,14 @@
}
#endif

+#endif /* !__ARM_FEATURE_IDIV && !__ARM_ARCH_EXT_IDIV__ */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
 {
+#if !(defined __ARM_FEATURE_IDIV) && !(defined __ARM_ARCH_EXT_IDIV__)
     sljit_sw saved_reg_list[3];
     sljit_sw saved_reg_count;
+#endif


     CHECK_ERROR();
     CHECK(check_sljit_emit_op0(compiler, op));
@@ -1155,10 +1241,21 @@
             | (reg_map[SLJIT_R0] << 12)
             | (reg_map[SLJIT_R0] << 16)
             | reg_map[SLJIT_R1]);
+#if (defined __ARM_FEATURE_IDIV) || (defined __ARM_ARCH_EXT_IDIV__)
     case SLJIT_DIVMOD_UW:
     case SLJIT_DIVMOD_SW:
+        FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, SLJIT_R0)));
+        FAIL_IF(push_inst32(compiler, (op == SLJIT_DIVMOD_UW ? UDIV : SDIV) | RD4(SLJIT_R0) | RN4(SLJIT_R0) | RM4(SLJIT_R1)));
+        FAIL_IF(push_inst32(compiler, MUL | RD4(SLJIT_R1) | RN4(SLJIT_R0) | RM4(SLJIT_R1)));
+        return push_inst32(compiler, SUB_W | RD4(SLJIT_R1) | RN4(TMP_REG1) | RM4(SLJIT_R1));
     case SLJIT_DIV_UW:
     case SLJIT_DIV_SW:
+        return push_inst32(compiler, (op == SLJIT_DIV_UW ? UDIV : SDIV) | RD4(SLJIT_R0) | RN4(SLJIT_R0) | RM4(SLJIT_R1));
+#else /* !__ARM_FEATURE_IDIV && !__ARM_ARCH_EXT_IDIV__ */
+    case SLJIT_DIVMOD_UW:
+    case SLJIT_DIVMOD_SW:
+    case SLJIT_DIV_UW:
+    case SLJIT_DIV_SW:
         SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
         SLJIT_ASSERT(reg_map[2] == 1 && reg_map[3] == 2 && reg_map[4] == 3);


@@ -1183,8 +1280,14 @@
             }
         }


-#if defined(__GNUC__)
+#ifdef _WIN32
+        FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, SLJIT_R0)));
+        FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_R0, SLJIT_R1)));
+        FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_R1, TMP_REG1)));
         FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
+            ((op | 0x2) == SLJIT_DIV_UW ? SLJIT_FUNC_OFFSET(__rt_udiv) : SLJIT_FUNC_OFFSET(__rt_sdiv))));
+#elif defined(__GNUC__)
+        FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
             ((op | 0x2) == SLJIT_DIV_UW ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
 #else
 #error "Software divmod functions are needed"
@@ -1203,6 +1306,7 @@
                         | (saved_reg_list[0] << 12) /* ldr rX, [sp], #8/16 */);
         }
         return SLJIT_SUCCESS;
+#endif /* __ARM_FEATURE_IDIV || __ARM_ARCH_EXT_IDIV__ */
     }


     return SLJIT_SUCCESS;


Modified: code/trunk/src/sljit/sljitNativeX86_32.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_32.c    2018-03-04 15:30:46 UTC (rev 927)
+++ code/trunk/src/sljit/sljitNativeX86_32.c    2018-03-13 12:05:48 UTC (rev 928)
@@ -123,34 +123,38 @@


 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
     if (args > 0) {
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (reg_map[SLJIT_S0] << 3) | reg_map[SLJIT_R2];
+        inst[0] = MOV_r_rm;
+        inst[1] = MOD_REG | (reg_map[SLJIT_S0] << 3) | reg_map[SLJIT_R2];
+        inst += 2;
     }
     if (args > 1) {
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_REG | (reg_map[SLJIT_S1] << 3) | reg_map[SLJIT_R1];
+        inst[0] = MOV_r_rm;
+        inst[1] = MOD_REG | (reg_map[SLJIT_S1] << 3) | reg_map[SLJIT_R1];
+        inst += 2;
     }
     if (args > 2) {
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | 0x4 /* esp */;
-        *inst++ = 0x24;
-        *inst++ = sizeof(sljit_sw) * (3 + 2); /* saveds >= 3 as well. */
+        inst[0] = MOV_r_rm;
+        inst[1] = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | 0x4 /* esp */;
+        inst[2] = 0x24;
+        inst[3] = sizeof(sljit_sw) * (3 + 2); /* saveds >= 3 as well. */
     }
 #else
     if (args > 0) {
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_S0] << 3) | reg_map[TMP_REG1];
-        *inst++ = sizeof(sljit_sw) * 2;
+        inst[0] = MOV_r_rm;
+        inst[1] = MOD_DISP8 | (reg_map[SLJIT_S0] << 3) | reg_map[TMP_REG1];
+        inst[2] = sizeof(sljit_sw) * 2;
+        inst += 3;
     }
     if (args > 1) {
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_S1] << 3) | reg_map[TMP_REG1];
-        *inst++ = sizeof(sljit_sw) * 3;
+        inst[0] = MOV_r_rm;
+        inst[1] = MOD_DISP8 | (reg_map[SLJIT_S1] << 3) | reg_map[TMP_REG1];
+        inst[2] = sizeof(sljit_sw) * 3;
+        inst += 3;
     }
     if (args > 2) {
-        *inst++ = MOV_r_rm;
-        *inst++ = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | reg_map[TMP_REG1];
-        *inst++ = sizeof(sljit_sw) * 4;
+        inst[0] = MOV_r_rm;
+        inst[1] = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | reg_map[TMP_REG1];
+        inst[2] = sizeof(sljit_sw) * 4;
     }
 #endif


@@ -170,17 +174,36 @@
     compiler->local_size = local_size;


 #ifdef _WIN32
-    if (local_size > 1024) {
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-        FAIL_IF(emit_do_imm(compiler, MOV_r_i32 + reg_map[SLJIT_R0], local_size));
-#else
-        /* Space for a single argument. This amount is excluded when the stack is allocated below. */
-        local_size -= sizeof(sljit_sw);
-        FAIL_IF(emit_do_imm(compiler, MOV_r_i32 + reg_map[SLJIT_R0], local_size));
-        FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
-            SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, sizeof(sljit_sw)));
-#endif
-        FAIL_IF(sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARG1(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
+    if (local_size > 0) {
+        if (local_size <= 4 * 4096) {
+            if (local_size > 4096)
+                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096);
+            if (local_size > 2 * 4096)
+                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 2);
+            if (local_size > 3 * 4096)
+                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 3);
+        }
+        else {
+            EMIT_MOV(compiler, SLJIT_R0, 0, SLJIT_SP, 0);
+            EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_IMM, (local_size - 1) >> 12);
+
+            SLJIT_ASSERT (reg_map[SLJIT_R0] == 0);
+
+            EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_R0), -4096);
+            FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+                SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 4096));
+            FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+                SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1));
+
+            inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
+            FAIL_IF(!inst);
+
+            INC_SIZE(2);
+            inst[0] = JNE_i8;
+            inst[1] = (sljit_s8) -16;
+        }
+
+        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -local_size);
     }
 #endif



Modified: code/trunk/src/sljit/sljitNativeX86_64.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_64.c    2018-03-04 15:30:46 UTC (rev 927)
+++ code/trunk/src/sljit/sljitNativeX86_64.c    2018-03-13 12:05:48 UTC (rev 928)
@@ -83,6 +83,8 @@
     CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
     set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);


+    compiler->mode32 = 0;
+
 #ifdef _WIN64
     /* Two/four register slots for parameters plus space for xmm6 register if needed. */
     if (fscratches >= 6 || fsaveds >= 1)
@@ -126,35 +128,39 @@


 #ifndef _WIN64
         if (args > 0) {
-            *inst++ = REX_W;
-            *inst++ = MOV_r_rm;
-            *inst++ = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x7 /* rdi */;
+            inst[0] = REX_W;
+            inst[1] = MOV_r_rm;
+            inst[2] = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x7 /* rdi */;
+            inst += 3;
         }
         if (args > 1) {
-            *inst++ = REX_W | REX_R;
-            *inst++ = MOV_r_rm;
-            *inst++ = MOD_REG | (reg_lmap[SLJIT_S1] << 3) | 0x6 /* rsi */;
+            inst[0] = REX_W | REX_R;
+            inst[1] = MOV_r_rm;
+            inst[2] = MOD_REG | (reg_lmap[SLJIT_S1] << 3) | 0x6 /* rsi */;
+            inst += 3;
         }
         if (args > 2) {
-            *inst++ = REX_W | REX_R;
-            *inst++ = MOV_r_rm;
-            *inst++ = MOD_REG | (reg_lmap[SLJIT_S2] << 3) | 0x2 /* rdx */;
+            inst[0] = REX_W | REX_R;
+            inst[1] = MOV_r_rm;
+            inst[2] = MOD_REG | (reg_lmap[SLJIT_S2] << 3) | 0x2 /* rdx */;
         }
 #else
         if (args > 0) {
-            *inst++ = REX_W;
-            *inst++ = MOV_r_rm;
-            *inst++ = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x1 /* rcx */;
+            inst[0] = REX_W;
+            inst[1] = MOV_r_rm;
+            inst[2] = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x1 /* rcx */;
+            inst += 3;
         }
         if (args > 1) {
-            *inst++ = REX_W;
-            *inst++ = MOV_r_rm;
-            *inst++ = MOD_REG | (reg_map[SLJIT_S1] << 3) | 0x2 /* rdx */;
+            inst[0] = REX_W;
+            inst[1] = MOV_r_rm;
+            inst[2] = MOD_REG | (reg_map[SLJIT_S1] << 3) | 0x2 /* rdx */;
+            inst += 3;
         }
         if (args > 2) {
-            *inst++ = REX_W | REX_B;
-            *inst++ = MOV_r_rm;
-            *inst++ = MOD_REG | (reg_map[SLJIT_S2] << 3) | 0x0 /* r8 */;
+            inst[0] = REX_W | REX_B;
+            inst[1] = MOV_r_rm;
+            inst[2] = MOD_REG | (reg_map[SLJIT_S2] << 3) | 0x0 /* r8 */;
         }
 #endif
     }
@@ -163,60 +169,44 @@
     compiler->local_size = local_size;


 #ifdef _WIN64
-    if (local_size > 1024) {
-        /* Allocate stack for the callback, which grows the stack. */
-        inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + (3 + sizeof(sljit_s32)));
-        FAIL_IF(!inst);
-        INC_SIZE(4 + (3 + sizeof(sljit_s32)));
-        *inst++ = REX_W;
-        *inst++ = GROUP_BINARY_83;
-        *inst++ = MOD_REG | SUB | reg_map[SLJIT_SP];
-        /* Allocated size for registers must be divisible by 8. */
-        SLJIT_ASSERT(!(saved_register_size & 0x7));
-        /* Aligned to 16 byte. */
-        if (saved_register_size & 0x8) {
-            *inst++ = 5 * sizeof(sljit_sw);
-            local_size -= 5 * sizeof(sljit_sw);
-        } else {
-            *inst++ = 4 * sizeof(sljit_sw);
-            local_size -= 4 * sizeof(sljit_sw);
-        }
-        /* Second instruction */
-        SLJIT_ASSERT(reg_map[SLJIT_R0] < 8);
-        *inst++ = REX_W;
-        *inst++ = MOV_rm_i32;
-        *inst++ = MOD_REG | reg_lmap[SLJIT_R0];
-        sljit_unaligned_store_s32(inst, local_size);
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-            || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-        compiler->skip_checks = 1;
-#endif
-        FAIL_IF(sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARG1(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
-    }
-#endif
-
     if (local_size > 0) {
-        if (local_size <= 127) {
-            inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
-            FAIL_IF(!inst);
-            INC_SIZE(4);
-            *inst++ = REX_W;
-            *inst++ = GROUP_BINARY_83;
-            *inst++ = MOD_REG | SUB | reg_map[SLJIT_SP];
-            *inst++ = local_size;
+        if (local_size <= 4 * 4096) {
+            if (local_size > 4096)
+                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096);
+            if (local_size > 2 * 4096)
+                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 2);
+            if (local_size > 3 * 4096)
+                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 3);
         }
         else {
-            inst = (sljit_u8*)ensure_buf(compiler, 1 + 7);
+            EMIT_MOV(compiler, SLJIT_R0, 0, SLJIT_SP, 0);
+            EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, (local_size - 1) >> 12);
+
+            SLJIT_ASSERT (reg_map[SLJIT_R0] == 0);
+
+            EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_MEM1(SLJIT_R0), -4096);
+            FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+                SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 4096));
+            FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+                TMP_REG1, 0, TMP_REG1, 0, SLJIT_IMM, 1));
+
+            inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
             FAIL_IF(!inst);
-            INC_SIZE(7);
-            *inst++ = REX_W;
-            *inst++ = GROUP_BINARY_81;
-            *inst++ = MOD_REG | SUB | reg_map[SLJIT_SP];
-            sljit_unaligned_store_s32(inst, local_size);
-            inst += sizeof(sljit_s32);
+
+            INC_SIZE(2);
+            inst[0] = JNE_i8;
+            inst[1] = (sljit_s8) -19;
         }
+
+        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -local_size);
     }
+#endif


+    if (local_size > 0) {
+        FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+            SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, local_size));
+    }
+
 #ifdef _WIN64
     /* Save xmm6 register: movaps [rsp + 0x20], xmm6 */
     if (fscratches >= 6 || fsaveds >= 1) {


Modified: code/trunk/src/sljit/sljitNativeX86_common.c
===================================================================
--- code/trunk/src/sljit/sljitNativeX86_common.c    2018-03-04 15:30:46 UTC (rev 927)
+++ code/trunk/src/sljit/sljitNativeX86_common.c    2018-03-13 12:05:48 UTC (rev 928)
@@ -669,23 +669,6 @@
 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
     sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);


-#ifdef _WIN32
-#include <malloc.h>
-
-static void SLJIT_FUNC sljit_grow_stack(sljit_sw local_size)
-{
-    /* Workaround for calling the internal _chkstk() function on Windows.
-    This function touches all 4k pages belongs to the requested stack space,
-    which size is passed in local_size. This is necessary on Windows where
-    the stack can only grow in 4k steps. However, this function just burn
-    CPU cycles if the stack is large enough. However, you don't know it in
-    advance, so it must always be called. I think this is a bad design in
-    general even if it has some reasons. */
-    *(volatile sljit_s32*)alloca(local_size) = 0;
-}
-
-#endif
-
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 #include "sljitNativeX86_32.c"
 #else