Prevent Clang UB 'optimization' which breaks integerness checks.

Thanks to Kacper Michajłow. #1351 #1355
ARM: Fix soft-float math.min()/math.max().
2025-04-20 05:53:26 +00:00 · 2025-04-10 22:53:50 +02:00 · 2025-04-10 22:45:38 +02:00 · 2025-04-10 22:06:47 +02:00 · 2025-04-07 10:33:15 +02:00 · 2025-04-07 10:27:40 +02:00
217 changed files with 2489 additions and 1509 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+/.relver export-subst
--- a/.relver
+++ b/.relver
@ -0,0 +1 @@
+$Format:%ct$
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 ===============================================================================
 LuaJIT -- a Just-In-Time Compiler for Lua. https://luajit.org/

-Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+Copyright (C) 2005-2025 Mike Pall. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/24
+++ b/24
@ -10,7 +10,7 @@
 # For MSVC, please follow the instructions given in src/msvcbuild.bat.
 # For MinGW and Cygwin, cd to src and run make with the Makefile there.
 #
-# Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+# Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 ##############################################################################

 MAJVER=  2
@ -37,12 +37,13 @@ export MULTILIB= lib
 DPREFIX= $(DESTDIR)$(PREFIX)
 INSTALL_BIN=   $(DPREFIX)/bin
 INSTALL_LIB=   $(DPREFIX)/$(MULTILIB)
-INSTALL_SHARE= $(DPREFIX)/share
+INSTALL_SHARE_= $(PREFIX)/share
+INSTALL_SHARE= $(DESTDIR)$(INSTALL_SHARE_)
 INSTALL_DEFINC= $(DPREFIX)/include/luajit-$(MMVERSION)
 INSTALL_INC=   $(INSTALL_DEFINC)

-INSTALL_LJLIBD= $(INSTALL_SHARE)/luajit-$(MMVERSION)
-INSTALL_JITLIB= $(INSTALL_LJLIBD)/jit
+export INSTALL_LJLIBD= $(INSTALL_SHARE_)/luajit-$(MMVERSION)
+INSTALL_JITLIB= $(DESTDIR)$(INSTALL_LJLIBD)/jit
 INSTALL_LMODD= $(INSTALL_SHARE)/lua
 INSTALL_LMOD= $(INSTALL_LMODD)/$(ABIVER)
 INSTALL_CMODD= $(INSTALL_LIB)/lua
@ -71,7 +72,7 @@ INSTALL_PC= $(INSTALL_PKGCONFIG)/$(INSTALL_PCNAME)

 INSTALL_DIRS= $(INSTALL_BIN) $(INSTALL_LIB) $(INSTALL_INC) $(INSTALL_MAN) \
  $(INSTALL_PKGCONFIG) $(INSTALL_JITLIB) $(INSTALL_LMOD) $(INSTALL_CMOD)
-UNINSTALL_DIRS= $(INSTALL_JITLIB) $(INSTALL_LJLIBD) $(INSTALL_INC) \
+UNINSTALL_DIRS= $(INSTALL_JITLIB) $(DESTDIR)$(INSTALL_LJLIBD) $(INSTALL_INC) \
  $(INSTALL_LMOD) $(INSTALL_LMODD) $(INSTALL_CMOD) $(INSTALL_CMODD)

 RM= rm -f
@ -109,11 +110,12 @@ else
 endif
 TARGET_SYS?= $(HOST_SYS)

-ifeq (Darwin,$(TARGET_SYS))
+ifneq (,$(filter $(TARGET_SYS),Darwin iOS))
  INSTALL_SONAME= $(INSTALL_DYLIBNAME)
  INSTALL_SOSHORT1= $(INSTALL_DYLIBSHORT1)
  INSTALL_SOSHORT2= $(INSTALL_DYLIBSHORT2)
  LDCONFIG= :
+  SED_PC+= -e "s| -Wl,-E||"
 endif

 ##############################################################################
@ -142,18 +144,12 @@ install: $(INSTALL_DEP)
 	  $(RM) $(FILE_PC).tmp
 	cd src && $(INSTALL_F) $(FILES_INC) $(INSTALL_INC)
 	cd src/jit && $(INSTALL_F) $(FILES_JITLIB) $(INSTALL_JITLIB)
+	$(SYMLINK) $(INSTALL_TNAME) $(INSTALL_TSYM)
 	@echo "==== Successfully installed LuaJIT $(VERSION) to $(PREFIX) ===="
-	@echo ""
-	@echo "Note: the development releases deliberately do NOT install a symlink for luajit"
-	@echo "You can do this now by running this command (with sudo):"
-	@echo ""
-	@echo "  $(SYMLINK) $(INSTALL_TNAME) $(INSTALL_TSYM)"
-	@echo ""
-

 uninstall:
 	@echo "==== Uninstalling LuaJIT $(VERSION) from $(PREFIX) ===="
-	$(UNINSTALL) $(INSTALL_T) $(INSTALL_STATIC) $(INSTALL_DYN) $(INSTALL_SHORT1) $(INSTALL_SHORT2) $(INSTALL_MAN)/$(FILE_MAN) $(INSTALL_PC)
+	$(UNINSTALL) $(INSTALL_TSYM) $(INSTALL_T) $(INSTALL_STATIC) $(INSTALL_DYN) $(INSTALL_SHORT1) $(INSTALL_SHORT2) $(INSTALL_MAN)/$(FILE_MAN) $(INSTALL_PC)
 	for file in $(FILES_JITLIB); do \
 	  $(UNINSTALL) $(INSTALL_JITLIB)/$$file; \
 	  done
--- a/2
+++ b/2
@ -5,7 +5,7 @@ LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language.

 Project Homepage: https://luajit.org/

-LuaJIT is Copyright (C) 2005-2023 Mike Pall.
+LuaJIT is Copyright (C) 2005-2025 Mike Pall.
 LuaJIT is free software, released under the MIT license.
 See full Copyright Notice in the COPYRIGHT file or in luajit.h.

--- a/doc/bluequad-print.css
+++ b/doc/bluequad-print.css
@ -1,4 +1,4 @@
-/* Copyright (C) 2004-2023 Mike Pall.
+/* Copyright (C) 2004-2025 Mike Pall.
 *
 * You are welcome to use the general ideas of this design for your own sites.
 * But please do not steal the stylesheet, the layout or the color scheme.
--- a/doc/bluequad.css
+++ b/doc/bluequad.css
@ -1,4 +1,4 @@
-/* Copyright (C) 2004-2023 Mike Pall.
+/* Copyright (C) 2004-2025 Mike Pall.
 *
 * You are welcome to use the general ideas of this design for your own sites.
 * But please do not steal the stylesheet, the layout or the color scheme.
--- a/doc/contact.html
+++ b/doc/contact.html
@ -3,7 +3,7 @@
 <head>
 <title>Contact</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -94,7 +94,7 @@ don't like that, please complain to Google or Microsoft, not me.
 <h2>Copyright</h2>
 <p>
 All documentation is
-Copyright &copy; 2005-2023 Mike Pall.
+Copyright &copy; 2005-2025 Mike Pall.
 </p>


@ -102,7 +102,7 @@ Copyright &copy; 2005-2023 Mike Pall.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_buffer.html
+++ b/doc/ext_buffer.html
@ -3,7 +3,7 @@
 <head>
 <title>String Buffer Library</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -85,7 +85,7 @@ operations.
 </p>
 <p>
 The string buffer library also includes a high-performance
-<a href="serialize">serializer</a> for Lua objects.
+<a href="#serialize">serializer</a> for Lua objects.
 </p>

 <h2 id="use">Using the String Buffer Library</h2>
@ -588,9 +588,9 @@ num       → 0x07 double.L
 tab       → 0x08                                   // Empty table
          | 0x09 h.U h*{object object}          // Key/value hash
          | 0x0a a.U a*object                    // 0-based array
-          | 0x0b a.U a*object h.U h*{object object}      // Mixed
+          | 0x0b a.U h.U a*object h*{object object}      // Mixed
          | 0x0c a.U (a-1)*object                // 1-based array
-          | 0x0d a.U (a-1)*object h.U h*{object object}  // Mixed
+          | 0x0d a.U h.U (a-1)*object h*{object object}  // Mixed
 tab_mt    → 0x0e (index-1).U tab          // Metatable dict entry

 int64     → 0x10 int.L                             // FFI int64_t
@ -679,7 +679,7 @@ mappings of files are OK, but only if the file does not change.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_c_api.html
+++ b/doc/ext_c_api.html
@ -3,7 +3,7 @@
 <head>
 <title>Lua/C API Extensions</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -173,7 +173,7 @@ Also note that this mechanism is not without overhead.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_ffi.html
+++ b/doc/ext_ffi.html
@ -3,7 +3,7 @@
 <head>
 <title>FFI Library</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -316,7 +316,7 @@ without undue conversion penalties.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_ffi_api.html
+++ b/doc/ext_ffi_api.html
@ -3,7 +3,7 @@
 <head>
 <title>ffi.* API Functions</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -558,7 +558,7 @@ named <tt>i</tt>.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_ffi_semantics.html
+++ b/doc/ext_ffi_semantics.html
@ -3,7 +3,7 @@
 <head>
 <title>FFI Semantics</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -440,6 +440,19 @@ If you don't do this, the default Lua number &rarr; <tt>double</tt>
 conversion rule applies. A vararg C&nbsp;function expecting an integer
 will see a garbled or uninitialized value.
 </p>
+<p>
+Note: this is the only place where creating a boxed scalar number type is
+actually useful. <b>Never use <tt>ffi.new("int")</tt>, <tt>ffi.new("float")</tt>
+etc. anywhere else!</b>
+</p>
+<p style="font-size: 8pt;">
+Ditto for <tt>ffi.cast()</tt>. Explicitly boxing scalars <b>does not</b>
+improve performance or force <tt>int</tt> or <tt>float</tt> arithmetic! It
+just adds costly boxing, unboxing and conversions steps. And it may lead
+to surprise results, because
+<a href="#cdata_arith">cdata arithmetic on scalar numbers</a>
+is always performed on 64 bit integers.
+</p>

 <h2 id="init">Initializers</h2>
 <p>
@ -1246,7 +1259,7 @@ compiled.</li>
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_ffi_tutorial.html
+++ b/doc/ext_ffi_tutorial.html
@ -3,7 +3,7 @@
 <head>
 <title>FFI Tutorial</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -587,7 +587,7 @@ it to a local variable in the function scope is unnecessary.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_jit.html
+++ b/doc/ext_jit.html
@ -3,7 +3,7 @@
 <head>
 <title>jit.* Library</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -187,7 +187,7 @@ if you want to know more.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/ext_profiler.html
+++ b/doc/ext_profiler.html
@ -3,7 +3,7 @@
 <head>
 <title>Profiler</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -349,7 +349,7 @@ use.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/extensions.html
+++ b/doc/extensions.html
@ -3,7 +3,7 @@
 <head>
 <title>Extensions</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -160,13 +160,33 @@ passes any arguments after the error function to the function
 which is called in a protected context.
 </p>

-<h3 id="load"><tt>loadfile()</tt> etc. handle UTF-8 source code</h3>
+<h3 id="load"><tt>load*()</tt> handle UTF-8 source code</h3>
 <p>
 Non-ASCII characters are handled transparently by the Lua source code parser.
 This allows the use of UTF-8 characters in identifiers and strings.
 A UTF-8 BOM is skipped at the start of the source code.
 </p>

+<h3 id="load_mode"><tt>load*()</tt> add a mode parameter</h3>
+<p>
+As an extension from Lua 5.2, the functions <tt>loadstring()</tt>,
+<tt>loadfile()</tt> and (new) <tt>load()</tt> add an optional
+<tt>mode</tt> parameter.
+</p>
+<p>
+The default mode string is <tt>"bt"</tt>, which allows loading of both
+source code and bytecode. Use <tt>"t"</tt> to allow only source code
+or <tt>"b"</tt> to allow only bytecode to be loaded.
+</p>
+<p>
+By default, the <tt>load*</tt> functions generate the native bytecode format.
+For cross-compilation purposes, add <tt>W</tt> to the mode string to
+force the 32 bit format and <tt>X</tt> to force the 64 bit format.
+Add both to force the opposite format. Note that non-native bytecode
+generated by <tt>load*</tt> cannot be run, but can still be passed
+to <tt>string.dump</tt>.
+</p>
+
 <h3 id="tostring"><tt>tostring()</tt> etc. canonicalize NaN and &plusmn;Inf</h3>
 <p>
 All number-to-string conversions consistently convert non-finite numbers
@ -186,26 +206,33 @@ works independently of the current locale and it supports hex floating-point
 numbers (e.g. <tt>0x1.5p-3</tt>).
 </p>

-<h3 id="string_dump"><tt>string.dump(f [,strip])</tt> generates portable bytecode</h3>
+<h3 id="string_dump"><tt>string.dump(f [,mode])</tt> generates portable bytecode</h3>
 <p>
 An extra argument has been added to <tt>string.dump()</tt>. If set to
-<tt>true</tt>, 'stripped' bytecode without debug information is
-generated. This speeds up later bytecode loading and reduces memory
-usage. See also the
+<tt>true</tt> or to a string which contains the character <tt>s</tt>,
+'stripped' bytecode without debug information is generated. This speeds
+up later bytecode loading and reduces memory usage. See also the
 <a href="running.html#opt_b"><tt>-b</tt> command line option</a>.
 </p>
 <p>
 The generated bytecode is portable and can be loaded on any architecture
-that LuaJIT supports, independent of word size or endianess. However, the
-bytecode compatibility versions must match. Bytecode stays compatible
-for dot releases (x.y.0 &rarr; x.y.1), but may change with major or
-minor releases (2.0 &rarr; 2.1) or between any beta release. Foreign
-bytecode (e.g. from Lua 5.1) is incompatible and cannot be loaded.
+that LuaJIT supports. However, the bytecode compatibility versions must
+match. Bytecode only stays compatible within a major+minor version
+(x.y.aaa &rarr; x.y.bbb), except for development branches. Foreign bytecode
+(e.g. from Lua 5.1) is incompatible and cannot be loaded.
 </p>
 <p>
 Note: <tt>LJ_GC64</tt> mode requires a different frame layout, which implies
-a different, incompatible bytecode format for all 64 bit ports. This may be
-rectified in the future.
+a different, incompatible bytecode format between 32 bit and 64 bit ports.
+This may be rectified in the future. In the meantime, use the <tt>W</tt>
+and </tt>X</tt> <a href="#load_mode">modes of the <tt>load*</tt> functions</a>
+for cross-compilation purposes.
+</p>
+<p>
+Due to VM hardening, bytecode is not deterministic. Add <tt>d</tt> to the
+mode string to dump it in a deterministic manner: identical source code
+always gives a byte-for-byte identical bytecode dump. This feature is
+mainly useful for reproducible builds.
 </p>

 <h3 id="table_new"><tt>table.new(narray, nhash)</tt> allocates a pre-sized table</h3>
@ -238,7 +265,7 @@ and let the GC do its work.
 LuaJIT uses a Tausworthe PRNG with period 2^223 to implement
 <tt>math.random()</tt> and <tt>math.randomseed()</tt>. The quality of
 the PRNG results is much superior compared to the standard Lua
-implementation, which uses the platform-specific ANSI rand().
+implementation, which uses the platform-specific ANSI <tt>rand()</tt>.
 </p>
 <p>
 The PRNG generates the same sequences from the same seeds on all
@ -249,6 +276,10 @@ It's correctly scaled up and rounded for <tt>math.random(n&nbsp;[,m])</tt> to
 preserve uniformity.
 </p>
 <p>
+Call <tt>math.randomseed()</tt> without any arguments to seed it from
+system entropy.
+</p>
+<p>
 Important: Neither this nor any other PRNG based on the simplistic
 <tt>math.random()</tt> API is suitable for cryptographic use.
 </p>
@ -286,7 +317,7 @@ enabled:
 </p>
 <ul>
 <li><tt>goto</tt> and <tt>::labels::</tt>.</li>
-<li>Hex escapes <tt>'\x3F'</tt> and <tt>'\*'</tt> escape in strings.</li>
+<li>Hex escapes <tt>'\x3F'</tt> and <tt>'\z'</tt> escape in strings.</li>
 <li><tt>load(string|reader [, chunkname [,mode [,env]]])</tt>.</li>
 <li><tt>loadstring()</tt> is an alias for <tt>load()</tt>.</li>
 <li><tt>loadfile(filename [,mode [,env]])</tt>.</li>
@ -426,9 +457,7 @@ the toolchain used to compile LuaJIT:
 on the C&nbsp;stack. The contents of the C++&nbsp;exception object
 pass through unmodified.</li>
 <li>Lua errors can be caught on the C++ side with <tt>catch(...)</tt>.
-The corresponding Lua error message can be retrieved from the Lua stack.<br>
-For MSVC for Windows 64 bit this requires compilation of your C++ code
-with <tt>/EHa</tt>.</li>
+The corresponding Lua error message can be retrieved from the Lua stack.</li>
 <li>Throwing Lua errors across C++ frames is safe. C++ destructors
 will be called.</li>
 </ul>
@ -463,7 +492,7 @@ C++ destructors.</li>
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/install.html
+++ b/doc/install.html
@ -3,7 +3,7 @@
 <head>
 <title>Installation</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -117,7 +117,7 @@ hold all user-configurable settings:
 <li><tt>Makefile</tt> has settings for <b>installing</b> LuaJIT (POSIX
 only).</li>
 <li><tt>src/Makefile</tt> has settings for <b>compiling</b> LuaJIT
-under POSIX, MinGW or Cygwin.</li>
+under POSIX or MinGW.</li>
 <li><tt>src/msvcbuild.bat</tt> has settings for compiling LuaJIT with
 MSVC (Visual Studio).</li>
 </ul>
@ -195,15 +195,13 @@ Obviously the prefixes given during build and installation need to be the same.
 <h2 id="windows">Windows Systems</h2>
 <h3>Prerequisites</h3>
 <p>
-Either install one of the open source SDKs
-(<a href="http://mingw.org/"><span class="ext">&raquo;</span>&nbsp;MinGW</a> or
-<a href="https://www.cygwin.com/"><span class="ext">&raquo;</span>&nbsp;Cygwin</a>), which come with a modified
-GCC plus the required development headers.
+Either install the open source SDK <a href="http://mingw.org/"><span class="ext">&raquo;</span>&nbsp;MinGW</a>,
+which comes with a modified GCC plus the required development headers.
 Or install Microsoft's Visual Studio (MSVC).
 </p>
 <h3>Building with MSVC</h3>
 <p>
-Open a "Visual Studio Command Prompt" (either x86 or x64), <tt>cd</tt> to the
+Open a "Visual Studio Command Prompt" (x86, x64 or ARM64), <tt>cd</tt> to the
 directory with the source code and run these commands:
 </p>
 <pre class="code">
@ -214,9 +212,12 @@ msvcbuild
 Check the <tt>msvcbuild.bat</tt> file for more options.
 Then follow the installation instructions below.
 </p>
-<h3>Building with MinGW or Cygwin</h3>
 <p>
-Open a command prompt window and make sure the MinGW or Cygwin programs
+For an x64 to ARM64 cross-build run this first: <tt>vcvarsall.bat x64_arm64</tt>
+</p>
+<h3>Building with MinGW</h3>
+<p>
+Open a command prompt window and make sure the MinGW programs
 are in your path. Then <tt>cd</tt> to the directory of the git repository.
 Then run this command for MinGW:
 </p>
@ -224,12 +225,6 @@ Then run this command for MinGW:
 mingw32-make
 </pre>
 <p>
-Or this command for Cygwin:
-</p>
-<pre class="code">
-make
-</pre>
-<p>
 Then follow the installation instructions below.
 </p>
 <h3>Installing LuaJIT</h3>
@ -246,6 +241,19 @@ absolute path names &mdash; all modules are loaded relative to the
 directory where <tt>luajit.exe</tt> is installed
 (see <tt>src/luaconf.h</tt>).
 </p>
+<p>
+The final directory layout should look like this:
+</p>
+<pre class="code">
+├── luajit.exe
+├── lua51.dll
+├── <- put your own classic Lua/C API modules (*.dll) here
+└── lua
+    ├── <- put your own Lua modules (*.lua) here
+    └── jit
+        ├── bc.lua
+        └── (etc …)
+</pre>

 <h2 id="cross">Cross-compiling LuaJIT</h2>
 <p>
@ -266,6 +274,7 @@ for any supported target:
 <li>Yes, you need a toolchain for both your host <em>and</em> your target!</li>
 <li>Both host and target architectures must have the same pointer size.</li>
 <li>E.g. if you want to cross-compile to a 32 bit target on a 64 bit host, you need to install the multilib development package (e.g. <tt>libc6-dev-i386</tt> on Debian/Ubuntu) and build a 32 bit host part (<tt>HOST_CC="gcc -m32"</tt>).</li>
+<li>On some distro versions, multilib conflicts with cross-compilers. The workaround is to install the x86 cross-compiler package <tt>gcc-i686-linux-gnu</tt> and use it to build the host part (<tt>HOST_CC=i686-linux-gnu-gcc</tt>).</li>
 <li>64 bit targets always require compilation on a 64 bit host.</li>
 </ul>
 <p>
@ -568,7 +577,7 @@ to me (the upstream) and not you (the package maintainer), anyway.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/luajit.html
+++ b/doc/luajit.html
@ -3,7 +3,7 @@
 <head>
 <title>LuaJIT</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -122,7 +122,7 @@ Lua is a powerful, dynamic and light-weight programming language.
 It may be embedded or used as a general-purpose, stand-alone language.
 </p>
 <p>
-LuaJIT is Copyright &copy; 2005-2023 Mike Pall, released under the
+LuaJIT is Copyright &copy; 2005-2025 Mike Pall, released under the
 <a href="https://www.opensource.org/licenses/mit-license.php"><span class="ext">&raquo;</span>&nbsp;MIT open source license</a>.
 </p>
 <p>
@ -193,7 +193,7 @@ Please select a sub-topic in the navigation bar to learn more about LuaJIT.
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/doc/running.html
+++ b/doc/running.html
@ -3,7 +3,7 @@
 <head>
 <title>Running LuaJIT</title>
 <meta charset="utf-8">
-<meta name="Copyright" content="Copyright (C) 2005-2023">
+<meta name="Copyright" content="Copyright (C) 2005-2025">
 <meta name="Language" content="en">
 <link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
 <link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
@ -106,6 +106,9 @@ are accepted:
 <li><tt>-l</tt> &mdash; Only list bytecode.</li>
 <li><tt>-s</tt> &mdash; Strip debug info (this is the default).</li>
 <li><tt>-g</tt> &mdash; Keep debug info.</li>
+<li><tt>-W</tt> &mdash; Generate 32 bit (non-GC64) bytecode.</li>
+<li><tt>-X</tt> &mdash; Generate 64 bit (GC64) bytecode.</li>
+<li><tt>-d</tt> &mdash; Generate bytecode in deterministic manner.</li>
 <li><tt>-n name</tt> &mdash; Set module name (default: auto-detect from input name)</li>
 <li><tt>-t type</tt> &mdash; Set output file type (default: auto-detect from output name).</li>
 <li><tt>-a arch</tt> &mdash; Override architecture for object files (default: native).</li>
@ -120,7 +123,8 @@ file name:
 </p>
 <ul>
 <li><tt>c</tt> &mdash; C source file, exported bytecode data.</li>
-<li><tt>h</tt> &mdash; C header file, static bytecode data.</li>
+<li><tt>cc</tt> &mdash; C++ source file, exported bytecode data.</li>
+<li><tt>h</tt> &mdash; C/C++ header file, static bytecode data.</li>
 <li><tt>obj</tt> or <tt>o</tt> &mdash; Object file, exported bytecode data
 (OS- and architecture-specific).</li>
 <li><tt>raw</tt> or any other extension &mdash; Raw bytecode file (portable).
@ -303,7 +307,7 @@ Here are the parameters and their default settings:
 </div>
 <div id="foot">
 <hr class="hide">
-Copyright &copy; 2005-2023
+Copyright &copy; 2005-2025
 <span class="noprint">
 &middot;
 <a href="contact.html">Contact</a>
--- a/dynasm/dasm_arm.h
+++ b/dynasm/dasm_arm.h
@ -1,6 +1,6 @@
 /*
 ** DynASM ARM encoding engine.
-** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+** Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 ** Released under the MIT license. See dynasm.lua for full copyright notice.
 */

--- a/dynasm/dasm_arm.lua
+++ b/dynasm/dasm_arm.lua
@ -1,7 +1,7 @@
 ------------------------------------------------------------------------------
 -- DynASM ARM module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See dynasm.lua for full copyright notice.
 ------------------------------------------------------------------------------

--- a/dynasm/dasm_arm64.h
+++ b/dynasm/dasm_arm64.h
@ -1,6 +1,6 @@
 /*
 ** DynASM ARM64 encoding engine.
-** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+** Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 ** Released under the MIT license. See dynasm.lua for full copyright notice.
 */

--- a/dynasm/dasm_arm64.lua
+++ b/dynasm/dasm_arm64.lua
@ -1,7 +1,7 @@
 ------------------------------------------------------------------------------
 -- DynASM ARM64 module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See dynasm.lua for full copyright notice.
 ------------------------------------------------------------------------------

@ -549,7 +549,7 @@ end
 local function parse_load_pair(params, nparams, n, op)
  if params[n+2] then werror("too many operands") end
  local pn, p2 = params[n], params[n+1]
-  local scale = shr(op, 30) == 0 and 2 or 3
+  local scale = 2 + shr(op, 31 - band(shr(op, 26), 1))
  local p1, wb = match(pn, "^%[%s*(.-)%s*%](!?)$")
  if not p1 then
    if not p2 then
@ -806,8 +806,8 @@ map_op = {
  ["ldrsw_*"] = "98000000DxB|b8800000DxL",
  -- NOTE: ldur etc. are handled by ldr et al.

-  ["stp_*"]   = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP",
-  ["ldp_*"]   = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP",
+  ["stp_*"]   = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP|ac000000DAqP",
+  ["ldp_*"]   = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP|ac400000DAqP",
  ["ldpsw_*"] = "68400000DAxP",

  -- Branches.
@ -942,7 +942,7 @@ local function parse_template(params, template, nparams, pos)
 	werror("bad register type")
      end
      parse_reg_type = false
-    elseif p == "x" or p == "w" or p == "d" or p == "s" then
+    elseif p == "x" or p == "w" or p == "d" or p == "s" or p == "q" then
      if parse_reg_type ~= p then
 	werror("register size mismatch")
      end
--- a/dynasm/dasm_mips.h
+++ b/dynasm/dasm_mips.h
@ -1,6 +1,6 @@
 /*
 ** DynASM MIPS encoding engine.
-** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+** Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 ** Released under the MIT license. See dynasm.lua for full copyright notice.
 */

--- a/dynasm/dasm_mips.lua
+++ b/dynasm/dasm_mips.lua
@ -1,7 +1,7 @@
 ------------------------------------------------------------------------------
 -- DynASM MIPS32/MIPS64 module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See dynasm.lua for full copyright notice.
 ------------------------------------------------------------------------------

--- a/dynasm/dasm_mips64.lua
+++ b/dynasm/dasm_mips64.lua
@ -1,7 +1,7 @@
 ------------------------------------------------------------------------------
 -- DynASM MIPS64 module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See dynasm.lua for full copyright notice.
 ------------------------------------------------------------------------------
 -- This module just sets 64 bit mode for the combined MIPS/MIPS64 module.
--- a/dynasm/dasm_ppc.h
+++ b/dynasm/dasm_ppc.h
@ -1,6 +1,6 @@
 /*
 ** DynASM PPC/PPC64 encoding engine.
-** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+** Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 ** Released under the MIT license. See dynasm.lua for full copyright notice.
 */

--- a/dynasm/dasm_ppc.lua
+++ b/dynasm/dasm_ppc.lua
@ -1,7 +1,7 @@
 ------------------------------------------------------------------------------
 -- DynASM PPC/PPC64 module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See dynasm.lua for full copyright notice.
 --
 -- Support for various extensions contributed by Caio Souza Oliveira.
--- a/dynasm/dasm_proto.h
+++ b/dynasm/dasm_proto.h
@ -1,6 +1,6 @@
 /*
 ** DynASM encoding engine prototypes.
-** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+** Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 ** Released under the MIT license. See dynasm.lua for full copyright notice.
 */

--- a/dynasm/dasm_x64.lua
+++ b/dynasm/dasm_x64.lua
@ -1,7 +1,7 @@
 ------------------------------------------------------------------------------
 -- DynASM x64 module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See dynasm.lua for full copyright notice.
 ------------------------------------------------------------------------------
 -- This module just sets 64 bit mode for the combined x86/x64 module.
--- a/dynasm/dasm_x86.h
+++ b/dynasm/dasm_x86.h
@ -1,6 +1,6 @@
 /*
 ** DynASM x86 encoding engine.
-** Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+** Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 ** Released under the MIT license. See dynasm.lua for full copyright notice.
 */

--- a/dynasm/dasm_x86.lua
+++ b/dynasm/dasm_x86.lua
@ -1,7 +1,7 @@
 ------------------------------------------------------------------------------
 -- DynASM x86/x64 module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See dynasm.lua for full copyright notice.
 ------------------------------------------------------------------------------

@ -627,7 +627,11 @@ local function wputmrmsib(t, imark, s, vsreg, psz, sk)
 	werror("NYI: rip-relative displacement followed by immediate")
      end
      -- The previous byte in the action buffer cannot be 0xe9 or 0x80-0x8f.
-      wputlabel("REL_", disp[1], 2)
+      if disp[2] == "iPJ" then
+	waction("REL_A", disp[1])
+      else
+	wputlabel("REL_", disp[1], 2)
+      end
    else
      wputdarg(disp)
    end
@ -744,9 +748,9 @@ local function dispexpr(expr)
    return imm*map_opsizenum[ops]
  end
  local mode, iexpr = immexpr(dispt)
-  if mode == "iJ" then
+  if mode == "iJ" or mode == "iPJ" then
    if c == "-" then werror("cannot invert label reference") end
-    return { iexpr }
+    return { iexpr, mode }
  end
  return expr -- Need to return original signed expression.
 end
@ -1147,6 +1151,8 @@ local map_op = {
  rep_0 =	"F3",
  repe_0 =	"F3",
  repz_0 =	"F3",
+  endbr32_0 =	"F30F1EFB",
+  endbr64_0 =	"F30F1EFA",
  -- F4: *hlt
  cmc_0 =	"F5",
  -- F6: test... mb,i; div... mb
--- a/dynasm/dynasm.lua
+++ b/dynasm/dynasm.lua
@ -2,7 +2,7 @@
 -- DynASM. A dynamic assembler for code generation engines.
 -- Originally designed and implemented for LuaJIT.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- See below for full copyright notice.
 ------------------------------------------------------------------------------

@ -17,7 +17,7 @@ local _info = {
  url =		"https://luajit.org/dynasm.html",
  license =	"MIT",
  copyright =	[[
-Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+Copyright (C) 2005-2025 Mike Pall. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
@ -75,7 +75,7 @@ local function wline(line, needindent)
  g_synclineno = g_synclineno + 1
 end

-- Write assembler line as a comment, if requestd.
+-- Write assembler line as a comment, if requested.
 local function wcomment(aline)
  if g_opt.comment then
    wline(g_opt.comment..aline..g_opt.endcomment, true)
--- a/etc/luajit.1
+++ b/etc/luajit.1
@ -74,7 +74,7 @@ luajit \-jv \-e "for i=1,10 do for j=1,10 do for k=1,100 do end end end"
 Runs some nested loops and shows the resulting traces.
 .SH COPYRIGHT
 .PP
-\fBLuaJIT\fR is Copyright \(co 2005-2023 Mike Pall.
+\fBLuaJIT\fR is Copyright \(co 2005-2025 Mike Pall.
 .br
 \fBLuaJIT\fR is open source software, released under the MIT license.
 .SH SEE ALSO
--- a/src/Makefile
+++ b/src/Makefile
@ -7,7 +7,7 @@
 # Also works with MinGW and Cygwin on Windows.
 # Please check msvcbuild.bat for building with MSVC on Windows.
 #
-# Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+# Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 ##############################################################################

 MAJVER=  2
@ -233,7 +233,7 @@ TARGET_ALDFLAGS= $(LDOPTIONS) $(TARGET_XLDFLAGS) $(TARGET_FLAGS) $(TARGET_LDFLAG
 TARGET_ASHLDFLAGS= $(LDOPTIONS) $(TARGET_XSHLDFLAGS) $(TARGET_FLAGS) $(TARGET_SHLDFLAGS)
 TARGET_ALIBS= $(TARGET_XLIBS) $(LIBS) $(TARGET_LIBS)

-TARGET_TESTARCH=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM)
+TARGET_TESTARCH:=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM)
 ifneq (,$(findstring LJ_TARGET_X64 ,$(TARGET_TESTARCH)))
  TARGET_LJARCH= x64
 else
@ -299,6 +299,12 @@ endif
 ifneq (,$(LMULTILIB))
  TARGET_XCFLAGS+= -DLUA_LMULTILIB=\"$(LMULTILIB)\"
 endif
+ifneq (,$(INSTALL_LJLIBD))
+  TARGET_XCFLAGS+= -DLUA_LJDIR=\"$(INSTALL_LJLIBD)\"
+endif
+ifeq (,$(shell $(TARGET_CC) -o /dev/null -c -x c /dev/null -fno-strict-float-cast-overflow 2>/dev/null || echo 1))
+  TARGET_XCFLAGS+= -fno-strict-float-cast-overflow
+endif

 ##############################################################################
 # Target system detection.
@ -320,13 +326,13 @@ ifeq (Darwin,$(TARGET_SYS))
  endif
  TARGET_STRIP+= -x
  TARGET_XCFLAGS+= -DLUAJIT_UNWIND_EXTERNAL
-  TARGET_XSHLDFLAGS= -dynamiclib -single_module -undefined dynamic_lookup -fPIC
+  TARGET_XSHLDFLAGS= -dynamiclib -undefined dynamic_lookup -fPIC
  TARGET_DYNXLDOPTS=
  TARGET_XSHLDFLAGS+= -install_name $(TARGET_DYLIBPATH) -compatibility_version $(MAJVER).$(MINVER) -current_version $(MAJVER).$(MINVER).255
 else
 ifeq (iOS,$(TARGET_SYS))
  TARGET_STRIP+= -x
-  TARGET_XSHLDFLAGS= -dynamiclib -single_module -undefined dynamic_lookup -fPIC
+  TARGET_XSHLDFLAGS= -dynamiclib -undefined dynamic_lookup -fPIC
  TARGET_DYNXLDOPTS=
  TARGET_XSHLDFLAGS+= -install_name $(TARGET_DYLIBPATH) -compatibility_version $(MAJVER).$(MINVER) -current_version $(MAJVER).$(MINVER).255
  ifeq (arm64,$(TARGET_LJARCH))
@ -475,7 +481,11 @@ DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
 DASM_DASC= vm_$(DASM_ARCH).dasc

 GIT= git
-GIT_RELVER= [ -d ../.git ] && $(GIT) show -s --format=%ct >luajit_relver.txt 2>/dev/null || cat ../.relver >luajit_relver.txt 2>/dev/null || :
+ifeq (Windows,$(HOST_SYS)$(HOST_MSYS))
+  GIT_RELVER= if exist ..\.git ( $(GIT) show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
+else
+  GIT_RELVER= [ -e ../.git ] && $(GIT) show -s --format=%ct >luajit_relver.txt 2>/dev/null || cat ../.relver >luajit_relver.txt 2>/dev/null || :
+endif
 GIT_DEP= $(wildcard ../.git/HEAD ../.git/refs/heads/*)

 BUILDVM_O= host/buildvm.o host/buildvm_asm.o host/buildvm_peobj.o \
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@ -25,14 +25,15 @@ lib_ffi.o: lib_ffi.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
 lib_init.o: lib_init.c lua.h luaconf.h lauxlib.h lualib.h lj_arch.h
 lib_io.o: lib_io.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
 lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_buf.h lj_str.h lj_state.h \
- lj_strfmt.h lj_ff.h lj_ffdef.h lj_lib.h lj_libdef.h
+ lj_strfmt.h lj_ff.h lj_ffdef.h lj_lib.h lj_strscan.h lj_libdef.h
 lib_jit.o: lib_jit.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
 lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_str.h lj_tab.h \
 lj_state.h lj_bc.h lj_ctype.h lj_ir.h lj_jit.h lj_ircall.h lj_iropt.h \
 lj_target.h lj_target_*.h lj_trace.h lj_dispatch.h lj_traceerr.h \
 lj_vm.h lj_vmevent.h lj_lib.h luajit.h lj_libdef.h
 lib_math.o: lib_math.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
- lj_def.h lj_arch.h lj_lib.h lj_vm.h lj_prng.h lj_libdef.h
+ lj_def.h lj_arch.h lj_err.h lj_errmsg.h lj_lib.h lj_vm.h lj_prng.h \
+ lj_libdef.h
 lib_os.o: lib_os.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
 lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_buf.h lj_str.h lj_lib.h \
 lj_libdef.h
@ -55,7 +56,7 @@ lj_asm.o: lj_asm.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
 lj_buf.h lj_str.h lj_tab.h lj_frame.h lj_bc.h lj_ctype.h lj_ir.h \
 lj_jit.h lj_ircall.h lj_iropt.h lj_mcode.h lj_trace.h lj_dispatch.h \
 lj_traceerr.h lj_snap.h lj_asm.h lj_vm.h lj_target.h lj_target_*.h \
- lj_emit_*.h lj_asm_*.h
+ lj_prng.h lj_emit_*.h lj_asm_*.h
 lj_assert.o: lj_assert.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h
 lj_bc.o: lj_bc.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_bc.h \
 lj_bcdef.h
@ -97,7 +98,7 @@ lj_crecord.o: lj_crecord.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
 lj_cdata.h lj_cparse.h lj_cconv.h lj_carith.h lj_clib.h lj_ccall.h \
 lj_ff.h lj_ffdef.h lj_ir.h lj_jit.h lj_ircall.h lj_iropt.h lj_trace.h \
 lj_dispatch.h lj_traceerr.h lj_record.h lj_ffrecord.h lj_snap.h \
- lj_crecord.h lj_strfmt.h
+ lj_crecord.h lj_strfmt.h lj_strscan.h
 lj_ctype.o: lj_ctype.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
 lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_strfmt.h lj_ctype.h \
 lj_ccallback.h lj_buf.h
--- a/src/host/buildvm.c
+++ b/src/host/buildvm.c
@ -1,6 +1,6 @@
 /*
 ** LuaJIT VM builder.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** This is a tool to build the hand-tuned assembler code required for
 ** LuaJIT's bytecode interpreter. It supports a variety of output formats
--- a/src/host/buildvm.h
+++ b/src/host/buildvm.h
@ -1,6 +1,6 @@
 /*
 ** LuaJIT VM builder.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _BUILDVM_H
--- a/src/host/buildvm_asm.c
+++ b/src/host/buildvm_asm.c
@ -1,6 +1,6 @@
 /*
 ** LuaJIT VM builder: Assembler source code emitter.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #include "buildvm.h"
@ -339,6 +339,10 @@ void emit_asm(BuildCtx *ctx)
    fprintf(ctx->fp, "\t.ident \"%s\"\n", ctx->dasm_ident);
    break;
  case BUILD_machasm:
+#if defined(__apple_build_version__) && __apple_build_version__ >= 15000000 && __apple_build_version__ < 15000300
+    /* Workaround for XCode 15.0 - 15.2. */
+    fprintf(ctx->fp, "\t.subsections_via_symbols\n");
+#endif
    fprintf(ctx->fp,
      "\t.cstring\n"
      "\t.ascii \"%s\\0\"\n", ctx->dasm_ident);
--- a/src/host/buildvm_fold.c
+++ b/src/host/buildvm_fold.c
@ -1,6 +1,6 @@
 /*
 ** LuaJIT VM builder: IR folding hash table generator.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #include "buildvm.h"
--- a/src/host/buildvm_lib.c
+++ b/src/host/buildvm_lib.c
@ -1,6 +1,6 @@
 /*
 ** LuaJIT VM builder: library definition compiler.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #include "buildvm.h"
--- a/src/host/buildvm_peobj.c
+++ b/src/host/buildvm_peobj.c
@ -1,6 +1,6 @@
 /*
 ** LuaJIT VM builder: PE object emitter.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Only used for building on Windows, since we cannot assume the presence
 ** of a suitable assembler. The host and target byte order must match.
@ -9,7 +9,7 @@
 #include "buildvm.h"
 #include "lj_bc.h"

-#if LJ_TARGET_X86ORX64
+#if LJ_TARGET_WINDOWS || LJ_TARGET_CYGWIN

 /* Context for PE object emitter. */
 static char *strtab;
@ -93,6 +93,17 @@ typedef struct PEsymaux {
 #define PEOBJ_RELOC_ADDR32NB	0x03
 #define PEOBJ_RELOC_OFS		0
 #define PEOBJ_TEXT_FLAGS	0x60500020  /* 60=r+x, 50=align16, 20=code. */
+#define PEOBJ_PDATA_NRELOC	6
+#define PEOBJ_XDATA_SIZE	(8*2+4+6*2)
+#elif LJ_TARGET_ARM64
+#define PEOBJ_ARCH_TARGET	0xaa64
+#define PEOBJ_RELOC_REL32	0x03  /* MS: BRANCH26. */
+#define PEOBJ_RELOC_DIR32	0x01
+#define PEOBJ_RELOC_ADDR32NB	0x02
+#define PEOBJ_RELOC_OFS		(-4)
+#define PEOBJ_TEXT_FLAGS	0x60500020  /* 60=r+x, 50=align16, 20=code. */
+#define PEOBJ_PDATA_NRELOC	4
+#define PEOBJ_XDATA_SIZE	(4+24+4 +4+8)
 #endif

 /* Section numbers (0-based). */
@ -100,7 +111,7 @@ enum {
  PEOBJ_SECT_ABS = -2,
  PEOBJ_SECT_UNDEF = -1,
  PEOBJ_SECT_TEXT,
-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
  PEOBJ_SECT_PDATA,
  PEOBJ_SECT_XDATA,
 #elif LJ_TARGET_X86
@ -175,6 +186,9 @@ void emit_peobj(BuildCtx *ctx)
  uint32_t sofs;
  int i, nrsym;
  union { uint8_t b; uint32_t u; } host_endian;
+#ifdef PEOBJ_PDATA_NRELOC
+  uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
+#endif

  sofs = sizeof(PEheader) + PEOBJ_NSECTIONS*sizeof(PEsection);

@ -188,18 +202,18 @@ void emit_peobj(BuildCtx *ctx)
  /* Flags: 60 = read+execute, 50 = align16, 20 = code. */
  pesect[PEOBJ_SECT_TEXT].flags = PEOBJ_TEXT_FLAGS;

-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
  memcpy(pesect[PEOBJ_SECT_PDATA].name, ".pdata", sizeof(".pdata")-1);
  pesect[PEOBJ_SECT_PDATA].ofs = sofs;
-  sofs += (pesect[PEOBJ_SECT_PDATA].size = 6*4);
+  sofs += (pesect[PEOBJ_SECT_PDATA].size = PEOBJ_PDATA_NRELOC*4);
  pesect[PEOBJ_SECT_PDATA].relocofs = sofs;
-  sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = 6) * PEOBJ_RELOC_SIZE;
+  sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = PEOBJ_PDATA_NRELOC) * PEOBJ_RELOC_SIZE;
  /* Flags: 40 = read, 30 = align4, 40 = initialized data. */
  pesect[PEOBJ_SECT_PDATA].flags = 0x40300040;

  memcpy(pesect[PEOBJ_SECT_XDATA].name, ".xdata", sizeof(".xdata")-1);
  pesect[PEOBJ_SECT_XDATA].ofs = sofs;
-  sofs += (pesect[PEOBJ_SECT_XDATA].size = 8*2+4+6*2);  /* See below. */
+  sofs += (pesect[PEOBJ_SECT_XDATA].size = PEOBJ_XDATA_SIZE);  /* See below. */
  pesect[PEOBJ_SECT_XDATA].relocofs = sofs;
  sofs += (pesect[PEOBJ_SECT_XDATA].nreloc = 1) * PEOBJ_RELOC_SIZE;
  /* Flags: 40 = read, 30 = align4, 40 = initialized data. */
@ -234,7 +248,7 @@ void emit_peobj(BuildCtx *ctx)
  */
  nrsym = ctx->nrelocsym;
  pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+ctx->nsym + nrsym;
-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
  pehdr.nsyms += 1;  /* Symbol for lj_err_unwind_win. */
 #endif

@ -259,7 +273,6 @@ void emit_peobj(BuildCtx *ctx)

 #if LJ_TARGET_X64
  { /* Write .pdata section. */
-    uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
    uint32_t pdata[3];  /* Start of .text, end of .text and .xdata. */
    PEreloc reloc;
    pdata[0] = 0; pdata[1] = fcofs; pdata[2] = 0;
@ -308,6 +321,87 @@ void emit_peobj(BuildCtx *ctx)
    reloc.type = PEOBJ_RELOC_ADDR32NB;
    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
  }
+#elif LJ_TARGET_ARM64
+  /* https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling */
+  { /* Write .pdata section. */
+    uint32_t pdata[4];
+    PEreloc reloc;
+    pdata[0] = 0;
+    pdata[1] = 0;
+    pdata[2] = fcofs;
+    pdata[3] = 4+24+4;
+    owrite(ctx, &pdata, sizeof(pdata));
+    /* Start of .text and start of .xdata. */
+    reloc.vaddr = 0; reloc.symidx = 1+2+nrsym+2+2+1;
+    reloc.type = PEOBJ_RELOC_ADDR32NB;
+    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+    reloc.vaddr = 4; reloc.symidx = 1+2+nrsym+2;
+    reloc.type = PEOBJ_RELOC_ADDR32NB;
+    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+    /* Start of vm_ffi_call and start of second part of .xdata. */
+    reloc.vaddr = 8; reloc.symidx = 1+2+nrsym+2+2+1;
+    reloc.type = PEOBJ_RELOC_ADDR32NB;
+    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+    reloc.vaddr = 12; reloc.symidx = 1+2+nrsym+2;
+    reloc.type = PEOBJ_RELOC_ADDR32NB;
+    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+  }
+  { /* Write .xdata section. */
+    uint32_t u32;
+    uint8_t *p, uwc[24];
+    PEreloc reloc;
+
+#define CBE16(x)	(*p = ((x) >> 8) & 0xff, p[1] = (x) & 0xff, p += 2)
+#define CALLOC_S(s)	(*p++ = ((s) >> 4))  /* s < 512 */
+#define CSAVE_FPLR(o)	(*p++ = 0x40 | ((o) >> 3))  /* o <= 504 */
+#define CSAVE_REGP(r,o)	CBE16(0xc800 | (((r) - 19) << 6) | ((o) >> 3))
+#define CSAVE_REGS(r1,r2,o1) do { \
+  int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \
+} while (0)
+#define CSAVE_REGPX(r,o) CBE16(0xcc00 | (((r) - 19) << 6) | (~(o) >> 3))
+#define CSAVE_FREGP(r,o) CBE16(0xd800 | (((r) - 8) << 6) | ((o) >> 3))
+#define CSAVE_FREGS(r1,r2,o1) do { \
+  int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \
+} while (0)
+#define CADD_FP(s)	CBE16(0xe200 | ((s) >> 3))  /* s < 8*256 */
+#define CODE_NOP	0xe3
+#define CODE_END	0xe4
+#define CEND_ALIGN	do { \
+  *p++ = CODE_END; \
+  while ((p - uwc) & 3) *p++ = CODE_NOP; \
+} while (0)
+
+    /* Unwind codes for .text section with handler. */
+    p = uwc;
+    CADD_FP(192);		/* +2 */
+    CSAVE_REGS(19, 28, 176);	/* +5*2 */
+    CSAVE_FREGS(8, 15, 96);	/* +4*2 */
+    CSAVE_FPLR(192);		/* +1 */
+    CALLOC_S(208);		/* +1 */
+    CEND_ALIGN;			/* +1 +1 -> 24 */
+
+    u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2);
+    owrite(ctx, &u32, 4);
+    owrite(ctx, &uwc, 24);
+
+    u32 = 0;  /* Handler RVA to be relocated at 4 + 24. */
+    owrite(ctx, &u32, 4);
+
+    /* Unwind codes for vm_ffi_call without handler. */
+    p = uwc;
+    CADD_FP(16);		/* +2 */
+    CSAVE_FPLR(16);		/* +1 */
+    CSAVE_REGPX(19, -32);	/* +2 */
+    CEND_ALIGN;			/* +1 +2 -> 8 */
+
+    u32 = ((8u >> 2) << 27) | (((uint32_t)ctx->codesz - fcofs) >> 2);
+    owrite(ctx, &u32, 4);
+    owrite(ctx, &uwc, 8);
+
+    reloc.vaddr = 4 + 24; reloc.symidx = 1+2+nrsym+2+2;
+    reloc.type = PEOBJ_RELOC_ADDR32NB;
+    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+  }
 #elif LJ_TARGET_X86
  /* Write .sxdata section. */
  for (i = 0; i < nrsym; i++) {
@ -339,7 +433,7 @@ void emit_peobj(BuildCtx *ctx)
      emit_peobj_sym(ctx, ctx->relocsym[i], 0,
 		     PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN);

-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
    emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_PDATA);
    emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_XDATA);
    emit_peobj_sym(ctx, "lj_err_unwind_win", 0,
--- a/src/host/genlibbc.lua
+++ b/src/host/genlibbc.lua
@ -2,7 +2,7 @@
 -- Lua script to dump the bytecode of the library functions written in Lua.
 -- The resulting 'buildvm_libbc.h' is used for the build process of LuaJIT.
 ----------------------------------------------------------------------------
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------

@ -138,65 +138,73 @@ local function fixup_dump(dump, fixup)
  return { dump = ndump, startbc = startbc, sizebc = sizebc }
 end

-local function find_defs(src)
+local function find_defs(src, mode)
  local defs = {}
  for name, code in string.gmatch(src, "LJLIB_LUA%(([^)]*)%)%s*/%*(.-)%*/") do
-    local env = {}
    local tcode, fixup = transform_lua(code)
-    local func = assert(load(tcode, "", nil, env))()
-    defs[name] = fixup_dump(string.dump(func, true), fixup)
+    local func = assert(load(tcode, "", mode))
+    defs[name] = fixup_dump(string.dump(func, mode), fixup)
    defs[#defs+1] = name
  end
  return defs
 end

-local function gen_header(defs)
+local function gen_header(defs32, defs64)
  local t = {}
  local function w(x) t[#t+1] = x end
  w("/* This is a generated file. DO NOT EDIT! */\n\n")
  w("static const int libbc_endian = ") w(isbe and 1 or 0) w(";\n\n")
-  local s, sb = "", ""
-  for i,name in ipairs(defs) do
-    local d = defs[name]
-    s = s .. d.dump
-    sb = sb .. string.char(i) .. ("\0"):rep(d.startbc - 1)
-	    .. (isbe and "\0\0\0\255" or "\255\0\0\0"):rep(d.sizebc)
-	    .. ("\0"):rep(#d.dump - d.startbc - d.sizebc*4)
-  end
-  w("static const uint8_t libbc_code[] = {\n")
-  local n = 0
-  for i=1,#s do
-    local x = string.byte(s, i)
-    local xb = string.byte(sb, i)
-    if xb == 255 then
-      local name = BCN[x]
-      local m = #name + 4
-      if n + m > 78 then n = 0; w("\n") end
-      n = n + m
-      w("BC_"); w(name)
-    else
-      local m = x < 10 and 2 or (x < 100 and 3 or 4)
-      if xb == 0 then
-	if n + m > 78 then n = 0; w("\n") end
-      else
-	local name = defs[xb]:gsub("_", ".")
-	if n ~= 0 then w("\n") end
-	w("/* "); w(name); w(" */ ")
-	n = #name + 7
-      end
-      n = n + m
-      w(x)
+  for j,defs in ipairs{defs64, defs32} do
+    local s, sb = "", ""
+    for i,name in ipairs(defs) do
+      local d = defs[name]
+      s = s .. d.dump
+      sb = sb .. string.char(i) .. ("\0"):rep(d.startbc - 1)
+	      .. (isbe and "\0\0\0\255" or "\255\0\0\0"):rep(d.sizebc)
+	      .. ("\0"):rep(#d.dump - d.startbc - d.sizebc*4)
+    end
+    if j == 1 then
+      w("static const uint8_t libbc_code[] = {\n#if LJ_FR2\n")
+    else
+      w("\n#else\n")
+    end
+    local n = 0
+    for i=1,#s do
+      local x = string.byte(s, i)
+      local xb = string.byte(sb, i)
+      if xb == 255 then
+	local name = BCN[x]
+	local m = #name + 4
+	if n + m > 78 then n = 0; w("\n") end
+	n = n + m
+	w("BC_"); w(name)
+      else
+	local m = x < 10 and 2 or (x < 100 and 3 or 4)
+	if xb == 0 then
+	  if n + m > 78 then n = 0; w("\n") end
+	else
+	  local name = defs[xb]:gsub("_", ".")
+	  if n ~= 0 then w("\n") end
+	  w("/* "); w(name); w(" */ ")
+	  n = #name + 7
+	end
+	n = n + m
+	w(x)
+      end
+      w(",")
    end
-    w(",")
  end
-  w("\n0\n};\n\n")
+  w("\n#endif\n0\n};\n\n")
  w("static const struct { const char *name; int ofs; } libbc_map[] = {\n")
-  local m = 0
-  for _,name in ipairs(defs) do
-    w('{"'); w(name); w('",'); w(m) w('},\n')
-    m = m + #defs[name].dump
+  local m32, m64 = 0, 0
+  for i,name in ipairs(defs32) do
+    assert(name == defs64[i])
+    w('{"'); w(name); w('",'); w(m32) w('},\n')
+    m32 = m32 + #defs32[name].dump
+    m64 = m64 + #defs64[name].dump
+    assert(m32 == m64)
  end
-  w("{NULL,"); w(m); w("}\n};\n\n")
+  w("{NULL,"); w(m32); w("}\n};\n\n")
  return table.concat(t)
 end

@ -219,7 +227,8 @@ end

 local outfile = parse_arg(arg)
 local src = read_files(arg)
-local defs = find_defs(src)
-local hdr = gen_header(defs)
+local defs32 = find_defs(src, "Wdts")
+local defs64 = find_defs(src, "Xdts")
+local hdr = gen_header(defs32, defs64)
 write_file(outfile, hdr)

--- a/src/host/genminilua.lua
+++ b/src/host/genminilua.lua
@ -2,7 +2,7 @@
 -- Lua script to generate a customized, minified version of Lua.
 -- The resulting 'minilua' is used for the build process of LuaJIT.
 ----------------------------------------------------------------------------
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------

--- a/src/host/genversion.lua
+++ b/src/host/genversion.lua
@ -1,13 +1,14 @@
 ----------------------------------------------------------------------------
 -- Lua script to embed the rolling release version in luajit.h.
 ----------------------------------------------------------------------------
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------

-local FILE_INPUT_H = "luajit_rolling.h"
-local FILE_INPUT_R = "luajit_relver.txt"
-local FILE_OUTPUT_H = "luajit.h"
+local arg = {...}
+local FILE_ROLLING_H = arg[1] or "luajit_rolling.h"
+local FILE_RELVER_TXT = arg[2] or "luajit_relver.txt"
+local FILE_LUAJIT_H = arg[3] or "luajit.h"

 local function file_read(file)
  local fp = assert(io.open(file, "rb"), "run from the wrong directory")
@ -28,8 +29,8 @@ local function file_write_mod(file, data)
  assert(fp:close())
 end

-local text = file_read(FILE_INPUT_H)
-local relver = file_read(FILE_INPUT_R):match("(%d+)")
+local text = file_read(FILE_ROLLING_H):gsub("#error.-\n", "")
+local relver = file_read(FILE_RELVER_TXT):match("(%d+)")

 if relver then
  text = text:gsub("ROLLING", relver)
@ -38,6 +39,7 @@ else
 **** WARNING Cannot determine rolling release version from git log.
 **** WARNING The 'git' command must be available during the build.
 ]])
+  file_write_mod(FILE_RELVER_TXT, "ROLLING\n") -- Fallback for install target.
 end

-file_write_mod(FILE_OUTPUT_H, text)
+file_write_mod(FILE_LUAJIT_H, text)
--- a/src/jit/bc.lua
+++ b/src/jit/bc.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT bytecode listing module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 --
--- a/src/jit/bcsave.lua
+++ b/src/jit/bcsave.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT module to save/list bytecode.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 --
@ -29,6 +29,9 @@ Save LuaJIT bytecode: luajit -b[options] input output
  -l        Only list bytecode.
  -s        Strip debug info (default).
  -g        Keep debug info.
+  -W        Generate 32 bit (non-GC64) bytecode.
+  -X        Generate 64 bit (GC64) bytecode.
+  -d        Generate bytecode in deterministic manner.
  -n name   Set module name (default: auto-detect from input name).
  -t type   Set output file type (default: auto-detect from output name).
  -a arch   Override architecture for object files (default: native).
@ -38,7 +41,7 @@ Save LuaJIT bytecode: luajit -b[options] input output
  --        Stop handling options.
  -         Use stdin as input and/or stdout as output.

-File types: c h obj o raw (default)
+File types: c cc h obj o raw (default)
 ]]
  os.exit(1)
 end
@ -51,8 +54,9 @@ local function check(ok, ...)
 end

 local function readfile(ctx, input)
-  if type(input) == "function" then return input end
-  if ctx.filename then
+  if ctx.string then
+    return check(loadstring(input, nil, ctx.mode))
+  elseif ctx.filename then
    local data
    if input == "-" then
      data = io.stdin:read("*a")
@ -61,10 +65,10 @@ local function readfile(ctx, input)
      data = assert(fp:read("*a"))
      assert(fp:close())
    end
-    return check(load(data, ctx.filename))
+    return check(load(data, ctx.filename, ctx.mode))
  else
    if input == "-" then input = nil end
-    return check(loadfile(input))
+    return check(loadfile(input, ctx.mode))
  end
 end

@ -81,7 +85,7 @@ end
 ------------------------------------------------------------------------------

 local map_type = {
-  raw = "raw", c = "c", h = "h", o = "obj", obj = "obj",
+  raw = "raw", c = "c", cc = "c", h = "h", o = "obj", obj = "obj",
 }

 local map_arch = {
@ -435,24 +439,12 @@ typedef struct
 {
  mach_header; uint32_t reserved;
 } mach_header_64;
-typedef struct {
-  uint32_t cmd, cmdsize;
-  char segname[16];
-  uint32_t vmaddr, vmsize, fileoff, filesize;
-  uint32_t maxprot, initprot, nsects, flags;
-} mach_segment_command;
 typedef struct {
  uint32_t cmd, cmdsize;
  char segname[16];
  uint64_t vmaddr, vmsize, fileoff, filesize;
  uint32_t maxprot, initprot, nsects, flags;
 } mach_segment_command_64;
-typedef struct {
-  char sectname[16], segname[16];
-  uint32_t addr, size;
-  uint32_t offset, align, reloff, nreloc, flags;
-  uint32_t reserved1, reserved2;
-} mach_section;
 typedef struct {
  char sectname[16], segname[16];
  uint64_t addr, size;
@ -462,139 +454,64 @@ typedef struct {
 typedef struct {
  uint32_t cmd, cmdsize, symoff, nsyms, stroff, strsize;
 } mach_symtab_command;
-typedef struct {
-  int32_t strx;
-  uint8_t type, sect;
-  int16_t desc;
-  uint32_t value;
-} mach_nlist;
 typedef struct {
  int32_t strx;
  uint8_t type, sect;
  uint16_t desc;
  uint64_t value;
 } mach_nlist_64;
-typedef struct
-{
-  int32_t magic, nfat_arch;
-} mach_fat_header;
-typedef struct
-{
-  int32_t cputype, cpusubtype, offset, size, align;
-} mach_fat_arch;
 typedef struct {
-  struct {
-    mach_header hdr;
-    mach_segment_command seg;
-    mach_section sec;
-    mach_symtab_command sym;
-  } arch[1];
-  mach_nlist sym_entry;
-  uint8_t space[4096];
-} mach_obj;
-typedef struct {
-  struct {
-    mach_header_64 hdr;
-    mach_segment_command_64 seg;
-    mach_section_64 sec;
-    mach_symtab_command sym;
-  } arch[1];
+  mach_header_64 hdr;
+  mach_segment_command_64 seg;
+  mach_section_64 sec;
+  mach_symtab_command sym;
  mach_nlist_64 sym_entry;
  uint8_t space[4096];
 } mach_obj_64;
-typedef struct {
-  mach_fat_header fat;
-  mach_fat_arch fat_arch[2];
-  struct {
-    mach_header hdr;
-    mach_segment_command seg;
-    mach_section sec;
-    mach_symtab_command sym;
-  } arch[2];
-  mach_nlist sym_entry;
-  uint8_t space[4096];
-} mach_fat_obj;
-typedef struct {
-  mach_fat_header fat;
-  mach_fat_arch fat_arch[2];
-  struct {
-    mach_header_64 hdr;
-    mach_segment_command_64 seg;
-    mach_section_64 sec;
-    mach_symtab_command sym;
-  } arch[2];
-  mach_nlist_64 sym_entry;
-  uint8_t space[4096];
-} mach_fat_obj_64;
 ]]
  local symname = '_'..LJBC_PREFIX..ctx.modname
-  local isfat, is64, align, mobj = false, false, 4, "mach_obj"
-  if ctx.arch == "x64" then
-    is64, align, mobj = true, 8, "mach_obj_64"
-  elseif ctx.arch == "arm" then
-    isfat, mobj = true, "mach_fat_obj"
-  elseif ctx.arch == "arm64" then
-    is64, align, isfat, mobj = true, 8, true, "mach_fat_obj_64"
-  else
-    check(ctx.arch == "x86", "unsupported architecture for OSX")
+  local cputype, cpusubtype = 0x01000007, 3
+  if ctx.arch ~= "x64" then
+    check(ctx.arch == "arm64", "unsupported architecture for OSX")
+    cputype, cpusubtype = 0x0100000c, 0
  end
  local function aligned(v, a) return bit.band(v+a-1, -a) end
-  local be32 = bit.bswap -- Mach-O FAT is BE, supported archs are LE.

  -- Create Mach-O object and fill in header.
-  local o = ffi.new(mobj)
-  local mach_size = aligned(ffi.offsetof(o, "space")+#symname+2, align)
-  local cputype = ({ x86={7}, x64={0x01000007}, arm={7,12}, arm64={0x01000007,0x0100000c} })[ctx.arch]
-  local cpusubtype = ({ x86={3}, x64={3}, arm={3,9}, arm64={3,0} })[ctx.arch]
-  if isfat then
-    o.fat.magic = be32(0xcafebabe)
-    o.fat.nfat_arch = be32(#cpusubtype)
-  end
+  local o = ffi.new("mach_obj_64")
+  local mach_size = aligned(ffi.offsetof(o, "space")+#symname+2, 8)

  -- Fill in sections and symbols.
-  for i=0,#cpusubtype-1 do
-    local ofs = 0
-    if isfat then
-      local a = o.fat_arch[i]
-      a.cputype = be32(cputype[i+1])
-      a.cpusubtype = be32(cpusubtype[i+1])
-      -- Subsequent slices overlap each other to share data.
-      ofs = ffi.offsetof(o, "arch") + i*ffi.sizeof(o.arch[0])
-      a.offset = be32(ofs)
-      a.size = be32(mach_size-ofs+#s)
-    end
-    local a = o.arch[i]
-    a.hdr.magic = is64 and 0xfeedfacf or 0xfeedface
-    a.hdr.cputype = cputype[i+1]
-    a.hdr.cpusubtype = cpusubtype[i+1]
-    a.hdr.filetype = 1
-    a.hdr.ncmds = 2
-    a.hdr.sizeofcmds = ffi.sizeof(a.seg)+ffi.sizeof(a.sec)+ffi.sizeof(a.sym)
-    a.seg.cmd = is64 and 0x19 or 0x1
-    a.seg.cmdsize = ffi.sizeof(a.seg)+ffi.sizeof(a.sec)
-    a.seg.vmsize = #s
-    a.seg.fileoff = mach_size-ofs
-    a.seg.filesize = #s
-    a.seg.maxprot = 1
-    a.seg.initprot = 1
-    a.seg.nsects = 1
-    ffi.copy(a.sec.sectname, "__data")
-    ffi.copy(a.sec.segname, "__DATA")
-    a.sec.size = #s
-    a.sec.offset = mach_size-ofs
-    a.sym.cmd = 2
-    a.sym.cmdsize = ffi.sizeof(a.sym)
-    a.sym.symoff = ffi.offsetof(o, "sym_entry")-ofs
-    a.sym.nsyms = 1
-    a.sym.stroff = ffi.offsetof(o, "sym_entry")+ffi.sizeof(o.sym_entry)-ofs
-    a.sym.strsize = aligned(#symname+2, align)
-  end
+  o.hdr.magic = 0xfeedfacf
+  o.hdr.cputype = cputype
+  o.hdr.cpusubtype = cpusubtype
+  o.hdr.filetype = 1
+  o.hdr.ncmds = 2
+  o.hdr.sizeofcmds = ffi.sizeof(o.seg)+ffi.sizeof(o.sec)+ffi.sizeof(o.sym)
+  o.seg.cmd = 0x19
+  o.seg.cmdsize = ffi.sizeof(o.seg)+ffi.sizeof(o.sec)
+  o.seg.vmsize = #s
+  o.seg.fileoff = mach_size
+  o.seg.filesize = #s
+  o.seg.maxprot = 1
+  o.seg.initprot = 1
+  o.seg.nsects = 1
+  ffi.copy(o.sec.sectname, "__data")
+  ffi.copy(o.sec.segname, "__DATA")
+  o.sec.size = #s
+  o.sec.offset = mach_size
+  o.sym.cmd = 2
+  o.sym.cmdsize = ffi.sizeof(o.sym)
+  o.sym.symoff = ffi.offsetof(o, "sym_entry")
+  o.sym.nsyms = 1
+  o.sym.stroff = ffi.offsetof(o, "sym_entry")+ffi.sizeof(o.sym_entry)
+  o.sym.strsize = aligned(#symname+2, 8)
  o.sym_entry.type = 0xf
  o.sym_entry.sect = 1
  o.sym_entry.strx = 1
  ffi.copy(o.space+1, symname)

-  -- Write Macho-O object file.
+  -- Write Mach-O object file.
  local fp = savefile(output, "wb")
  fp:write(ffi.string(o, mach_size))
  bcsave_tail(fp, output, s)
@ -624,7 +541,7 @@ end

 local function bcsave(ctx, input, output)
  local f = readfile(ctx, input)
-  local s = string.dump(f, ctx.strip)
+  local s = string.dump(f, ctx.mode)
  local t = ctx.type
  if not t then
    t = detecttype(output)
@ -647,9 +564,11 @@ local function docmd(...)
  local n = 1
  local list = false
  local ctx = {
-    strip = true, arch = jit.arch, os = jit.os:lower(),
-    type = false, modname = false,
+    mode = "bt", arch = jit.arch, os = jit.os:lower(),
+    type = false, modname = false, string = false,
  }
+  local strip = "s"
+  local gc64 = ""
  while n <= #arg do
    local a = arg[n]
    if type(a) == "string" and a:sub(1, 1) == "-" and a ~= "-" then
@ -660,14 +579,18 @@ local function docmd(...)
 	if opt == "l" then
 	  list = true
 	elseif opt == "s" then
-	  ctx.strip = true
+	  strip = "s"
 	elseif opt == "g" then
-	  ctx.strip = false
+	  strip = ""
+	elseif opt == "W" or opt == "X" then
+	  gc64 = opt
+	elseif opt == "d" then
+	  ctx.mode = ctx.mode .. opt
 	else
 	  if arg[n] == nil or m ~= #a then usage() end
 	  if opt == "e" then
 	    if n ~= 1 then usage() end
-	    arg[1] = check(loadstring(arg[1]))
+	    ctx.string = true
 	  elseif opt == "n" then
 	    ctx.modname = checkmodname(tremove(arg, n))
 	  elseif opt == "t" then
@ -687,6 +610,7 @@ local function docmd(...)
      n = n + 1
    end
  end
+  ctx.mode = ctx.mode .. strip .. gc64
  if list then
    if #arg == 0 or #arg > 2 then usage() end
    bclist(ctx, arg[1], arg[2] or "-")
--- a/src/jit/dis_arm.lua
+++ b/src/jit/dis_arm.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT ARM disassembler module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This is a helper module used by the LuaJIT machine code dumper module.
--- a/src/jit/dis_arm64.lua
+++ b/src/jit/dis_arm64.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT ARM64 disassembler module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 --
 -- Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
@ -107,24 +107,20 @@ local map_logsr = { -- Logical, shifted register.
    [0] = {
      shift = 29, mask = 3,
      [0] = {
-	shift = 21, mask = 7,
-	[0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
-	"andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+	shift = 21, mask = 1,
+	[0] = "andDNMSg", "bicDNMSg"
      },
      {
-	shift = 21, mask = 7,
-	[0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
-	     "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+	shift = 21, mask = 1,
+	[0] = "orr|movDN0MSg", "orn|mvnDN0MSg"
      },
      {
-	shift = 21, mask = 7,
-	[0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
-	"eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+	shift = 21, mask = 1,
+	[0] = "eorDNMSg", "eonDNMSg"
      },
      {
-	shift = 21, mask = 7,
-	[0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
-	"ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+	shift = 21, mask = 1,
+	[0] = "ands|tstD0NMSg", "bicsDNMSg"
      }
    },
    false -- unallocated
@ -132,24 +128,20 @@ local map_logsr = { -- Logical, shifted register.
  {
    shift = 29, mask = 3,
    [0] = {
-      shift = 21, mask = 7,
-      [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
-      "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+      shift = 21, mask = 1,
+      [0] = "andDNMSg", "bicDNMSg"
    },
    {
-      shift = 21, mask = 7,
-      [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
-      "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+      shift = 21, mask = 1,
+      [0] = "orr|movDN0MSg", "orn|mvnDN0MSg"
    },
    {
-      shift = 21, mask = 7,
-      [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
-      "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+      shift = 21, mask = 1,
+      [0] = "eorDNMSg", "eonDNMSg"
    },
    {
-      shift = 21, mask = 7,
-      [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
-      "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+      shift = 21, mask = 1,
+      [0] = "ands|tstD0NMSg", "bicsDNMSg"
    }
  }
 }
@ -666,6 +658,10 @@ local map_datafp = { -- Data processing, SIMD and FP.
 	}
      }
    }
+  },
+  { -- 010
+    shift = 0, mask = 0x81f8fc00,
+    [0x100e400] = "moviDdG"
  }
 }

@ -735,7 +731,7 @@ local map_cond = {
  "hi", "ls", "ge", "lt", "gt", "le", "al",
 }

-local map_shift = { [0] = "lsl", "lsr", "asr", }
+local map_shift = { [0] = "lsl", "lsr", "asr", "ror"}

 local map_extend = {
  [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx",
@ -840,6 +836,20 @@ local function parse_fpimm8(op)
  return sign * frac * 2^exp
 end

+local function decode_fpmovi(op)
+  local lo = rshift(op, 5)
+  local hi = rshift(op, 9)
+  lo = bor(band(lo, 1) * 0xff, band(lo, 2) * 0x7f80, band(lo, 4) * 0x3fc000,
+	   band(lo, 8) * 0x1fe00000)
+  hi = bor(band(hi, 1) * 0xff, band(hi, 0x80) * 0x1fe,
+	   band(hi, 0x100) * 0xff00, band(hi, 0x200) * 0x7f8000)
+  if hi ~= 0 then
+    return fmt_hex32(hi)..tohex(lo)
+  else
+    return fmt_hex32(lo)
+  end
+end
+
 local function prefer_bfx(sf, uns, imms, immr)
  if imms < immr or imms == 31 or imms == 63 then
    return false
@ -956,7 +966,7 @@ local function disass_ins(ctx)
    elseif p == "U" then
      local rn = map_regs.x[band(rshift(op, 5), 31)]
      local sz = band(rshift(op, 30), 3)
-      local imm12 = lshift(arshift(lshift(op, 10), 20), sz)
+      local imm12 = lshift(rshift(lshift(op, 10), 20), sz)
      if imm12 ~= 0 then
 	x = "["..rn..", #"..imm12.."]"
      else
@ -993,8 +1003,7 @@ local function disass_ins(ctx)
 	x = x.."]"
      end
    elseif p == "P" then
-      local opcv, sh = rshift(op, 26), 2
-      if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end
+      local sh = 2 + rshift(op, 31 - band(rshift(op, 26), 1))
      local imm7 = lshift(arshift(lshift(op, 10), 25), sh)
      local rn = map_regs.x[band(rshift(op, 5), 31)]
      local ind = band(rshift(op, 23), 3)
@ -1140,6 +1149,8 @@ local function disass_ins(ctx)
      x = 0
    elseif p == "F" then
      x = parse_fpimm8(op)
+    elseif p == "G" then
+      x = "#0x"..decode_fpmovi(op)
    elseif p == "g" or p == "f" or p == "x" or p == "w" or
 	   p == "d" or p == "s" then
      -- These are handled in D/N/M/A.
--- a/src/jit/dis_arm64be.lua
+++ b/src/jit/dis_arm64be.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT ARM64BE disassembler wrapper module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- ARM64 instructions are always little-endian. So just forward to the
--- a/src/jit/dis_mips.lua
+++ b/src/jit/dis_mips.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT MIPS disassembler module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT/X license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This is a helper module used by the LuaJIT machine code dumper module.
--- a/src/jit/dis_mips64.lua
+++ b/src/jit/dis_mips64.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT MIPS64 disassembler wrapper module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This module just exports the big-endian functions from the
--- a/src/jit/dis_mips64el.lua
+++ b/src/jit/dis_mips64el.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT MIPS64EL disassembler wrapper module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This module just exports the little-endian functions from the
--- a/src/jit/dis_mips64r6.lua
+++ b/src/jit/dis_mips64r6.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT MIPS64R6 disassembler wrapper module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This module just exports the r6 big-endian functions from the
--- a/src/jit/dis_mips64r6el.lua
+++ b/src/jit/dis_mips64r6el.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT MIPS64R6EL disassembler wrapper module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This module just exports the r6 little-endian functions from the
--- a/src/jit/dis_mipsel.lua
+++ b/src/jit/dis_mipsel.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT MIPSEL disassembler wrapper module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This module just exports the little-endian functions from the
--- a/src/jit/dis_ppc.lua
+++ b/src/jit/dis_ppc.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT PPC disassembler module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT/X license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This is a helper module used by the LuaJIT machine code dumper module.
--- a/src/jit/dis_x64.lua
+++ b/src/jit/dis_x64.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT x64 disassembler wrapper module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This module just exports the 64 bit functions from the combined
--- a/src/jit/dis_x86.lua
+++ b/src/jit/dis_x86.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT x86/x64 disassembler module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 -- This is a helper module used by the LuaJIT machine code dumper module.
--- a/src/jit/dump.lua
+++ b/src/jit/dump.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT compiler dump module.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 --
@ -552,7 +552,12 @@ local recdepth = 0
 local function fmterr(err, info)
  if type(err) == "number" then
    if type(info) == "function" then info = fmtfunc(info) end
-    err = format(vmdef.traceerr[err], info)
+    local fmt = vmdef.traceerr[err]
+    if fmt == "NYI: bytecode %s" then
+      local oidx = 6 * info
+      info = sub(vmdef.bcnames, oidx+1, oidx+6)
+    end
+    err = format(fmt, info)
  end
  return err
 end
--- a/src/jit/p.lua
+++ b/src/jit/p.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT profiler.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 --
@ -227,9 +227,7 @@ local function prof_finish()
    local samples = prof_samples
    if samples == 0 then
      if prof_raw ~= true then out:write("[No samples collected]\n") end
-      return
-    end
-    if prof_ann then
+    elseif prof_ann then
      prof_annotate(prof_count1, samples)
    else
      prof_top(prof_count1, prof_count2, samples, "")
--- a/src/jit/v.lua
+++ b/src/jit/v.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- Verbose mode of the LuaJIT compiler.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 --
@ -62,7 +62,7 @@ local jit = require("jit")
 local jutil = require("jit.util")
 local vmdef = require("jit.vmdef")
 local funcinfo, traceinfo = jutil.funcinfo, jutil.traceinfo
-local type, format = type, string.format
+local type, sub, format = type, string.sub, string.format
 local stdout, stderr = io.stdout, io.stderr

 -- Active flag and output file handle.
@ -89,7 +89,12 @@ end
 local function fmterr(err, info)
  if type(err) == "number" then
    if type(info) == "function" then info = fmtfunc(info) end
-    err = format(vmdef.traceerr[err], info)
+    local fmt = vmdef.traceerr[err]
+    if fmt == "NYI: bytecode %s" then
+      local oidx = 6 * info
+      info = sub(vmdef.bcnames, oidx+1, oidx+6)
+    end
+    err = format(fmt, info)
  end
  return err
 end
--- a/src/jit/zone.lua
+++ b/src/jit/zone.lua
@ -1,7 +1,7 @@
 ----------------------------------------------------------------------------
 -- LuaJIT profiler zones.
 --
-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
+-- Copyright (C) 2005-2025 Mike Pall. All rights reserved.
 -- Released under the MIT license. See Copyright Notice in luajit.h
 ----------------------------------------------------------------------------
 --
--- a/src/lib_aux.c
+++ b/src/lib_aux.c
@ -1,6 +1,6 @@
 /*
 ** Auxiliary library for the Lua/C API.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major parts taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
--- a/src/lib_base.c
+++ b/src/lib_base.c
@ -1,6 +1,6 @@
 /*
 ** Base and coroutine library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2011 Lua.org, PUC-Rio. See Copyright Notice in lua.h
@ -146,6 +146,8 @@ LJLIB_CF(getfenv)		LJLIB_REC(.)
  cTValue *o = L->base;
  if (!(o < L->top && tvisfunc(o))) {
    int level = lj_lib_optint(L, 1, 1);
+    if (level < 0)
+      lj_err_arg(L, 1, LJ_ERR_INVLVL);
    o = lj_debug_frame(L, level, &level);
    if (o == NULL)
      lj_err_arg(L, 1, LJ_ERR_INVLVL);
@ -168,6 +170,8 @@ LJLIB_CF(setfenv)
      setgcref(L->env, obj2gco(t));
      return 0;
    }
+    if (level < 0)
+      lj_err_arg(L, 1, LJ_ERR_INVLVL);
    o = lj_debug_frame(L, level, &level);
    if (o == NULL)
      lj_err_arg(L, 1, LJ_ERR_INVLVL);
@ -360,7 +364,11 @@ LJLIB_ASM_(xpcall)		LJLIB_REC(.)
 static int load_aux(lua_State *L, int status, int envarg)
 {
  if (status == LUA_OK) {
-    if (tvistab(L->base+envarg-1)) {
+    /*
+    ** Set environment table for top-level function.
+    ** Don't do this for non-native bytecode, which returns a prototype.
+    */
+    if (tvistab(L->base+envarg-1) && tvisfunc(L->top-1)) {
      GCfunc *fn = funcV(L->top-1);
      GCtab *t = tabV(L->base+envarg-1);
      setgcref(fn->c.env, obj2gco(t));
@ -616,7 +624,10 @@ static int ffh_resume(lua_State *L, lua_State *co, int wrap)
    setstrV(L, L->base-LJ_FR2, lj_err_str(L, em));
    return FFH_RES(2);
  }
-  lj_state_growstack(co, (MSize)(L->top - L->base));
+  if (lj_state_cpgrowstack(co, (MSize)(L->top - L->base)) != LUA_OK) {
+    cTValue *msg = --co->top;
+    lj_err_callermsg(L, strVdata(msg));
+  }
  return FFH_RETRY;
 }

--- a/src/lib_bit.c
+++ b/src/lib_bit.c
@ -1,6 +1,6 @@
 /*
 ** Bit manipulation library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lib_bit_c
@ -98,7 +98,7 @@ LJLIB_ASM(bit_lshift)		LJLIB_REC(bit_shift IR_BSHL)
    x = lj_carith_shift64(x, sh, curr_func(L)->c.ffid - (int)FF_bit_lshift);
    return bit_result64(L, id, x);
  }
-  if (id2) setintV(L->base+1, sh);
+  setintV(L->base+1, sh);
  return FFH_RETRY;
 #else
  lj_lib_checknumber(L, 1);
--- a/src/lib_buffer.c
+++ b/src/lib_buffer.c
@ -1,6 +1,6 @@
 /*
 ** Buffer library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lib_buffer_c
--- a/src/lib_debug.c
+++ b/src/lib_debug.c
@ -1,6 +1,6 @@
 /*
 ** Debug library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
--- a/src/lib_ffi.c
+++ b/src/lib_ffi.c
@ -1,6 +1,6 @@
 /*
 ** FFI library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lib_ffi_c
@ -305,7 +305,7 @@ LJLIB_CF(ffi_meta___tostring)
      p = *(void **)p;
    } else if (ctype_isenum(ct->info)) {
      msg = "cdata<%s>: %d";
-      p = (void *)(uintptr_t)*(uint32_t **)p;
+      p = (void *)(uintptr_t)*(uint32_t *)p;
    } else {
      if (ctype_isptr(ct->info)) {
 	p = cdata_getptr(p, ct->size);
@ -513,7 +513,7 @@ LJLIB_CF(ffi_new)	LJLIB_REC(.)
    /* Handle ctype __gc metamethod. Use the fast lookup here. */
    cTValue *tv = lj_tab_getinth(cts->miscmap, -(int32_t)id);
    if (tv && tvistab(tv) && (tv = lj_meta_fast(L, tabV(tv), MM_gc))) {
-      GCtab *t = cts->finalizer;
+      GCtab *t = tabref(G(L)->gcroot[GCROOT_FFI_FIN]);
      if (gcref(t->metatable)) {
 	/* Add to finalizer table, if still enabled. */
 	copyTV(L, lj_tab_set(L, t, o-1), tv);
@ -746,7 +746,7 @@ LJLIB_CF(ffi_abi)	LJLIB_REC(.)
    "\003win"
 #endif
 #if LJ_ABI_PAUTH
-    "\007pauth"
+    "\005pauth"
 #endif
 #if LJ_TARGET_UWP
    "\003uwp"
@ -765,7 +765,7 @@ LJLIB_CF(ffi_abi)	LJLIB_REC(.)
  return 1;
 }

-LJLIB_PUSH(top-8) LJLIB_SET(!)  /* Store reference to miscmap table. */
+LJLIB_PUSH(top-7) LJLIB_SET(!)  /* Store reference to miscmap table. */

 LJLIB_CF(ffi_metatype)
 {
@ -791,8 +791,6 @@ LJLIB_CF(ffi_metatype)
  return 1;
 }

-LJLIB_PUSH(top-7) LJLIB_SET(!)  /* Store reference to finalizer table. */
-
 LJLIB_CF(ffi_gc)	LJLIB_REC(.)
 {
  GCcdata *cd = ffi_checkcdata(L, 1);
@ -825,19 +823,6 @@ LJLIB_PUSH(top-2) LJLIB_SET(arch)

 /* ------------------------------------------------------------------------ */

-/* Create special weak-keyed finalizer table. */
-static GCtab *ffi_finalizer(lua_State *L)
-{
-  /* NOBARRIER: The table is new (marked white). */
-  GCtab *t = lj_tab_new(L, 0, 1);
-  settabV(L, L->top++, t);
-  setgcref(t->metatable, obj2gco(t));
-  setstrV(L, lj_tab_setstr(L, t, lj_str_newlit(L, "__mode")),
-	  lj_str_newlit(L, "k"));
-  t->nomm = (uint8_t)(~(1u<<MM_mode));
-  return t;
-}
-
 /* Register FFI module as loaded. */
 static void ffi_register_module(lua_State *L)
 {
@ -853,7 +838,6 @@ LUALIB_API int luaopen_ffi(lua_State *L)
 {
  CTState *cts = lj_ctype_init(L);
  settabV(L, L->top++, (cts->miscmap = lj_tab_new(L, 0, 1)));
-  cts->finalizer = ffi_finalizer(L);
  LJ_LIB_REG(L, NULL, ffi_meta);
  /* NOBARRIER: basemt is a GC root. */
  setgcref(basemt_it(G(L), LJ_TCDATA), obj2gco(tabV(L->top-1)));
--- a/src/lib_init.c
+++ b/src/lib_init.c
@ -1,6 +1,6 @@
 /*
 ** Library initialization.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major parts taken verbatim from the Lua interpreter.
 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
--- a/src/lib_io.c
+++ b/src/lib_io.c
@ -1,6 +1,6 @@
 /*
 ** I/O library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2011 Lua.org, PUC-Rio. See Copyright Notice in lua.h
@ -25,6 +25,7 @@
 #include "lj_strfmt.h"
 #include "lj_ff.h"
 #include "lj_lib.h"
+#include "lj_strscan.h"

 /* Userdata payload for I/O file. */
 typedef struct IOFileUD {
@ -323,13 +324,14 @@ LJLIB_CF(io_method_seek)
  FILE *fp = io_tofile(L)->fp;
  int opt = lj_lib_checkopt(L, 2, 1, "\3set\3cur\3end");
  int64_t ofs = 0;
-  cTValue *o;
+  TValue *o;
  int res;
  if (opt == 0) opt = SEEK_SET;
  else if (opt == 1) opt = SEEK_CUR;
  else if (opt == 2) opt = SEEK_END;
  o = L->base+2;
  if (o < L->top) {
+    if (tvisstr(o)) lj_strscan_num(strV(o), o);
    if (tvisint(o))
      ofs = (int64_t)intV(o);
    else if (tvisnum(o))
--- a/src/lib_jit.c
+++ b/src/lib_jit.c
@ -1,6 +1,6 @@
 /*
 ** JIT library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lib_jit_c
@ -161,24 +161,6 @@ LJLIB_PUSH(top-2) LJLIB_SET(version)

 /* -- Reflection API for Lua functions ------------------------------------ */

-/* Return prototype of first argument (Lua function or prototype object) */
-static GCproto *check_Lproto(lua_State *L, int nolua)
-{
-  TValue *o = L->base;
-  if (L->top > o) {
-    if (tvisproto(o)) {
-      return protoV(o);
-    } else if (tvisfunc(o)) {
-      if (isluafunc(funcV(o)))
-	return funcproto(funcV(o));
-      else if (nolua)
-	return NULL;
-    }
-  }
-  lj_err_argt(L, 1, LUA_TFUNCTION);
-  return NULL;  /* unreachable */
-}
-
 static void setintfield(lua_State *L, GCtab *t, const char *name, int32_t val)
 {
  setintV(lj_tab_setstr(L, t, lj_str_newz(L, name)), val);
@ -187,7 +169,7 @@ static void setintfield(lua_State *L, GCtab *t, const char *name, int32_t val)
 /* local info = jit.util.funcinfo(func [,pc]) */
 LJLIB_CF(jit_util_funcinfo)
 {
-  GCproto *pt = check_Lproto(L, 1);
+  GCproto *pt = lj_lib_checkLproto(L, 1, 1);
  if (pt) {
    BCPos pc = (BCPos)lj_lib_optint(L, 2, 0);
    GCtab *t;
@ -229,7 +211,7 @@ LJLIB_CF(jit_util_funcinfo)
 /* local ins, m = jit.util.funcbc(func, pc) */
 LJLIB_CF(jit_util_funcbc)
 {
-  GCproto *pt = check_Lproto(L, 0);
+  GCproto *pt = lj_lib_checkLproto(L, 1, 0);
  BCPos pc = (BCPos)lj_lib_checkint(L, 2);
  if (pc < pt->sizebc) {
    BCIns ins = proto_bc(pt)[pc];
@ -246,7 +228,7 @@ LJLIB_CF(jit_util_funcbc)
 /* local k = jit.util.funck(func, idx) */
 LJLIB_CF(jit_util_funck)
 {
-  GCproto *pt = check_Lproto(L, 0);
+  GCproto *pt = lj_lib_checkLproto(L, 1, 0);
  ptrdiff_t idx = (ptrdiff_t)lj_lib_checkint(L, 2);
  if (idx >= 0) {
    if (idx < (ptrdiff_t)pt->sizekn) {
@ -266,7 +248,7 @@ LJLIB_CF(jit_util_funck)
 /* local name = jit.util.funcuvname(func, idx) */
 LJLIB_CF(jit_util_funcuvname)
 {
-  GCproto *pt = check_Lproto(L, 0);
+  GCproto *pt = lj_lib_checkLproto(L, 1, 0);
  uint32_t idx = (uint32_t)lj_lib_checkint(L, 2);
  if (idx < pt->sizeuv) {
    setstrV(L, L->top-1, lj_str_newz(L, lj_debug_uvname(pt, idx)));
--- a/src/lib_math.c
+++ b/src/lib_math.c
@ -1,6 +1,6 @@
 /*
 ** Math library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #include <math.h>
@ -13,6 +13,7 @@
 #include "lualib.h"

 #include "lj_obj.h"
+#include "lj_err.h"
 #include "lj_lib.h"
 #include "lj_vm.h"
 #include "lj_prng.h"
@ -183,7 +184,10 @@ LJLIB_PUSH(top-2)  /* Upvalue holds userdata with PRNGState. */
 LJLIB_CF(math_randomseed)
 {
  PRNGState *rs = (PRNGState *)(uddata(udataV(lj_lib_upvalue(L, 1))));
-  random_seed(rs, lj_lib_checknum(L, 1));
+  if (L->base != L->top)
+    random_seed(rs, lj_lib_checknum(L, 1));
+  else if (!lj_prng_seed_secure(rs))
+    lj_err_caller(L, LJ_ERR_PRNGSD);
  return 0;
 }

--- a/src/lib_os.c
+++ b/src/lib_os.c
@ -1,6 +1,6 @@
 /*
 ** OS library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
--- a/src/lib_package.c
+++ b/src/lib_package.c
@ -1,6 +1,6 @@
 /*
 ** Package library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2012 Lua.org, PUC-Rio. See Copyright Notice in lua.h
--- a/src/lib_string.c
+++ b/src/lib_string.c
@ -1,6 +1,6 @@
 /*
 ** String library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
@ -122,11 +122,25 @@ static int writer_buf(lua_State *L, const void *p, size_t size, void *sb)

 LJLIB_CF(string_dump)
 {
-  GCfunc *fn = lj_lib_checkfunc(L, 1);
-  int strip = L->base+1 < L->top && tvistruecond(L->base+1);
-  SBuf *sb = lj_buf_tmp_(L);  /* Assumes lj_bcwrite() doesn't use tmpbuf. */
+  GCproto *pt = lj_lib_checkLproto(L, 1, 1);
+  uint32_t flags = 0;
+  SBuf *sb;
+  TValue *o = L->base+1;
+  if (o < L->top) {
+    if (tvisstr(o)) {
+      const char *mode = strVdata(o);
+      char c;
+      while ((c = *mode++)) {
+	if (c == 's') flags |= BCDUMP_F_STRIP;
+	if (c == 'd') flags |= BCDUMP_F_DETERMINISTIC;
+      }
+    } else if (tvistruecond(o)) {
+      flags |= BCDUMP_F_STRIP;
+    }
+  }
+  sb = lj_buf_tmp_(L);  /* Assumes lj_bcwrite() doesn't use tmpbuf. */
  L->top = L->base+1;
-  if (!isluafunc(fn) || lj_bcwrite(L, funcproto(fn), writer_buf, sb, strip))
+  if (!pt || lj_bcwrite(L, pt, writer_buf, sb, flags))
    lj_err_caller(L, LJ_ERR_STRDUMP);
  setstrV(L, L->top-1, lj_buf_str(L, sb));
  lj_gc_check(L);
--- a/src/lib_table.c
+++ b/src/lib_table.c
@ -1,6 +1,6 @@
 /*
 ** Table library.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
--- a/src/lj_alloc.c
+++ b/src/lj_alloc.c
@ -1057,7 +1057,7 @@ static size_t release_unused_segments(mstate m)
      mchunkptr p = align_as_chunk(base);
      size_t psize = chunksize(p);
      /* Can unmap if first chunk holds entire segment and not pinned */
-      if (!cinuse(p) && (char *)p + psize >= base + size - TOP_FOOT_SIZE) {
+      if (!cinuse(p) && (char *)p + psize == (char *)mem2chunk(sp)) {
 	tchunkptr tp = (tchunkptr)p;
 	if (p == m->dv) {
 	  m->dv = 0;
--- a/src/lj_api.c
+++ b/src/lj_api.c
@ -1,6 +1,6 @@
 /*
 ** Public Lua/C API.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Major portions taken verbatim or adapted from the Lua interpreter.
 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
@ -104,7 +104,12 @@ LUA_API int lua_checkstack(lua_State *L, int size)
  if (size > LUAI_MAXCSTACK || (L->top - L->base + size) > LUAI_MAXCSTACK) {
    return 0;  /* Stack overflow. */
  } else if (size > 0) {
-    lj_state_checkstack(L, (MSize)size);
+    int avail = (int)(mref(L->maxstack, TValue) - L->top);
+    if (size > avail &&
+	lj_state_cpgrowstack(L, (MSize)(size - avail)) != LUA_OK) {
+      L->top--;
+      return 0;  /* Out of memory. */
+    }
  }
  return 1;
 }
@ -1047,6 +1052,7 @@ LUA_API int lua_setmetatable(lua_State *L, int idx)
    /* Flush cache, since traces specialize to basemt. But not during __gc. */
    if (lj_trace_flushall(L))
      lj_err_caller(L, LJ_ERR_NOGCMM);
+    o = index2adr(L, idx);  /* Stack may have been reallocated. */
    if (tvisbool(o)) {
      /* NOBARRIER: basemt is a GC root. */
      setgcref(basemt_it(g, LJ_TTRUE), obj2gco(mt));
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@ -1,6 +1,6 @@
 /*
 ** Target architecture selection.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _LJ_ARCH_H
@ -57,7 +57,7 @@
 #define LUAJIT_TARGET	LUAJIT_ARCH_X64
 #elif defined(__arm__) || defined(__arm) || defined(__ARM__) || defined(__ARM)
 #define LUAJIT_TARGET	LUAJIT_ARCH_ARM
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(_M_ARM64)
 #define LUAJIT_TARGET	LUAJIT_ARCH_ARM64
 #elif defined(__ppc__) || defined(__ppc) || defined(__PPC__) || defined(__PPC) || defined(__powerpc__) || defined(__powerpc) || defined(__POWERPC__) || defined(__POWERPC) || defined(_M_PPC)
 #define LUAJIT_TARGET	LUAJIT_ARCH_PPC
@ -66,7 +66,7 @@
 #elif defined(__mips__) || defined(__mips) || defined(__MIPS__) || defined(__MIPS)
 #define LUAJIT_TARGET	LUAJIT_ARCH_MIPS32
 #else
-#error "No support for this architecture (yet)"
+#error "Architecture not supported (in this version), see: https://luajit.org/status.html#architectures"
 #endif

 #endif
@ -124,7 +124,7 @@
 #define LJ_TARGET_POSIX		(LUAJIT_OS > LUAJIT_OS_WINDOWS)
 #define LJ_TARGET_DLOPEN	LJ_TARGET_POSIX

-#if TARGET_OS_IPHONE
+#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
 #define LJ_TARGET_IOS		1
 #else
 #define LJ_TARGET_IOS		0
@ -237,7 +237,7 @@
 #define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
 #define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL

-#if __ARM_ARCH == 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__
+#if __ARM_ARCH >= 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__
 #define LJ_ARCH_VERSION		80
 #elif __ARM_ARCH == 7 || __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__
 #define LJ_ARCH_VERSION		70
@ -331,6 +331,7 @@
 #define LJ_ARCH_NOFFI		1
 #elif LJ_ARCH_BITS == 64
 #error "No support for PPC64"
+#undef LJ_TARGET_PPC
 #endif

 #if _ARCH_PWR7
@ -490,36 +491,45 @@
 #elif LJ_TARGET_ARM
 #if defined(__ARMEB__)
 #error "No support for big-endian ARM"
+#undef LJ_TARGET_ARM
 #endif
 #if __ARM_ARCH_6M__ || __ARM_ARCH_7M__ || __ARM_ARCH_7EM__
 #error "No support for Cortex-M CPUs"
+#undef LJ_TARGET_ARM
 #endif
 #if !(__ARM_EABI__ || LJ_TARGET_IOS)
 #error "Only ARM EABI or iOS 3.0+ ABI is supported"
+#undef LJ_TARGET_ARM
 #endif
 #elif LJ_TARGET_ARM64
 #if defined(_ILP32)
 #error "No support for ILP32 model on ARM64"
+#undef LJ_TARGET_ARM64
 #endif
 #elif LJ_TARGET_PPC
 #if defined(_LITTLE_ENDIAN) && (!defined(_BYTE_ORDER) || (_BYTE_ORDER == _LITTLE_ENDIAN))
 #error "No support for little-endian PPC32"
+#undef LJ_TARGET_PPC
 #endif
 #if defined(__NO_FPRS__) && !defined(_SOFT_FLOAT)
-#error "No support for PPC/e500 anymore (use LuaJIT 2.0)"
+#error "No support for PPC/e500, use LuaJIT 2.0"
+#undef LJ_TARGET_PPC
 #endif
 #elif LJ_TARGET_MIPS32
 #if !((defined(_MIPS_SIM_ABI32) && _MIPS_SIM == _MIPS_SIM_ABI32) || (defined(_ABIO32) && _MIPS_SIM == _ABIO32))
 #error "Only o32 ABI supported for MIPS32"
+#undef LJ_TARGET_MIPS
 #endif
 #if LJ_TARGET_MIPSR6
 /* Not that useful, since most available r6 CPUs are 64 bit. */
 #error "No support for MIPS32R6"
+#undef LJ_TARGET_MIPS
 #endif
 #elif LJ_TARGET_MIPS64
 #if !((defined(_MIPS_SIM_ABI64) && _MIPS_SIM == _MIPS_SIM_ABI64) || (defined(_ABI64) && _MIPS_SIM == _ABI64))
 /* MIPS32ON64 aka n32 ABI support might be desirable, but difficult. */
 #error "Only n64 ABI supported for MIPS64"
+#undef LJ_TARGET_MIPS
 #endif
 #endif
 #endif
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@ -1,6 +1,6 @@
 /*
 ** IR assembler (SSA IR -> machine code).
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lj_asm_c
@ -29,6 +29,7 @@
 #include "lj_dispatch.h"
 #include "lj_vm.h"
 #include "lj_target.h"
+#include "lj_prng.h"

 #ifdef LUA_USE_ASSERT
 #include <stdio.h>
@ -93,6 +94,12 @@ typedef struct ASMState {
  MCode *flagmcp;	/* Pending opportunity to merge flag setting ins. */
  MCode *realign;	/* Realign loop if not NULL. */

+#ifdef LUAJIT_RANDOM_RA
+  /* Randomize register allocation. OK for fuzz testing, not for production. */
+  uint64_t prngbits;
+  PRNGState prngstate;
+#endif
+
 #ifdef RID_NUM_KREF
  intptr_t krefk[RID_NUM_KREF];
 #endif
@ -173,6 +180,41 @@ IRFLDEF(FLOFS)
  0
 };

+#ifdef LUAJIT_RANDOM_RA
+/* Return a fixed number of random bits from the local PRNG state. */
+static uint32_t ra_random_bits(ASMState *as, uint32_t nbits) {
+  uint64_t b = as->prngbits;
+  uint32_t res = (1u << nbits) - 1u;
+  if (b <= res) b = lj_prng_u64(&as->prngstate) | (1ull << 63);
+  res &= (uint32_t)b;
+  as->prngbits = b >> nbits;
+  return res;
+}
+
+/* Pick a random register from a register set. */
+static Reg rset_pickrandom(ASMState *as, RegSet rs)
+{
+  Reg r = rset_pickbot_(rs);
+  rs >>= r;
+  if (rs > 1) {  /* More than one bit set? */
+    while (1) {
+      /* We need to sample max. the GPR or FPR half of the set. */
+      uint32_t d = ra_random_bits(as, RSET_BITS-1);
+      if ((rs >> d) & 1) {
+	r += d;
+	break;
+      }
+    }
+  }
+  return r;
+}
+#define rset_picktop(rs)	rset_pickrandom(as, rs)
+#define rset_pickbot(rs)	rset_pickrandom(as, rs)
+#else
+#define rset_picktop(rs)	rset_picktop_(rs)
+#define rset_pickbot(rs)	rset_pickbot_(rs)
+#endif
+
 /* -- Target-specific instruction emitter --------------------------------- */

 #if LJ_TARGET_X86ORX64
@ -564,7 +606,11 @@ static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow)
 	IRIns *ir = IR(ref);
 	if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) ||
 #if LJ_GC64
+#if LJ_TARGET_ARM64
+	    (ir->o == IR_KINT && (uint64_t)k == (uint32_t)ir->i) ||
+#else
 	    (ir->o == IR_KINT && k == ir->i) ||
+#endif
 	    (ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) ||
 	    ((ir->o == IR_KPTR || ir->o == IR_KKPTR) &&
 	     k == (intptr_t)ir_kptr(ir))
@ -903,11 +949,11 @@ static int asm_sunk_store(ASMState *as, IRIns *ira, IRIns *irs)
 static void asm_snap_alloc1(ASMState *as, IRRef ref)
 {
  IRIns *ir = IR(ref);
-  if (!irref_isk(ref) && ir->r != RID_SUNK) {
+  if (!irref_isk(ref)) {
    bloomset(as->snapfilt1, ref);
    bloomset(as->snapfilt2, hashrot(ref, ref + HASH_BIAS));
    if (ra_used(ir)) return;
-    if (ir->r == RID_SINK) {
+    if (ir->r == RID_SINK || ir->r == RID_SUNK) {
      ir->r = RID_SUNK;
 #if LJ_HASFFI
      if (ir->o == IR_CNEWI) {  /* Allocate CNEWI value. */
@ -2442,6 +2488,9 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
  as->realign = NULL;
  as->loopinv = 0;
  as->parent = J->parent ? traceref(J, J->parent) : NULL;
+#ifdef LUAJIT_RANDOM_RA
+  (void)lj_prng_u64(&J2G(J)->prng);  /* Ensure PRNG step between traces. */
+#endif

  /* Reserve MCode memory. */
  as->mctop = as->mctoporig = lj_mcode_reserve(J, &as->mcbot);
@ -2483,6 +2532,10 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
 #endif
    as->ir = J->curfinal->ir;  /* Use the copied IR. */
    as->curins = J->cur.nins = as->orignins;
+#ifdef LUAJIT_RANDOM_RA
+    as->prngstate = J2G(J)->prng;  /* Must (re)start from identical state. */
+    as->prngbits = 0;
+#endif

    RA_DBG_START();
    RA_DBGX((as, "===== STOP ====="));
--- a/src/lj_asm.h
+++ b/src/lj_asm.h
@ -1,6 +1,6 @@
 /*
 ** IR assembler (SSA IR -> machine code).
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _LJ_ASM_H
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@ -1,6 +1,6 @@
 /*
 ** ARM IR assembler (SSA IR -> machine code).
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 /* -- Register allocator extensions --------------------------------------- */
@ -969,24 +969,32 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
 static void asm_uref(ASMState *as, IRIns *ir)
 {
  Reg dest = ra_dest(as, ir, RSET_GPR);
-  if (irref_isk(ir->op1)) {
+  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+  if (irref_isk(ir->op1) && !guarded) {
    GCfunc *fn = ir_kfunc(IR(ir->op1));
    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
    emit_lsptr(as, ARMI_LDR, dest, v);
  } else {
-    Reg uv = ra_scratch(as, RSET_GPR);
-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
-    if (ir->o == IR_UREFC) {
-      asm_guardcc(as, CC_NE);
+    if (guarded) {
+      asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ);
      emit_n(as, ARMI_CMP|ARMI_K12|1, RID_TMP);
-      emit_opk(as, ARMI_ADD, dest, uv,
-	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
-      emit_lso(as, ARMI_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
-    } else {
-      emit_lso(as, ARMI_LDR, dest, uv, (int32_t)offsetof(GCupval, v));
    }
-    emit_lso(as, ARMI_LDR, uv, func,
-	     (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+    if (ir->o == IR_UREFC)
+      emit_opk(as, ARMI_ADD, dest, dest,
+	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
+    else
+      emit_lso(as, ARMI_LDR, dest, dest, (int32_t)offsetof(GCupval, v));
+    if (guarded)
+      emit_lso(as, ARMI_LDRB, RID_TMP, dest,
+	       (int32_t)offsetof(GCupval, closed));
+    if (irref_isk(ir->op1)) {
+      GCfunc *fn = ir_kfunc(IR(ir->op1));
+      int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
+      emit_loadi(as, dest, k);
+    } else {
+      emit_lso(as, ARMI_LDR, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+	       (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+    }
  }
 }

@ -1919,7 +1927,7 @@ static void asm_hiop(ASMState *as, IRIns *ir)
  } else if ((ir-1)->o == IR_MIN || (ir-1)->o == IR_MAX) {
    as->curins--;  /* Always skip the loword min/max. */
    if (uselo || usehi)
-      asm_sfpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_PL : CC_LE);
+      asm_sfpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_HS : CC_LS);
    return;
 #elif LJ_HASFFI
  } else if ((ir-1)->o == IR_CONV) {
@ -1990,6 +1998,7 @@ static void asm_prof(ASMState *as, IRIns *ir)
 static void asm_stack_check(ASMState *as, BCReg topslot,
 			    IRIns *irp, RegSet allow, ExitNo exitno)
 {
+  int savereg = 0;
  Reg pbase;
  uint32_t k;
  if (irp) {
@ -2000,12 +2009,14 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
      pbase = rset_pickbot(allow);
    } else {
      pbase = RID_RET;
-      emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0);  /* Restore temp. register. */
+      savereg = 1;
    }
  } else {
    pbase = RID_BASE;
  }
  emit_branch(as, ARMF_CC(ARMI_BL, CC_LS), exitstub_addr(as->J, exitno));
+  if (savereg)
+    emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0);  /* Restore temp. register. */
  k = emit_isk12(0, (int32_t)(8*topslot));
  lj_assertA(k, "slot offset %d does not fit in K12", 8*topslot);
  emit_n(as, ARMI_CMP^k, RID_TMP);
@ -2017,7 +2028,7 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
    if (ra_hasspill(irp->s))
      emit_lso(as, ARMI_LDR, pbase, RID_SP, sps_scale(irp->s));
    emit_lso(as, ARMI_LDR, RID_TMP, RID_TMP, (i & 4095));
-    if (ra_hasspill(irp->s) && !allow)
+    if (savereg)
      emit_lso(as, ARMI_STR, RID_RET, RID_SP, 0);  /* Save temp. register. */
    emit_loadi(as, RID_TMP, (i & ~4095));
  } else {
@ -2031,11 +2042,12 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
  SnapEntry *map = &as->T->snapmap[snap->mapofs];
  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1];
  MSize n, nent = snap->nent;
+  int32_t bias = 0;
  /* Store the value of all modified slots to the Lua stack. */
  for (n = 0; n < nent; n++) {
    SnapEntry sn = map[n];
    BCReg s = snap_slot(sn);
-    int32_t ofs = 8*((int32_t)s-1);
+    int32_t ofs = 8*((int32_t)s-1) - bias;
    IRRef ref = snap_ref(sn);
    IRIns *ir = IR(ref);
    if ((sn & SNAP_NORESTORE))
@ -2054,6 +2066,12 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
      emit_lso(as, ARMI_STR, tmp, RID_BASE, ofs+4);
 #else
      Reg src = ra_alloc1(as, ref, RSET_FPR);
+      if (LJ_UNLIKELY(ofs < -1020 || ofs > 1020)) {
+	int32_t adj = ofs & 0xffffff00;  /* K12-friendly. */
+	bias += adj;
+	ofs -= adj;
+	emit_addptr(as, RID_BASE, -adj);
+      }
      emit_vlso(as, ARMI_VSTR_D, src, RID_BASE, ofs);
 #endif
    } else {
@ -2082,6 +2100,7 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
    }
    checkmclim(as);
  }
+  emit_addptr(as, RID_BASE, bias);
  lj_assertA(map + nent == flinks, "inconsistent frames in snapshot");
 }

@ -2252,7 +2271,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
  }
  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
    as->evenspill = nslots;
-  return REGSP_HINT(RID_RET);
+  return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET);
 }

 static void asm_setup_target(ASMState *as)
--- a/src/lj_asm_arm64.h
+++ b/src/lj_asm_arm64.h
@ -1,6 +1,6 @@
 /*
 ** ARM64 IR assembler (SSA IR -> machine code).
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 **
 ** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
 ** Sponsored by Cisco Systems, Inc.
@ -84,18 +84,23 @@ static void asm_guardcc(ASMState *as, A64CC cc)
  emit_cond_branch(as, cc, target);
 }

-/* Emit test and branch instruction to exit for guard. */
-static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
+/* Emit test and branch instruction to exit for guard, if in range. */
+static int asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
 {
  MCode *target = asm_exitstub_addr(as, as->snapno);
  MCode *p = as->mcp;
+  ptrdiff_t delta = target - p;
  if (LJ_UNLIKELY(p == as->invmcp)) {
+    if (as->orignins > 1023) return 0;  /* Delta might end up too large. */
    as->loopinv = 1;
-    *p = A64I_B | A64F_S26(target-p);
-    emit_tnb(as, ai^0x01000000u, r, bit, p-1);
-    return;
+    *p = A64I_B | A64F_S26(delta);
+    ai ^= 0x01000000u;
+    target = p-1;
+  } else if (LJ_UNLIKELY(delta >= 0x1fff)) {
+    return 0;
  }
  emit_tnb(as, ai, r, bit, target);
+  return 1;
 }

 /* Emit compare and branch instruction to exit for guard. */
@ -211,16 +216,14 @@ static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow,
 static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
 {
  IRIns *ir = IR(ref);
+  int logical = (ai & 0x1f000000) == 0x0a000000;
  if (ra_hasreg(ir->r)) {
    ra_noweak(as, ir->r);
    return A64F_M(ir->r);
  } else if (irref_isk(ref)) {
-    uint32_t m;
    int64_t k = get_k64val(as, ref);
-    if ((ai & 0x1f000000) == 0x0a000000)
-      m = emit_isk13(k, irt_is64(ir->t));
-    else
-      m = emit_isk12(k);
+    uint32_t m = logical ? emit_isk13(k, irt_is64(ir->t)) :
+			   emit_isk12(irt_is64(ir->t) ? k : (int32_t)k);
    if (m)
      return m;
  } else if (mayfuse(as, ref)) {
@ -232,7 +235,7 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
 		    (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
      IRIns *irl = IR(ir->op1);
      if (sh == A64SH_LSL &&
-	  irl->o == IR_CONV &&
+	  irl->o == IR_CONV && !logical &&
 	  irl->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT) &&
 	  shift <= 4 &&
 	  canfuse(as, irl)) {
@ -242,7 +245,11 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
 	Reg m = ra_alloc1(as, ir->op1, allow);
 	return A64F_M(m) | A64F_SH(sh, shift);
      }
-    } else if (ir->o == IR_CONV &&
+    } else if (ir->o == IR_BROR && logical && irref_isk(ir->op2)) {
+      Reg m = ra_alloc1(as, ir->op1, allow);
+      int shift = (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
+      return A64F_M(m) | A64F_SH(A64SH_ROR, shift);
+    } else if (ir->o == IR_CONV && !logical &&
 	       ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)) {
      Reg m = ra_alloc1(as, ir->op1, allow);
      return A64F_M(m) | A64F_EX(A64EX_SXTW);
@ -419,13 +426,18 @@ static int asm_fuseorshift(ASMState *as, IRIns *ir)
 static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 {
  uint32_t n, nargs = CCI_XNARGS(ci);
-  int32_t ofs = 0;
+  int32_t spofs = 0, spalign = LJ_HASFFI && LJ_TARGET_OSX ? 0 : 7;
  Reg gpr, fpr = REGARG_FIRSTFPR;
  if (ci->func)
    emit_call(as, ci->func);
  for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
    as->cost[gpr] = REGCOST(~0u, ASMREF_L);
  gpr = REGARG_FIRSTGPR;
+#if LJ_HASFFI && LJ_ABI_WIN
+  if ((ci->flags & CCI_VARARG)) {
+    fpr = REGARG_LASTFPR+1;
+  }
+#endif
  for (n = 0; n < nargs; n++) { /* Setup args. */
    IRRef ref = args[n];
    IRIns *ir = IR(ref);
@ -436,10 +448,21 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 		     "reg %d not free", fpr);  /* Must have been evicted. */
 	  ra_leftov(as, fpr, ref);
 	  fpr++;
+#if LJ_HASFFI && LJ_ABI_WIN
+	} else if ((ci->flags & CCI_VARARG) && (gpr <= REGARG_LASTGPR)) {
+	  Reg rf = ra_alloc1(as, ref, RSET_FPR);
+	  emit_dn(as, A64I_FMOV_R_D, gpr++, rf & 31);
+#endif
 	} else {
 	  Reg r = ra_alloc1(as, ref, RSET_FPR);
-	  emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_isnum(ir->t)) ? 4 : 0));
-	  ofs += 8;
+	  int32_t al = spalign;
+#if LJ_HASFFI && LJ_TARGET_OSX
+	  al |= irt_isnum(ir->t) ? 7 : 3;
+#endif
+	  spofs = (spofs + al) & ~al;
+	  if (LJ_BE && al >= 7 && !irt_isnum(ir->t)) spofs += 4, al -= 4;
+	  emit_spstore(as, ir, r, spofs);
+	  spofs += al + 1;
 	}
      } else {
 	if (gpr <= REGARG_LASTGPR) {
@ -449,10 +472,27 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 	  gpr++;
 	} else {
 	  Reg r = ra_alloc1(as, ref, RSET_GPR);
-	  emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_is64(ir->t)) ? 4 : 0));
-	  ofs += 8;
+	  int32_t al = spalign;
+#if LJ_HASFFI && LJ_TARGET_OSX
+	  al |= irt_size(ir->t) - 1;
+#endif
+	  spofs = (spofs + al) & ~al;
+	  if (al >= 3) {
+	    if (LJ_BE && al >= 7 && !irt_is64(ir->t)) spofs += 4, al -= 4;
+	    emit_spstore(as, ir, r, spofs);
+	  } else {
+	    lj_assertA(al == 0 || al == 1, "size %d unexpected", al + 1);
+	    emit_lso(as, al ? A64I_STRH : A64I_STRB, r, RID_SP, spofs);
+	  }
+	  spofs += al + 1;
 	}
      }
+#if LJ_HASFFI && LJ_TARGET_OSX
+    } else {  /* Marker for start of varargs. */
+      gpr = REGARG_LASTGPR+1;
+      fpr = REGARG_LASTFPR+1;
+      spalign = 7;
+#endif
    }
  }
 }
@ -518,8 +558,6 @@ static void asm_retf(ASMState *as, IRIns *ir)
  as->topslot -= (BCReg)delta;
  if ((int32_t)as->topslot < 0) as->topslot = 0;
  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
-  /* Need to force a spill on REF_BASE now to update the stack slot. */
-  emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE)));
  emit_setgl(as, base, jit_base);
  emit_addptr(as, base, -8*delta);
  asm_guardcc(as, CC_NE);
@ -643,25 +681,22 @@ static void asm_strto(ASMState *as, IRIns *ir)
 {
  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
  IRRef args[2];
-  Reg dest = 0, tmp;
-  int destused = ra_used(ir);
+  Reg tmp;
  int32_t ofs = 0;
  ra_evictset(as, RSET_SCRATCH);
-  if (destused) {
+  if (ra_used(ir)) {
    if (ra_hasspill(ir->s)) {
      ofs = sps_scale(ir->s);
-      destused = 0;
      if (ra_hasreg(ir->r)) {
 	ra_free(as, ir->r);
 	ra_modified(as, ir->r);
 	emit_spload(as, ir, ir->r, ofs);
      }
    } else {
-      dest = ra_dest(as, ir, RSET_FPR);
+      Reg dest = ra_dest(as, ir, RSET_FPR);
+      emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
    }
  }
-  if (destused)
-    emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
  asm_guardcnb(as, A64I_CBZ, RID_RET);
  args[0] = ir->op1; /* GCstr *str */
  args[1] = ASMREF_TMP1; /* TValue *n  */
@ -752,113 +787,75 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
  int destused = ra_used(ir);
  Reg dest = ra_dest(as, ir, allow);
  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
-  Reg key = 0, tmp = RID_TMP;
-  Reg ftmp = RID_NONE, type = RID_NONE, scr = RID_NONE, tisnum = RID_NONE;
+  Reg tmp = RID_TMP, type = RID_NONE, key = RID_NONE, tkey;
  IRRef refkey = ir->op2;
  IRIns *irkey = IR(refkey);
-  int isk = irref_isk(ir->op2);
+  int isk = irref_isk(refkey);
  IRType1 kt = irkey->t;
  uint32_t k = 0;
  uint32_t khash;
-  MCLabel l_end, l_loop, l_next;
+  MCLabel l_end, l_loop;
  rset_clear(allow, tab);

-  if (!isk) {
-    key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
-    rset_clear(allow, key);
-    if (!irt_isstr(kt)) {
-      tmp = ra_scratch(as, allow);
-      rset_clear(allow, tmp);
-    }
-  } else if (irt_isnum(kt)) {
-    int64_t val = (int64_t)ir_knum(irkey)->u64;
-    if (!(k = emit_isk12(val))) {
-      key = ra_allock(as, val, allow);
-      rset_clear(allow, key);
-    }
-  } else if (!irt_ispri(kt)) {
-    if (!(k = emit_isk12(irkey->i))) {
-      key = ra_alloc1(as, refkey, allow);
-      rset_clear(allow, key);
-    }
-  }
-
-  /* Allocate constants early. */
-  if (irt_isnum(kt)) {
-    if (!isk) {
-      tisnum = ra_allock(as, LJ_TISNUM << 15, allow);
-      ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key));
-      rset_clear(allow, tisnum);
-    }
-  } else if (irt_isaddr(kt)) {
-    if (isk) {
-      int64_t kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
-      scr = ra_allock(as, kk, allow);
+  /* Allocate register for tkey outside of the loop. */
+  if (isk) {
+    int64_t kk;
+    if (irt_isaddr(kt)) {
+      kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
+    } else if (irt_isnum(kt)) {
+      kk = (int64_t)ir_knum(irkey)->u64;
+      /* Assumes -0.0 is already canonicalized to +0.0. */
    } else {
-      scr = ra_scratch(as, allow);
+      lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
+      kk = ~((int64_t)~irt_toitype(kt) << 47);
    }
-    rset_clear(allow, scr);
+    k = emit_isk12(kk);
+    tkey = k ? 0 : ra_allock(as, kk, allow);
  } else {
-    lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
-    type = ra_allock(as, ~((int64_t)~irt_toitype(kt) << 47), allow);
-    scr = ra_scratch(as, rset_clear(allow, type));
-    rset_clear(allow, scr);
+    tkey = ra_scratch(as, allow);
  }

  /* Key not found in chain: jump to exit (if merged) or load niltv. */
  l_end = emit_label(as);
  as->invmcp = NULL;
-  if (merge == IR_NE)
+  if (merge == IR_NE) {
    asm_guardcc(as, CC_AL);
-  else if (destused)
-    emit_loada(as, dest, niltvg(J2G(as->J)));
+  } else if (destused) {
+    uint32_t k12 = emit_isk12(offsetof(global_State, nilnode.val));
+    lj_assertA(k12 != 0, "Cannot k12 encode niltv(L)");
+    emit_dn(as, A64I_ADDx^k12, dest, RID_GL);
+  }

  /* Follow hash chain until the end. */
  l_loop = --as->mcp;
-  emit_n(as, A64I_CMPx^A64I_K12^0, dest);
-  emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
-  l_next = emit_label(as);
+  if (destused)
+    emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));

  /* Type and value comparison. */
  if (merge == IR_EQ)
    asm_guardcc(as, CC_EQ);
  else
    emit_cond_branch(as, CC_EQ, l_end);
+  emit_nm(as, A64I_CMPx^k, tmp, tkey);
+  if (!destused)
+    emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
+  emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key));
+  *l_loop = A64I_X | A64I_CBNZ | A64F_S19(as->mcp - l_loop) | dest;

-  if (irt_isnum(kt)) {
-    if (isk) {
-      /* Assumes -0.0 is already canonicalized to +0.0. */
-      if (k)
-	emit_n(as, A64I_CMPx^k, tmp);
-      else
-	emit_nm(as, A64I_CMPx, key, tmp);
-      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+  /* Construct tkey as canonicalized or tagged key. */
+  if (!isk) {
+    if (irt_isnum(kt)) {
+      key = ra_alloc1(as, refkey, RSET_FPR);
+      emit_dnm(as, A64I_CSELx | A64F_CC(CC_EQ), tkey, RID_ZERO, tkey);
+      /* A64I_FMOV_R_D from key to tkey done below. */
    } else {
-      emit_nm(as, A64I_FCMPd, key, ftmp);
-      emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31));
-      emit_cond_branch(as, CC_LO, l_next);
-      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp);
-      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n));
+      lj_assertA(irt_isaddr(kt), "bad HREF key type");
+      key = ra_alloc1(as, refkey, allow);
+      type = ra_allock(as, irt_toitype(kt) << 15, rset_clear(allow, key));
+      emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 32), tkey, key, type);
    }
-  } else if (irt_isaddr(kt)) {
-    if (isk) {
-      emit_nm(as, A64I_CMPx, scr, tmp);
-      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
-    } else {
-      emit_nm(as, A64I_CMPx, tmp, scr);
-      emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64));
-    }
-  } else {
-    emit_nm(as, A64I_CMPx, scr, type);
-    emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key));
  }

-  *l_loop = A64I_BCC | A64F_S19(as->mcp - l_loop) | CC_NE;
-  if (!isk && irt_isaddr(kt)) {
-    type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
-    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type);
-    rset_clear(allow, type);
-  }
  /* Load main position relative to tab->node into dest. */
  khash = isk ? ir_khash(as, irkey) : 1;
  if (khash == 0) {
@ -872,7 +869,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
      emit_dnm(as, A64I_ANDw, dest, dest, tmphash);
      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
    } else if (irt_isstr(kt)) {
-      /* Fetch of str->sid is cheaper than ra_allock. */
      emit_dnm(as, A64I_ANDw, dest, dest, tmp);
      emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, sid));
      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
@ -881,23 +877,18 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
      emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask));
      emit_dnm(as, A64I_SUBw, dest, dest, tmp);
      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp);
-      emit_dnm(as, A64I_EORw, dest, dest, tmp);
-      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest);
+      emit_dnm(as, A64I_EORw | A64F_SH(A64SH_ROR, 32-HASH_ROT2), dest, tmp, dest);
      emit_dnm(as, A64I_SUBw, tmp, tmp, dest);
      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest);
-      emit_dnm(as, A64I_EORw, tmp, tmp, dest);
      if (irt_isnum(kt)) {
+	emit_dnm(as, A64I_EORw, tmp, tkey, dest);
 	emit_dnm(as, A64I_ADDw, dest, dest, dest);
-	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
-	emit_dm(as, A64I_MOVw, tmp, dest);
-	emit_dn(as, A64I_FMOV_R_D, dest, (key & 31));
+	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, tkey);
+	emit_nm(as, A64I_FCMPZd, (key & 31), 0);
+	emit_dn(as, A64I_FMOV_R_D, tkey, (key & 31));
      } else {
-	checkmclim(as);
-	emit_dm(as, A64I_MOVw, tmp, key);
-	emit_dnm(as, A64I_EORw, dest, dest,
-		 ra_allock(as, irt_toitype(kt) << 15, allow));
-	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
-	emit_dm(as, A64I_MOVx, dest, key);
+	emit_dnm(as, A64I_EORw, tmp, key, dest);
+	emit_dnm(as, A64I_EORx | A64F_SH(A64SH_LSR, 32), dest, type, key);
      }
    }
  }
@ -912,7 +903,7 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
  int bigofs = !emit_checkofs(A64I_LDRx, kofs);
  Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
  Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
-  Reg key, idx = node;
+  Reg idx = node;
  RegSet allow = rset_exclude(RSET_GPR, node);
  uint64_t k;
  lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
@ -931,9 +922,8 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
  } else {
    k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey);
  }
-  key = ra_scratch(as, allow);
-  emit_nm(as, A64I_CMPx, key, ra_allock(as, k, rset_exclude(allow, key)));
-  emit_lso(as, A64I_LDRx, key, idx, kofs);
+  emit_nm(as, A64I_CMPx, RID_TMP, ra_allock(as, k, allow));
+  emit_lso(as, A64I_LDRx, RID_TMP, idx, kofs);
  if (bigofs)
    emit_opk(as, A64I_ADDx, dest, node, ofs, rset_exclude(RSET_GPR, node));
 }
@ -941,24 +931,30 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
 static void asm_uref(ASMState *as, IRIns *ir)
 {
  Reg dest = ra_dest(as, ir, RSET_GPR);
-  if (irref_isk(ir->op1)) {
+  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+  if (irref_isk(ir->op1) && !guarded) {
    GCfunc *fn = ir_kfunc(IR(ir->op1));
    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
    emit_lsptr(as, A64I_LDRx, dest, v);
  } else {
-    Reg uv = ra_scratch(as, RSET_GPR);
-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
-    if (ir->o == IR_UREFC) {
-      asm_guardcc(as, CC_NE);
-      emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP);
-      emit_opk(as, A64I_ADDx, dest, uv,
+    if (guarded)
+      asm_guardcnb(as, ir->o == IR_UREFC ? A64I_CBZ : A64I_CBNZ, RID_TMP);
+    if (ir->o == IR_UREFC)
+      emit_opk(as, A64I_ADDx, dest, dest,
 	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
-      emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+    else
+      emit_lso(as, A64I_LDRx, dest, dest, (int32_t)offsetof(GCupval, v));
+    if (guarded)
+      emit_lso(as, A64I_LDRB, RID_TMP, dest,
+	       (int32_t)offsetof(GCupval, closed));
+    if (irref_isk(ir->op1)) {
+      GCfunc *fn = ir_kfunc(IR(ir->op1));
+      uint64_t k = gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
+      emit_loadu64(as, dest, k);
    } else {
-      emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v));
+      emit_lso(as, A64I_LDRx, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+	       (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
    }
-    emit_lso(as, A64I_LDRx, uv, func,
-	     (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
  }
 }

@ -1063,7 +1059,7 @@ static void asm_xstore(ASMState *as, IRIns *ir)

 static void asm_ahuvload(ASMState *as, IRIns *ir)
 {
-  Reg idx, tmp, type;
+  Reg idx, tmp;
  int32_t ofs = 0;
  RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
  lj_assertA(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
@ -1082,8 +1078,9 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
  } else {
    tmp = ra_scratch(as, gpr);
  }
-  type = ra_scratch(as, rset_clear(gpr, tmp));
-  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx);
+  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, tmp), A64I_LDRx);
+  rset_clear(gpr, idx);
+  if (ofs & FUSE_REG) rset_clear(gpr, ofs & 31);
  if (ir->o == IR_VLOAD) ofs += 8 * ir->op2;
  /* Always do the type check, even if the load result is unused. */
  asm_guardcc(as, irt_isnum(ir->t) ? CC_LS : CC_NE);
@ -1091,10 +1088,10 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
    lj_assertA(irt_isinteger(ir->t) || irt_isnum(ir->t),
 	       "bad load type %d", irt_type(ir->t));
    emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
-	    ra_allock(as, LJ_TISNUM << 15, rset_exclude(gpr, idx)), tmp);
+	    ra_allock(as, LJ_TISNUM << 15, gpr), tmp);
  } else if (irt_isaddr(ir->t)) {
-    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type);
-    emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), RID_TMP);
+    emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp);
  } else if (irt_isnil(ir->t)) {
    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
  } else {
@ -1217,9 +1214,8 @@ dotypecheck:
      emit_nm(as, A64I_CMPx,
 	      ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp);
    } else {
-      Reg type = ra_scratch(as, allow);
-      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type);
-      emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), RID_TMP);
+      emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp);
    }
    emit_lso(as, A64I_LDRx, tmp, base, ofs);
    return;
@ -1289,8 +1285,9 @@ static void asm_tbar(ASMState *as, IRIns *ir)
  Reg link = ra_scratch(as, rset_exclude(RSET_GPR, tab));
  Reg mark = RID_TMP;
  MCLabel l_end = emit_label(as);
-  emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
  emit_lso(as, A64I_STRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+  /* Keep STRx in the middle to avoid LDP/STP fusion with surrounding code. */
+  emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
  emit_setgl(as, tab, gc.grayagain);
  emit_dn(as, A64I_ANDw^emit_isk13(~LJ_GC_BLACK, 0), mark, mark);
  emit_getgl(as, link, gc.grayagain);
@ -1304,7 +1301,6 @@ static void asm_obar(ASMState *as, IRIns *ir)
  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
  IRRef args[2];
  MCLabel l_end;
-  RegSet allow = RSET_GPR;
  Reg obj, val, tmp;
  /* No need for other object barriers (yet). */
  lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type");
@ -1315,14 +1311,13 @@ static void asm_obar(ASMState *as, IRIns *ir)
  asm_gencall(as, ci, args);
  emit_dm(as, A64I_MOVx, ra_releasetmp(as, ASMREF_TMP1), RID_GL);
  obj = IR(ir->op1)->r;
-  tmp = ra_scratch(as, rset_exclude(allow, obj));
-  emit_cond_branch(as, CC_EQ, l_end);
-  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp);
+  tmp = ra_scratch(as, rset_exclude(RSET_GPR, obj));
+  emit_tnb(as, A64I_TBZ, tmp, lj_ffs(LJ_GC_BLACK), l_end);
  emit_cond_branch(as, CC_EQ, l_end);
  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP);
  val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
  emit_lso(as, A64I_LDRB, tmp, obj,
-     (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
+	   (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
  emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked));
 }

@ -1364,12 +1359,12 @@ static int asm_swapops(ASMState *as, IRRef lref, IRRef rref)
  if (irref_isk(lref))
    return 1;  /* But swap constants to the right. */
  ir = IR(rref);
-  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
+  if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) ||
      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
    return 0;  /* Don't swap fusable operands to the left. */
  ir = IR(lref);
-  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
+  if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) ||
      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
    return 1;  /* But swap fusable operands to the right. */
@ -1415,13 +1410,12 @@ static void asm_intneg(ASMState *as, IRIns *ir)
 static void asm_intmul(ASMState *as, IRIns *ir)
 {
  Reg dest = ra_dest(as, ir, RSET_GPR);
-  Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest));
+  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
  Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
  if (irt_isguard(ir->t)) {  /* IR_MULOV */
    asm_guardcc(as, CC_NE);
    emit_dm(as, A64I_MOVw, dest, dest);  /* Zero-extend. */
-    emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest);
-    emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest);
+    emit_nm(as, A64I_CMPx | A64F_EX(A64EX_SXTW), dest, dest);
    emit_dnm(as, A64I_SMULL, dest, right, left);
  } else {
    emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right);
@ -1681,16 +1675,15 @@ static void asm_intcomp(ASMState *as, IRIns *ir)
      if (asm_swapops(as, blref, brref)) {
 	Reg tmp = blref; blref = brref; brref = tmp;
      }
+      bleft = ra_alloc1(as, blref, RSET_GPR);
      if (irref_isk(brref)) {
 	uint64_t k = get_k64val(as, brref);
-	if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) {
-	  asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ,
-		       ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k));
+	if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE) &&
+	    asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, bleft,
+			 emit_ctz64(k)))
 	  return;
-	}
 	m2 = emit_isk13(k, irt_is64(irl->t));
      }
-      bleft = ra_alloc1(as, blref, RSET_GPR);
      ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
      if (!m2)
 	m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft));
@ -1765,37 +1758,28 @@ static void asm_prof(ASMState *as, IRIns *ir)
 static void asm_stack_check(ASMState *as, BCReg topslot,
 			    IRIns *irp, RegSet allow, ExitNo exitno)
 {
-  Reg pbase;
  uint32_t k;
+  Reg pbase = RID_BASE;
  if (irp) {
-    if (!ra_hasspill(irp->s)) {
-      pbase = irp->r;
-      lj_assertA(ra_hasreg(pbase), "base reg lost");
-    } else if (allow) {
-      pbase = rset_pickbot(allow);
-    } else {
-      pbase = RID_RET;
-      emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0);  /* Restore temp register. */
-    }
-  } else {
-    pbase = RID_BASE;
+    pbase = irp->r;
+    if (!ra_hasreg(pbase))
+      pbase = allow ? (0x40 | rset_pickbot(allow)) : (0xC0 | RID_RET);
  }
  emit_cond_branch(as, CC_LS, asm_exitstub_addr(as, exitno));
+  if (pbase & 0x80)  /* Restore temp. register. */
+    emit_lso(as, A64I_LDRx, (pbase & 31), RID_SP, 0);
  k = emit_isk12((8*topslot));
  lj_assertA(k, "slot offset %d does not fit in K12", 8*topslot);
  emit_n(as, A64I_CMPx^k, RID_TMP);
-  emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase);
+  emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, (pbase & 31));
  emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP,
 	   (int32_t)offsetof(lua_State, maxstack));
-  if (irp) {  /* Must not spill arbitrary registers in head of side trace. */
-    if (ra_hasspill(irp->s))
-      emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s));
-    emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L));
-    if (ra_hasspill(irp->s) && !allow)
-      emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0);  /* Save temp register. */
-  } else {
-    emit_getgl(as, RID_TMP, cur_L);
+  if (pbase & 0x40) {
+    emit_getgl(as, (pbase & 31), jit_base);
+    if (pbase & 0x80)  /* Save temp register. */
+      emit_lso(as, A64I_STRx, (pbase & 31), RID_SP, 0);
  }
+  emit_getgl(as, RID_TMP, cur_L);
 }

 /* Restore Lua stack from on-trace state. */
@ -1837,7 +1821,7 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)

 /* Marker to prevent patching the GC check exit. */
 #define ARM64_NOPATCH_GC_CHECK \
-  (A64I_ORRx|A64F_D(RID_TMP)|A64F_M(RID_TMP)|A64F_N(RID_TMP))
+  (A64I_ORRx|A64F_D(RID_ZERO)|A64F_M(RID_ZERO)|A64F_N(RID_ZERO))

 /* Check GC threshold and do one or more GC steps. */
 static void asm_gc_check(ASMState *as)
@ -1892,46 +1876,40 @@ static void asm_loop_tail_fixup(ASMState *as)

 /* -- Head of trace ------------------------------------------------------- */

-/* Reload L register from g->cur_L. */
-static void asm_head_lreg(ASMState *as)
-{
-  IRIns *ir = IR(ASMREF_L);
-  if (ra_used(ir)) {
-    Reg r = ra_dest(as, ir, RSET_GPR);
-    emit_getgl(as, r, cur_L);
-    ra_evictk(as);
-  }
-}
-
 /* Coalesce BASE register for a root trace. */
 static void asm_head_root_base(ASMState *as)
 {
-  IRIns *ir;
-  asm_head_lreg(as);
-  ir = IR(REF_BASE);
-  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
-    ra_spill(as, ir);
-  ra_destreg(as, ir, RID_BASE);
+  IRIns *ir = IR(REF_BASE);
+  Reg r = ir->r;
+  if (ra_hasreg(r)) {
+    ra_free(as, r);
+    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
+    if (r != RID_BASE)
+      emit_movrr(as, ir, r, RID_BASE);
+  }
 }

 /* Coalesce BASE register for a side trace. */
 static Reg asm_head_side_base(ASMState *as, IRIns *irp)
 {
-  IRIns *ir;
-  asm_head_lreg(as);
-  ir = IR(REF_BASE);
-  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
-    ra_spill(as, ir);
-  if (ra_hasspill(irp->s)) {
-    return ra_dest(as, ir, RSET_GPR);
-  } else {
-    Reg r = irp->r;
-    lj_assertA(ra_hasreg(r), "base reg lost");
-    if (r != ir->r && !rset_test(as->freeset, r))
-      ra_restore(as, regcost_ref(as->cost[r]));
-    ra_destreg(as, ir, r);
-    return r;
+  IRIns *ir = IR(REF_BASE);
+  Reg r = ir->r;
+  if (ra_hasreg(r)) {
+    ra_free(as, r);
+    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
+    if (irp->r == r) {
+      return r;  /* Same BASE register already coalesced. */
+    } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
+      /* Move from coalesced parent reg. */
+      emit_movrr(as, ir, r, irp->r);
+      return irp->r;
+    } else {
+      emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
+    }
  }
+  return RID_NONE;
 }

 /* -- Tail of trace ------------------------------------------------------- */
@ -1975,20 +1953,47 @@ static void asm_tail_prep(ASMState *as)
 /* Ensure there are enough stack slots for call arguments. */
 static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
 {
-  IRRef args[CCI_NARGS_MAX*2];
+#if LJ_HASFFI
  uint32_t i, nargs = CCI_XNARGS(ci);
-  int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
-  asm_collectargs(as, ir, ci, args);
-  for (i = 0; i < nargs; i++) {
-    if (args[i] && irt_isfp(IR(args[i])->t)) {
-      if (nfpr > 0) nfpr--; else nslots += 2;
-    } else {
-      if (ngpr > 0) ngpr--; else nslots += 2;
+  if (nargs > (REGARG_NUMGPR < REGARG_NUMFPR ? REGARG_NUMGPR : REGARG_NUMFPR) ||
+      (LJ_TARGET_OSX && (ci->flags & CCI_VARARG))) {
+    IRRef args[CCI_NARGS_MAX*2];
+    int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
+    int spofs = 0, spalign = LJ_TARGET_OSX ? 0 : 7, nslots;
+    asm_collectargs(as, ir, ci, args);
+#if LJ_ABI_WIN
+    if ((ci->flags & CCI_VARARG)) nfpr = 0;
+#endif
+    for (i = 0; i < nargs; i++) {
+      int al = spalign;
+      if (!args[i]) {
+#if LJ_TARGET_OSX
+	/* Marker for start of varaargs. */
+	nfpr = 0;
+	ngpr = 0;
+	spalign = 7;
+#endif
+      } else if (irt_isfp(IR(args[i])->t)) {
+	if (nfpr > 0) { nfpr--; continue; }
+#if LJ_ABI_WIN
+	if ((ci->flags & CCI_VARARG) && ngpr > 0) { ngpr--; continue; }
+#elif LJ_TARGET_OSX
+	al |= irt_isnum(IR(args[i])->t) ? 7 : 3;
+#endif
+      } else {
+	if (ngpr > 0) { ngpr--; continue; }
+#if LJ_TARGET_OSX
+	al |= irt_size(IR(args[i])->t) - 1;
+#endif
+      }
+      spofs = (spofs + 2*al+1) & ~al;  /* Align and bump stack pointer. */
    }
+    nslots = (spofs + 3) >> 2;
+    if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
+      as->evenspill = nslots;
  }
-  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
-    as->evenspill = nslots;
-  return REGSP_HINT(RID_RET);
+#endif
+  return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET);
 }

 static void asm_setup_target(ASMState *as)
--- a/src/lj_asm_mips.h
+++ b/src/lj_asm_mips.h
@ -1,6 +1,6 @@
 /*
 ** MIPS IR assembler (SSA IR -> machine code).
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 /* -- Register allocator extensions --------------------------------------- */
@ -456,7 +456,7 @@ static void asm_retf(ASMState *as, IRIns *ir)
  emit_addptr(as, base, -8*delta);
  asm_guard(as, MIPSI_BNE, RID_TMP,
 	    ra_allock(as, igcptr(pc), rset_exclude(RSET_GPR, base)));
-  emit_tsi(as, MIPSI_AL, RID_TMP, base, -8);
+  emit_tsi(as, MIPSI_AL, RID_TMP, base, (LJ_BE || LJ_FR2) ? -8 : -4);
 }

 /* -- Buffer operations --------------------------------------------------- */
@ -653,11 +653,11 @@ static void asm_conv(ASMState *as, IRIns *ir)
 		     rset_exclude(RSET_GPR, dest));
 	  emit_fg(as, MIPSI_TRUNC_L_D, tmp, left);  /* Delay slot. */
 #if !LJ_TARGET_MIPSR6
-	 emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
-	 emit_fgh(as, MIPSI_C_OLT_D, 0, left, tmp);
+	emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
+	emit_fgh(as, MIPSI_C_OLT_D, 0, left, tmp);
 #else
-	 emit_branch(as, MIPSI_BC1NEZ, 0, (left&31), l_end);
-	 emit_fgh(as, MIPSI_CMP_LT_D, left, left, tmp);
+	emit_branch(as, MIPSI_BC1NEZ, 0, (tmp&31), l_end);
+	emit_fgh(as, MIPSI_CMP_LT_D, tmp, left, tmp);
 #endif
 	  emit_lsptr(as, MIPSI_LDC1, (tmp & 31),
 		     (void *)&as->J->k64[LJ_K64_2P63],
@ -670,11 +670,11 @@ static void asm_conv(ASMState *as, IRIns *ir)
 		     rset_exclude(RSET_GPR, dest));
 	  emit_fg(as, MIPSI_TRUNC_L_S, tmp, left);  /* Delay slot. */
 #if !LJ_TARGET_MIPSR6
-	 emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
-	 emit_fgh(as, MIPSI_C_OLT_S, 0, left, tmp);
+	emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
+	emit_fgh(as, MIPSI_C_OLT_S, 0, left, tmp);
 #else
-	 emit_branch(as, MIPSI_BC1NEZ, 0, (left&31), l_end);
-	 emit_fgh(as, MIPSI_CMP_LT_S, left, left, tmp);
+	emit_branch(as, MIPSI_BC1NEZ, 0, (tmp&31), l_end);
+	emit_fgh(as, MIPSI_CMP_LT_S, tmp, left, tmp);
 #endif
 	  emit_lsptr(as, MIPSI_LWC1, (tmp & 31),
 		     (void *)&as->J->k32[LJ_K32_2P63],
@ -690,8 +690,8 @@ static void asm_conv(ASMState *as, IRIns *ir)
 	MIPSIns mi = irt_is64(ir->t) ?
 	  (st == IRT_NUM ? MIPSI_TRUNC_L_D : MIPSI_TRUNC_L_S) :
 	  (st == IRT_NUM ? MIPSI_TRUNC_W_D : MIPSI_TRUNC_W_S);
-	emit_tg(as, irt_is64(ir->t) ? MIPSI_DMFC1 : MIPSI_MFC1, dest, left);
-	emit_fg(as, mi, left, left);
+	emit_tg(as, irt_is64(ir->t) ? MIPSI_DMFC1 : MIPSI_MFC1, dest, tmp);
+	emit_fg(as, mi, tmp, left);
 #endif
      }
    }
@ -1207,22 +1207,29 @@ nolo:
 static void asm_uref(ASMState *as, IRIns *ir)
 {
  Reg dest = ra_dest(as, ir, RSET_GPR);
-  if (irref_isk(ir->op1)) {
+  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+  if (irref_isk(ir->op1) && !guarded) {
    GCfunc *fn = ir_kfunc(IR(ir->op1));
    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
    emit_lsptr(as, MIPSI_AL, dest, v, RSET_GPR);
  } else {
-    Reg uv = ra_scratch(as, RSET_GPR);
-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
-    if (ir->o == IR_UREFC) {
-      asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO);
-      emit_tsi(as, MIPSI_AADDIU, dest, uv, (int32_t)offsetof(GCupval, tv));
-      emit_tsi(as, MIPSI_LBU, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+    if (guarded)
+      asm_guard(as, ir->o == IR_UREFC ? MIPSI_BEQ : MIPSI_BNE, RID_TMP, RID_ZERO);
+    if (ir->o == IR_UREFC)
+      emit_tsi(as, MIPSI_AADDIU, dest, dest, (int32_t)offsetof(GCupval, tv));
+    else
+      emit_tsi(as, MIPSI_AL, dest, dest, (int32_t)offsetof(GCupval, v));
+    if (guarded)
+      emit_tsi(as, MIPSI_LBU, RID_TMP, dest, (int32_t)offsetof(GCupval, closed));
+    if (irref_isk(ir->op1)) {
+      GCfunc *fn = ir_kfunc(IR(ir->op1));
+      GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]);
+      emit_loada(as, dest, o);
    } else {
-      emit_tsi(as, MIPSI_AL, dest, uv, (int32_t)offsetof(GCupval, v));
+      emit_tsi(as, MIPSI_AL, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+	       (int32_t)offsetof(GCfuncL, uvptr) +
+	       (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
    }
-    emit_tsi(as, MIPSI_AL, uv, func, (int32_t)offsetof(GCfuncL, uvptr) +
-	     (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
  }
 }

--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@ -1,6 +1,6 @@
 /*
 ** PPC IR assembler (SSA IR -> machine code).
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 /* -- Register allocator extensions --------------------------------------- */
@ -840,23 +840,30 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
 static void asm_uref(ASMState *as, IRIns *ir)
 {
  Reg dest = ra_dest(as, ir, RSET_GPR);
-  if (irref_isk(ir->op1)) {
+  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+  if (irref_isk(ir->op1) && !guarded) {
    GCfunc *fn = ir_kfunc(IR(ir->op1));
    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
    emit_lsptr(as, PPCI_LWZ, dest, v, RSET_GPR);
  } else {
-    Reg uv = ra_scratch(as, RSET_GPR);
-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
-    if (ir->o == IR_UREFC) {
-      asm_guardcc(as, CC_NE);
+    if (guarded) {
+      asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ);
      emit_ai(as, PPCI_CMPWI, RID_TMP, 1);
-      emit_tai(as, PPCI_ADDI, dest, uv, (int32_t)offsetof(GCupval, tv));
-      emit_tai(as, PPCI_LBZ, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
-    } else {
-      emit_tai(as, PPCI_LWZ, dest, uv, (int32_t)offsetof(GCupval, v));
    }
-    emit_tai(as, PPCI_LWZ, uv, func,
-	     (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+    if (ir->o == IR_UREFC)
+      emit_tai(as, PPCI_ADDI, dest, dest, (int32_t)offsetof(GCupval, tv));
+    else
+      emit_tai(as, PPCI_LWZ, dest, dest, (int32_t)offsetof(GCupval, v));
+    if (guarded)
+      emit_tai(as, PPCI_LBZ, RID_TMP, dest, (int32_t)offsetof(GCupval, closed));
+    if (irref_isk(ir->op1)) {
+      GCfunc *fn = ir_kfunc(IR(ir->op1));
+      int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
+      emit_loadi(as, dest, k);
+    } else {
+      emit_tai(as, PPCI_LWZ, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+	       (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+    }
  }
 }

--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@ -1,6 +1,6 @@
 /*
 ** x86/x64 IR assembler (SSA IR -> machine code).
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 /* -- Guard handling ------------------------------------------------------ */
@ -109,7 +109,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
 /* Check if there's no conflicting instruction between curins and ref.
 ** Also avoid fusing loads if there are multiple references.
 */
-static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
+static int noconflict(ASMState *as, IRRef ref, IROp conflict, int check)
 {
  IRIns *ir = as->ir;
  IRRef i = as->curins;
@ -118,7 +118,9 @@ static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
  while (--i > ref) {
    if (ir[i].o == conflict)
      return 0;  /* Conflict found. */
-    else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref))
+    else if ((check & 1) && (ir[i].o == IR_NEWREF || ir[i].o == IR_CALLS))
+      return 0;
+    else if ((check & 2) && (ir[i].op1 == ref || ir[i].op2 == ref))
      return 0;
  }
  return 1;  /* Ok, no conflict. */
@ -134,13 +136,14 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref)
    lj_assertA(irb->op2 == IRFL_TAB_ARRAY, "expected FLOAD TAB_ARRAY");
    /* We can avoid the FLOAD of t->array for colocated arrays. */
    if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
-	!neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) {
+	!neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 0)) {
      as->mrm.ofs = (int32_t)sizeof(GCtab);  /* Ofs to colocated array. */
      return irb->op1;  /* Table obj. */
    }
  } else if (irb->o == IR_ADD && irref_isk(irb->op2)) {
    /* Fuse base offset (vararg load). */
-    as->mrm.ofs = IR(irb->op2)->i;
+    IRIns *irk = IR(irb->op2);
+    as->mrm.ofs = irk->o == IR_KINT ? irk->i : (int32_t)ir_kint64(irk)->u64;
    return irb->op1;
  }
  return ref;  /* Otherwise use the given array base. */
@ -455,7 +458,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
    RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
    if (ir->o == IR_SLOAD) {
      if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
-	  noconflict(as, ref, IR_RETF, 0) &&
+	  noconflict(as, ref, IR_RETF, 2) &&
 	  !(LJ_GC64 && irt_isaddr(ir->t))) {
 	as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
 	as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
@ -466,12 +469,12 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
    } else if (ir->o == IR_FLOAD) {
      /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
      if ((irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)) &&
-	  noconflict(as, ref, IR_FSTORE, 0)) {
+	  noconflict(as, ref, IR_FSTORE, 2)) {
 	asm_fusefref(as, ir, xallow);
 	return RID_MRM;
      }
    } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
-      if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) &&
+      if (noconflict(as, ref, ir->o + IRDELTA_L2S, 2+(ir->o != IR_ULOAD)) &&
 	  !(LJ_GC64 && irt_isaddr(ir->t))) {
 	asm_fuseahuref(as, ir->op1, xallow);
 	return RID_MRM;
@ -481,7 +484,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
      ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
      */
      if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) &&
-	  noconflict(as, ref, IR_XSTORE, 0)) {
+	  noconflict(as, ref, IR_XSTORE, 2)) {
 	asm_fusexref(as, ir->op1, xallow);
 	return RID_MRM;
      }
@ -814,6 +817,7 @@ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
  emit_rr(as, XO_UCOMISD, left, tmp);
  emit_rr(as, XO_CVTSI2SD, tmp, dest);
  emit_rr(as, XO_XORPS, tmp, tmp);  /* Avoid partial register stall. */
+  checkmclim(as);
  emit_rr(as, XO_CVTTSD2SI, dest, left);
  /* Can't fuse since left is needed twice. */
 }
@ -856,6 +860,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
      emit_rr(as, XO_SUBSD, dest, bias);  /* Subtract 2^52+2^51 bias. */
      emit_rr(as, XO_XORPS, dest, bias);  /* Merge bias and integer. */
      emit_rma(as, XO_MOVSD, bias, k);
+      checkmclim(as);
      emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
      return;
    } else {  /* Integer to FP conversion. */
@ -1172,6 +1177,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
    asm_guardcc(as, CC_E);
  else
    emit_sjcc(as, CC_E, l_end);
+  checkmclim(as);
  if (irt_isnum(kt)) {
    if (isk) {
      /* Assumes -0.0 is already canonicalized to +0.0. */
@ -1231,7 +1237,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
 #endif
  }
  emit_sfixup(as, l_loop);
-  checkmclim(as);
 #if LJ_GC64
  if (!isk && irt_isaddr(kt)) {
    emit_rr(as, XO_OR, tmp|REX_64, key);
@ -1258,6 +1263,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
      emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp);
      emit_shifti(as, XOg_ROL, tmp, HASH_ROT3);
      emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp);
+      checkmclim(as);
      emit_shifti(as, XOg_ROL, dest, HASH_ROT2);
      emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest);
      emit_shifti(as, XOg_ROL, dest, HASH_ROT1);
@ -1275,7 +1281,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
      } else {
 	emit_rr(as, XO_MOV, tmp, key);
 #if LJ_GC64
-	checkmclim(as);
 	emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15);
 	if ((as->flags & JIT_F_BMI2)) {
 	  emit_i8(as, 32);
@ -1372,24 +1377,31 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
 static void asm_uref(ASMState *as, IRIns *ir)
 {
  Reg dest = ra_dest(as, ir, RSET_GPR);
-  if (irref_isk(ir->op1)) {
+  int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+  if (irref_isk(ir->op1) && !guarded) {
    GCfunc *fn = ir_kfunc(IR(ir->op1));
    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
    emit_rma(as, XO_MOV, dest|REX_GC64, v);
  } else {
    Reg uv = ra_scratch(as, RSET_GPR);
-    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
-    if (ir->o == IR_UREFC) {
+    if (ir->o == IR_UREFC)
      emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv));
-      asm_guardcc(as, CC_NE);
-      emit_i8(as, 1);
-      emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
-    } else {
+    else
      emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v));
+    if (guarded) {
+      asm_guardcc(as, ir->o == IR_UREFC ? CC_E : CC_NE);
+      emit_i8(as, 0);
+      emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
+    }
+    if (irref_isk(ir->op1)) {
+      GCfunc *fn = ir_kfunc(IR(ir->op1));
+      GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]);
+      emit_loada(as, uv, o);
+    } else {
+      emit_rmro(as, XO_MOV, uv|REX_GC64, ra_alloc1(as, ir->op1, RSET_GPR),
+	        (int32_t)offsetof(GCfuncL, uvptr) +
+	        (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
    }
-    emit_rmro(as, XO_MOV, uv|REX_GC64, func,
-	      (int32_t)offsetof(GCfuncL, uvptr) +
-	      (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
  }
 }

@ -1546,6 +1558,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
  if (irt_islightud(ir->t)) {
    Reg dest = asm_load_lightud64(as, ir, 1);
    if (ra_hasreg(dest)) {
+      checkmclim(as);
      asm_fuseahuref(as, ir->op1, RSET_GPR);
      if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
      emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
@ -1593,6 +1606,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
  if (LJ_64 && irt_type(ir->t) >= IRT_NUM) {
    lj_assertA(irt_isinteger(ir->t) || irt_isnum(ir->t),
 	       "bad load type %d", irt_type(ir->t));
+    checkmclim(as);
 #if LJ_GC64
    emit_u32(as, LJ_TISNUM << 15);
 #else
--- a/src/lj_assert.c
+++ b/src/lj_assert.c
@ -1,6 +1,6 @@
 /*
 ** Internal assertions.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lj_assert_c
--- a/src/lj_bc.c
+++ b/src/lj_bc.c
@ -1,6 +1,6 @@
 /*
 ** Bytecode instruction modes.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lj_bc_c
--- a/src/lj_bc.h
+++ b/src/lj_bc.h
@ -1,6 +1,6 @@
 /*
 ** Bytecode instruction format.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _LJ_BC_H
--- a/src/lj_bcdump.h
+++ b/src/lj_bcdump.h
@ -1,6 +1,6 @@
 /*
 ** Bytecode dump definitions.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _LJ_BCDUMP_H
@ -46,6 +46,8 @@

 #define BCDUMP_F_KNOWN		(BCDUMP_F_FR2*2-1)

+#define BCDUMP_F_DETERMINISTIC	0x80000000
+
 /* Type codes for the GC constants of a prototype. Plus length for strings. */
 enum {
  BCDUMP_KGC_CHILD, BCDUMP_KGC_TAB, BCDUMP_KGC_I64, BCDUMP_KGC_U64,
@ -61,7 +63,7 @@ enum {
 /* -- Bytecode reader/writer ---------------------------------------------- */

 LJ_FUNC int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer,
-		       void *data, int strip);
+		       void *data, uint32_t flags);
 LJ_FUNC GCproto *lj_bcread_proto(LexState *ls);
 LJ_FUNC GCproto *lj_bcread(LexState *ls);

--- a/src/lj_bcread.c
+++ b/src/lj_bcread.c
@ -1,6 +1,6 @@
 /*
 ** Bytecode reader.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lj_bcread_c
@ -179,7 +179,7 @@ static const void *bcread_varinfo(GCproto *pt)
 }

 /* Read a single constant key/value of a template table. */
-static void bcread_ktabk(LexState *ls, TValue *o)
+static void bcread_ktabk(LexState *ls, TValue *o, GCtab *t)
 {
  MSize tp = bcread_uleb128(ls);
  if (tp >= BCDUMP_KTAB_STR) {
@ -191,6 +191,8 @@ static void bcread_ktabk(LexState *ls, TValue *o)
  } else if (tp == BCDUMP_KTAB_NUM) {
    o->u32.lo = bcread_uleb128(ls);
    o->u32.hi = bcread_uleb128(ls);
+  } else if (t && tp == BCDUMP_KTAB_NIL) { /* Restore nil value marker. */
+    settabV(ls->L, o, t);
  } else {
    lj_assertLS(tp <= BCDUMP_KTAB_TRUE, "bad constant type %d", tp);
    setpriV(o, ~tp);
@ -207,15 +209,15 @@ static GCtab *bcread_ktab(LexState *ls)
    MSize i;
    TValue *o = tvref(t->array);
    for (i = 0; i < narray; i++, o++)
-      bcread_ktabk(ls, o);
+      bcread_ktabk(ls, o, NULL);
  }
  if (nhash) {  /* Read hash entries. */
    MSize i;
    for (i = 0; i < nhash; i++) {
      TValue key;
-      bcread_ktabk(ls, &key);
+      bcread_ktabk(ls, &key, NULL);
      lj_assertLS(!tvisnil(&key), "nil key");
-      bcread_ktabk(ls, lj_tab_set(ls->L, t, &key));
+      bcread_ktabk(ls, lj_tab_set(ls->L, t, &key), t);
    }
  }
  return t;
@ -281,8 +283,11 @@ static void bcread_knum(LexState *ls, GCproto *pt, MSize sizekn)
 static void bcread_bytecode(LexState *ls, GCproto *pt, MSize sizebc)
 {
  BCIns *bc = proto_bc(pt);
-  bc[0] = BCINS_AD((pt->flags & PROTO_VARARG) ? BC_FUNCV : BC_FUNCF,
-		   pt->framesize, 0);
+  BCIns op;
+  if (ls->fr2 != LJ_FR2) op = BC_NOT;  /* Mark non-native prototype. */
+  else if ((pt->flags & PROTO_VARARG)) op = BC_FUNCV;
+  else op = BC_FUNCF;
+  bc[0] = BCINS_AD(op, pt->framesize, 0);
  bcread_block(ls, bc+1, (sizebc-1)*(MSize)sizeof(BCIns));
  /* Swap bytecode instructions if the endianess differs. */
  if (bcread_swap(ls)) {
@ -395,7 +400,7 @@ static int bcread_header(LexState *ls)
      bcread_byte(ls) != BCDUMP_VERSION) return 0;
  bcread_flags(ls) = flags = bcread_uleb128(ls);
  if ((flags & ~(BCDUMP_F_KNOWN)) != 0) return 0;
-  if ((flags & BCDUMP_F_FR2) != LJ_FR2*BCDUMP_F_FR2) return 0;
+  if ((flags & BCDUMP_F_FR2) != (uint32_t)ls->fr2*BCDUMP_F_FR2) return 0;
  if ((flags & BCDUMP_F_FFI)) {
 #if LJ_HASFFI
    lua_State *L = ls->L;
@ -405,7 +410,7 @@ static int bcread_header(LexState *ls)
 #endif
  }
  if ((flags & BCDUMP_F_STRIP)) {
-    ls->chunkname = lj_str_newz(ls->L, ls->chunkarg);
+    ls->chunkname = lj_str_newz(ls->L, *ls->chunkarg == BCDUMP_HEAD1 ? "=?" : ls->chunkarg);
  } else {
    MSize len = bcread_uleb128(ls);
    bcread_need(ls, len);
--- a/src/lj_bcwrite.c
+++ b/src/lj_bcwrite.c
@ -1,6 +1,6 @@
 /*
 ** Bytecode writer.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lj_bcwrite_c
@ -27,7 +27,9 @@ typedef struct BCWriteCtx {
  GCproto *pt;			/* Root prototype. */
  lua_Writer wfunc;		/* Writer callback. */
  void *wdata;			/* Writer callback data. */
-  int strip;			/* Strip debug info. */
+  TValue **heap;		/* Heap used for deterministic sorting. */
+  uint32_t heapsz;		/* Size of heap. */
+  uint32_t flags;		/* BCDUMP_F_* flags. */
  int status;			/* Status from writer callback. */
 #ifdef LUA_USE_ASSERT
  global_State *g;
@ -69,6 +71,8 @@ static void bcwrite_ktabk(BCWriteCtx *ctx, cTValue *o, int narrow)
    *p++ = BCDUMP_KTAB_NUM;
    p = lj_strfmt_wuleb128(p, o->u32.lo);
    p = lj_strfmt_wuleb128(p, o->u32.hi);
+  } else if (tvistab(o)) { /* Write the nil value marker as a nil. */
+    *p++ = BCDUMP_KTAB_NIL;
  } else {
    lj_assertBCW(tvispri(o), "unhandled type %d", itype(o));
    *p++ = BCDUMP_KTAB_NIL+~itype(o);
@ -76,6 +80,75 @@ static void bcwrite_ktabk(BCWriteCtx *ctx, cTValue *o, int narrow)
  ctx->sb.w = p;
 }

+/* Compare two template table keys. */
+static LJ_AINLINE int bcwrite_ktabk_lt(TValue *a, TValue *b)
+{
+  uint32_t at = itype(a), bt = itype(b);
+  if (at != bt) {  /* This also handles false and true keys. */
+    return at < bt;
+  } else if (at == LJ_TSTR) {
+    return lj_str_cmp(strV(a), strV(b)) < 0;
+  } else {
+    return a->u64 < b->u64;  /* This works for numbers and integers. */
+  }
+}
+
+/* Insert key into a sorted heap. */
+static void bcwrite_ktabk_heap_insert(TValue **heap, MSize idx, MSize end,
+				      TValue *key)
+{
+  MSize child;
+  while ((child = idx * 2 + 1) < end) {
+    /* Find lower of the two children. */
+    TValue *c0 = heap[child];
+    if (child + 1 < end) {
+      TValue *c1 = heap[child + 1];
+      if (bcwrite_ktabk_lt(c1, c0)) {
+	c0 = c1;
+	child++;
+      }
+    }
+    if (bcwrite_ktabk_lt(key, c0)) break;  /* Key lower? Found our position. */
+    heap[idx] = c0;  /* Move lower child up. */
+    idx = child;  /* Descend. */
+  }
+  heap[idx] = key;  /* Insert key here. */
+}
+
+/* Resize heap, dropping content. */
+static void bcwrite_heap_resize(BCWriteCtx *ctx, uint32_t nsz)
+{
+  lua_State *L = sbufL(&ctx->sb);
+  if (ctx->heapsz) {
+    lj_mem_freevec(G(L), ctx->heap, ctx->heapsz, TValue *);
+    ctx->heapsz = 0;
+  }
+  if (nsz) {
+    ctx->heap = lj_mem_newvec(L, nsz, TValue *);
+    ctx->heapsz = nsz;
+  }
+}
+
+/* Write hash part of template table in sorted order. */
+static void bcwrite_ktab_sorted_hash(BCWriteCtx *ctx, Node *node, MSize nhash)
+{
+  TValue **heap = ctx->heap;
+  MSize i = nhash;
+  for (;; node--) {  /* Build heap. */
+    if (!tvisnil(&node->val)) {
+      bcwrite_ktabk_heap_insert(heap, --i, nhash, &node->key);
+      if (i == 0) break;
+    }
+  }
+  do {  /* Drain heap. */
+    TValue *key = heap[0];  /* Output lowest key from top. */
+    bcwrite_ktabk(ctx, key, 0);
+    bcwrite_ktabk(ctx, (TValue *)((char *)key - offsetof(Node, key)), 1);
+    key = heap[--nhash];  /* Remove last key. */
+    bcwrite_ktabk_heap_insert(heap, 0, nhash, key);  /* Re-insert. */
+  } while (nhash);
+}
+
 /* Write a template table. */
 static void bcwrite_ktab(BCWriteCtx *ctx, char *p, const GCtab *t)
 {
@ -105,14 +178,20 @@ static void bcwrite_ktab(BCWriteCtx *ctx, char *p, const GCtab *t)
      bcwrite_ktabk(ctx, o, 1);
  }
  if (nhash) {  /* Write hash entries. */
-    MSize i = nhash;
    Node *node = noderef(t->node) + t->hmask;
-    for (;; node--)
-      if (!tvisnil(&node->val)) {
-	bcwrite_ktabk(ctx, &node->key, 0);
-	bcwrite_ktabk(ctx, &node->val, 1);
-	if (--i == 0) break;
-      }
+    if ((ctx->flags & BCDUMP_F_DETERMINISTIC) && nhash > 1) {
+      if (ctx->heapsz < nhash)
+	bcwrite_heap_resize(ctx, t->hmask + 1);
+      bcwrite_ktab_sorted_hash(ctx, node, nhash);
+    } else {
+      MSize i = nhash;
+      for (;; node--)
+	if (!tvisnil(&node->val)) {
+	  bcwrite_ktabk(ctx, &node->key, 0);
+	  bcwrite_ktabk(ctx, &node->val, 1);
+	  if (--i == 0) break;
+	}
+    }
  }
 }

@ -269,7 +348,7 @@ static void bcwrite_proto(BCWriteCtx *ctx, GCproto *pt)
  p = lj_strfmt_wuleb128(p, pt->sizekgc);
  p = lj_strfmt_wuleb128(p, pt->sizekn);
  p = lj_strfmt_wuleb128(p, pt->sizebc-1);
-  if (!ctx->strip) {
+  if (!(ctx->flags & BCDUMP_F_STRIP)) {
    if (proto_lineinfo(pt))
      sizedbg = pt->sizept - (MSize)((char *)proto_lineinfo(pt) - (char *)pt);
    p = lj_strfmt_wuleb128(p, sizedbg);
@ -317,11 +396,10 @@ static void bcwrite_header(BCWriteCtx *ctx)
  *p++ = BCDUMP_HEAD2;
  *p++ = BCDUMP_HEAD3;
  *p++ = BCDUMP_VERSION;
-  *p++ = (ctx->strip ? BCDUMP_F_STRIP : 0) +
+  *p++ = (ctx->flags & (BCDUMP_F_STRIP | BCDUMP_F_FR2)) +
 	 LJ_BE*BCDUMP_F_BE +
-	 ((ctx->pt->flags & PROTO_FFI) ? BCDUMP_F_FFI : 0) +
-	 LJ_FR2*BCDUMP_F_FR2;
-  if (!ctx->strip) {
+	 ((ctx->pt->flags & PROTO_FFI) ? BCDUMP_F_FFI : 0);
+  if (!(ctx->flags & BCDUMP_F_STRIP)) {
    p = lj_strfmt_wuleb128(p, len);
    p = lj_buf_wmem(p, name, len);
  }
@ -352,14 +430,16 @@ static TValue *cpwriter(lua_State *L, lua_CFunction dummy, void *ud)

 /* Write bytecode for a prototype. */
 int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer, void *data,
-	      int strip)
+	      uint32_t flags)
 {
  BCWriteCtx ctx;
  int status;
  ctx.pt = pt;
  ctx.wfunc = writer;
  ctx.wdata = data;
-  ctx.strip = strip;
+  ctx.heapsz = 0;
+  if ((bc_op(proto_bc(pt)[0]) != BC_NOT) == LJ_FR2) flags |= BCDUMP_F_FR2;
+  ctx.flags = flags;
  ctx.status = 0;
 #ifdef LUA_USE_ASSERT
  ctx.g = G(L);
@ -368,6 +448,7 @@ int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer, void *data,
  status = lj_vm_cpcall(L, NULL, &ctx, cpwriter);
  if (status == 0) status = ctx.status;
  lj_buf_free(G(sbufL(&ctx.sb)), &ctx.sb);
+  bcwrite_heap_resize(&ctx, 0);
  return status;
 }

--- a/src/lj_buf.c
+++ b/src/lj_buf.c
@ -1,6 +1,6 @@
 /*
 ** Buffer handling.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #define lj_buf_c
@ -92,10 +92,8 @@ void LJ_FASTCALL lj_buf_shrink(lua_State *L, SBuf *sb)
  char *b = sb->b;
  MSize osz = (MSize)(sb->e - b);
  if (osz > 2*LJ_MIN_SBUF) {
-    MSize n = (MSize)(sb->w - b);
    b = lj_mem_realloc(L, b, osz, (osz >> 1));
-    sb->b = b;
-    sb->w = b + n;
+    sb->w = sb->b = b;  /* Not supposed to keep data across shrinks. */
    sb->e = b + (osz >> 1);
  }
  lj_assertG_(G(sbufL(sb)), !sbufisext(sb), "YAGNI shrink SBufExt");
--- a/src/lj_buf.h
+++ b/src/lj_buf.h
@ -1,6 +1,6 @@
 /*
 ** Buffer handling.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _LJ_BUF_H
--- a/src/lj_carith.c
+++ b/src/lj_carith.c
@ -1,6 +1,6 @@
 /*
 ** C data arithmetic.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #include "lj_obj.h"
@ -44,9 +44,13 @@ static int carith_checkarg(lua_State *L, CTState *cts, CDArith *ca)
 	p = (uint8_t *)cdata_getptr(p, ct->size);
 	if (ctype_isref(ct->info)) ct = ctype_rawchild(cts, ct);
      } else if (ctype_isfunc(ct->info)) {
+	CTypeID id0 = i ? ctype_typeid(cts, ca->ct[0]) : 0;
 	p = (uint8_t *)*(void **)p;
 	ct = ctype_get(cts,
 	  lj_ctype_intern(cts, CTINFO(CT_PTR, CTALIGN_PTR|id), CTSIZE_PTR));
+	if (i) {  /* cts->tab may have been reallocated. */
+	  ca->ct[0] = ctype_get(cts, id0);
+	}
      }
      if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
      ca->ct[i] = ct;
@ -345,9 +349,7 @@ uint64_t lj_carith_check64(lua_State *L, int narg, CTypeID *id)
  if (LJ_LIKELY(tvisint(o))) {
    return (uint32_t)intV(o);
  } else {
-    int32_t i = lj_num2bit(numV(o));
-    if (LJ_DUALNUM) setintV(o, i);
-    return (uint32_t)i;
+    return (uint32_t)lj_num2bit(numV(o));
  }
 }

--- a/src/lj_carith.h
+++ b/src/lj_carith.h
@ -1,6 +1,6 @@
 /*
 ** C data arithmetic.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _LJ_CARITH_H
--- a/src/lj_ccall.c
+++ b/src/lj_ccall.c
@ -1,6 +1,6 @@
 /*
 ** FFI C call handling.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #include "lj_obj.h"
@ -20,12 +20,15 @@
 #if LJ_TARGET_X86
 /* -- x86 calling conventions --------------------------------------------- */

+#define CCALL_PUSH(arg) \
+  *(GPRArg *)((uint8_t *)cc->stack + nsp) = (GPRArg)(arg), nsp += CTSIZE_PTR
+
 #if LJ_ABI_WIN

 #define CCALL_HANDLE_STRUCTRET \
  /* Return structs bigger than 8 by reference (on stack only). */ \
  cc->retref = (sz > 8); \
-  if (cc->retref) cc->stack[nsp++] = (GPRArg)dp;
+  if (cc->retref) CCALL_PUSH(dp);

 #define CCALL_HANDLE_COMPLEXRET CCALL_HANDLE_STRUCTRET

@ -40,7 +43,7 @@
    if (ngpr < maxgpr) \
      cc->gpr[ngpr++] = (GPRArg)dp; \
    else \
-      cc->stack[nsp++] = (GPRArg)dp; \
+      CCALL_PUSH(dp); \
  } else {  /* Struct with single FP field ends up in FPR. */ \
    cc->resx87 = ccall_classify_struct(cts, ctr); \
  }
@ -56,7 +59,7 @@
  if (ngpr < maxgpr) \
    cc->gpr[ngpr++] = (GPRArg)dp; \
  else \
-    cc->stack[nsp++] = (GPRArg)dp;
+    CCALL_PUSH(dp);

 #endif

@ -67,7 +70,7 @@
    if (ngpr < maxgpr) \
      cc->gpr[ngpr++] = (GPRArg)dp; \
    else \
-      cc->stack[nsp++] = (GPRArg)dp; \
+      CCALL_PUSH(dp); \
  }

 #endif
@ -278,8 +281,8 @@
  if (ngpr < maxgpr) { \
    dp = &cc->gpr[ngpr]; \
    if (ngpr + n > maxgpr) { \
-      nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
-      if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
+      nsp += (ngpr + n - maxgpr) * CTSIZE_PTR;  /* Assumes contiguous gpr/stack fields. */ \
+      if (nsp > CCALL_SIZE_STACK) goto err_nyi;  /* Too many arguments. */ \
      ngpr = maxgpr; \
    } else { \
      ngpr += n; \
@ -345,7 +348,6 @@
      goto done; \
    } else { \
      nfpr = CCALL_NARG_FPR;  /* Prevent reordering. */ \
-      if (LJ_TARGET_OSX && d->size < 8) goto err_nyi; \
    } \
  } else {  /* Try to pass argument in GPRs. */ \
    if (!LJ_TARGET_OSX && (d->info & CTF_ALIGN) > CTALIGN_PTR) \
@ -356,7 +358,6 @@
      goto done; \
    } else { \
      ngpr = maxgpr;  /* Prevent reordering. */ \
-      if (LJ_TARGET_OSX && d->size < 8) goto err_nyi; \
    } \
  }

@ -471,8 +472,8 @@
  if (ngpr < maxgpr) { \
    dp = &cc->gpr[ngpr]; \
    if (ngpr + n > maxgpr) { \
-     nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
-     if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
+     nsp += (ngpr + n - maxgpr) * CTSIZE_PTR;  /* Assumes contiguous gpr/stack fields. */ \
+     if (nsp > CCALL_SIZE_STACK) goto err_nyi;  /* Too many arguments. */ \
     ngpr = maxgpr; \
    } else { \
     ngpr += n; \
@ -565,8 +566,8 @@
  if (ngpr < maxgpr) { \
    dp = &cc->gpr[ngpr]; \
    if (ngpr + n > maxgpr) { \
-      nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
-      if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
+      nsp += (ngpr + n - maxgpr) * CTSIZE_PTR;  /* Assumes contiguous gpr/stack fields. */ \
+      if (nsp > CCALL_SIZE_STACK) goto err_nyi;  /* Too many arguments. */ \
      ngpr = maxgpr; \
    } else { \
      ngpr += n; \
@ -698,10 +699,11 @@ static int ccall_struct_arg(CCallState *cc, CTState *cts, CType *d, int *rcl,
  lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg));
  if (ccall_struct_reg(cc, cts, dp, rcl)) {
    /* Register overflow? Pass on stack. */
-    MSize nsp = cc->nsp, n = rcl[1] ? 2 : 1;
-    if (nsp + n > CCALL_MAXSTACK) return 1;  /* Too many arguments. */
-    cc->nsp = nsp + n;
-    memcpy(&cc->stack[nsp], dp, n*CTSIZE_PTR);
+    MSize nsp = cc->nsp, sz = rcl[1] ? 2*CTSIZE_PTR : CTSIZE_PTR;
+    if (nsp + sz > CCALL_SIZE_STACK)
+      return 1;  /* Too many arguments. */
+    cc->nsp = nsp + sz;
+    memcpy((uint8_t *)cc->stack + nsp, dp, sz);
  }
  return 0;  /* Ok. */
 }
@ -779,17 +781,24 @@ static unsigned int ccall_classify_struct(CTState *cts, CType *ct)
 {
  CTSize sz = ct->size;
  unsigned int r = 0, n = 0, isu = (ct->info & CTF_UNION);
-  while (ct->sib) {
+  while (ct->sib && n <= 4) {
+    unsigned int m = 1;
    CType *sct;
    ct = ctype_get(cts, ct->sib);
    if (ctype_isfield(ct->info)) {
      sct = ctype_rawchild(cts, ct);
+      if (ctype_isarray(sct->info)) {
+	CType *cct = ctype_rawchild(cts, sct);
+	if (!cct->size) continue;
+	m = sct->size / cct->size;
+	sct = cct;
+      }
      if (ctype_isfp(sct->info)) {
 	r |= sct->size;
-	if (!isu) n++; else if (n == 0) n = 1;
+	if (!isu) n += m; else if (n < m) n = m;
      } else if (ctype_iscomplex(sct->info)) {
 	r |= (sct->size >> 1);
-	if (!isu) n += 2; else if (n < 2) n = 2;
+	if (!isu) n += 2*m; else if (n < 2*m) n = 2*m;
      } else if (ctype_isstruct(sct->info)) {
 	goto substruct;
      } else {
@ -801,10 +810,11 @@ static unsigned int ccall_classify_struct(CTState *cts, CType *ct)
      sct = ctype_rawchild(cts, ct);
    substruct:
      if (sct->size > 0) {
-	unsigned int s = ccall_classify_struct(cts, sct);
+	unsigned int s = ccall_classify_struct(cts, sct), sn;
 	if (s <= 1) goto noth;
 	r |= (s & 255);
-	if (!isu) n += (s >> 8); else if (n < (s >>8)) n = (s >> 8);
+	sn = (s >> 8) * m;
+	if (!isu) n += sn; else if (n < sn) n = sn;
      }
    }
  }
@ -983,6 +993,14 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
    fid = ctf->sib;
  }

+#if LJ_TARGET_ARM64 && LJ_ABI_WIN
+  if ((ct->info & CTF_VARARG)) {
+    nsp -= maxgpr * CTSIZE_PTR;  /* May end up with negative nsp. */
+    ngpr = maxgpr;
+    nfpr = CCALL_NARG_FPR;
+  }
+#endif
+
  /* Walk through all passed arguments. */
  for (o = L->base+1, narg = 1; o < top; o++, narg++) {
    CTypeID did;
@ -1019,25 +1037,31 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
      CCALL_HANDLE_STRUCTARG
    } else if (ctype_iscomplex(d->info)) {
      CCALL_HANDLE_COMPLEXARG
-    } else {
+    } else if (!(CCALL_PACK_STACKARG && ctype_isenum(d->info))) {
      sz = CTSIZE_PTR;
    }
-    sz = (sz + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1);
-    n = sz / CTSIZE_PTR;  /* Number of GPRs or stack slots needed. */
+    n = (sz + CTSIZE_PTR-1) / CTSIZE_PTR;  /* Number of GPRs or stack slots needed. */

    CCALL_HANDLE_REGARG  /* Handle register arguments. */

    /* Otherwise pass argument on stack. */
-    if (CCALL_ALIGN_STACKARG && !rp && (d->info & CTF_ALIGN) > CTALIGN_PTR) {
-      MSize align = (1u << ctype_align(d->info-CTALIGN_PTR)) -1;
-      nsp = (nsp + align) & ~align;  /* Align argument on stack. */
+    if (CCALL_ALIGN_STACKARG) {  /* Align argument on stack. */
+      MSize align = (1u << ctype_align(d->info)) - 1;
+      if (rp || (CCALL_PACK_STACKARG && isva && align < CTSIZE_PTR-1))
+	align = CTSIZE_PTR-1;
+      nsp = (nsp + align) & ~align;
    }
-    if (nsp + n > CCALL_MAXSTACK) {  /* Too many arguments. */
+#if LJ_TARGET_ARM64 && LJ_ABI_WIN
+    /* A negative nsp points into cc->gpr. Blame MS for their messy ABI. */
+    dp = ((uint8_t *)cc->stack) + (int32_t)nsp;
+#else
+    dp = ((uint8_t *)cc->stack) + nsp;
+#endif
+    nsp += CCALL_PACK_STACKARG ? sz : n * CTSIZE_PTR;
+    if ((int32_t)nsp > CCALL_SIZE_STACK) {  /* Too many arguments. */
    err_nyi:
      lj_err_caller(L, LJ_ERR_FFI_NYICALL);
    }
-    dp = &cc->stack[nsp];
-    nsp += n;
    isva = 0;

  done:
@ -1048,7 +1072,8 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
    }
    lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg));
    /* Extend passed integers to 32 bits at least. */
-    if (ctype_isinteger_or_bool(d->info) && d->size < 4) {
+    if (ctype_isinteger_or_bool(d->info) && d->size < 4 &&
+	(!CCALL_PACK_STACKARG || !((uintptr_t)dp & 3))) {  /* Assumes LJ_LE. */
      if (d->info & CTF_UNSIGNED)
 	*(uint32_t *)dp = d->size == 1 ? (uint32_t)*(uint8_t *)dp :
 					 (uint32_t)*(uint16_t *)dp;
@ -1095,14 +1120,17 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
 #endif
  }
  if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG);  /* Too few arguments. */
+#if LJ_TARGET_ARM64 && LJ_ABI_WIN
+  if ((int32_t)nsp < 0) nsp = 0;
+#endif

 #if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP)
  cc->nfpr = nfpr;  /* Required for vararg functions. */
 #endif
-  cc->nsp = nsp;
-  cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA)*CTSIZE_PTR;
-  if (nsp > CCALL_SPS_FREE)
-    cc->spadj += (((nsp-CCALL_SPS_FREE)*CTSIZE_PTR + 15u) & ~15u);
+  cc->nsp = (nsp + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1);
+  cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA) * CTSIZE_PTR;
+  if (cc->nsp > CCALL_SPS_FREE * CTSIZE_PTR)
+    cc->spadj += (((cc->nsp - CCALL_SPS_FREE * CTSIZE_PTR) + 15u) & ~15u);
  return gcsteps;
 }

--- a/src/lj_ccall.h
+++ b/src/lj_ccall.h
@ -1,6 +1,6 @@
 /*
 ** FFI C call handling.
-** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
 */

 #ifndef _LJ_CCALL_H
@ -75,6 +75,9 @@ typedef union FPRArg {
 #define CCALL_NARG_FPR		8
 #define CCALL_NRET_FPR		4
 #define CCALL_SPS_FREE		0
+#if LJ_TARGET_OSX
+#define CCALL_PACK_STACKARG	1
+#endif

 typedef intptr_t GPRArg;
 typedef union FPRArg {
@ -139,6 +142,9 @@ typedef union FPRArg {
 #ifndef CCALL_ALIGN_STACKARG
 #define CCALL_ALIGN_STACKARG	1
 #endif
+#ifndef CCALL_PACK_STACKARG
+#define CCALL_PACK_STACKARG	0
+#endif
 #ifndef CCALL_ALIGN_CALLSTATE
 #define CCALL_ALIGN_CALLSTATE	8
 #endif
@ -152,14 +158,15 @@ typedef union FPRArg {
 LJ_STATIC_ASSERT(CCALL_NUM_GPR <= CCALL_MAX_GPR);
 LJ_STATIC_ASSERT(CCALL_NUM_FPR <= CCALL_MAX_FPR);

-#define CCALL_MAXSTACK		32
+#define CCALL_NUM_STACK		31
+#define CCALL_SIZE_STACK	(CCALL_NUM_STACK * CTSIZE_PTR)

 /* -- C call state -------------------------------------------------------- */

 typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState {
  void (*func)(void);		/* Pointer to called function. */
  uint32_t spadj;		/* Stack pointer adjustment. */
-  uint8_t nsp;			/* Number of stack slots. */
+  uint8_t nsp;			/* Number of bytes on stack. */
  uint8_t retref;		/* Return value by reference. */
 #if LJ_TARGET_X64
  uint8_t ngpr;			/* Number of arguments in GPRs. */
@ -178,7 +185,7 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState {
  FPRArg fpr[CCALL_NUM_FPR];	/* Arguments/results in FPRs. */
 #endif
  GPRArg gpr[CCALL_NUM_GPR];	/* Arguments/results in GPRs. */
-  GPRArg stack[CCALL_MAXSTACK];	/* Stack slots. */
+  GPRArg stack[CCALL_NUM_STACK];	/* Stack slots. */
 } CCallState;

 /* -- C call handling ----------------------------------------------------- */
--- a/Show More
+++ b/Show More